1#!/usr/bin/perl -w 2 3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! 4# Any files created or read by this program should be listed in 'mktables.lst' 5# Use -makelist to regenerate it. 6 7# There was an attempt when this was first rewritten to make it 5.8 8# compatible, but that has now been abandoned, and newer constructs are used 9# as convenient. 10 11# NOTE: this script can run quite slowly in older/slower systems. 12# It can also consume a lot of memory (128 MB or more), you may need 13# to raise your process resource limits (e.g. in bash, "ulimit -a" 14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set) 15 16my $start_time; 17BEGIN { # Get the time the script started running; do it at compilation to 18 # get it as close as possible 19 $start_time= time; 20} 21 22require 5.010_001; 23use strict; 24use warnings; 25use builtin qw(refaddr); 26use Carp; 27use Config; 28use File::Find; 29use File::Path; 30use File::Spec; 31use Text::Tabs; 32use re "/aa"; 33 34use feature 'state'; 35use feature 'signatures'; 36no warnings qw( experimental::builtin ); 37 38sub DEBUG () { 0 } # Set to 0 for production; 1 for development 39$| = 1 if DEBUG; 40my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; 41 42sub NON_ASCII_PLATFORM { ord("A") != 65 } 43 44# When a new version of Unicode is published, unfortunately the algorithms for 45# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated 46# manually. The changes may or may not be backward compatible with older 47# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the 48# changes, then come back here and set the variable below to what version the 49# code is expecting. If a newer version of Unicode is being compiled than 50# expected, a warning will be generated. If an older version is being 51# compiled, any bounds tests that fail in the generated test file (-maketest 52# option) will be marked as TODO. 53my $version_of_mk_invlist_bounds = v15.0.0; 54 55########################################################################## 56# 57# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), 58# from the Unicode database files (lib/unicore/.../*.txt), It also generates 59# a pod file and .t files, depending on option parameters. 60# 61# The structure of this file is: 62# First these introductory comments; then 63# code needed for everywhere, such as debugging stuff; then 64# code to handle input parameters; then 65# data structures likely to be of external interest (some of which depend on 66# the input parameters, so follows them; then 67# more data structures and subroutine and package (class) definitions; then 68# the small actual loop to process the input files and finish up; then 69# a __DATA__ section, for the .t tests 70# 71# This program works on all releases of Unicode so far. The outputs have been 72# scrutinized most intently for release 5.1. The others have been checked for 73# somewhat more than just sanity. It can handle all non-provisional Unicode 74# character properties in those releases. 75# 76# This program is mostly about Unicode character (or code point) properties. 77# A property describes some attribute or quality of a code point, like if it 78# is lowercase or not, its name, what version of Unicode it was first defined 79# in, or what its uppercase equivalent is. Unicode deals with these disparate 80# possibilities by making all properties into mappings from each code point 81# into some corresponding value. In the case of it being lowercase or not, 82# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each 83# property maps each Unicode code point to a single value, called a "property 84# value". (Some more recently defined properties, map a code point to a set 85# of values.) 86# 87# When using a property in a regular expression, what is desired isn't the 88# mapping of the code point to its property's value, but the reverse (or the 89# mathematical "inverse relation"): starting with the property value, "Does a 90# code point map to it?" These are written in a "compound" form: 91# \p{property=value}, e.g., \p{category=punctuation}. This program generates 92# files containing the lists of code points that map to each such regular 93# expression property value, one file per list 94# 95# There is also a single form shortcut that Perl adds for many of the commonly 96# used properties. This happens for all binary properties, plus script, 97# general_category, and block properties. 98# 99# Thus the outputs of this program are files. There are map files, mostly in 100# the 'To' directory; and there are list files for use in regular expression 101# matching, all in subdirectories of the 'lib' directory, with each 102# subdirectory being named for the property that the lists in it are for. 103# Bookkeeping, test, and documentation files are also generated. 104 105my $matches_directory = 'lib'; # Where match (\p{}) files go. 106my $map_directory = 'To'; # Where map files go. 107 108# DATA STRUCTURES 109# 110# The major data structures of this program are Property, of course, but also 111# Table. There are two kinds of tables, very similar to each other. 112# "Match_Table" is the data structure giving the list of code points that have 113# a particular property value, mentioned above. There is also a "Map_Table" 114# data structure which gives the property's mapping from code point to value. 115# There are two structures because the match tables need to be combined in 116# various ways, such as constructing unions, intersections, complements, etc., 117# and the map ones don't. And there would be problems, perhaps subtle, if 118# a map table were inadvertently operated on in some of those ways. 119# The use of separate classes with operations defined on one but not the other 120# prevents accidentally confusing the two. 121# 122# At the heart of each table's data structure is a "Range_List", which is just 123# an ordered list of "Ranges", plus ancillary information, and methods to 124# operate on them. A Range is a compact way to store property information. 125# Each range has a starting code point, an ending code point, and a value that 126# is meant to apply to all the code points between the two end points, 127# inclusive. For a map table, this value is the property value for those 128# code points. Two such ranges could be written like this: 129# 0x41 .. 0x5A, 'Upper', 130# 0x61 .. 0x7A, 'Lower' 131# 132# Each range also has a type used as a convenience to classify the values. 133# Most ranges in this program will be Type 0, or normal, but there are some 134# ranges that have a non-zero type. These are used only in map tables, and 135# are for mappings that don't fit into the normal scheme of things. Mappings 136# that require a hash entry to communicate with utf8.c are one example; 137# another example is mappings for charnames.pm to use which indicate a name 138# that is algorithmically determinable from its code point (and the reverse). 139# These are used to significantly compact these tables, instead of listing 140# each one of the tens of thousands individually. 141# 142# In a match table, the value of a range is irrelevant (and hence the type as 143# well, which will always be 0), and arbitrarily set to the empty string. 144# Using the example above, there would be two match tables for those two 145# entries, one named Upper would contain the 0x41..0x5A range, and the other 146# named Lower would contain 0x61..0x7A. 147# 148# Actually, there are two types of range lists, "Range_Map" is the one 149# associated with map tables, and "Range_List" with match tables. 150# Again, this is so that methods can be defined on one and not the others so 151# as to prevent operating on them in incorrect ways. 152# 153# Eventually, most tables are written out to files to be read by Unicode::UCD. 154# All tables could in theory be written, but some are suppressed because there 155# is no current practical use for them. It is easy to change which get 156# written by changing various lists that are near the top of the actual code 157# in this file. The table data structures contain enough ancillary 158# information to allow them to be treated as separate entities for writing, 159# such as the path to each one's file. There is a heading in each map table 160# that gives the format of its entries, and what the map is for all the code 161# points missing from it. (This allows tables to be more compact.) 162# 163# The Property data structure contains one or more tables. All properties 164# contain a map table (except the $perl property which is a 165# pseudo-property containing only match tables), and any properties that 166# are usable in regular expression matches also contain various matching 167# tables, one for each value the property can have. A binary property can 168# have two values, True and False (or Y and N, which are preferred by Unicode 169# terminology). Thus each of these properties will have a map table that 170# takes every code point and maps it to Y or N (but having ranges cuts the 171# number of entries in that table way down), and two match tables, one 172# which has a list of all the code points that map to Y, and one for all the 173# code points that map to N. (For each binary property, a third table is also 174# generated for the pseudo Perl property. It contains the identical code 175# points as the Y table, but can be written in regular expressions, not in the 176# compound form, but in a "single" form like \p{IsUppercase}.) Many 177# properties are binary, but some properties have several possible values, 178# some have many, and properties like Name have a different value for every 179# named code point. Those will not, unless the controlling lists are changed, 180# have their match tables written out. But all the ones which can be used in 181# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally 182# a property would have either its map table or its match tables written but 183# not both. Again, what gets written is controlled by lists which can easily 184# be changed. Starting in 5.14, advantage was taken of this, and all the map 185# tables needed to reconstruct the Unicode db are now written out, while 186# suppressing the Unicode .txt files that contain the data. Our tables are 187# much more compact than the .txt files, so a significant space savings was 188# achieved. Also, tables are not written out that are trivially derivable 189# from tables that do get written. So, there typically is no file containing 190# the code points not matched by a binary property (the table for \P{} versus 191# lowercase \p{}), since you just need to invert the True table to get the 192# False table. 193 194# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on 195# how many match tables there are and the content of the maps. This 'Type' is 196# different than a range 'Type', so don't get confused by the two concepts 197# having the same name. 198# 199# For information about the Unicode properties, see Unicode's UAX44 document: 200 201my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; 202 203# As stated earlier, this program will work on any release of Unicode so far. 204# Most obvious problems in earlier data have NOT been corrected except when 205# necessary to make Perl or this program work reasonably, and to keep out 206# potential security issues. For example, no folding information was given in 207# early releases, so this program substitutes lower case instead, just so that 208# a regular expression with the /i option will do something that actually 209# gives the right results in many cases. There are also a couple other 210# corrections for version 1.1.5, commented at the point they are made. As an 211# example of corrections that weren't made (but could be) is this statement 212# from DerivedAge.txt: "The supplementary private use code points and the 213# non-character code points were assigned in version 2.0, but not specifically 214# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise 215# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is 216# further down in these introductory comments. 217# 218# This program works on all non-provisional properties as of the current 219# Unicode release, though the files for some are suppressed for various 220# reasons. You can change which are output by changing lists in this program. 221# 222# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's 223# loose matchings rules (from Unicode TR18): 224# 225# The recommended names for UCD properties and property values are in 226# PropertyAliases.txt [Prop] and PropertyValueAliases.txt 227# [PropValue]. There are both abbreviated names and longer, more 228# descriptive names. It is strongly recommended that both names be 229# recognized, and that loose matching of property names be used, 230# whereby the case distinctions, whitespace, hyphens, and underbar 231# are ignored. 232# 233# The program still allows Fuzzy to override its determination of if loose 234# matching should be used, but it isn't currently used, as it is no longer 235# needed; the calculations it makes are good enough. 236# 237# SUMMARY OF HOW IT WORKS: 238# 239# Process arguments 240# 241# A list is constructed containing each input file that is to be processed 242# 243# Each file on the list is processed in a loop, using the associated handler 244# code for each: 245# The PropertyAliases.txt and PropValueAliases.txt files are processed 246# first. These files name the properties and property values. 247# Objects are created of all the property and property value names 248# that the rest of the input should expect, including all synonyms. 249# The other input files give mappings from properties to property 250# values. That is, they list code points and say what the mapping 251# is under the given property. Some files give the mappings for 252# just one property; and some for many. This program goes through 253# each file and populates the properties and their map tables from 254# them. Some properties are listed in more than one file, and 255# Unicode has set up a precedence as to which has priority if there 256# is a conflict. Thus the order of processing matters, and this 257# program handles the conflict possibility by processing the 258# overriding input files last, so that if necessary they replace 259# earlier values. 260# After this is all done, the program creates the property mappings not 261# furnished by Unicode, but derivable from what it does give. 262# The tables of code points that match each property value in each 263# property that is accessible by regular expressions are created. 264# The Perl-defined properties are created and populated. Many of these 265# require data determined from the earlier steps 266# Any Perl-defined synonyms are created, and name clashes between Perl 267# and Unicode are reconciled and warned about. 268# All the properties are written to files 269# Any other files are written, and final warnings issued. 270# 271# For clarity, a number of operators have been overloaded to work on tables: 272# ~ means invert (take all characters not in the set). The more 273# conventional '!' is not used because of the possibility of confusing 274# it with the actual boolean operation. 275# + means union 276# - means subtraction 277# & means intersection 278# The precedence of these is the order listed. Parentheses should be 279# copiously used. These are not a general scheme. The operations aren't 280# defined for a number of things, deliberately, to avoid getting into trouble. 281# Operations are done on references and affect the underlying structures, so 282# that the copy constructors for them have been overloaded to not return a new 283# clone, but the input object itself. 284# 285# The bool operator is deliberately not overloaded to avoid confusion with 286# "should it mean if the object merely exists, or also is non-empty?". 287# 288# WHY CERTAIN DESIGN DECISIONS WERE MADE 289# 290# This program needs to be able to run under miniperl. Therefore, it uses a 291# minimum of other modules, and hence implements some things itself that could 292# be gotten from CPAN 293# 294# This program uses inputs published by the Unicode Consortium. These can 295# change incompatibly between releases without the Perl maintainers realizing 296# it. Therefore this program is now designed to try to flag these. It looks 297# at the directories where the inputs are, and flags any unrecognized files. 298# It keeps track of all the properties in the files it handles, and flags any 299# that it doesn't know how to handle. It also flags any input lines that 300# don't match the expected syntax, among other checks. 301# 302# It is also designed so if a new input file matches one of the known 303# templates, one hopefully just needs to add it to a list to have it 304# processed. 305# 306# As mentioned earlier, some properties are given in more than one file. In 307# particular, the files in the extracted directory are supposedly just 308# reformattings of the others. But they contain information not easily 309# derivable from the other files, including results for Unihan (which isn't 310# usually available to this program) and for unassigned code points. They 311# also have historically had errors or been incomplete. In an attempt to 312# create the best possible data, this program thus processes them first to 313# glean information missing from the other files; then processes those other 314# files to override any errors in the extracted ones. Much of the design was 315# driven by this need to store things and then possibly override them. 316# 317# It tries to keep fatal errors to a minimum, to generate something usable for 318# testing purposes. It always looks for files that could be inputs, and will 319# warn about any that it doesn't know how to handle (the -q option suppresses 320# the warning). 321# 322# Why is there more than one type of range? 323# This simplified things. There are some very specialized code points that 324# have to be handled specially for output, such as Hangul syllable names. 325# By creating a range type (done late in the development process), it 326# allowed this to be stored with the range, and overridden by other input. 327# Originally these were stored in another data structure, and it became a 328# mess trying to decide if a second file that was for the same property was 329# overriding the earlier one or not. 330# 331# Why are there two kinds of tables, match and map? 332# (And there is a base class shared by the two as well.) As stated above, 333# they actually are for different things. Development proceeded much more 334# smoothly when I (khw) realized the distinction. Map tables are used to 335# give the property value for every code point (actually every code point 336# that doesn't map to a default value). Match tables are used for regular 337# expression matches, and are essentially the inverse mapping. Separating 338# the two allows more specialized methods, and error checks so that one 339# can't just take the intersection of two map tables, for example, as that 340# is nonsensical. 341# 342# What about 'fate' and 'status'. The concept of a table's fate was created 343# late when it became clear that something more was needed. The difference 344# between this and 'status' is unclean, and could be improved if someone 345# wanted to spend the effort. 346# 347# DEBUGGING 348# 349# This program is written so it will run under miniperl. Occasionally changes 350# will cause an error where the backtrace doesn't work well under miniperl. 351# To diagnose the problem, you can instead run it under regular perl, if you 352# have one compiled. 353# 354# There is a good trace facility. To enable it, first sub DEBUG must be set 355# to return true. Then a line like 356# 357# local $to_trace = 1 if main::DEBUG; 358# 359# can be added to enable tracing in its lexical scope (plus dynamic) or until 360# you insert another line: 361# 362# local $to_trace = 0 if main::DEBUG; 363# 364# To actually trace, use a line like "trace $a, @b, %c, ...; 365# 366# Some of the more complex subroutines already have trace statements in them. 367# Permanent trace statements should be like: 368# 369# trace ... if main::DEBUG && $to_trace; 370# 371# main::stack_trace() will display what its name implies 372# 373# If there is just one or a few files that you're debugging, you can easily 374# cause most everything else to be skipped. Change the line 375# 376# my $debug_skip = 0; 377# 378# to 1, and every file whose object is in @input_file_objects and doesn't have 379# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping 380# Jamo.txt or UnicodeData.txt will likely cause fatal errors. 381# 382# To compare the output tables, it may be useful to specify the -annotate 383# flag. (As of this writing, this can't be done on a clean workspace, due to 384# requirements in Text::Tabs used in this option; so first run mktables 385# without this option.) This option adds comment lines to each table, one for 386# each non-algorithmically named character giving, currently its code point, 387# name, and graphic representation if printable (and you have a font that 388# knows about it). This makes it easier to see what the particular code 389# points are in each output table. Non-named code points are annotated with a 390# description of their status, and contiguous ones with the same description 391# will be output as a range rather than individually. Algorithmically named 392# characters are also output as ranges, except when there are just a few 393# contiguous ones. 394# 395# FUTURE ISSUES 396# 397# The program would break if Unicode were to change its names so that 398# interior white space, underscores, or dashes differences were significant 399# within property and property value names. 400# 401# It might be easier to use the xml versions of the UCD if this program ever 402# would need heavy revision, and the ability to handle old versions was not 403# required. 404# 405# There is the potential for name collisions, in that Perl has chosen names 406# that Unicode could decide it also likes. There have been such collisions in 407# the past, with mostly Perl deciding to adopt the Unicode definition of the 408# name. However in the 5.2 Unicode beta testing, there were a number of such 409# collisions, which were withdrawn before the final release, because of Perl's 410# and other's protests. These all involved new properties which began with 411# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, 412# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a 413# Unicode document, so they are unlikely to be used by Unicode for another 414# purpose. However, they might try something beginning with 'In', or use any 415# of the other Perl-defined properties. This program will warn you of name 416# collisions, and refuse to generate tables with them, but manual intervention 417# will be required in this event. One scheme that could be implemented, if 418# necessary, would be to have this program generate another file, or add a 419# field to mktables.lst that gives the date of first definition of a property. 420# Each new release of Unicode would use that file as a basis for the next 421# iteration. And the Perl synonym addition code could sort based on the age 422# of the property, so older properties get priority, and newer ones that clash 423# would be refused; hence existing code would not be impacted, and some other 424# synonym would have to be used for the new property. This is ugly, and 425# manual intervention would certainly be easier to do in the short run; lets 426# hope it never comes to this. 427# 428# A NOTE ON UNIHAN 429# 430# This program can generate tables from the Unihan database. But that DB 431# isn't normally available, so it is marked as optional. Prior to version 432# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database 433# was split into 8 different files, all beginning with the letters 'Unihan'. 434# If you plunk those files down into the directory mktables ($0) is in, this 435# program will read them and automatically create tables for the properties 436# from it that are listed in PropertyAliases.txt and PropValueAliases.txt, 437# plus any you add to the @cjk_properties array and the @cjk_property_values 438# array, being sure to add necessary '# @missings' lines to the latter. For 439# Unicode versions earlier than 5.2, most of the Unihan properties are not 440# listed at all in PropertyAliases nor PropValueAliases. This program assumes 441# for these early releases that you want the properties that are specified in 442# the 5.2 release. 443# 444# You may need to adjust the entries to suit your purposes. setup_unihan(), 445# and filter_unihan_line() are the functions where this is done. This program 446# already does some adjusting to make the lines look more like the rest of the 447# Unicode DB; You can see what that is in filter_unihan_line() 448# 449# There is a bug in the 3.2 data file in which some values for the 450# kPrimaryNumeric property have commas and an unexpected comment. A filter 451# could be added to correct these; or for a particular installation, the 452# Unihan.txt file could be edited to fix them. 453# 454# HOW TO ADD A FILE TO BE PROCESSED 455# 456# A new file from Unicode needs to have an object constructed for it in 457# @input_file_objects, probably at the end or at the end of the extracted 458# ones. The program should warn you if its name will clash with others on 459# restrictive file systems, like DOS. If so, figure out a better name, and 460# add lines to the README.perl file giving that. If the file is a character 461# property, it should be in the format that Unicode has implicitly 462# standardized for such files for the more recently introduced ones. 463# If so, the Input_file constructor for @input_file_objects can just be the 464# file name and release it first appeared in. If not, then it should be 465# possible to construct an each_line_handler() to massage the line into the 466# standardized form. 467# 468# For non-character properties, more code will be needed. You can look at 469# the existing entries for clues. 470# 471# UNICODE VERSIONS NOTES 472# 473# The Unicode UCD has had a number of errors in it over the versions. And 474# these remain, by policy, in the standard for that version. Therefore it is 475# risky to correct them, because code may be expecting the error. So this 476# program doesn't generally make changes, unless the error breaks the Perl 477# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value 478# for U+1105, which causes real problems for the algorithms for Jamo 479# calculations, so it is changed here. 480# 481# But it isn't so clear cut as to what to do about concepts that are 482# introduced in a later release; should they extend back to earlier releases 483# where the concept just didn't exist? It was easier to do this than to not, 484# so that's what was done. For example, the default value for code points not 485# in the files for various properties was probably undefined until changed by 486# some version. No_Block for blocks is such an example. This program will 487# assign No_Block even in Unicode versions that didn't have it. This has the 488# benefit that code being written doesn't have to special case earlier 489# versions; and the detriment that it doesn't match the Standard precisely for 490# the affected versions. 491# 492# Here are some observations about some of the issues in early versions: 493# 494# Prior to version 3.0, there were 3 character decompositions. These are not 495# handled by Unicode::Normalize, nor will it compile when presented a version 496# that has them. However, you can trivially get it to compile by simply 497# ignoring those decompositions, by changing the croak to a carp. At the time 498# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or 499# dist/Unicode-Normalize/mkheader) reads 500# 501# croak("Weird Canonical Decomposition of U+$h"); 502# 503# Simply comment it out. It will compile, but will not know about any three 504# character decompositions. 505 506# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out 507# that the reason is that the CJK block starting at 4E00 was removed from 508# PropList, and was not put back in until 3.1.0. The Perl extension (the 509# single property name \p{alpha}) has the correct values. But the compound 510# form is simply not generated until 3.1, as it can be argued that prior to 511# this release, this was not an official property. The comments for 512# filter_old_style_proplist() give more details. 513# 514# Unicode introduced the synonym Space for White_Space in 4.1. Perl has 515# always had a \p{Space}. In release 3.2 only, they are not synonymous. The 516# reason is that 3.2 introduced U+205F=medium math space, which was not 517# classed as white space, but Perl figured out that it should have been. 4.0 518# reclassified it correctly. 519# 520# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 521# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB 522# became 202, and ATBL was left with no code points, as all the ones that 523# mapped to 202 stayed mapped to 202. Thus if your program used the numeric 524# name for the class, it would not have been affected, but if it used the 525# mnemonic, it would have been. 526# 527# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code 528# points which eventually came to have this script property value, instead 529# mapped to "Unknown". But in the next release all these code points were 530# moved to \p{sc=common} instead. 531 532# The tests furnished by Unicode for testing WordBreak and SentenceBreak 533# generate errors in 5.0 and earlier. 534# 535# The default for missing code points for BidiClass is complicated. Starting 536# in 3.1.1, the derived file DBidiClass.txt handles this, but this program 537# tries to do the best it can for earlier releases. It is done in 538# process_PropertyAliases() 539# 540# In version 2.1.2, the entry in UnicodeData.txt: 541# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; 542# should instead be 543# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F 544# Without this change, there are casing problems for this character. 545# 546# Search for $string_compare_versions to see how to compare changes to 547# properties between Unicode versions 548# 549############################################################################## 550 551my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing 552 # and errors 553my $MAX_LINE_WIDTH = 78; 554 555# Debugging aid to skip most files so as to not be distracted by them when 556# concentrating on the ones being debugged. Add 557# non_skip => 1, 558# to the constructor for those files you want processed when you set this. 559# Files with a first version number of 0 are special: they are always 560# processed regardless of the state of this flag. Generally, Jamo.txt and 561# UnicodeData.txt must not be skipped if you want this program to not die 562# before normal completion. 563my $debug_skip = 0; 564 565 566# Normally these are suppressed. 567my $write_Unicode_deprecated_tables = 0; 568 569# Set to 1 to enable tracing. 570our $to_trace = 0; 571 572{ # Closure for trace: debugging aid 573 my $print_caller = 1; # ? Include calling subroutine name 574 my $main_with_colon = 'main::'; 575 my $main_colon_length = length($main_with_colon); 576 577 sub trace { 578 return unless $to_trace; # Do nothing if global flag not set 579 580 my @input = @_; 581 582 local $DB::trace = 0; 583 $DB::trace = 0; # Quiet 'used only once' message 584 585 my $line_number; 586 587 # Loop looking up the stack to get the first non-trace caller 588 my $caller_line; 589 my $caller_name; 590 my $i = 0; 591 do { 592 $line_number = $caller_line; 593 (my $pkg, my $file, $caller_line, my $caller) = caller $i++; 594 $caller = $main_with_colon unless defined $caller; 595 596 $caller_name = $caller; 597 598 # get rid of pkg 599 $caller_name =~ s/.*:://; 600 if (substr($caller_name, 0, $main_colon_length) 601 eq $main_with_colon) 602 { 603 $caller_name = substr($caller_name, $main_colon_length); 604 } 605 606 } until ($caller_name ne 'trace'); 607 608 # If the stack was empty, we were called from the top level 609 $caller_name = 'main' if ($caller_name eq "" 610 || $caller_name eq 'trace'); 611 612 my $output = ""; 613 #print STDERR __LINE__, ": ", join ", ", @input, "\n"; 614 foreach my $string (@input) { 615 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { 616 $output .= simple_dumper($string); 617 } 618 else { 619 $string = "$string" if ref $string; 620 $string = $UNDEF unless defined $string; 621 chomp $string; 622 $string = '""' if $string eq ""; 623 $output .= " " if $output ne "" 624 && $string ne "" 625 && substr($output, -1, 1) ne " " 626 && substr($string, 0, 1) ne " "; 627 $output .= $string; 628 } 629 } 630 631 print STDERR sprintf "%4d: ", $line_number if defined $line_number; 632 print STDERR "$caller_name: " if $print_caller; 633 print STDERR $output, "\n"; 634 return; 635 } 636} 637 638sub stack_trace() { 639 local $to_trace = 1 if main::DEBUG; 640 my $line = (caller(0))[2]; 641 my $i = 1; 642 643 # Accumulate the stack trace 644 while (1) { 645 my ($pkg, $file, $caller_line, $caller) = caller $i++; 646 647 last unless defined $caller; 648 649 trace "called from $caller() at line $line"; 650 $line = $caller_line; 651 } 652} 653 654# This is for a rarely used development feature that allows you to compare two 655# versions of the Unicode standard without having to deal with changes caused 656# by the code points introduced in the later version. You probably also want 657# to use the -annotate option when using this. Run this program on a unicore 658# containing the starting release you want to compare. Save that output 659# structure. Then, switching to a unicore with the ending release, change the 660# "" in the $string_compare_versions definition just below to a string 661# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding 662# to the starting release. This program will then compile, but throw away all 663# code points introduced after the starting release. Finally use a diff tool 664# to compare the two directory structures. They include only the code points 665# common to both releases, and you can see the changes caused just by the 666# underlying release semantic changes. For versions earlier than 3.2, you 667# must copy a version of DAge.txt into the directory. 668my $string_compare_versions = DEBUG && ""; 669my $compare_versions = DEBUG 670 && $string_compare_versions 671 && pack "C*", split /\./, $string_compare_versions; 672 673sub uniques { 674 # Returns non-duplicated input values. From "Perl Best Practices: 675 # Encapsulated Cleverness". p. 455 in first edition. 676 677 my %seen; 678 # Arguably this breaks encapsulation, if the goal is to permit multiple 679 # distinct objects to stringify to the same value, and be interchangeable. 680 # However, for this program, no two objects stringify identically, and all 681 # lists passed to this function are either objects or strings. So this 682 # doesn't affect correctness, but it does give a couple of percent speedup. 683 no overloading; 684 return grep { ! $seen{$_}++ } @_; 685} 686 687$0 = File::Spec->canonpath($0); 688 689my $make_test_script = 0; # ? Should we output a test script 690my $make_norm_test_script = 0; # ? Should we output a normalization test script 691my $write_unchanged_files = 0; # ? Should we update the output files even if 692 # we don't think they have changed 693my $use_directory = ""; # ? Should we chdir somewhere. 694my $pod_directory; # input directory to store the pod file. 695my $pod_file = 'perluniprops'; 696my $t_path; # Path to the .t test file 697my $file_list = 'mktables.lst'; # File to store input and output file names. 698 # This is used to speed up the build, by not 699 # executing the main body of the program if 700 # nothing on the list has changed since the 701 # previous build 702my $make_list = 1; # ? Should we write $file_list. Set to always 703 # make a list so that when the release manager 704 # is preparing a release, they won't have to do 705 # special things 706my $glob_list = 0; # ? Should we try to include unknown .txt files 707 # in the input. 708my $output_range_counts = $debugging_build; # ? Should we include the number 709 # of code points in ranges in 710 # the output 711my $annotate = 0; # ? Should character names be in the output 712 713# Verbosity levels; 0 is quiet 714my $NORMAL_VERBOSITY = 1; 715my $PROGRESS = 2; 716my $VERBOSE = 3; 717 718my $verbosity = $NORMAL_VERBOSITY; 719 720# Stored in mktables.lst so that if this program is called with different 721# options, will regenerate even if the files otherwise look like they're 722# up-to-date. 723my $command_line_arguments = join " ", @ARGV; 724 725# Process arguments 726while (@ARGV) { 727 my $arg = shift @ARGV; 728 if ($arg eq '-v') { 729 $verbosity = $VERBOSE; 730 } 731 elsif ($arg eq '-p') { 732 $verbosity = $PROGRESS; 733 $| = 1; # Flush buffers as we go. 734 } 735 elsif ($arg eq '-q') { 736 $verbosity = 0; 737 } 738 elsif ($arg eq '-w') { 739 # update the files even if they haven't changed 740 $write_unchanged_files = 1; 741 } 742 elsif ($arg eq '-check') { 743 my $this = shift @ARGV; 744 my $ok = shift @ARGV; 745 if ($this ne $ok) { 746 print "Skipping as check params are not the same.\n"; 747 exit(0); 748 } 749 } 750 elsif ($arg eq '-P' && defined ($pod_directory = shift)) { 751 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; 752 } 753 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) 754 { 755 $make_test_script = 1; 756 } 757 elsif ($arg eq '-makenormtest') 758 { 759 $make_norm_test_script = 1; 760 } 761 elsif ($arg eq '-makelist') { 762 $make_list = 1; 763 } 764 elsif ($arg eq '-C' && defined ($use_directory = shift)) { 765 -d $use_directory or croak "Unknown directory '$use_directory'"; 766 } 767 elsif ($arg eq '-L') { 768 769 # Existence not tested until have chdir'd 770 $file_list = shift; 771 } 772 elsif ($arg eq '-globlist') { 773 $glob_list = 1; 774 } 775 elsif ($arg eq '-c') { 776 $output_range_counts = ! $output_range_counts 777 } 778 elsif ($arg eq '-annotate') { 779 $annotate = 1; 780 $debugging_build = 1; 781 $output_range_counts = 1; 782 } 783 else { 784 my $with_c = 'with'; 785 $with_c .= 'out' if $output_range_counts; # Complements the state 786 croak <<END; 787usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] 788 [ -T test_file_path ] [-globlist] [-makelist] [-maketest] 789 [-check A B ] 790 -c : Output comments $with_c number of code points in ranges 791 -q : Quiet Mode: Only output serious warnings. 792 -p : Set verbosity level to normal plus show progress. 793 -v : Set Verbosity level high: Show progress and non-serious 794 warnings 795 -w : Write files regardless 796 -C dir : Change to this directory before proceeding. All relative paths 797 except those specified by the -P and -T options will be done 798 with respect to this directory. 799 -P dir : Output $pod_file file to directory 'dir'. 800 -T path : Create a test script as 'path'; overrides -maketest 801 -L filelist : Use alternate 'filelist' instead of standard one 802 -globlist : Take as input all non-Test *.txt files in current and sub 803 directories 804 -maketest : Make test script 'TestProp.pl' in current (or -C directory), 805 overrides -T 806 -makelist : Rewrite the file list $file_list based on current setup 807 -annotate : Output an annotation for each character in the table files; 808 useful for debugging mktables, looking at diffs; but is slow 809 and memory intensive 810 -check A B : Executes $0 only if A and B are the same 811END 812 } 813} 814 815# Stores the most-recently changed file. If none have changed, can skip the 816# build 817my $most_recent = (stat $0)[9]; # Do this before the chdir! 818 819# Change directories now, because need to read 'version' early. 820if ($use_directory) { 821 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { 822 $pod_directory = File::Spec->rel2abs($pod_directory); 823 } 824 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { 825 $t_path = File::Spec->rel2abs($t_path); 826 } 827 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; 828 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { 829 $pod_directory = File::Spec->abs2rel($pod_directory); 830 } 831 if ($t_path && File::Spec->file_name_is_absolute($t_path)) { 832 $t_path = File::Spec->abs2rel($t_path); 833 } 834} 835 836# Get Unicode version into regular and v-string. This is done now because 837# various tables below get populated based on it. These tables are populated 838# here to be near the top of the file, and so easily seeable by those needing 839# to modify things. 840open my $VERSION, "<", "version" 841 or croak "$0: can't open required file 'version': $!\n"; 842my $string_version = <$VERSION>; 843close $VERSION; 844chomp $string_version; 845my $v_version = pack "C*", split /\./, $string_version; # v string 846 847my $unicode_version = ($compare_versions) 848 ? ( "$string_compare_versions (using " 849 . "$string_version rules)") 850 : $string_version; 851 852# The following are the complete names of properties with property values that 853# are known to not match any code points in some versions of Unicode, but that 854# may change in the future so they should be matchable, hence an empty file is 855# generated for them. 856my @tables_that_may_be_empty; 857push @tables_that_may_be_empty, 'Joining_Type=Left_Joining' 858 if $v_version lt v6.3.0; 859push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; 860push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; 861push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' 862 if $v_version ge v4.1.0; 863push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' 864 if $v_version ge v6.0.0; 865push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' 866 if $v_version ge v6.1.0; 867push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' 868 if $v_version ge v6.2.0; 869 870# The lists below are hashes, so the key is the item in the list, and the 871# value is the reason why it is in the list. This makes generation of 872# documentation easier. 873 874my %why_suppressed; # No file generated for these. 875 876# Files aren't generated for empty extraneous properties. This is arguable. 877# Extraneous properties generally come about because a property is no longer 878# used in a newer version of Unicode. If we generated a file without code 879# points, programs that used to work on that property will still execute 880# without errors. It just won't ever match (or will always match, with \P{}). 881# This means that the logic is now likely wrong. I (khw) think its better to 882# find this out by getting an error message. Just move them to the table 883# above to change this behavior 884my %why_suppress_if_empty_warn_if_not = ( 885 886 # It is the only property that has ever officially been removed from the 887 # Standard. The database never contained any code points for it. 888 'Special_Case_Condition' => 'Obsolete', 889 890 # Apparently never official, but there were code points in some versions of 891 # old-style PropList.txt 892 'Non_Break' => 'Obsolete', 893); 894 895# These would normally go in the warn table just above, but they were changed 896# a long time before this program was written, so warnings about them are 897# moot. 898if ($v_version gt v3.2.0) { 899 push @tables_that_may_be_empty, 900 'Canonical_Combining_Class=Attached_Below_Left' 901} 902 903# Obsoleted 904if ($v_version ge v11.0.0) { 905 push @tables_that_may_be_empty, qw( 906 Grapheme_Cluster_Break=E_Base 907 Grapheme_Cluster_Break=E_Base_GAZ 908 Grapheme_Cluster_Break=E_Modifier 909 Grapheme_Cluster_Break=Glue_After_Zwj 910 Word_Break=E_Base 911 Word_Break=E_Base_GAZ 912 Word_Break=E_Modifier 913 Word_Break=Glue_After_Zwj); 914} 915 916# Enum values for to_output_map() method in the Map_Table package. (0 is don't 917# output) 918my $EXTERNAL_MAP = 1; 919my $INTERNAL_MAP = 2; 920my $OUTPUT_ADJUSTED = 3; 921 922# To override computed values for writing the map tables for these properties. 923# The default for enum map tables is to write them out, so that the Unicode 924# .txt files can be removed, but all the data to compute any property value 925# for any code point is available in a more compact form. 926my %global_to_output_map = ( 927 # Needed by UCD.pm, but don't want to publicize that it exists, so won't 928 # get stuck supporting it if things change. Since it is a STRING 929 # property, it normally would be listed in the pod, but INTERNAL_MAP 930 # suppresses that. 931 Unicode_1_Name => $INTERNAL_MAP, 932 933 Present_In => 0, # Suppress, as easily computed from Age 934 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is 935 # retained, but needed for 936 # non-ASCII 937 938 # Suppress, as mapping can be found instead from the 939 # Perl_Decomposition_Mapping file 940 Decomposition_Type => 0, 941); 942 943# There are several types of obsolete properties defined by Unicode. These 944# must be hand-edited for every new Unicode release. 945my %why_deprecated; # Generates a deprecated warning message if used. 946my %why_stabilized; # Documentation only 947my %why_obsolete; # Documentation only 948 949{ # Closure 950 my $simple = 'Perl uses the more complete version'; 951 my $unihan = 'Unihan properties are by default not enabled in the Perl core.'; 952 953 my $other_properties = 'other properties'; 954 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; 955 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; 956 957 %why_deprecated = ( 958 'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 959 'Jamo_Short_Name' => $contributory, 960 'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 961 'Other_Alphabetic' => $contributory, 962 'Other_Default_Ignorable_Code_Point' => $contributory, 963 'Other_Grapheme_Extend' => $contributory, 964 'Other_ID_Continue' => $contributory, 965 'Other_ID_Start' => $contributory, 966 'Other_Lowercase' => $contributory, 967 'Other_Math' => $contributory, 968 'Other_Uppercase' => $contributory, 969 'Expands_On_NFC' => $why_no_expand, 970 'Expands_On_NFD' => $why_no_expand, 971 'Expands_On_NFKC' => $why_no_expand, 972 'Expands_On_NFKD' => $why_no_expand, 973 ); 974 975 %why_suppressed = ( 976 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which 977 # contains the same information, but without the algorithmically 978 # determinable Hangul syllables'. This file is not published, so it's 979 # existence is not noted in the comment. 980 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', 981 982 # Don't suppress ISO_Comment, as otherwise special handling is needed 983 # to differentiate between it and gc=c, which can be written as 'isc', 984 # which is the same characters as ISO_Comment's short name. 985 986 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::", 987 988 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD", 989 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 990 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 991 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 992 993 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful', 994 ); 995 996 foreach my $property ( 997 998 # The following are suppressed because they were made contributory 999 # or deprecated by Unicode before Perl ever thought about 1000 # supporting them. 1001 'Jamo_Short_Name', 1002 'Grapheme_Link', 1003 'Expands_On_NFC', 1004 'Expands_On_NFD', 1005 'Expands_On_NFKC', 1006 'Expands_On_NFKD', 1007 1008 # The following are suppressed because they have been marked 1009 # as deprecated for a sufficient amount of time 1010 'Other_Alphabetic', 1011 'Other_Default_Ignorable_Code_Point', 1012 'Other_Grapheme_Extend', 1013 'Other_ID_Continue', 1014 'Other_ID_Start', 1015 'Other_Lowercase', 1016 'Other_Math', 1017 'Other_Uppercase', 1018 ) { 1019 $why_suppressed{$property} = $why_deprecated{$property}; 1020 } 1021 1022 # Customize the message for all the 'Other_' properties 1023 foreach my $property (keys %why_deprecated) { 1024 next if (my $main_property = $property) !~ s/^Other_//; 1025 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; 1026 } 1027} 1028 1029if ($write_Unicode_deprecated_tables) { 1030 foreach my $property (keys %why_suppressed) { 1031 delete $why_suppressed{$property} if $property =~ 1032 / ^ Other | Grapheme /x; 1033 } 1034} 1035 1036if ($v_version ge 4.0.0) { 1037 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; 1038 if ($v_version ge 6.0.0) { 1039 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; 1040 } 1041} 1042if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { 1043 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; 1044 if ($v_version ge 6.0.0) { 1045 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; 1046 } 1047} 1048 1049# Probably obsolete forever 1050if ($v_version ge v4.1.0) { 1051 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; 1052} 1053if ($v_version ge v6.0.0) { 1054 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; 1055 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; 1056} 1057 1058# This program can create files for enumerated-like properties, such as 1059# 'Numeric_Type'. This file would be the same format as for a string 1060# property, with a mapping from code point to its value, so you could look up, 1061# for example, the script a code point is in. But no one so far wants this 1062# mapping, or they have found another way to get it since this is a new 1063# feature. So no file is generated except if it is in this list. 1064my @output_mapped_properties = split "\n", <<END; 1065END 1066 1067# If you want more Unihan properties than the default, you need to add them to 1068# these arrays. Depending on the property type, @missing lines might have to 1069# be added to the second array. A sample entry would be (including the '#'): 1070# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 1071my @cjk_properties = split "\n", <<'END'; 1072END 1073my @cjk_property_values = split "\n", <<'END'; 1074END 1075 1076# The input files don't list every code point. Those not listed are to be 1077# defaulted to some value. Below are hard-coded what those values are for 1078# non-binary properties as of 5.1. Starting in 5.0, there are 1079# machine-parsable comment lines in the files that give the defaults; so this 1080# list shouldn't have to be extended. The claim is that all missing entries 1081# for binary properties will default to 'N'. Unicode tried to change that in 1082# 5.2, but the beta period produced enough protest that they backed off. 1083# 1084# The defaults for the fields that appear in UnicodeData.txt in this hash must 1085# be in the form that it expects. The others may be synonyms. 1086my $CODE_POINT = '<code point>'; 1087my %default_mapping = ( 1088 Age => "Unassigned", 1089 # Bidi_Class => Complicated; set in code 1090 Bidi_Mirroring_Glyph => "", 1091 Block => 'No_Block', 1092 Canonical_Combining_Class => 0, 1093 Case_Folding => $CODE_POINT, 1094 Decomposition_Mapping => $CODE_POINT, 1095 Decomposition_Type => 'None', 1096 East_Asian_Width => "Neutral", 1097 FC_NFKC_Closure => $CODE_POINT, 1098 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned', 1099 Grapheme_Cluster_Break => 'Other', 1100 Hangul_Syllable_Type => 'NA', 1101 ISO_Comment => "", 1102 Jamo_Short_Name => "", 1103 Joining_Group => "No_Joining_Group", 1104 # Joining_Type => Complicated; set in code 1105 kIICore => 'N', # Is converted to binary 1106 #Line_Break => Complicated; set in code 1107 Lowercase_Mapping => $CODE_POINT, 1108 Name => "", 1109 Name_Alias => "", 1110 NFC_QC => 'Yes', 1111 NFD_QC => 'Yes', 1112 NFKC_QC => 'Yes', 1113 NFKD_QC => 'Yes', 1114 Numeric_Type => 'None', 1115 Numeric_Value => 'NaN', 1116 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', 1117 Sentence_Break => 'Other', 1118 Simple_Case_Folding => $CODE_POINT, 1119 Simple_Lowercase_Mapping => $CODE_POINT, 1120 Simple_Titlecase_Mapping => $CODE_POINT, 1121 Simple_Uppercase_Mapping => $CODE_POINT, 1122 Titlecase_Mapping => $CODE_POINT, 1123 Unicode_1_Name => "", 1124 Unicode_Radical_Stroke => "", 1125 Uppercase_Mapping => $CODE_POINT, 1126 Word_Break => 'Other', 1127); 1128 1129### End of externally interesting definitions, except for @input_file_objects 1130 1131my $HEADER=<<"EOF"; 1132# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1133# This file is machine-generated by $0 from the Unicode 1134# database, Version $unicode_version. Any changes made here will be lost! 1135EOF 1136 1137my $INTERNAL_ONLY_HEADER = <<"EOF"; 1138 1139# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 1140# This file is for internal use by core Perl only. The format and even the 1141# name or existence of this file are subject to change without notice. Don't 1142# use it directly. Use Unicode::UCD to access the Unicode character data 1143# base. 1144EOF 1145 1146my $DEVELOPMENT_ONLY=<<"EOF"; 1147# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! 1148# This file contains information artificially constrained to code points 1149# present in Unicode release $string_compare_versions. 1150# IT CANNOT BE RELIED ON. It is for use during development only and should 1151# not be used for production. 1152 1153EOF 1154 1155my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) 1156 ? "10FFFF" 1157 : "FFFF"; 1158my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; 1159my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; 1160 1161# We work with above-Unicode code points, up to IV_MAX, but we may want to use 1162# sentinels above that number. Therefore for internal use, we use a much 1163# smaller number, translating it to IV_MAX only for output. The exact number 1164# is immaterial (all above-Unicode code points are treated exactly the same), 1165# but the algorithm requires it to be at least 1166# 2 * $MAX_UNICODE_CODEPOINTS + 1 1167my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; 1168my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; 1169my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); 1170 1171my $MAX_PLATFORM_CODEPOINT = ~0 >> 1; 1172 1173# Matches legal code point. 4-6 hex numbers, If there are 6, the first 1174# two must be 10; if there are 5, the first must not be a 0. Written this way 1175# to decrease backtracking. The first regex allows the code point to be at 1176# the end of a word, but to work properly, the word shouldn't end with a valid 1177# hex character. The second one won't match a code point at the end of a 1178# word, and doesn't have the run-on issue 1179my $run_on_code_point_re = 1180 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; 1181my $code_point_re = qr/\b$run_on_code_point_re/; 1182 1183# This matches the beginning of the line in the Unicode DB files that give the 1184# defaults for code points not listed (i.e., missing) in the file. The code 1185# depends on this ending with a semi-colon, so it can assume it is a valid 1186# field when the line is split() by semi-colons 1187my $missing_defaults_prefix = qr/ ^ \# \s+ \@missing: \s+ 1188 ($code_point_re) 1189 \.\. 1190 ($code_point_re) 1191 \s* ; 1192 /x; 1193 1194# Property types. Unicode has more types, but these are sufficient for our 1195# purposes. 1196my $UNKNOWN = -1; # initialized to illegal value 1197my $NON_STRING = 1; # Either binary or enum 1198my $BINARY = 2; 1199my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal 1200 # tables, additional true and false tables are 1201 # generated so that false is anything matching the 1202 # default value, and true is everything else. 1203my $ENUM = 4; # Include catalog 1204my $STRING = 5; # Anything else: string or misc 1205 1206# Some input files have lines that give default values for code points not 1207# contained in the file. Sometimes these should be ignored. 1208my $NO_DEFAULTS = 0; # Must evaluate to false 1209my $NOT_IGNORED = 1; 1210my $IGNORED = 2; 1211 1212# Range types. Each range has a type. Most ranges are type 0, for normal, 1213# and will appear in the main body of the tables in the output files, but 1214# there are other types of ranges as well, listed below, that are specially 1215# handled. There are pseudo-types as well that will never be stored as a 1216# type, but will affect the calculation of the type. 1217 1218# 0 is for normal, non-specials 1219my $MULTI_CP = 1; # Sequence of more than code point 1220my $HANGUL_SYLLABLE = 2; 1221my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. 1222my $NULL = 4; # The map is to the null string; utf8.c can't 1223 # handle these, nor is there an accepted syntax 1224 # for them in \p{} constructs 1225my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would 1226 # otherwise be $MULTI_CP type are instead type 0 1227 1228# process_generic_property_file() can accept certain overrides in its input. 1229# Each of these must begin AND end with $CMD_DELIM. 1230my $CMD_DELIM = "\a"; 1231my $REPLACE_CMD = 'replace'; # Override the Replace 1232my $MAP_TYPE_CMD = 'map_type'; # Override the Type 1233 1234my $NO = 0; 1235my $YES = 1; 1236 1237# Values for the Replace argument to add_range. 1238# $NO # Don't replace; add only the code points not 1239 # already present. 1240my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in 1241 # the comments at the subroutine definition. 1242my $UNCONDITIONALLY = 2; # Replace without conditions. 1243my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if 1244 # already there 1245my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if 1246 # already there 1247my $CROAK = 6; # Die with an error if is already there 1248 1249# Flags to give property statuses. The phrases are to remind maintainers that 1250# if the flag is changed, the indefinite article referring to it in the 1251# documentation may need to be as well. 1252my $NORMAL = ""; 1253my $DEPRECATED = 'D'; 1254my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; 1255my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; 1256my $DISCOURAGED = 'X'; 1257my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; 1258my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; 1259my $STRICTER = 'T'; 1260my $a_bold_stricter = "a 'B<$STRICTER>'"; 1261my $A_bold_stricter = "A 'B<$STRICTER>'"; 1262my $STABILIZED = 'S'; 1263my $a_bold_stabilized = "an 'B<$STABILIZED>'"; 1264my $A_bold_stabilized = "An 'B<$STABILIZED>'"; 1265my $OBSOLETE = 'O'; 1266my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; 1267my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; 1268 1269# Aliases can also have an extra status: 1270my $INTERNAL_ALIAS = 'P'; 1271 1272my %status_past_participles = ( 1273 $DISCOURAGED => 'discouraged', 1274 $STABILIZED => 'stabilized', 1275 $OBSOLETE => 'obsolete', 1276 $DEPRECATED => 'deprecated', 1277 $INTERNAL_ALIAS => 'reserved for Perl core internal use only', 1278); 1279 1280# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be 1281# externally documented. 1282my $ORDINARY = 0; # The normal fate. 1283my $MAP_PROXIED = 1; # The map table for the property isn't written out, 1284 # but there is a file written that can be used to 1285 # reconstruct this table 1286my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is 1287 # for Perl's internal use only 1288my $SUPPRESSED = 3; # The file for this table is not written out, and as a 1289 # result, we don't bother to do many computations on 1290 # it. 1291my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the 1292 # computations anyway, as the values are needed for 1293 # things to work. This happens when we have Perl 1294 # extensions that depend on Unicode tables that 1295 # wouldn't normally be in a given Unicode version. 1296 1297# The format of the values of the tables: 1298my $EMPTY_FORMAT = ""; 1299my $BINARY_FORMAT = 'b'; 1300my $DECIMAL_FORMAT = 'd'; 1301my $FLOAT_FORMAT = 'f'; 1302my $INTEGER_FORMAT = 'i'; 1303my $HEX_FORMAT = 'x'; 1304my $RATIONAL_FORMAT = 'r'; 1305my $STRING_FORMAT = 's'; 1306my $ADJUST_FORMAT = 'a'; 1307my $HEX_ADJUST_FORMAT = 'ax'; 1308my $DECOMP_STRING_FORMAT = 'c'; 1309my $STRING_WHITE_SPACE_LIST = 'sw'; 1310 1311my %map_table_formats = ( 1312 $BINARY_FORMAT => 'binary', 1313 $DECIMAL_FORMAT => 'single decimal digit', 1314 $FLOAT_FORMAT => 'floating point number', 1315 $INTEGER_FORMAT => 'integer', 1316 $HEX_FORMAT => 'non-negative hex whole number; a code point', 1317 $RATIONAL_FORMAT => 'rational: an integer or a fraction', 1318 $STRING_FORMAT => 'string', 1319 $ADJUST_FORMAT => 'some entries need adjustment', 1320 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment', 1321 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', 1322 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' 1323); 1324 1325# Unicode didn't put such derived files in a separate directory at first. 1326my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; 1327my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; 1328my $AUXILIARY = 'auxiliary'; 1329my $EMOJI = 'emoji'; 1330 1331# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm 1332my %loose_to_file_of; # loosely maps table names to their respective 1333 # files 1334my %stricter_to_file_of; # same; but for stricter mapping. 1335my %loose_property_to_file_of; # Maps a loose property name to its map file 1336my %strict_property_to_file_of; # Same, but strict 1337my @inline_definitions = "V0"; # Each element gives a definition of a unique 1338 # inversion list. When a definition is inlined, 1339 # its value in the hash it's in (one of the two 1340 # defined just above) will include an index into 1341 # this array. The 0th element is initialized to 1342 # the definition for a zero length inversion list 1343my %file_to_swash_name; # Maps the file name to its corresponding key name 1344 # in the hash %Unicode::UCD::SwashInfo 1345my %nv_floating_to_rational; # maps numeric values floating point numbers to 1346 # their rational equivalent 1347my %loose_property_name_of; # Loosely maps (non_string) property names to 1348 # standard form 1349my %strict_property_name_of; # Strictly maps (non_string) property names to 1350 # standard form 1351my %string_property_loose_to_name; # Same, for string properties. 1352my %loose_defaults; # keys are of form "prop=value", where 'prop' is 1353 # the property name in standard loose form, and 1354 # 'value' is the default value for that property, 1355 # also in standard loose form. 1356my %loose_to_standard_value; # loosely maps table names to the canonical 1357 # alias for them 1358my %ambiguous_names; # keys are alias names (in standard form) that 1359 # have more than one possible meaning. 1360my %combination_property; # keys are alias names (in standard form) that 1361 # have both a map table, and a binary one that 1362 # yields true for all non-null maps. 1363my %prop_aliases; # Keys are standard property name; values are each 1364 # one's aliases 1365my %prop_value_aliases; # Keys of top level are standard property name; 1366 # values are keys to another hash, Each one is 1367 # one of the property's values, in standard form. 1368 # The values are that prop-val's aliases. 1369my %skipped_files; # List of files that we skip 1370my %ucd_pod; # Holds entries that will go into the UCD section of the pod 1371 1372# Most properties are immune to caseless matching, otherwise you would get 1373# nonsensical results, as properties are a function of a code point, not 1374# everything that is caselessly equivalent to that code point. For example, 1375# Changes_When_Case_Folded('s') should be false, whereas caselessly it would 1376# be true because 's' and 'S' are equivalent caselessly. However, 1377# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we 1378# extend that concept to those very few properties that are like this. Each 1379# such property will match the full range caselessly. They are hard-coded in 1380# the program; it's not worth trying to make it general as it's extremely 1381# unlikely that they will ever change. 1382my %caseless_equivalent_to; 1383 1384# This is the range of characters that were in Release 1 of Unicode, and 1385# removed in Release 2 (replaced with the current Hangul syllables starting at 1386# U+AC00). The range was reused starting in Release 3 for other purposes. 1387my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400; 1388my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF; 1389 1390# These constants names and values were taken from the Unicode standard, 1391# version 5.1, section 3.12. They are used in conjunction with Hangul 1392# syllables. The '_string' versions are so generated tables can retain the 1393# hex format, which is the more familiar value 1394my $SBase_string = "0xAC00"; 1395my $SBase = CORE::hex $SBase_string; 1396my $LBase_string = "0x1100"; 1397my $LBase = CORE::hex $LBase_string; 1398my $VBase_string = "0x1161"; 1399my $VBase = CORE::hex $VBase_string; 1400my $TBase_string = "0x11A7"; 1401my $TBase = CORE::hex $TBase_string; 1402my $SCount = 11172; 1403my $LCount = 19; 1404my $VCount = 21; 1405my $TCount = 28; 1406my $NCount = $VCount * $TCount; 1407 1408# For Hangul syllables; These store the numbers from Jamo.txt in conjunction 1409# with the above published constants. 1410my %Jamo; 1411my %Jamo_L; # Leading consonants 1412my %Jamo_V; # Vowels 1413my %Jamo_T; # Trailing consonants 1414 1415# For code points whose name contains its ordinal as a '-ABCD' suffix. 1416# The key is the base name of the code point, and the value is an 1417# array giving all the ranges that use this base name. Each range 1418# is actually a hash giving the 'low' and 'high' values of it. 1419my %names_ending_in_code_point; 1420my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes 1421 # removed from the names 1422# Inverse mapping. The list of ranges that have these kinds of 1423# names. Each element contains the low, high, and base names in an 1424# anonymous hash. 1425my @code_points_ending_in_code_point; 1426 1427# To hold Unicode's normalization test suite 1428my @normalization_tests; 1429 1430# Boolean: does this Unicode version have the hangul syllables, and are we 1431# writing out a table for them? 1432my $has_hangul_syllables = 0; 1433 1434# Does this Unicode version have code points whose names end in their 1435# respective code points, and are we writing out a table for them? 0 for no; 1436# otherwise points to first property that a table is needed for them, so that 1437# if multiple tables are needed, we don't create duplicates 1438my $needing_code_points_ending_in_code_point = 0; 1439 1440my @backslash_X_tests; # List of tests read in for testing \X 1441my @LB_tests; # List of tests read in for testing \b{lb} 1442my @SB_tests; # List of tests read in for testing \b{sb} 1443my @WB_tests; # List of tests read in for testing \b{wb} 1444my @unhandled_properties; # Will contain a list of properties found in 1445 # the input that we didn't process. 1446my @match_properties; # Properties that have match tables, to be 1447 # listed in the pod 1448my @map_properties; # Properties that get map files written 1449my @named_sequences; # NamedSequences.txt contents. 1450my %potential_files; # Generated list of all .txt files in the directory 1451 # structure so we can warn if something is being 1452 # ignored. 1453my @missing_early_files; # Generated list of absent files that we need to 1454 # proceed in compiling this early Unicode version 1455my @files_actually_output; # List of files we generated. 1456my @more_Names; # Some code point names are compound; this is used 1457 # to store the extra components of them. 1458my $E_FLOAT_PRECISION = 3; # The minimum number of digits after the decimal 1459 # point of a normalized floating point number 1460 # needed to match before we consider it equivalent 1461 # to a candidate rational 1462 1463# These store references to certain commonly used property objects 1464my $age; 1465my $ccc; 1466my $gc; 1467my $perl; 1468my $block; 1469my $perl_charname; 1470my $print; 1471my $All; 1472my $Assigned; # All assigned characters in this Unicode release 1473my $DI; # Default_Ignorable_Code_Point property 1474my $NChar; # Noncharacter_Code_Point property 1475my $script; 1476my $scx; # Script_Extensions property 1477my $idt; # Identifier_Type property 1478 1479# Are there conflicting names because of beginning with 'In_', or 'Is_' 1480my $has_In_conflicts = 0; 1481my $has_Is_conflicts = 0; 1482 1483sub internal_file_to_platform ($file=undef) { 1484 # Convert our file paths which have '/' separators to those of the 1485 # platform. 1486 1487 return undef unless defined $file; 1488 1489 return File::Spec->join(split '/', $file); 1490} 1491 1492sub file_exists ($file=undef) { # platform independent '-e'. This program internally 1493 # uses slash as a path separator. 1494 return 0 unless defined $file; 1495 return -e internal_file_to_platform($file); 1496} 1497 1498sub objaddr($addr) { 1499 # Returns the address of the blessed input object. 1500 # It doesn't check for blessedness because that would do a string eval 1501 # every call, and the program is structured so that this is never called 1502 # for a non-blessed object. 1503 1504 return pack 'J', refaddr $addr; 1505} 1506 1507# These are used only if $annotate is true. 1508# The entire range of Unicode characters is examined to populate these 1509# after all the input has been processed. But most can be skipped, as they 1510# have the same descriptive phrases, such as being unassigned 1511my @viacode; # Contains the 1 million character names 1512my @age; # And their ages ("" if none) 1513my @printable; # boolean: And are those characters printable? 1514my @annotate_char_type; # Contains a type of those characters, specifically 1515 # for the purposes of annotation. 1516my $annotate_ranges; # A map of ranges of code points that have the same 1517 # name for the purposes of annotation. They map to the 1518 # upper edge of the range, so that the end point can 1519 # be immediately found. This is used to skip ahead to 1520 # the end of a range, and avoid processing each 1521 # individual code point in it. 1522my $unassigned_sans_noncharacters; # A Range_List of the unassigned 1523 # characters, but excluding those which are 1524 # also noncharacter code points 1525 1526# The annotation types are an extension of the regular range types, though 1527# some of the latter are folded into one. Make the new types negative to 1528# avoid conflicting with the regular types 1529my $SURROGATE_TYPE = -1; 1530my $UNASSIGNED_TYPE = -2; 1531my $PRIVATE_USE_TYPE = -3; 1532my $NONCHARACTER_TYPE = -4; 1533my $CONTROL_TYPE = -5; 1534my $ABOVE_UNICODE_TYPE = -6; 1535my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program 1536 1537sub populate_char_info ($i) { 1538 # Used only with the $annotate option. Populates the arrays with the 1539 # input code point's info that are needed for outputting more detailed 1540 # comments. If calling context wants a return, it is the end point of 1541 # any contiguous range of characters that share essentially the same info 1542 1543 $viacode[$i] = $perl_charname->value_of($i) || ""; 1544 $age[$i] = (defined $age) 1545 ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x) 1546 ? $age->value_of($i) 1547 : "") 1548 : ""; 1549 1550 # A character is generally printable if Unicode says it is, 1551 # but below we make sure that most Unicode general category 'C' types 1552 # aren't. 1553 $printable[$i] = $print->contains($i); 1554 1555 # But the characters in this range were removed in v2.0 and replaced by 1556 # different ones later. Modern fonts will be for the replacement 1557 # characters, so suppress printing them. 1558 if (($v_version lt v2.0 1559 || ($compare_versions && $compare_versions lt v2.0)) 1560 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE 1561 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE)) 1562 { 1563 $printable[$i] = 0; 1564 } 1565 1566 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; 1567 1568 # Only these two regular types are treated specially for annotations 1569 # purposes 1570 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME 1571 && $annotate_char_type[$i] != $HANGUL_SYLLABLE; 1572 1573 # Give a generic name to all code points that don't have a real name. 1574 # We output ranges, if applicable, for these. Also calculate the end 1575 # point of the range. 1576 my $end; 1577 if (! $viacode[$i]) { 1578 if ($i > $MAX_UNICODE_CODEPOINT) { 1579 $viacode[$i] = 'Above-Unicode'; 1580 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; 1581 $printable[$i] = 0; 1582 $end = $MAX_WORKING_CODEPOINT; 1583 } 1584 elsif ($gc-> table('Private_use')->contains($i)) { 1585 $viacode[$i] = 'Private Use'; 1586 $annotate_char_type[$i] = $PRIVATE_USE_TYPE; 1587 $printable[$i] = 0; 1588 $end = $gc->table('Private_Use')->containing_range($i)->end; 1589 } 1590 elsif ($NChar->contains($i)) { 1591 $viacode[$i] = 'Noncharacter'; 1592 $annotate_char_type[$i] = $NONCHARACTER_TYPE; 1593 $printable[$i] = 0; 1594 $end = $NChar->containing_range($i)->end; 1595 } 1596 elsif ($gc-> table('Control')->contains($i)) { 1597 my $name_ref = property_ref('Name_Alias'); 1598 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref; 1599 $viacode[$i] = (defined $name_ref) 1600 ? $name_ref->value_of($i) 1601 : 'Control'; 1602 $annotate_char_type[$i] = $CONTROL_TYPE; 1603 $printable[$i] = 0; 1604 } 1605 elsif ($gc-> table('Unassigned')->contains($i)) { 1606 $annotate_char_type[$i] = $UNASSIGNED_TYPE; 1607 $printable[$i] = 0; 1608 $viacode[$i] = 'Unassigned'; 1609 1610 if (defined $block) { # No blocks in earliest releases 1611 $viacode[$i] .= ', block=' . $block-> value_of($i); 1612 $end = $gc-> table('Unassigned')->containing_range($i)->end; 1613 1614 # Because we name the unassigned by the blocks they are in, it 1615 # can't go past the end of that block, and it also can't go 1616 # past the unassigned range it is in. The special table makes 1617 # sure that the non-characters, which are unassigned, are 1618 # separated out. 1619 $end = min($block->containing_range($i)->end, 1620 $unassigned_sans_noncharacters-> 1621 containing_range($i)->end); 1622 } 1623 else { 1624 $end = $i + 1; 1625 while ($unassigned_sans_noncharacters->contains($end)) { 1626 $end++; 1627 } 1628 $end--; 1629 } 1630 } 1631 elsif ($perl->table('_Perl_Surrogate')->contains($i)) { 1632 $viacode[$i] = 'Surrogate'; 1633 $annotate_char_type[$i] = $SURROGATE_TYPE; 1634 $printable[$i] = 0; 1635 $end = $gc->table('Surrogate')->containing_range($i)->end; 1636 } 1637 else { 1638 Carp::my_carp_bug("Can't figure out how to annotate " 1639 . sprintf("U+%04X", $i) 1640 . ". Proceeding anyway."); 1641 $viacode[$i] = 'UNKNOWN'; 1642 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1643 $printable[$i] = 0; 1644 } 1645 } 1646 1647 # Here, has a name, but if it's one in which the code point number is 1648 # appended to the name, do that. 1649 elsif ($annotate_char_type[$i] == $CP_IN_NAME) { 1650 $viacode[$i] .= sprintf("-%04X", $i); 1651 1652 my $limit = $perl_charname->containing_range($i)->end; 1653 if (defined $age) { 1654 # Do all these as groups of the same age, instead of individually, 1655 # because their names are so meaningless, and there are typically 1656 # large quantities of them. 1657 $end = $i + 1; 1658 while ($end <= $limit && $age->value_of($end) == $age[$i]) { 1659 $end++; 1660 } 1661 $end--; 1662 } 1663 else { 1664 $end = $limit; 1665 } 1666 } 1667 1668 # And here, has a name, but if it's a hangul syllable one, replace it with 1669 # the correct name from the Unicode algorithm 1670 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { 1671 use integer; 1672 my $SIndex = $i - $SBase; 1673 my $L = $LBase + $SIndex / $NCount; 1674 my $V = $VBase + ($SIndex % $NCount) / $TCount; 1675 my $T = $TBase + $SIndex % $TCount; 1676 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; 1677 $viacode[$i] .= $Jamo{$T} if $T != $TBase; 1678 $end = $perl_charname->containing_range($i)->end; 1679 } 1680 1681 return if ! defined wantarray; 1682 return $i if ! defined $end; # If not a range, return the input 1683 1684 # Save this whole range so can find the end point quickly 1685 $annotate_ranges->add_map($i, $end, $end); 1686 1687 return $end; 1688} 1689 1690sub max($a, $b) { 1691 return $a >= $b ? $a : $b; 1692} 1693 1694sub min($a, $b) { 1695 return $a <= $b ? $a : $b; 1696} 1697 1698sub clarify_number ($number) { 1699 # This returns the input number with underscores inserted every 3 digits 1700 # in large (5 digits or more) numbers. Input must be entirely digits, not 1701 # checked. 1702 1703 my $pos = length($number) - 3; 1704 return $number if $pos <= 1; 1705 while ($pos > 0) { 1706 substr($number, $pos, 0) = '_'; 1707 $pos -= 3; 1708 } 1709 return $number; 1710} 1711 1712sub clarify_code_point_count ($number) { 1713 # This is like clarify_number(), but the input is assumed to be a count of 1714 # code points, rather than a generic number. 1715 1716 my $append = ""; 1717 1718 if ($number > $MAX_UNICODE_CODEPOINTS) { 1719 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); 1720 return "All above-Unicode code points" if $number == 0; 1721 $append = " + all above-Unicode code points"; 1722 } 1723 return clarify_number($number) . $append; 1724} 1725 1726package Carp; 1727 1728# These routines give a uniform treatment of messages in this program. They 1729# are placed in the Carp package to cause the stack trace to not include them, 1730# although an alternative would be to use another package and set @CARP_NOT 1731# for it. 1732 1733our $Verbose = 1 if main::DEBUG; # Useful info when debugging 1734 1735# This is a work-around suggested by Nicholas Clark to fix a problem with Carp 1736# and overload trying to load Scalar:Util under miniperl. See 1737# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html 1738undef $overload::VERSION; 1739 1740sub my_carp($message="", $nofold=0) { 1741 1742 if ($message) { 1743 $message = main::join_lines($message); 1744 $message =~ s/^$0: *//; # Remove initial program name 1745 $message =~ s/[.;,]+$//; # Remove certain ending punctuation 1746 $message = "\n$0: $message;"; 1747 1748 # Fold the message with program name, semi-colon end punctuation 1749 # (which looks good with the message that carp appends to it), and a 1750 # hanging indent for continuation lines. 1751 $message = main::simple_fold($message, "", 4) unless $nofold; 1752 $message =~ s/\n$//; # Remove the trailing nl so what carp 1753 # appends is to the same line 1754 } 1755 1756 return $message if defined wantarray; # If a caller just wants the msg 1757 1758 carp $message; 1759 return; 1760} 1761 1762sub my_carp_bug($message="") { 1763 # This is called when it is clear that the problem is caused by a bug in 1764 # this program. 1765 $message =~ s/^$0: *//; 1766 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); 1767 carp $message; 1768 return; 1769} 1770 1771sub carp_too_few_args($args_ref, $count) { 1772 my_carp_bug("Need at least $count arguments to " 1773 . (caller 1)[3] 1774 . ". Instead got: '" 1775 . join ', ', @$args_ref 1776 . "'. No action taken."); 1777 return; 1778} 1779 1780sub carp_extra_args($args_ref) { 1781 unless (ref $args_ref) { 1782 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); 1783 return; 1784 } 1785 my ($package, $file, $line) = caller; 1786 my $subroutine = (caller 1)[3]; 1787 1788 my $list; 1789 if (ref $args_ref eq 'HASH') { 1790 foreach my $key (keys %$args_ref) { 1791 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; 1792 } 1793 $list = join ', ', each %{$args_ref}; 1794 } 1795 elsif (ref $args_ref eq 'ARRAY') { 1796 foreach my $arg (@$args_ref) { 1797 $arg = $UNDEF unless defined $arg; 1798 } 1799 $list = join ', ', @$args_ref; 1800 } 1801 else { 1802 my_carp_bug("Can't cope with ref " 1803 . ref($args_ref) 1804 . " . argument to 'carp_extra_args'. Not checking arguments."); 1805 return; 1806 } 1807 1808 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); 1809 return; 1810} 1811 1812package main; 1813 1814{ # Closure 1815 1816 # This program uses the inside-out method for objects, as recommended in 1817 # "Perl Best Practices". (This is the best solution still, since this has 1818 # to run under miniperl.) This closure aids in generating those. There 1819 # are two routines. setup_package() is called once per package to set 1820 # things up, and then set_access() is called for each hash representing a 1821 # field in the object. These routines arrange for the object to be 1822 # properly destroyed when no longer used, and for standard accessor 1823 # functions to be generated. If you need more complex accessors, just 1824 # write your own and leave those accesses out of the call to set_access(). 1825 # More details below. 1826 1827 my %constructor_fields; # fields that are to be used in constructors; see 1828 # below 1829 1830 # The values of this hash will be the package names as keys to other 1831 # hashes containing the name of each field in the package as keys, and 1832 # references to their respective hashes as values. 1833 my %package_fields; 1834 1835 sub setup_package { 1836 # Sets up the package, creating standard DESTROY and dump methods 1837 # (unless already defined). The dump method is used in debugging by 1838 # simple_dumper(). 1839 # The optional parameters are: 1840 # a) a reference to a hash, that gets populated by later 1841 # set_access() calls with one of the accesses being 1842 # 'constructor'. The caller can then refer to this, but it is 1843 # not otherwise used by these two routines. 1844 # b) a reference to a callback routine to call during destruction 1845 # of the object, before any fields are actually destroyed 1846 1847 my %args = @_; 1848 my $constructor_ref = delete $args{'Constructor_Fields'}; 1849 my $destroy_callback = delete $args{'Destroy_Callback'}; 1850 Carp::carp_extra_args(\@_) if main::DEBUG && %args; 1851 1852 my %fields; 1853 my $package = (caller)[0]; 1854 1855 $package_fields{$package} = \%fields; 1856 $constructor_fields{$package} = $constructor_ref; 1857 1858 unless ($package->can('DESTROY')) { 1859 my $destroy_name = "${package}::DESTROY"; 1860 no strict "refs"; 1861 1862 # Use typeglob to give the anonymous subroutine the name we want 1863 *$destroy_name = sub ($self) { 1864 my $addr = pack 'J', refaddr $self; 1865 1866 $self->$destroy_callback if $destroy_callback; 1867 foreach my $field (keys %{$package_fields{$package}}) { 1868 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; 1869 delete $package_fields{$package}{$field}{$addr}; 1870 } 1871 return; 1872 } 1873 } 1874 1875 unless ($package->can('dump')) { 1876 my $dump_name = "${package}::dump"; 1877 no strict "refs"; 1878 *$dump_name = sub ($self, @_args) { 1879 return dump_inside_out($self, $package_fields{$package}, @_args); 1880 } 1881 } 1882 return; 1883 } 1884 1885 sub set_access($name, $field, @accessors) { 1886 # Arrange for the input field to be garbage collected when no longer 1887 # needed. Also, creates standard accessor functions for the field 1888 # based on the optional parameters-- none if none of these parameters: 1889 # 'addable' creates an 'add_NAME()' accessor function. 1890 # 'readable' or 'readable_array' creates a 'NAME()' accessor 1891 # function. 1892 # 'settable' creates a 'set_NAME()' accessor function. 1893 # 'constructor' doesn't create an accessor function, but adds the 1894 # field to the hash that was previously passed to 1895 # setup_package(); 1896 # Any of the accesses can be abbreviated down, so that 'a', 'ad', 1897 # 'add' etc. all mean 'addable'. 1898 # The read accessor function will work on both array and scalar 1899 # values. If another accessor in the parameter list is 'a', the read 1900 # access assumes an array. You can also force it to be array access 1901 # by specifying 'readable_array' instead of 'readable' 1902 # 1903 # A sort-of 'protected' access can be set-up by preceding the addable, 1904 # readable or settable with some initial portion of 'protected_' (but, 1905 # the underscore is required), like 'p_a', 'pro_set', etc. The 1906 # "protection" is only by convention. All that happens is that the 1907 # accessor functions' names begin with an underscore. So instead of 1908 # calling set_foo, the call is _set_foo. (Real protection could be 1909 # accomplished by having a new subroutine, end_package, called at the 1910 # end of each package, and then storing the __LINE__ ranges and 1911 # checking them on every accessor. But that is way overkill.) 1912 1913 # We create anonymous subroutines as the accessors and then use 1914 # typeglobs to assign them to the proper package and name 1915 1916 # $name Name of the field 1917 # $field Reference to the inside-out hash containing the 1918 # field 1919 1920 my $package = (caller)[0]; 1921 1922 if (! exists $package_fields{$package}) { 1923 croak "$0: Must call 'setup_package' before 'set_access'"; 1924 } 1925 1926 # Stash the field so DESTROY can get it. 1927 $package_fields{$package}{$name} = $field; 1928 1929 # Remaining arguments are the accessors. For each... 1930 foreach my $access (@accessors) { 1931 my $access = lc $access; 1932 1933 my $protected = ""; 1934 1935 # Match the input as far as it goes. 1936 if ($access =~ /^(p[^_]*)_/) { 1937 $protected = $1; 1938 if (substr('protected_', 0, length $protected) 1939 eq $protected) 1940 { 1941 1942 # Add 1 for the underscore not included in $protected 1943 $access = substr($access, length($protected) + 1); 1944 $protected = '_'; 1945 } 1946 else { 1947 $protected = ""; 1948 } 1949 } 1950 1951 if (substr('addable', 0, length $access) eq $access) { 1952 my $subname = "${package}::${protected}add_$name"; 1953 no strict "refs"; 1954 1955 # add_ accessor. Don't add if already there, which we 1956 # determine using 'eq' for scalars and '==' otherwise. 1957 *$subname = sub ($self, $value) { 1958 use strict "refs"; 1959 my $addr = pack 'J', refaddr $self; 1960 if (ref $value) { 1961 return if grep { $value == $_ } @{$field->{$addr}}; 1962 } 1963 else { 1964 return if grep { $value eq $_ } @{$field->{$addr}}; 1965 } 1966 push @{$field->{$addr}}, $value; 1967 return; 1968 } 1969 } 1970 elsif (substr('constructor', 0, length $access) eq $access) { 1971 if ($protected) { 1972 Carp::my_carp_bug("Can't set-up 'protected' constructors") 1973 } 1974 else { 1975 $constructor_fields{$package}{$name} = $field; 1976 } 1977 } 1978 elsif (substr('readable_array', 0, length $access) eq $access) { 1979 1980 # Here has read access. If one of the other parameters for 1981 # access is array, or this one specifies array (by being more 1982 # than just 'readable_'), then create a subroutine that 1983 # assumes the data is an array. Otherwise just a scalar 1984 my $subname = "${package}::${protected}$name"; 1985 if (grep { /^a/i } (@accessors) 1986 or length($access) > length('readable_')) 1987 { 1988 no strict "refs"; 1989 *$subname = sub ($_addr) { 1990 use strict "refs"; 1991 my $addr = pack 'J', refaddr $_addr; 1992 if (ref $field->{$addr} ne 'ARRAY') { 1993 my $type = ref $field->{$addr}; 1994 $type = 'scalar' unless $type; 1995 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); 1996 return; 1997 } 1998 return scalar @{$field->{$addr}} unless wantarray; 1999 2000 # Make a copy; had problems with caller modifying the 2001 # original otherwise 2002 my @return = @{$field->{$addr}}; 2003 return @return; 2004 } 2005 } 2006 else { 2007 2008 # Here not an array value, a simpler function. 2009 no strict "refs"; 2010 *$subname = sub ($addr) { 2011 use strict "refs"; 2012 return $field->{pack 'J', refaddr $addr}; 2013 } 2014 } 2015 } 2016 elsif (substr('settable', 0, length $access) eq $access) { 2017 my $subname = "${package}::${protected}set_$name"; 2018 no strict "refs"; 2019 *$subname = sub ($self, $value) { 2020 use strict "refs"; 2021 # $self is $_[0]; $value is $_[1] 2022 $field->{pack 'J', refaddr $self} = $value; 2023 return; 2024 } 2025 } 2026 else { 2027 Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); 2028 } 2029 } 2030 return; 2031 } 2032} 2033 2034package Input_file; 2035 2036# All input files use this object, which stores various attributes about them, 2037# and provides for convenient, uniform handling. The run method wraps the 2038# processing. It handles all the bookkeeping of opening, reading, and closing 2039# the file, returning only significant input lines. 2040# 2041# Each object gets a handler which processes the body of the file, and is 2042# called by run(). All character property files must use the generic, 2043# default handler, which has code scrubbed to handle things you might not 2044# expect, including automatic EBCDIC handling. For files that don't deal with 2045# mapping code points to a property value, such as test files, 2046# PropertyAliases, PropValueAliases, and named sequences, you can override the 2047# handler to be a custom one. Such a handler should basically be a 2048# while(next_line()) {...} loop. 2049# 2050# You can also set up handlers to 2051# 0) call during object construction time, after everything else is done 2052# 1) call before the first line is read, for pre processing 2053# 2) call to adjust each line of the input before the main handler gets 2054# them. This can be automatically generated, if appropriately simple 2055# enough, by specifying a Properties parameter in the constructor. 2056# 3) call upon EOF before the main handler exits its loop 2057# 4) call at the end, for post processing 2058# 2059# $_ is used to store the input line, and is to be filtered by the 2060# each_line_handler()s. So, if the format of the line is not in the desired 2061# format for the main handler, these are used to do that adjusting. They can 2062# be stacked (by enclosing them in an [ anonymous array ] in the constructor, 2063# so the $_ output of one is used as the input to the next. The EOF handler 2064# is also stackable, but none of the others are, but could easily be changed 2065# to be so. 2066# 2067# Some properties are used by the Perl core but aren't defined until later 2068# Unicode releases. The perl interpreter would have problems working when 2069# compiled with an earlier Unicode version that doesn't have them, so we need 2070# to define them somehow for those releases. The 'Early' constructor 2071# parameter can be used to automatically handle this. It is essentially 2072# ignored if the Unicode version being compiled has a data file for this 2073# property. Either code to execute or a file to read can be specified. 2074# Details are at the %early definition. 2075# 2076# Most of the handlers can call insert_lines() or insert_adjusted_lines() 2077# which insert the parameters as lines to be processed before the next input 2078# file line is read. This allows the EOF handler(s) to flush buffers, for 2079# example. The difference between the two routines is that the lines inserted 2080# by insert_lines() are subjected to the each_line_handler()s. (So if you 2081# called it from such a handler, you would get infinite recursion without some 2082# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go 2083# directly to the main handler without any adjustments. If the 2084# post-processing handler calls any of these, there will be no effect. Some 2085# error checking for these conditions could be added, but it hasn't been done. 2086# 2087# carp_bad_line() should be called to warn of bad input lines, which clears $_ 2088# to prevent further processing of the line. This routine will output the 2089# message as a warning once, and then keep a count of the lines that have the 2090# same message, and output that count at the end of the file's processing. 2091# This keeps the number of messages down to a manageable amount. 2092# 2093# get_missings() should be called to retrieve any @missing input lines. 2094# Messages will be raised if this isn't done if the options aren't to ignore 2095# missings. 2096 2097sub trace { return main::trace(@_); } 2098 2099{ # Closure 2100 # Keep track of fields that are to be put into the constructor. 2101 my %constructor_fields; 2102 2103 main::setup_package(Constructor_Fields => \%constructor_fields); 2104 2105 my %file; # Input file name, required 2106 main::set_access('file', \%file, qw{ c r }); 2107 2108 my %first_released; # Unicode version file was first released in, required 2109 main::set_access('first_released', \%first_released, qw{ c r }); 2110 2111 my %handler; # Subroutine to process the input file, defaults to 2112 # 'process_generic_property_file' 2113 main::set_access('handler', \%handler, qw{ c }); 2114 2115 my %property; 2116 # name of property this file is for. defaults to none, meaning not 2117 # applicable, or is otherwise determinable, for example, from each line. 2118 main::set_access('property', \%property, qw{ c r }); 2119 2120 my %optional; 2121 # This is either an unsigned number, or a list of property names. In the 2122 # former case, if it is non-zero, it means the file is optional, so if the 2123 # file is absent, no warning about that is output. In the latter case, it 2124 # is a list of properties that the file (exclusively) defines. If the 2125 # file is present, tables for those properties will be produced; if 2126 # absent, none will, even if they are listed elsewhere (namely 2127 # PropertyAliases.txt and PropValueAliases.txt) as being in this release, 2128 # and no warnings will be raised about them not being available. (And no 2129 # warning about the file itself will be raised.) 2130 main::set_access('optional', \%optional, qw{ c readable_array } ); 2131 2132 my %non_skip; 2133 # This is used for debugging, to skip processing of all but a few input 2134 # files. Add 'non_skip => 1' to the constructor for those files you want 2135 # processed when you set the $debug_skip global. 2136 main::set_access('non_skip', \%non_skip, 'c'); 2137 2138 my %skip; 2139 # This is used to skip processing of this input file (semi-) permanently. 2140 # The value should be the reason the file is being skipped. It is used 2141 # for files that we aren't planning to process anytime soon, but want to 2142 # allow to be in the directory and be checked for their names not 2143 # conflicting with any other files on a DOS 8.3 name filesystem, but to 2144 # not otherwise be processed, and to not raise a warning about not being 2145 # handled. In the constructor call, any value that evaluates to a numeric 2146 # 0 or undef means don't skip. Any other value is a string giving the 2147 # reason it is being skipped, and this will appear in generated pod. 2148 # However, an empty string reason will suppress the pod entry. 2149 # Internally, calls that evaluate to numeric 0 are changed into undef to 2150 # distinguish them from an empty string call. 2151 main::set_access('skip', \%skip, 'c', 'r'); 2152 2153 my %each_line_handler; 2154 # list of subroutines to look at and filter each non-comment line in the 2155 # file. defaults to none. The subroutines are called in order, each is 2156 # to adjust $_ for the next one, and the final one adjusts it for 2157 # 'handler' 2158 main::set_access('each_line_handler', \%each_line_handler, 'c'); 2159 2160 my %retain_trailing_comments; 2161 # This is used to not discard the comments that end data lines. This 2162 # would be used only for files with non-typical syntax, and most code here 2163 # assumes that comments have been stripped, so special handlers would have 2164 # to be written. It is assumed that the code will use these in 2165 # single-quoted contexts, and so any "'" marks in the comment will be 2166 # prefixed by a backslash. 2167 main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c'); 2168 2169 my %properties; # Optional ordered list of the properties that occur in each 2170 # meaningful line of the input file. If present, an appropriate 2171 # each_line_handler() is automatically generated and pushed onto the stack 2172 # of such handlers. This is useful when a file contains multiple 2173 # properties per line, but no other special considerations are necessary. 2174 # The special value "<ignored>" means to discard the corresponding input 2175 # field. 2176 # Any @missing lines in the file should also match this syntax; no such 2177 # files exist as of 6.3. But if it happens in a future release, the code 2178 # could be expanded to properly parse them. 2179 main::set_access('properties', \%properties, qw{ c r }); 2180 2181 my %has_missings_defaults; 2182 # ? Are there lines in the file giving default values for code points 2183 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is 2184 # the norm, but IGNORED means it has such lines, but the handler doesn't 2185 # use them. Having these three states allows us to catch changes to the 2186 # UCD that this program should track. XXX This could be expanded to 2187 # specify the syntax for such lines, like %properties above. 2188 main::set_access('has_missings_defaults', 2189 \%has_missings_defaults, qw{ c r }); 2190 2191 my %construction_time_handler; 2192 # Subroutine to call at the end of the new method. If undef, no such 2193 # handler is called. 2194 main::set_access('construction_time_handler', 2195 \%construction_time_handler, qw{ c }); 2196 2197 my %pre_handler; 2198 # Subroutine to call before doing anything else in the file. If undef, no 2199 # such handler is called. 2200 main::set_access('pre_handler', \%pre_handler, qw{ c }); 2201 2202 my %eof_handler; 2203 # Subroutines to call upon getting an EOF on the input file, but before 2204 # that is returned to the main handler. This is to allow buffers to be 2205 # flushed. The handler is expected to call insert_lines() or 2206 # insert_adjusted() with the buffered material 2207 main::set_access('eof_handler', \%eof_handler, qw{ c }); 2208 2209 my %post_handler; 2210 # Subroutine to call after all the lines of the file are read in and 2211 # processed. If undef, no such handler is called. Note that this cannot 2212 # add lines to be processed; instead use eof_handler 2213 main::set_access('post_handler', \%post_handler, qw{ c }); 2214 2215 my %progress_message; 2216 # Message to print to display progress in lieu of the standard one 2217 main::set_access('progress_message', \%progress_message, qw{ c }); 2218 2219 my %handle; 2220 # cache open file handle, internal. Is undef if file hasn't been 2221 # processed at all, empty if has; 2222 main::set_access('handle', \%handle); 2223 2224 my %added_lines; 2225 # cache of lines added virtually to the file, internal 2226 main::set_access('added_lines', \%added_lines); 2227 2228 my %remapped_lines; 2229 # cache of lines added virtually to the file, internal 2230 main::set_access('remapped_lines', \%remapped_lines); 2231 2232 my %errors; 2233 # cache of errors found, internal 2234 main::set_access('errors', \%errors); 2235 2236 my %missings; 2237 # storage of '@missing' defaults lines 2238 main::set_access('missings', \%missings); 2239 2240 my %early; 2241 # Used for properties that must be defined (for Perl's purposes) on 2242 # versions of Unicode earlier than Unicode itself defines them. The 2243 # parameter is an array (it would be better to be a hash, but not worth 2244 # bothering about due to its rare use). 2245 # 2246 # The first element is either a code reference to call when in a release 2247 # earlier than the Unicode file is available in, or it is an alternate 2248 # file to use instead of the non-existent one. This file must have been 2249 # plunked down in the same directory as mktables. Should you be compiling 2250 # on a release that needs such a file, mktables will abort the 2251 # compilation, and tell you where to get the necessary file(s), and what 2252 # name(s) to use to store them as. 2253 # In the case of specifying an alternate file, the array must contain two 2254 # further elements: 2255 # 2256 # [1] is the name of the property that will be generated by this file. 2257 # The class automatically takes the input file and excludes any code 2258 # points in it that were not assigned in the Unicode version being 2259 # compiled. It then uses this result to define the property in the given 2260 # version. Since the property doesn't actually exist in the Unicode 2261 # version being compiled, this should be a name accessible only by core 2262 # perl. If it is the same name as the regular property, the constructor 2263 # will mark the output table as a $PLACEHOLDER so that it doesn't actually 2264 # get output, and so will be unusable by non-core code. Otherwise it gets 2265 # marked as $INTERNAL_ONLY. 2266 # 2267 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to 2268 # the Hangul syllables in that release (which were ripped out in version 2269 # 2) for the given property . (Hence it is ignored except when compiling 2270 # version 1. You only get one value that applies to all of them, which 2271 # may not be the actual reality, but probably nobody cares anyway for 2272 # these obsolete characters.) 2273 # 2274 # [3] if present is the default value for the property to assign for code 2275 # points not given in the input. If not present, the default from the 2276 # normal property is used 2277 # 2278 # [-1] If there is an extra final element that is the string 'ONLY_EARLY'. 2279 # it means to not add the name in [1] as an alias to the property name 2280 # used for these. Normally, when compiling Unicode versions that don't 2281 # invoke the early handling, the name is added as a synonym. 2282 # 2283 # Not all files can be handled in the above way, and so the code ref 2284 # alternative is available. It can do whatever it needs to. The other 2285 # array elements are optional in this case, and the code is free to use or 2286 # ignore them if they are present. 2287 # 2288 # Internally, the constructor unshifts a 0 or 1 onto this array to 2289 # indicate if an early alternative is actually being used or not. This 2290 # makes for easier testing later on. 2291 main::set_access('early', \%early, 'c'); 2292 2293 my %only_early; 2294 main::set_access('only_early', \%only_early, 'c'); 2295 2296 my %required_even_in_debug_skip; 2297 # debug_skip is used to speed up compilation during debugging by skipping 2298 # processing files that are not needed for the task at hand. However, 2299 # some files pretty much can never be skipped, and this is used to specify 2300 # that this is one of them. In order to skip this file, the call to the 2301 # constructor must be edited to comment out this parameter. 2302 main::set_access('required_even_in_debug_skip', 2303 \%required_even_in_debug_skip, 'c'); 2304 2305 my %withdrawn; 2306 # Some files get removed from the Unicode DB. This is a version object 2307 # giving the first release without this file. 2308 main::set_access('withdrawn', \%withdrawn, 'c'); 2309 2310 my %ucd; 2311 # Some files are not actually part of the Unicode Character Database. 2312 # These typically have a different way of indicating their version 2313 main::set_access('ucd', \%ucd, 'c'); 2314 2315 my %in_this_release; 2316 # Calculated value from %first_released and %withdrawn. Are we compiling 2317 # a Unicode release which includes this file? 2318 main::set_access('in_this_release', \%in_this_release); 2319 2320 sub _next_line; 2321 sub _next_line_with_remapped_range; 2322 2323 sub new { 2324 my $class = shift; 2325 2326 my $self = bless \do{ my $anonymous_scalar }, $class; 2327 my $addr = pack 'J', refaddr $self; 2328 2329 # Set defaults 2330 $handler{$addr} = \&main::process_generic_property_file; 2331 $retain_trailing_comments{$addr} = 0; 2332 $non_skip{$addr} = 0; 2333 $skip{$addr} = undef; 2334 $has_missings_defaults{$addr} = $NO_DEFAULTS; 2335 $handle{$addr} = undef; 2336 $added_lines{$addr} = [ ]; 2337 $remapped_lines{$addr} = [ ]; 2338 $each_line_handler{$addr} = [ ]; 2339 $eof_handler{$addr} = [ ]; 2340 $errors{$addr} = { }; 2341 $missings{$addr} = [ ]; 2342 $early{$addr} = [ ]; 2343 $optional{$addr} = [ ]; 2344 $ucd{$addr} = 1; 2345 2346 # Two positional parameters. 2347 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2348 $file{$addr} = main::internal_file_to_platform(shift); 2349 $first_released{$addr} = shift; 2350 2351 # The rest of the arguments are key => value pairs 2352 # %constructor_fields has been set up earlier to list all possible 2353 # ones. Either set or push, depending on how the default has been set 2354 # up just above. 2355 my %args = @_; 2356 foreach my $key (keys %args) { 2357 my $argument = $args{$key}; 2358 2359 # Note that the fields are the lower case of the constructor keys 2360 my $hash = $constructor_fields{lc $key}; 2361 if (! defined $hash) { 2362 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); 2363 next; 2364 } 2365 if (ref $hash->{$addr} eq 'ARRAY') { 2366 if (ref $argument eq 'ARRAY') { 2367 foreach my $argument (@{$argument}) { 2368 next if ! defined $argument; 2369 push @{$hash->{$addr}}, $argument; 2370 } 2371 } 2372 else { 2373 push @{$hash->{$addr}}, $argument if defined $argument; 2374 } 2375 } 2376 else { 2377 $hash->{$addr} = $argument; 2378 } 2379 delete $args{$key}; 2380 }; 2381 2382 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr}; 2383 2384 # Convert 0 (meaning don't skip) to undef 2385 undef $skip{$addr} unless $skip{$addr}; 2386 2387 # Handle the case where this file is optional 2388 my $pod_message_for_non_existent_optional = ""; 2389 if ($optional{$addr}->@*) { 2390 2391 # First element is the pod message 2392 $pod_message_for_non_existent_optional 2393 = shift $optional{$addr}->@*; 2394 # Convert a 0 'Optional' argument to an empty list to make later 2395 # code more concise. 2396 if ( $optional{$addr}->@* 2397 && $optional{$addr}->@* == 1 2398 && $optional{$addr}[0] ne "" 2399 && $optional{$addr}[0] !~ /\D/ 2400 && $optional{$addr}[0] == 0) 2401 { 2402 $optional{$addr} = [ ]; 2403 } 2404 else { # But if the only element doesn't evaluate to 0, make sure 2405 # that this file is indeed considered optional below. 2406 unshift $optional{$addr}->@*, 1; 2407 } 2408 } 2409 2410 my $progress; 2411 my $function_instead_of_file = 0; 2412 2413 if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') { 2414 $only_early{$addr} = 1; 2415 pop $early{$addr}->@*; 2416 } 2417 2418 # If we are compiling a Unicode release earlier than the file became 2419 # available, the constructor may have supplied a substitute 2420 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) { 2421 2422 # Yes, we have a substitute, that we will use; mark it so 2423 unshift $early{$addr}->@*, 1; 2424 2425 # See the definition of %early for what the array elements mean. 2426 # Note that we have just unshifted onto the array, so the numbers 2427 # below are +1 of those in the %early description. 2428 # If we have a property this defines, create a table and default 2429 # map for it now (at essentially compile time), so that it will be 2430 # available for the whole of run time. (We will want to add this 2431 # name as an alias when we are using the official property name; 2432 # but this must be deferred until run(), because at construction 2433 # time the official names have yet to be defined.) 2434 if ($early{$addr}[2]) { 2435 my $fate = ($property{$addr} 2436 && $property{$addr} eq $early{$addr}[2]) 2437 ? $PLACEHOLDER 2438 : $INTERNAL_ONLY; 2439 my $prop_object = Property->new($early{$addr}[2], 2440 Fate => $fate, 2441 Perl_Extension => 1, 2442 ); 2443 2444 # If not specified by the constructor, use the default mapping 2445 # for the regular property for this substitute one. 2446 if ($early{$addr}[4]) { 2447 $prop_object->set_default_map($early{$addr}[4]); 2448 } 2449 elsif ( defined $property{$addr} 2450 && defined $default_mapping{$property{$addr}}) 2451 { 2452 $prop_object 2453 ->set_default_map($default_mapping{$property{$addr}}); 2454 } 2455 } 2456 2457 if (ref $early{$addr}[1] eq 'CODE') { 2458 $function_instead_of_file = 1; 2459 2460 # If the first element of the array is a code ref, the others 2461 # are optional. 2462 $handler{$addr} = $early{$addr}[1]; 2463 $property{$addr} = $early{$addr}[2] 2464 if defined $early{$addr}[2]; 2465 $progress = "substitute $file{$addr}"; 2466 2467 undef $file{$addr}; 2468 } 2469 else { # Specifying a substitute file 2470 2471 if (! main::file_exists($early{$addr}[1])) { 2472 2473 # If we don't see the substitute file, generate an error 2474 # message giving the needed things, and add it to the list 2475 # of such to output before actual processing happens 2476 # (hence the user finds out all of them in one run). 2477 # Instead of creating a general method for NameAliases, 2478 # hard-code it here, as there is unlikely to ever be a 2479 # second one which needs special handling. 2480 my $string_version = ($file{$addr} eq "NameAliases.txt") 2481 ? 'at least 6.1 (the later, the better)' 2482 : sprintf "%vd", $first_released{$addr}; 2483 push @missing_early_files, <<END; 2484'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'. 2485END 2486 ; 2487 return; 2488 } 2489 $progress = $early{$addr}[1]; 2490 $progress .= ", substituting for $file{$addr}" if $file{$addr}; 2491 $file{$addr} = $early{$addr}[1]; 2492 $property{$addr} = $early{$addr}[2]; 2493 2494 # Ignore code points not in the version being compiled 2495 push $each_line_handler{$addr}->@*, \&_exclude_unassigned; 2496 2497 if ( $v_version lt v2.0 # Hanguls in this release ... 2498 && defined $early{$addr}[3]) # ... need special treatment 2499 { 2500 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls; 2501 } 2502 } 2503 2504 # And this substitute is valid for all releases. 2505 $first_released{$addr} = v0; 2506 } 2507 else { # Normal behavior 2508 $progress = $file{$addr}; 2509 unshift $early{$addr}->@*, 0; # No substitute 2510 } 2511 2512 my $file = $file{$addr}; 2513 $progress_message{$addr} = "Processing $progress" 2514 unless $progress_message{$addr}; 2515 2516 # A file should be there if it is within the window of versions for 2517 # which Unicode supplies it 2518 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) { 2519 $in_this_release{$addr} = 0; 2520 $skip{$addr} = ""; 2521 } 2522 else { 2523 $in_this_release{$addr} = $first_released{$addr} le $v_version; 2524 2525 # Check that the file for this object (possibly using a substitute 2526 # for early releases) exists or we have a function alternative 2527 if ( ! $function_instead_of_file 2528 && ! main::file_exists($file)) 2529 { 2530 # Here there is nothing available for this release. This is 2531 # fine if we aren't expecting anything in this release. 2532 if (! $in_this_release{$addr}) { 2533 $skip{$addr} = ""; # Don't remark since we expected 2534 # nothing and got nothing 2535 } 2536 elsif ($optional{$addr}->@*) { 2537 2538 # Here the file is optional in this release; Use the 2539 # passed in text to document this case in the pod. 2540 $skip{$addr} = $pod_message_for_non_existent_optional; 2541 } 2542 elsif ( $in_this_release{$addr} 2543 && ! defined $skip{$addr} 2544 && defined $file) 2545 { # Doesn't exist but should. 2546 $skip{$addr} = "'$file' not found. Possibly Big problems"; 2547 Carp::my_carp($skip{$addr}); 2548 } 2549 } 2550 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr}) 2551 { 2552 2553 # The file exists; if not skipped for another reason, and we are 2554 # skipping most everything during debugging builds, use that as 2555 # the skip reason. 2556 $skip{$addr} = '$debug_skip is on' 2557 } 2558 } 2559 2560 if ( ! $debug_skip 2561 && $non_skip{$addr} 2562 && ! $required_even_in_debug_skip{$addr} 2563 && $verbosity) 2564 { 2565 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n"; 2566 } 2567 2568 # Here, we have figured out if we will be skipping this file or not. 2569 # If so, we add any single property it defines to any passed in 2570 # optional property list. These will be dealt with at run time. 2571 if (defined $skip{$addr}) { 2572 if ($property{$addr}) { 2573 push $optional{$addr}->@*, $property{$addr}; 2574 } 2575 } # Otherwise, are going to process the file. 2576 elsif ($property{$addr}) { 2577 2578 # If the file has a property defined in the constructor for it, it 2579 # means that the property is not listed in the file's entries. So 2580 # add a handler (to the list of line handlers) to insert the 2581 # property name into the lines, to provide a uniform interface to 2582 # the final processing subroutine. 2583 push @{$each_line_handler{$addr}}, \&_insert_property_into_line; 2584 } 2585 elsif ($properties{$addr}) { 2586 2587 # Similarly, there may be more than one property represented on 2588 # each line, with no clue but the constructor input what those 2589 # might be. Add a handler for each line in the input so that it 2590 # creates a separate input line for each property in those input 2591 # lines, thus making them suitable to handle generically. 2592 2593 push @{$each_line_handler{$addr}}, 2594 sub { 2595 my $file = shift; 2596 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2597 my @fields = split /\s*;\s*/, $_, -1; 2598 2599 if (@fields - 1 > @{$properties{$addr}}) { 2600 $file->carp_bad_line('Extra fields'); 2601 $_ = ""; 2602 return; 2603 } 2604 my $range = shift @fields; # 0th element is always the 2605 # range 2606 2607 # The next fields in the input line correspond 2608 # respectively to the stored properties. 2609 for my $i (0 .. @{$properties{$addr}} - 1) { 2610 my $property_name = $properties{$addr}[$i]; 2611 next if $property_name eq '<ignored>'; 2612 $file->insert_adjusted_lines( 2613 "$range; $property_name; $fields[$i]"); 2614 } 2615 $_ = ""; 2616 2617 return; 2618 }; 2619 } 2620 2621 { # On non-ascii platforms, we use a special pre-handler 2622 no strict; 2623 no warnings 'once'; 2624 *next_line = (main::NON_ASCII_PLATFORM) 2625 ? *_next_line_with_remapped_range 2626 : *_next_line; 2627 } 2628 2629 &{$construction_time_handler{$addr}}($self) 2630 if $construction_time_handler{$addr}; 2631 2632 return $self; 2633 } 2634 2635 2636 use overload 2637 fallback => 0, 2638 qw("") => "_operator_stringify", 2639 "." => \&main::_operator_dot, 2640 ".=" => \&main::_operator_dot_equal, 2641 ; 2642 2643 sub _operator_stringify($self, $other="", $reversed=0) { 2644 return __PACKAGE__ . " object for " . $self->file; 2645 } 2646 2647 sub run($self) { 2648 # Process the input object $self. This opens and closes the file and 2649 # calls all the handlers for it. Currently, this can only be called 2650 # once per file, as it destroy's the EOF handlers 2651 2652 # flag to make sure extracted files are processed early 2653 state $seen_non_extracted = 0; 2654 2655 my $addr = pack 'J', refaddr $self; 2656 2657 my $file = $file{$addr}; 2658 2659 if (! $file) { 2660 $handle{$addr} = 'pretend_is_open'; 2661 } 2662 else { 2663 if ($seen_non_extracted) { 2664 if ($file =~ /$EXTRACTED/i) # Some platforms may change the 2665 # case of the file's name 2666 { 2667 Carp::my_carp_bug(main::join_lines(<<END 2668$file should be processed just after the 'Prop...Alias' files, and before 2669anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may 2670have subtle problems 2671END 2672 )); 2673 } 2674 } 2675 elsif ($EXTRACTED_DIR 2676 2677 # We only do this check for generic property files 2678 && $handler{$addr} == \&main::process_generic_property_file 2679 2680 && $file !~ /$EXTRACTED/i) 2681 { 2682 # We don't set this (by the 'if' above) if we have no 2683 # extracted directory, so if running on an early version, 2684 # this test won't work. Not worth worrying about. 2685 $seen_non_extracted = 1; 2686 } 2687 2688 # Mark the file as having being processed, and warn if it 2689 # isn't a file we are expecting. As we process the files, 2690 # they are deleted from the hash, so any that remain at the 2691 # end of the program are files that we didn't process. 2692 my $fkey = File::Spec->rel2abs($file); 2693 my $exists = delete $potential_files{lc($fkey)}; 2694 2695 Carp::my_carp("Was not expecting '$file'.") 2696 if $exists && ! $in_this_release{$addr}; 2697 2698 # If there is special handling for compiling Unicode releases 2699 # earlier than the first one in which Unicode defines this 2700 # property ... 2701 if ($early{$addr}->@* > 1) { 2702 2703 # Mark as processed any substitute file that would be used in 2704 # such a release 2705 $fkey = File::Spec->rel2abs($early{$addr}[1]); 2706 delete $potential_files{lc($fkey)}; 2707 2708 # As commented in the constructor code, when using the 2709 # official property, we still have to allow the publicly 2710 # inaccessible early name so that the core code which uses it 2711 # will work regardless. 2712 if ( ! $only_early{$addr} 2713 && ! $early{$addr}[0] 2714 && $early{$addr}->@* > 2) 2715 { 2716 my $early_property_name = $early{$addr}[2]; 2717 if ($property{$addr} ne $early_property_name) { 2718 main::property_ref($property{$addr}) 2719 ->add_alias($early_property_name); 2720 } 2721 } 2722 } 2723 2724 # We may be skipping this file ... 2725 if (defined $skip{$addr}) { 2726 2727 # If the file isn't supposed to be in this release, there is 2728 # nothing to do 2729 if ($in_this_release{$addr}) { 2730 2731 # But otherwise, we may print a message 2732 if ($debug_skip) { 2733 print STDERR "Skipping input file '$file'", 2734 " because '$skip{$addr}'\n"; 2735 } 2736 2737 # And add it to the list of skipped files, which is later 2738 # used to make the pod 2739 $skipped_files{$file} = $skip{$addr}; 2740 2741 # The 'optional' list contains properties that are also to 2742 # be skipped along with the file. (There may also be 2743 # digits which are just placeholders to make sure it isn't 2744 # an empty list 2745 foreach my $property ($optional{$addr}->@*) { 2746 next unless $property =~ /\D/; 2747 my $prop_object = main::property_ref($property); 2748 next unless defined $prop_object; 2749 $prop_object->set_fate($SUPPRESSED, $skip{$addr}); 2750 } 2751 } 2752 2753 return; 2754 } 2755 2756 # Here, we are going to process the file. Open it, converting the 2757 # slashes used in this program into the proper form for the OS 2758 my $file_handle; 2759 if (not open $file_handle, "<", $file) { 2760 Carp::my_carp("Can't open $file. Skipping: $!"); 2761 return; 2762 } 2763 $handle{$addr} = $file_handle; # Cache the open file handle 2764 2765 # If possible, make sure that the file is the correct version. 2766 # (This data isn't available on early Unicode releases or in 2767 # UnicodeData.txt.) We don't do this check if we are using a 2768 # substitute file instead of the official one (though the code 2769 # could be extended to do so). 2770 if ($in_this_release{$addr} 2771 && ! $early{$addr}[0] 2772 && lc($file) ne 'unicodedata.txt') 2773 { 2774 my $this_version; 2775 2776 if ($file !~ /^Unihan/i) { 2777 2778 # The non-Unihan files started getting version numbers in 2779 # 3.2, but some files in 4.0 are unchanged from 3.2, and 2780 # marked as 3.2. 4.0.1 is the first version where there 2781 # are no files marked as being from less than 4.0, though 2782 # some are marked as 4.0. In versions after that, the 2783 # numbers are correct. 2784 if ($v_version ge v4.0.1) { 2785 $_ = <$file_handle>; # The version number is in the 2786 # very first line if it is a 2787 # UCD file; otherwise, it 2788 # might be 2789 goto valid_version if $_ =~ / - $string_version \. /x; 2790 chomp; 2791 if ($ucd{$addr}) { 2792 $_ =~ s/^#\s*//; 2793 2794 # 4.0.1 had some valid files that weren't updated. 2795 goto valid_version 2796 if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/; 2797 $this_version = $_; 2798 goto wrong_version; 2799 } 2800 else { 2801 my $BOM = "\x{FEFF}"; 2802 utf8::encode($BOM); 2803 my $BOM_re = qr/ ^ (?:$BOM)? /x; 2804 2805 do { 2806 chomp; 2807 2808 # BOM; seems to be on many lines in some 2809 # files!! 2810 $_ =~ s/$BOM_re//; 2811 2812 if (/./) { 2813 2814 # Only look for the version if in the 2815 # first comment block. 2816 goto no_version unless $_ =~ /^#/; 2817 2818 if ($_ =~ /Version:? (\S*)/) { 2819 $this_version = $1; 2820 goto valid_version 2821 if $this_version eq $string_version; 2822 goto valid_version 2823 if "$this_version.0" 2824 eq $string_version; 2825 } 2826 } 2827 } while (<$file_handle>); 2828 2829 goto no_version; 2830 } 2831 } 2832 } 2833 elsif ($v_version ge v6.0.0) { # Unihan 2834 2835 # Unihan files didn't get accurate version numbers until 2836 # 6.0. The version is somewhere in the first comment 2837 # block 2838 while (<$file_handle>) { 2839 goto no_version if $_ !~ /^#/; 2840 chomp; 2841 $_ =~ s/^#\s*//; 2842 next if $_ !~ / version: /x; 2843 goto valid_version if $_ =~ /$string_version/; 2844 goto wrong_version; 2845 } 2846 goto no_version; 2847 } 2848 else { # Old Unihan; have to assume is valid 2849 goto valid_version; 2850 } 2851 2852 wrong_version: 2853 die Carp::my_carp("File '$file' is version " 2854 . "'$this_version'. It should be " 2855 . "version $string_version"); 2856 no_version: 2857 Carp::my_carp_bug("Could not find the expected " 2858 . "version info in file '$file'"); 2859 } 2860 } 2861 2862 valid_version: 2863 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS; 2864 2865 # Call any special handler for before the file. 2866 &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; 2867 2868 # Then the main handler 2869 &{$handler{$addr}}($self); 2870 2871 # Then any special post-file handler. 2872 &{$post_handler{$addr}}($self) if $post_handler{$addr}; 2873 2874 # If any errors have been accumulated, output the counts (as the first 2875 # error message in each class was output when it was encountered). 2876 if ($errors{$addr}) { 2877 my $total = 0; 2878 my $types = 0; 2879 foreach my $error (keys %{$errors{$addr}}) { 2880 $total += $errors{$addr}->{$error}; 2881 delete $errors{$addr}->{$error}; 2882 $types++; 2883 } 2884 if ($total > 1) { 2885 my $message 2886 = "A total of $total lines had errors in $file. "; 2887 2888 $message .= ($types == 1) 2889 ? '(Only the first one was displayed.)' 2890 : '(Only the first of each type was displayed.)'; 2891 Carp::my_carp($message); 2892 } 2893 } 2894 2895 if (@{$missings{$addr}}) { 2896 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); 2897 } 2898 2899 # If a real file handle, close it. 2900 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if 2901 ref $handle{$addr}; 2902 $handle{$addr} = ""; # Uses empty to indicate that has already seen 2903 # the file, as opposed to undef 2904 return; 2905 } 2906 2907 sub _next_line($self) { 2908 # Sets $_ to be the next logical input line, if any. Returns non-zero 2909 # if such a line exists. 'logical' means that any lines that have 2910 # been added via insert_lines() will be returned in $_ before the file 2911 # is read again. 2912 2913 my $addr = pack 'J', refaddr $self; 2914 2915 # Here the file is open (or if the handle is not a ref, is an open 2916 # 'virtual' file). Get the next line; any inserted lines get priority 2917 # over the file itself. 2918 my $adjusted; 2919 2920 LINE: 2921 while (1) { # Loop until find non-comment, non-empty line 2922 #local $to_trace = 1 if main::DEBUG; 2923 my $inserted_ref = shift @{$added_lines{$addr}}; 2924 if (defined $inserted_ref) { 2925 ($adjusted, $_) = @{$inserted_ref}; 2926 trace $adjusted, $_ if main::DEBUG && $to_trace; 2927 return 1 if $adjusted; 2928 } 2929 else { 2930 last if ! ref $handle{$addr}; # Don't read unless is real file 2931 last if ! defined ($_ = readline $handle{$addr}); 2932 } 2933 chomp; 2934 trace $_ if main::DEBUG && $to_trace; 2935 2936 # See if this line is the comment line that defines what property 2937 # value that code points that are not listed in the file should 2938 # have. The format or existence of these lines is not guaranteed 2939 # by Unicode since they are comments, but the documentation says 2940 # that this was added for machine-readability, so probably won't 2941 # change. This works starting in Unicode Version 5.0. They look 2942 # like: 2943 # 2944 # @missing: 0000..10FFFF; Not_Reordered 2945 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> 2946 # @missing: 0000..10FFFF; ; NaN 2947 # 2948 # Save the line for a later get_missings() call. 2949 if (/$missing_defaults_prefix/) { 2950 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { 2951 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); 2952 } 2953 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { 2954 my $start = $1; # The pattern saves the beginning and 2955 my $end = $2; # end points of the range the default 2956 # is for 2957 my @defaults = split /\s* ; \s*/x, $_; 2958 2959 # The first field is the @missing, which ends in a 2960 # semi-colon, so can safely shift. 2961 shift @defaults; 2962 2963 # Some of these lines may have empty field placeholders 2964 # which get in the way. An example is: 2965 # @missing: 0000..10FFFF; ; NaN 2966 # Remove them. Process starting from the top so the 2967 # splice doesn't affect things still to be looked at. 2968 for (my $i = @defaults - 1; $i >= 0; $i--) { 2969 next if $defaults[$i] ne ""; 2970 splice @defaults, $i, 1; 2971 } 2972 2973 # What's left should be just the property (maybe) and the 2974 # default. Having only one element means it doesn't have 2975 # the property. 2976 my $default; 2977 my $property; 2978 if (@defaults >= 1) { 2979 if (@defaults == 1) { 2980 $default = $defaults[0]; 2981 } 2982 else { 2983 $property = $defaults[0]; 2984 $default = $defaults[1]; 2985 } 2986 } 2987 2988 if (@defaults < 1 2989 || @defaults > 2 2990 || ($default =~ /^</ 2991 && $default !~ /^<code *point>$/i 2992 && $default !~ /^<none>$/i 2993 && $default !~ /^<script>$/i)) 2994 { 2995 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); 2996 } 2997 else { 2998 2999 # If the property is missing from the line, it should 3000 # be the one for the whole file 3001 $property = $property{$addr} if ! defined $property; 3002 3003 # Change <none> to the null string, which is what it 3004 # really means. If the default is the code point 3005 # itself, set it to <code point>, which is what 3006 # Unicode uses (but sometimes they've forgotten the 3007 # space) 3008 if ($default =~ /^<none>$/i) { 3009 $default = ""; 3010 } 3011 elsif ($default =~ /^<code *point>$/i) { 3012 $default = $CODE_POINT; 3013 } 3014 elsif ($default =~ /^<script>$/i) { 3015 3016 # Special case this one. Currently is from 3017 # ScriptExtensions.txt, and means for all unlisted 3018 # code points, use their Script property values. 3019 # For the code points not listed in that file, the 3020 # default value is 'Unknown'. 3021 $default = "Unknown"; 3022 } 3023 3024 # Store them as a sub-hash as part of an array, with 3025 # both components. 3026 push @{$missings{$addr}}, { start => hex $start, 3027 end => hex $end, 3028 default => $default, 3029 property => $property 3030 }; 3031 } 3032 } 3033 3034 # There is nothing for the caller to process on this comment 3035 # line. 3036 next; 3037 } 3038 3039 # Unless to keep, remove comments. If to keep, ignore 3040 # comment-only lines 3041 if ($retain_trailing_comments{$addr}) { 3042 next if / ^ \s* \# /x; 3043 3044 # But escape any single quotes (done in both the comment and 3045 # non-comment portion; this could be a bug someday, but not 3046 # likely) 3047 s/'/\\'/g; 3048 } 3049 else { 3050 s/#.*//; 3051 } 3052 3053 # Remove trailing space, and skip this line if the result is empty 3054 s/\s+$//; 3055 next if /^$/; 3056 3057 # Call any handlers for this line, and skip further processing of 3058 # the line if the handler sets the line to null. 3059 foreach my $sub_ref (@{$each_line_handler{$addr}}) { 3060 &{$sub_ref}($self); 3061 next LINE if /^$/; 3062 } 3063 3064 # Here the line is ok. return success. 3065 return 1; 3066 } # End of looping through lines. 3067 3068 # If there are EOF handlers, call each (only once) and if it generates 3069 # more lines to process go back in the loop to handle them. 3070 while ($eof_handler{$addr}->@*) { 3071 &{$eof_handler{$addr}[0]}($self); 3072 shift $eof_handler{$addr}->@*; # Currently only get one shot at it. 3073 goto LINE if $added_lines{$addr}; 3074 } 3075 3076 # Return failure -- no more lines. 3077 return 0; 3078 3079 } 3080 3081 sub _next_line_with_remapped_range($self) { 3082 # like _next_line(), but for use on non-ASCII platforms. It sets $_ 3083 # to be the next logical input line, if any. Returns non-zero if such 3084 # a line exists. 'logical' means that any lines that have been added 3085 # via insert_lines() will be returned in $_ before the file is read 3086 # again. 3087 # 3088 # The difference from _next_line() is that this remaps the Unicode 3089 # code points in the input to those of the native platform. Each 3090 # input line contains a single code point, or a single contiguous 3091 # range of them This routine splits each range into its individual 3092 # code points and caches them. It returns the cached values, 3093 # translated into their native equivalents, one at a time, for each 3094 # call, before reading the next line. Since native values can only be 3095 # a single byte wide, no translation is needed for code points above 3096 # 0xFF, and ranges that are entirely above that number are not split. 3097 # If an input line contains the range 254-1000, it would be split into 3098 # three elements: 254, 255, and 256-1000. (The downstream table 3099 # insertion code will sort and coalesce the individual code points 3100 # into appropriate ranges.) 3101 3102 my $addr = pack 'J', refaddr $self; 3103 3104 while (1) { 3105 3106 # Look in cache before reading the next line. Return any cached 3107 # value, translated 3108 my $inserted = shift @{$remapped_lines{$addr}}; 3109 if (defined $inserted) { 3110 trace $inserted if main::DEBUG && $to_trace; 3111 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer; 3112 trace $_ if main::DEBUG && $to_trace; 3113 return 1; 3114 } 3115 3116 # Get the next line. 3117 return 0 unless _next_line($self); 3118 3119 # If there is a special handler for it, return the line, 3120 # untranslated. This should happen only for files that are 3121 # special, not being code-point related, such as property names. 3122 return 1 if $handler{$addr} 3123 != \&main::process_generic_property_file; 3124 3125 my ($range, $property_name, $map, @remainder) 3126 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3127 3128 if (@remainder 3129 || ! defined $property_name 3130 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3131 { 3132 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored"); 3133 } 3134 3135 my $low = hex $1; 3136 my $high = (defined $2) ? hex $2 : $low; 3137 3138 # If the input maps the range to another code point, remap the 3139 # target if it is between 0 and 255. 3140 my $tail; 3141 if (defined $map) { 3142 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe; 3143 $tail = "$property_name; $map"; 3144 $_ = "$range; $tail"; 3145 } 3146 else { 3147 $tail = $property_name; 3148 } 3149 3150 # If entire range is above 255, just return it, unchanged (except 3151 # any mapped-to code point, already changed above) 3152 return 1 if $low > 255; 3153 3154 # Cache an entry for every code point < 255. For those in the 3155 # range above 255, return a dummy entry for just that portion of 3156 # the range. Note that this will be out-of-order, but that is not 3157 # a problem. 3158 foreach my $code_point ($low .. $high) { 3159 if ($code_point > 255) { 3160 $_ = sprintf "%04X..%04X; $tail", $code_point, $high; 3161 return 1; 3162 } 3163 push @{$remapped_lines{$addr}}, "$code_point; $tail"; 3164 } 3165 } # End of looping through lines. 3166 3167 # NOTREACHED 3168 } 3169 3170# Not currently used, not fully tested. 3171# sub peek { 3172# # Non-destructive lookahead one non-adjusted, non-comment, non-blank 3173# # record. Not callable from an each_line_handler(), nor does it call 3174# # an each_line_handler() on the line. 3175# 3176# my $self = shift; 3177# my $addr = pack 'J', refaddr $self; 3178# 3179# foreach my $inserted_ref (@{$added_lines{$addr}}) { 3180# my ($adjusted, $line) = @{$inserted_ref}; 3181# next if $adjusted; 3182# 3183# # Remove comments and trailing space, and return a non-empty 3184# # resulting line 3185# $line =~ s/#.*//; 3186# $line =~ s/\s+$//; 3187# return $line if $line ne ""; 3188# } 3189# 3190# return if ! ref $handle{$addr}; # Don't read unless is real file 3191# while (1) { # Loop until find non-comment, non-empty line 3192# local $to_trace = 1 if main::DEBUG; 3193# trace $_ if main::DEBUG && $to_trace; 3194# return if ! defined (my $line = readline $handle{$addr}); 3195# chomp $line; 3196# push @{$added_lines{$addr}}, [ 0, $line ]; 3197# 3198# $line =~ s/#.*//; 3199# $line =~ s/\s+$//; 3200# return $line if $line ne ""; 3201# } 3202# 3203# return; 3204# } 3205 3206 3207 sub insert_lines($self, @lines) { 3208 # Lines can be inserted so that it looks like they were in the input 3209 # file at the place it was when this routine is called. See also 3210 # insert_adjusted_lines(). Lines inserted via this routine go through 3211 # any each_line_handler() 3212 3213 # Each inserted line is an array, with the first element being 0 to 3214 # indicate that this line hasn't been adjusted, and needs to be 3215 # processed. 3216 push @{$added_lines{pack 'J', refaddr $self}}, map { [ 0, $_ ] } @lines; 3217 return; 3218 } 3219 3220 sub insert_adjusted_lines($self, @lines) { 3221 # Lines can be inserted so that it looks like they were in the input 3222 # file at the place it was when this routine is called. See also 3223 # insert_lines(). Lines inserted via this routine are already fully 3224 # adjusted, ready to be processed; each_line_handler()s handlers will 3225 # not be called. This means this is not a completely general 3226 # facility, as only the last each_line_handler on the stack should 3227 # call this. It could be made more general, by passing to each of the 3228 # line_handlers their position on the stack, which they would pass on 3229 # to this routine, and that would replace the boolean first element in 3230 # the anonymous array pushed here, so that the next_line routine could 3231 # use that to call only those handlers whose index is after it on the 3232 # stack. But this is overkill for what is needed now. 3233 3234 trace $self if main::DEBUG && $to_trace; 3235 3236 # Each inserted line is an array, with the first element being 1 to 3237 # indicate that this line has been adjusted 3238 push @{$added_lines{pack 'J', refaddr $self}}, map { [ 1, $_ ] } @lines; 3239 return; 3240 } 3241 3242 sub get_missings($self) { 3243 # Returns the stored up @missings lines' values, and clears the list. 3244 # The values are in a hash, consisting of 'default' and 'property'. 3245 # However, since these lines can be stacked up, the return is an array 3246 # of all these hashes. 3247 3248 my $addr = pack 'J', refaddr $self; 3249 3250 # If not accepting a list return, just return the first one. 3251 return shift @{$missings{$addr}} unless wantarray; 3252 3253 my @return = @{$missings{$addr}}; 3254 undef @{$missings{$addr}}; 3255 return @return; 3256 } 3257 3258 sub _exclude_unassigned($self) { 3259 3260 # Takes the range in $_ and excludes code points that aren't assigned 3261 # in this release 3262 3263 state $skip_inserted_count = 0; 3264 3265 # Ignore recursive calls. 3266 if ($skip_inserted_count) { 3267 $skip_inserted_count--; 3268 return; 3269 } 3270 3271 # Find what code points are assigned in this release 3272 main::calculate_Assigned() if ! defined $Assigned; 3273 3274 my ($range, @remainder) 3275 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3276 3277 # Examine the range. 3278 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3279 { 3280 my $low = hex $1; 3281 my $high = (defined $2) ? hex $2 : $low; 3282 3283 # Split the range into subranges of just those code points in it 3284 # that are assigned. 3285 my @ranges = (Range_List->new(Initialize 3286 => Range->new($low, $high)) & $Assigned)->ranges; 3287 3288 # Do nothing if nothing in the original range is assigned in this 3289 # release; handle normally if everything is in this release. 3290 if (! @ranges) { 3291 $_ = ""; 3292 } 3293 elsif (@ranges != 1) { 3294 3295 # Here, some code points in the original range aren't in this 3296 # release; @ranges gives the ones that are. Create fake input 3297 # lines for each of the ranges, and set things up so that when 3298 # this routine is called on that fake input, it will do 3299 # nothing. 3300 $skip_inserted_count = @ranges; 3301 my $remainder = join ";", @remainder; 3302 for my $range (@ranges) { 3303 $self->insert_lines(sprintf("%04X..%04X;%s", 3304 $range->start, $range->end, $remainder)); 3305 } 3306 $_ = ""; # The original range is now defunct. 3307 } 3308 } 3309 3310 return; 3311 } 3312 3313 sub _fixup_obsolete_hanguls($self) { 3314 3315 # This is called only when compiling Unicode version 1. All Unicode 3316 # data for subsequent releases assumes that the code points that were 3317 # Hangul syllables in this release only are something else, so if 3318 # using such data, we have to override it 3319 3320 my $addr = pack 'J', refaddr $self; 3321 3322 my $object = main::property_ref($property{$addr}); 3323 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE, 3324 $FINAL_REMOVED_HANGUL_SYLLABLE, 3325 $early{$addr}[3], # Passed-in value for these 3326 Replace => $UNCONDITIONALLY); 3327 } 3328 3329 sub _insert_property_into_line($self) { 3330 # Add a property field to $_, if this file requires it. 3331 3332 my $property = $property{pack 'J', refaddr $self}; 3333 $_ =~ s/(;|$)/; $property$1/; 3334 return; 3335 } 3336 3337 sub carp_bad_line($self, $message="") { 3338 # Output consistent error messages, using either a generic one, or the 3339 # one given by the optional parameter. To avoid gazillions of the 3340 # same message in case the syntax of a file is way off, this routine 3341 # only outputs the first instance of each message, incrementing a 3342 # count so the totals can be output at the end of the file. 3343 3344 my $addr = pack 'J', refaddr $self; 3345 3346 $message = 'Unexpected line' unless $message; 3347 3348 # No trailing punctuation so as to fit with our addenda. 3349 $message =~ s/[.:;,]$//; 3350 3351 # If haven't seen this exact message before, output it now. Otherwise 3352 # increment the count of how many times it has occurred 3353 unless ($errors{$addr}->{$message}) { 3354 Carp::my_carp("$message in '$_' in " 3355 . $file{$addr} 3356 . " at line $.. Skipping this line;"); 3357 $errors{$addr}->{$message} = 1; 3358 } 3359 else { 3360 $errors{$addr}->{$message}++; 3361 } 3362 3363 # Clear the line to prevent any further (meaningful) processing of it. 3364 $_ = ""; 3365 3366 return; 3367 } 3368} # End closure 3369 3370package Multi_Default; 3371 3372sub trace { return main::trace(@_); } 3373 3374# Certain properties in early versions of Unicode had more than one possible 3375# default for code points missing from the files. In these cases, one 3376# default applies to everything left over after all the others are applied, 3377# and for each of the others, there is a description of which class of code 3378# points applies to it. This object helps implement this by storing the 3379# defaults, and for all but that final default, an eval string that generates 3380# the class that it applies to. That class must be a Range_List, or contains 3381# a Range_List that the overloaded operators recognize as to be operated on. 3382# A string is used because this is called early when we know symbolically what 3383# needs to be done, but typically before any data is gathered. Thus the 3384# evaluation gets delayed until we have at hand all the needed information. 3385 3386{ # Closure 3387 3388 main::setup_package(); 3389 3390 my %class_defaults; 3391 # The defaults structure for the classes 3392 main::set_access('class_defaults', \%class_defaults, 'readable_array'); 3393 3394 my %other_default; 3395 # The default that applies to everything left over. 3396 main::set_access('other_default', \%other_default, 'r'); 3397 3398 my %iterator; 3399 3400 sub new { 3401 # The constructor is called with default => eval pairs, terminated by 3402 # the left-over default. e.g. 3403 # Multi_Default->new( 3404 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C 3405 # - 0x200D', 3406 # 'R' => 'some other expression that evaluates to code points', 3407 # . 3408 # . 3409 # . 3410 # 'U')); 3411 # It is best to leave the final value be the one that matches the 3412 # above-Unicode code points. 3413 3414 my $class = shift; 3415 3416 my $self = bless \do{my $anonymous_scalar}, $class; 3417 my $addr = pack 'J', refaddr $self; 3418 $iterator{$addr} = 0; 3419 3420 return $self unless @_; 3421 3422 while (@_ > 1) { 3423 $self->append_default(shift, shift); 3424 } 3425 3426 $self->set_final_default(shift); 3427 3428 return $self; 3429 } 3430 3431 sub append_default($self, $new_default, $eval) { 3432 my $addr = pack 'J', refaddr $self; 3433 3434 # Pushes a default setting to the current list 3435 push $class_defaults{$addr}->@*, [ $new_default, $eval ]; 3436 } 3437 3438 sub set_final_default($self, $new_default) { 3439 my $addr = pack 'J', refaddr $self; 3440 $other_default{$addr} = $new_default; 3441 } 3442 3443 sub get_next_defaults($self) { 3444 # Iterates and returns the next class of defaults. 3445 3446 my $addr = pack 'J', refaddr $self; 3447 if ($iterator{$addr}++ < $class_defaults{$addr}->@*) { 3448 return $class_defaults{$addr}->[$iterator{$addr}-1]->@*; 3449 } 3450 3451 $iterator{$addr} = 0; 3452 return undef; 3453 } 3454} 3455 3456package Alias; 3457 3458# An alias is one of the names that a table goes by. This class defines them 3459# including some attributes. Everything is currently setup in the 3460# constructor. 3461 3462 3463{ # Closure 3464 3465 main::setup_package(); 3466 3467 my %name; 3468 main::set_access('name', \%name, 'r'); 3469 3470 my %loose_match; 3471 # Should this name match loosely or not. 3472 main::set_access('loose_match', \%loose_match, 'r'); 3473 3474 my %make_re_pod_entry; 3475 # Some aliases should not get their own entries in the re section of the 3476 # pod, because they are covered by a wild-card, and some we want to 3477 # discourage use of. Binary 3478 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); 3479 3480 my %ucd; 3481 # Is this documented to be accessible via Unicode::UCD 3482 main::set_access('ucd', \%ucd, 'r', 's'); 3483 3484 my %status; 3485 # Aliases have a status, like deprecated, or even suppressed (which means 3486 # they don't appear in documentation). Enum 3487 main::set_access('status', \%status, 'r'); 3488 3489 my %ok_as_filename; 3490 # Similarly, some aliases should not be considered as usable ones for 3491 # external use, such as file names, or we don't want documentation to 3492 # recommend them. Boolean 3493 main::set_access('ok_as_filename', \%ok_as_filename, 'r'); 3494 3495 sub new { 3496 my $class = shift; 3497 3498 my $self = bless \do { my $anonymous_scalar }, $class; 3499 my $addr = pack 'J', refaddr $self; 3500 3501 $name{$addr} = shift; 3502 $loose_match{$addr} = shift; 3503 $make_re_pod_entry{$addr} = shift; 3504 $ok_as_filename{$addr} = shift; 3505 $status{$addr} = shift; 3506 $ucd{$addr} = shift; 3507 3508 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3509 3510 # Null names are never ok externally 3511 $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; 3512 3513 return $self; 3514 } 3515} 3516 3517package Range; 3518 3519# A range is the basic unit for storing code points, and is described in the 3520# comments at the beginning of the program. Each range has a starting code 3521# point; an ending code point (not less than the starting one); a value 3522# that applies to every code point in between the two end-points, inclusive; 3523# and an enum type that applies to the value. The type is for the user's 3524# convenience, and has no meaning here, except that a non-zero type is 3525# considered to not obey the normal Unicode rules for having standard forms. 3526# 3527# The same structure is used for both map and match tables, even though in the 3528# latter, the value (and hence type) is irrelevant and could be used as a 3529# comment. In map tables, the value is what all the code points in the range 3530# map to. Type 0 values have the standardized version of the value stored as 3531# well, so as to not have to recalculate it a lot. 3532 3533sub trace { return main::trace(@_); } 3534 3535{ # Closure 3536 3537 main::setup_package(); 3538 3539 my %start; 3540 main::set_access('start', \%start, 'r', 's'); 3541 3542 my %end; 3543 main::set_access('end', \%end, 'r', 's'); 3544 3545 my %value; 3546 main::set_access('value', \%value, 'r', 's'); 3547 3548 my %type; 3549 main::set_access('type', \%type, 'r'); 3550 3551 my %standard_form; 3552 # The value in internal standard form. Defined only if the type is 0. 3553 main::set_access('standard_form', \%standard_form); 3554 3555 # Note that if these fields change, the dump() method should as well 3556 3557 sub new($class, $_addr, $_end, @_args) { 3558 my $self = bless \do { my $anonymous_scalar }, $class; 3559 my $addr = pack 'J', refaddr $self; 3560 3561 $start{$addr} = $_addr; 3562 $end{$addr} = $_end; 3563 3564 my %args = @_args; 3565 3566 my $value = delete $args{'Value'}; # Can be 0 3567 $value = "" unless defined $value; 3568 $value{$addr} = $value; 3569 3570 $type{$addr} = delete $args{'Type'} || 0; 3571 3572 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3573 3574 return $self; 3575 } 3576 3577 use overload 3578 fallback => 0, 3579 qw("") => "_operator_stringify", 3580 "." => \&main::_operator_dot, 3581 ".=" => \&main::_operator_dot_equal, 3582 ; 3583 3584 sub _operator_stringify($self, $other="", $reversed=0) { 3585 my $addr = pack 'J', refaddr $self; 3586 3587 # Output it like '0041..0065 (value)' 3588 my $return = sprintf("%04X", $start{$addr}) 3589 . '..' 3590 . sprintf("%04X", $end{$addr}); 3591 my $value = $value{$addr}; 3592 my $type = $type{$addr}; 3593 $return .= ' ('; 3594 $return .= "$value"; 3595 $return .= ", Type=$type" if $type != 0; 3596 $return .= ')'; 3597 3598 return $return; 3599 } 3600 3601 sub standard_form($self) { 3602 # Calculate the standard form only if needed, and cache the result. 3603 # The standard form is the value itself if the type is special. 3604 # This represents a considerable CPU and memory saving - at the time 3605 # of writing there are 368676 non-special objects, but the standard 3606 # form is only requested for 22047 of them - ie about 6%. 3607 3608 my $addr = pack 'J', refaddr $self; 3609 3610 return $standard_form{$addr} if defined $standard_form{$addr}; 3611 3612 my $value = $value{$addr}; 3613 return $value if $type{$addr}; 3614 return $standard_form{$addr} = main::standardize($value); 3615 } 3616 3617 sub dump($self, $indent) { 3618 # Human, not machine readable. For machine readable, comment out this 3619 # entire routine and let the standard one take effect. 3620 my $addr = pack 'J', refaddr $self; 3621 3622 my $return = $indent 3623 . sprintf("%04X", $start{$addr}) 3624 . '..' 3625 . sprintf("%04X", $end{$addr}) 3626 . " '$value{$addr}';"; 3627 if (! defined $standard_form{$addr}) { 3628 $return .= "(type=$type{$addr})"; 3629 } 3630 elsif ($standard_form{$addr} ne $value{$addr}) { 3631 $return .= "(standard '$standard_form{$addr}')"; 3632 } 3633 return $return; 3634 } 3635} # End closure 3636 3637package _Range_List_Base; 3638 3639# Base class for range lists. A range list is simply an ordered list of 3640# ranges, so that the ranges with the lowest starting numbers are first in it. 3641# 3642# When a new range is added that is adjacent to an existing range that has the 3643# same value and type, it merges with it to form a larger range. 3644# 3645# Ranges generally do not overlap, except that there can be multiple entries 3646# of single code point ranges. This is because of NameAliases.txt. 3647# 3648# In this program, there is a standard value such that if two different 3649# values, have the same standard value, they are considered equivalent. This 3650# value was chosen so that it gives correct results on Unicode data 3651 3652# There are a number of methods to manipulate range lists, and some operators 3653# are overloaded to handle them. 3654 3655sub trace { return main::trace(@_); } 3656 3657{ # Closure 3658 3659 our $addr; 3660 3661 # Max is initialized to a negative value that isn't adjacent to 0, for 3662 # simpler tests 3663 my $max_init = -2; 3664 3665 main::setup_package(); 3666 3667 my %ranges; 3668 # The list of ranges 3669 main::set_access('ranges', \%ranges, 'readable_array'); 3670 3671 my %max; 3672 # The highest code point in the list. This was originally a method, but 3673 # actual measurements said it was used a lot. 3674 main::set_access('max', \%max, 'r'); 3675 3676 my %each_range_iterator; 3677 # Iterator position for each_range() 3678 main::set_access('each_range_iterator', \%each_range_iterator); 3679 3680 my %owner_name_of; 3681 # Name of parent this is attached to, if any. Solely for better error 3682 # messages. 3683 main::set_access('owner_name_of', \%owner_name_of, 'p_r'); 3684 3685 my %_search_ranges_cache; 3686 # A cache of the previous result from _search_ranges(), for better 3687 # performance 3688 main::set_access('_search_ranges_cache', \%_search_ranges_cache); 3689 3690 sub new { 3691 my $class = shift; 3692 my %args = @_; 3693 3694 # Optional initialization data for the range list. NOTE: For large 3695 # ranges, it is better to use Range object rather than 3696 # [ low .. high ] 3697 # as it iterates through each one individually in the latter case. 3698 my $initialize = delete $args{'Initialize'}; 3699 3700 my $self; 3701 3702 # Use _union() to initialize. _union() returns an object of this 3703 # class, which means that it will call this constructor recursively. 3704 # But it won't have this $initialize parameter so that it won't 3705 # infinitely loop on this. 3706 return _union($class, $initialize, %args) if defined $initialize; 3707 3708 $self = bless \do { my $anonymous_scalar }, $class; 3709 my $addr = pack 'J', refaddr $self; 3710 3711 # Optional parent object, only for debug info. 3712 $owner_name_of{$addr} = delete $args{'Owner'}; 3713 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; 3714 3715 # Stringify, in case it is an object. 3716 $owner_name_of{$addr} = "$owner_name_of{$addr}"; 3717 3718 # This is used only for error messages, and so a colon is added 3719 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; 3720 3721 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3722 3723 $max{$addr} = $max_init; 3724 3725 $_search_ranges_cache{$addr} = 0; 3726 $ranges{$addr} = []; 3727 3728 return $self; 3729 } 3730 3731 use overload 3732 fallback => 0, 3733 qw("") => "_operator_stringify", 3734 "." => \&main::_operator_dot, 3735 ".=" => \&main::_operator_dot_equal, 3736 ; 3737 3738 sub _operator_stringify($self, $other="", $reversed=0) { 3739 my $addr = pack 'J', refaddr $self; 3740 3741 return "Range_List attached to '$owner_name_of{$addr}'" 3742 if $owner_name_of{$addr}; 3743 return "anonymous Range_List " . \$self; 3744 } 3745 3746 sub _union { 3747 # Returns the union of the input code points. It can be called as 3748 # either a constructor or a method. If called as a method, the result 3749 # will be a new() instance of the calling object, containing the union 3750 # of that object with the other parameter's code points; if called as 3751 # a constructor, the first parameter gives the class that the new object 3752 # should be, and the second parameter gives the code points to go into 3753 # it. 3754 # In either case, there are two parameters looked at by this routine; 3755 # any additional parameters are passed to the new() constructor. 3756 # 3757 # The code points can come in the form of some object that contains 3758 # ranges, and has a conventionally named method to access them; or 3759 # they can be an array of individual code points (as integers); or 3760 # just a single code point. 3761 # 3762 # If they are ranges, this routine doesn't make any effort to preserve 3763 # the range values and types of one input over the other. Therefore 3764 # this base class should not allow _union to be called from other than 3765 # initialization code, so as to prevent two tables from being added 3766 # together where the range values matter. The general form of this 3767 # routine therefore belongs in a derived class, but it was moved here 3768 # to avoid duplication of code. The failure to overload this in this 3769 # class keeps it safe. 3770 # 3771 # It does make the effort during initialization to accept tables with 3772 # multiple values for the same code point, and to preserve the order 3773 # of these. If there is only one input range or range set, it doesn't 3774 # sort (as it should already be sorted to the desired order), and will 3775 # accept multiple values per code point. Otherwise it will merge 3776 # multiple values into a single one. 3777 3778 my $self; 3779 my @args; # Arguments to pass to the constructor 3780 3781 my $class = shift; 3782 3783 # If a method call, will start the union with the object itself, and 3784 # the class of the new object will be the same as self. 3785 if (ref $class) { 3786 $self = $class; 3787 $class = ref $self; 3788 push @args, $self; 3789 } 3790 3791 # Add the other required parameter. 3792 push @args, shift; 3793 # Rest of parameters are passed on to the constructor 3794 3795 # Accumulate all records from both lists. 3796 my @records; 3797 my $input_count = 0; 3798 for my $arg (@args) { 3799 #local $to_trace = 0 if main::DEBUG; 3800 trace "argument = $arg" if main::DEBUG && $to_trace; 3801 if (! defined $arg) { 3802 my $message = ""; 3803 if (defined $self) { 3804 $message .= $owner_name_of{pack 'J', refaddr $self}; 3805 } 3806 Carp::my_carp_bug($message . "Undefined argument to _union. No union done."); 3807 return; 3808 } 3809 3810 $arg = [ $arg ] if ! ref $arg; 3811 my $type = ref $arg; 3812 if ($type eq 'ARRAY') { 3813 foreach my $element (@$arg) { 3814 push @records, Range->new($element, $element); 3815 $input_count++; 3816 } 3817 } 3818 elsif ($arg->isa('Range')) { 3819 push @records, $arg; 3820 $input_count++; 3821 } 3822 elsif ($arg->can('ranges')) { 3823 push @records, $arg->ranges; 3824 $input_count++; 3825 } 3826 else { 3827 my $message = ""; 3828 if (defined $self) { 3829 $message .= $owner_name_of{pack 'J', refaddr $self}; 3830 } 3831 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); 3832 return; 3833 } 3834 } 3835 3836 # Sort with the range containing the lowest ordinal first, but if 3837 # two ranges start at the same code point, sort with the bigger range 3838 # of the two first, because it takes fewer cycles. 3839 if ($input_count > 1) { 3840 @records = sort { ($a->start <=> $b->start) 3841 or 3842 # if b is shorter than a, b->end will be 3843 # less than a->end, and we want to select 3844 # a, so want to return -1 3845 ($b->end <=> $a->end) 3846 } @records; 3847 } 3848 3849 my $new = $class->new(@_); 3850 3851 # Fold in records so long as they add new information. 3852 for my $set (@records) { 3853 my $start = $set->start; 3854 my $end = $set->end; 3855 my $value = $set->value; 3856 my $type = $set->type; 3857 if ($start > $new->max) { 3858 $new->_add_delete('+', $start, $end, $value, Type => $type); 3859 } 3860 elsif ($end > $new->max) { 3861 $new->_add_delete('+', $new->max +1, $end, $value, 3862 Type => $type); 3863 } 3864 elsif ($input_count == 1) { 3865 # Here, overlaps existing range, but is from a single input, 3866 # so preserve the multiple values from that input. 3867 $new->_add_delete('+', $start, $end, $value, Type => $type, 3868 Replace => $MULTIPLE_AFTER); 3869 } 3870 } 3871 3872 return $new; 3873 } 3874 3875 sub range_count($self) { # Return the number of ranges in the range list 3876 return scalar @{$ranges{pack 'J', refaddr $self}}; 3877 } 3878 3879 sub min($self) { 3880 # Returns the minimum code point currently in the range list, or if 3881 # the range list is empty, 2 beyond the max possible. This is a 3882 # method because used so rarely, that not worth saving between calls, 3883 # and having to worry about changing it as ranges are added and 3884 # deleted. 3885 3886 my $addr = pack 'J', refaddr $self; 3887 3888 # If the range list is empty, return a large value that isn't adjacent 3889 # to any that could be in the range list, for simpler tests 3890 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; 3891 return $ranges{$addr}->[0]->start; 3892 } 3893 3894 sub contains($self, $codepoint) { 3895 # Boolean: Is argument in the range list? If so returns $i such that: 3896 # range[$i]->end < $codepoint <= range[$i+1]->end 3897 # which is one beyond what you want; this is so that the 0th range 3898 # doesn't return false 3899 3900 my $i = $self->_search_ranges($codepoint); 3901 return 0 unless defined $i; 3902 3903 # The search returns $i, such that 3904 # range[$i-1]->end < $codepoint <= range[$i]->end 3905 # So is in the table if and only iff it is at least the start position 3906 # of range $i. 3907 return 0 if $ranges{pack 'J', refaddr $self}->[$i]->start > $codepoint; 3908 return $i + 1; 3909 } 3910 3911 sub containing_range($self, $codepoint) { 3912 # Returns the range object that contains the code point, undef if none 3913 my $i = $self->contains($codepoint); 3914 return unless $i; 3915 3916 # contains() returns 1 beyond where we should look 3917 return $ranges{pack 'J', refaddr $self}->[$i-1]; 3918 } 3919 3920 sub value_of($self, $codepoint) { 3921 # Returns the value associated with the code point, undef if none 3922 my $range = $self->containing_range($codepoint); 3923 return unless defined $range; 3924 3925 return $range->value; 3926 } 3927 3928 sub type_of($self, $codepoint) { 3929 # Returns the type of the range containing the code point, undef if 3930 # the code point is not in the table 3931 my $range = $self->containing_range($codepoint); 3932 return unless defined $range; 3933 3934 return $range->type; 3935 } 3936 3937 sub _search_ranges($self, $code_point) { 3938 # Find the range in the list which contains a code point, or where it 3939 # should go if were to add it. That is, it returns $i, such that: 3940 # range[$i-1]->end < $codepoint <= range[$i]->end 3941 # Returns undef if no such $i is possible (e.g. at end of table), or 3942 # if there is an error. 3943 my $addr = pack 'J', refaddr $self; 3944 3945 return if $code_point > $max{$addr}; 3946 my $r = $ranges{$addr}; # The current list of ranges 3947 my $range_list_size = scalar @$r; 3948 my $i; 3949 3950 use integer; # want integer division 3951 3952 # Use the cached result as the starting guess for this one, because, 3953 # an experiment on 5.1 showed that 90% of the time the cache was the 3954 # same as the result on the next call (and 7% it was one less). 3955 $i = $_search_ranges_cache{$addr}; 3956 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. 3957 # from an intervening deletion 3958 #local $to_trace = 1 if main::DEBUG; 3959 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); 3960 return $i if $code_point <= $r->[$i]->end 3961 && ($i == 0 || $r->[$i-1]->end < $code_point); 3962 3963 # Here the cache doesn't yield the correct $i. Try adding 1. 3964 if ($i < $range_list_size - 1 3965 && $r->[$i]->end < $code_point && 3966 $code_point <= $r->[$i+1]->end) 3967 { 3968 $i++; 3969 trace "next \$i is correct: $i" if main::DEBUG && $to_trace; 3970 $_search_ranges_cache{$addr} = $i; 3971 return $i; 3972 } 3973 3974 # Here, adding 1 also didn't work. We do a binary search to 3975 # find the correct position, starting with current $i 3976 my $lower = 0; 3977 my $upper = $range_list_size - 1; 3978 while (1) { 3979 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; 3980 3981 if ($code_point <= $r->[$i]->end) { 3982 3983 # Here we have met the upper constraint. We can quit if we 3984 # also meet the lower one. 3985 last if $i == 0 || $r->[$i-1]->end < $code_point; 3986 3987 $upper = $i; # Still too high. 3988 3989 } 3990 else { 3991 3992 # Here, $r[$i]->end < $code_point, so look higher up. 3993 $lower = $i; 3994 } 3995 3996 # Split search domain in half to try again. 3997 my $temp = ($upper + $lower) / 2; 3998 3999 # No point in continuing unless $i changes for next time 4000 # in the loop. 4001 if ($temp == $i) { 4002 4003 # We can't reach the highest element because of the averaging. 4004 # So if one below the upper edge, force it there and try one 4005 # more time. 4006 if ($i == $range_list_size - 2) { 4007 4008 trace "Forcing to upper edge" if main::DEBUG && $to_trace; 4009 $i = $range_list_size - 1; 4010 4011 # Change $lower as well so if fails next time through, 4012 # taking the average will yield the same $i, and we will 4013 # quit with the error message just below. 4014 $lower = $i; 4015 next; 4016 } 4017 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); 4018 return; 4019 } 4020 $i = $temp; 4021 } # End of while loop 4022 4023 if (main::DEBUG && $to_trace) { 4024 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; 4025 trace "i= [ $i ]", $r->[$i]; 4026 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; 4027 } 4028 4029 # Here we have found the offset. Cache it as a starting point for the 4030 # next call. 4031 $_search_ranges_cache{$addr} = $i; 4032 return $i; 4033 } 4034 4035 sub _add_delete { 4036 # Add, replace or delete ranges to or from a list. The $type 4037 # parameter gives which: 4038 # '+' => insert or replace a range, returning a list of any changed 4039 # ranges. 4040 # '-' => delete a range, returning a list of any deleted ranges. 4041 # 4042 # The next three parameters give respectively the start, end, and 4043 # value associated with the range. 'value' should be null unless the 4044 # operation is '+'; 4045 # 4046 # The range list is kept sorted so that the range with the lowest 4047 # starting position is first in the list, and generally, adjacent 4048 # ranges with the same values are merged into a single larger one (see 4049 # exceptions below). 4050 # 4051 # There are more parameters; all are key => value pairs: 4052 # Type gives the type of the value. It is only valid for '+'. 4053 # All ranges have types; if this parameter is omitted, 0 is 4054 # assumed. Ranges with type 0 are assumed to obey the 4055 # Unicode rules for casing, etc; ranges with other types are 4056 # not. Otherwise, the type is arbitrary, for the caller's 4057 # convenience, and looked at only by this routine to keep 4058 # adjacent ranges of different types from being merged into 4059 # a single larger range, and when Replace => 4060 # $IF_NOT_EQUIVALENT is specified (see just below). 4061 # Replace determines what to do if the range list already contains 4062 # ranges which coincide with all or portions of the input 4063 # range. It is only valid for '+': 4064 # => $NO means that the new value is not to replace 4065 # any existing ones, but any empty gaps of the 4066 # range list coinciding with the input range 4067 # will be filled in with the new value. 4068 # => $UNCONDITIONALLY means to replace the existing values with 4069 # this one unconditionally. However, if the 4070 # new and old values are identical, the 4071 # replacement is skipped to save cycles 4072 # => $IF_NOT_EQUIVALENT means to replace the existing values 4073 # (the default) with this one if they are not equivalent. 4074 # Ranges are equivalent if their types are the 4075 # same, and they are the same string; or if 4076 # both are type 0 ranges, if their Unicode 4077 # standard forms are identical. In this last 4078 # case, the routine chooses the more "modern" 4079 # one to use. This is because some of the 4080 # older files are formatted with values that 4081 # are, for example, ALL CAPs, whereas the 4082 # derived files have a more modern style, 4083 # which looks better. By looking for this 4084 # style when the pre-existing and replacement 4085 # standard forms are the same, we can move to 4086 # the modern style 4087 # => $MULTIPLE_BEFORE means that if this range duplicates an 4088 # existing one, but has a different value, 4089 # don't replace the existing one, but insert 4090 # this one so that the same range can occur 4091 # multiple times. They are stored LIFO, so 4092 # that the final one inserted is the first one 4093 # returned in an ordered search of the table. 4094 # If this is an exact duplicate, including the 4095 # value, the original will be moved to be 4096 # first, before any other duplicate ranges 4097 # with different values. 4098 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored 4099 # FIFO, so that this one is inserted after all 4100 # others that currently exist. If this is an 4101 # exact duplicate, including value, of an 4102 # existing range, this one is discarded 4103 # (leaving the existing one in its original, 4104 # higher priority position 4105 # => $CROAK Die with an error if is already there 4106 # => anything else is the same as => $IF_NOT_EQUIVALENT 4107 # 4108 # "same value" means identical for non-type-0 ranges, and it means 4109 # having the same standard forms for type-0 ranges. 4110 4111 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; 4112 4113 my $self = shift; 4114 my $operation = shift; # '+' for add/replace; '-' for delete; 4115 my $start = shift; 4116 my $end = shift; 4117 my $value = shift; 4118 4119 my %args = @_; 4120 4121 $value = "" if not defined $value; # warning: $value can be "0" 4122 4123 my $replace = delete $args{'Replace'}; 4124 $replace = $IF_NOT_EQUIVALENT unless defined $replace; 4125 4126 my $type = delete $args{'Type'}; 4127 $type = 0 unless defined $type; 4128 4129 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4130 4131 my $addr = pack 'J', refaddr $self; 4132 4133 if ($operation ne '+' && $operation ne '-') { 4134 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); 4135 return; 4136 } 4137 unless (defined $start && defined $end) { 4138 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); 4139 return; 4140 } 4141 unless ($end >= $start) { 4142 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); 4143 return; 4144 } 4145 #local $to_trace = 1 if main::DEBUG; 4146 4147 if ($operation eq '-') { 4148 if ($replace != $IF_NOT_EQUIVALENT) { 4149 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); 4150 $replace = $IF_NOT_EQUIVALENT; 4151 } 4152 if ($type) { 4153 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); 4154 $type = 0; 4155 } 4156 if ($value ne "") { 4157 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); 4158 $value = ""; 4159 } 4160 } 4161 4162 my $r = $ranges{$addr}; # The current list of ranges 4163 my $range_list_size = scalar @$r; # And its size 4164 my $max = $max{$addr}; # The current high code point in 4165 # the list of ranges 4166 4167 # Do a special case requiring fewer machine cycles when the new range 4168 # starts after the current highest point. The Unicode input data is 4169 # structured so this is common. 4170 if ($start > $max) { 4171 4172 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace; 4173 return if $operation eq '-'; # Deleting a non-existing range is a 4174 # no-op 4175 4176 # If the new range doesn't logically extend the current final one 4177 # in the range list, create a new range at the end of the range 4178 # list. (max cleverly is initialized to a negative number not 4179 # adjacent to 0 if the range list is empty, so even adding a range 4180 # to an empty range list starting at 0 will have this 'if' 4181 # succeed.) 4182 if ($start > $max + 1 # non-adjacent means can't extend. 4183 || @{$r}[-1]->value ne $value # values differ, can't extend. 4184 || @{$r}[-1]->type != $type # types differ, can't extend. 4185 ) { 4186 push @$r, Range->new($start, $end, 4187 Value => $value, 4188 Type => $type); 4189 } 4190 else { 4191 4192 # Here, the new range starts just after the current highest in 4193 # the range list, and they have the same type and value. 4194 # Extend the existing range to incorporate the new one. 4195 @{$r}[-1]->set_end($end); 4196 } 4197 4198 # This becomes the new maximum. 4199 $max{$addr} = $end; 4200 4201 return; 4202 } 4203 #local $to_trace = 0 if main::DEBUG; 4204 4205 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; 4206 4207 # Here, the input range isn't after the whole rest of the range list. 4208 # Most likely 'splice' will be needed. The rest of the routine finds 4209 # the needed splice parameters, and if necessary, does the splice. 4210 # First, find the offset parameter needed by the splice function for 4211 # the input range. Note that the input range may span multiple 4212 # existing ones, but we'll worry about that later. For now, just find 4213 # the beginning. If the input range is to be inserted starting in a 4214 # position not currently in the range list, it must (obviously) come 4215 # just after the range below it, and just before the range above it. 4216 # Slightly less obviously, it will occupy the position currently 4217 # occupied by the range that is to come after it. More formally, we 4218 # are looking for the position, $i, in the array of ranges, such that: 4219 # 4220 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end 4221 # 4222 # (The ordered relationships within existing ranges are also shown in 4223 # the equation above). However, if the start of the input range is 4224 # within an existing range, the splice offset should point to that 4225 # existing range's position in the list; that is $i satisfies a 4226 # somewhat different equation, namely: 4227 # 4228 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end 4229 # 4230 # More briefly, $start can come before or after r[$i]->start, and at 4231 # this point, we don't know which it will be. However, these 4232 # two equations share these constraints: 4233 # 4234 # r[$i-1]->end < $start <= r[$i]->end 4235 # 4236 # And that is good enough to find $i. 4237 4238 my $i = $self->_search_ranges($start); 4239 if (! defined $i) { 4240 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); 4241 return; 4242 } 4243 4244 # The search function returns $i such that: 4245 # 4246 # r[$i-1]->end < $start <= r[$i]->end 4247 # 4248 # That means that $i points to the first range in the range list 4249 # that could possibly be affected by this operation. We still don't 4250 # know if the start of the input range is within r[$i], or if it 4251 # points to empty space between r[$i-1] and r[$i]. 4252 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; 4253 4254 # Special case the insertion of data that is not to replace any 4255 # existing data. 4256 if ($replace == $NO) { # If $NO, has to be operation '+' 4257 #local $to_trace = 1 if main::DEBUG; 4258 trace "Doesn't replace" if main::DEBUG && $to_trace; 4259 4260 # Here, the new range is to take effect only on those code points 4261 # that aren't already in an existing range. This can be done by 4262 # looking through the existing range list and finding the gaps in 4263 # the ranges that this new range affects, and then calling this 4264 # function recursively on each of those gaps, leaving untouched 4265 # anything already in the list. Gather up a list of the changed 4266 # gaps first so that changes to the internal state as new ranges 4267 # are added won't be a problem. 4268 my @gap_list; 4269 4270 # First, if the starting point of the input range is outside an 4271 # existing one, there is a gap from there to the beginning of the 4272 # existing range -- add a span to fill the part that this new 4273 # range occupies 4274 if ($start < $r->[$i]->start) { 4275 push @gap_list, Range->new($start, 4276 main::min($end, 4277 $r->[$i]->start - 1), 4278 Type => $type); 4279 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; 4280 } 4281 4282 # Then look through the range list for other gaps until we reach 4283 # the highest range affected by the input one. 4284 my $j; 4285 for ($j = $i+1; $j < $range_list_size; $j++) { 4286 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; 4287 last if $end < $r->[$j]->start; 4288 4289 # If there is a gap between when this range starts and the 4290 # previous one ends, add a span to fill it. Note that just 4291 # because there are two ranges doesn't mean there is a 4292 # non-zero gap between them. It could be that they have 4293 # different values or types 4294 if ($r->[$j-1]->end + 1 != $r->[$j]->start) { 4295 push @gap_list, 4296 Range->new($r->[$j-1]->end + 1, 4297 $r->[$j]->start - 1, 4298 Type => $type); 4299 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; 4300 } 4301 } 4302 4303 # Here, we have either found an existing range in the range list, 4304 # beyond the area affected by the input one, or we fell off the 4305 # end of the loop because the input range affects the whole rest 4306 # of the range list. In either case, $j is 1 higher than the 4307 # highest affected range. If $j == $i, it means that there are no 4308 # affected ranges, that the entire insertion is in the gap between 4309 # r[$i-1], and r[$i], which we already have taken care of before 4310 # the loop. 4311 # On the other hand, if there are affected ranges, it might be 4312 # that there is a gap that needs filling after the final such 4313 # range to the end of the input range 4314 if ($r->[$j-1]->end < $end) { 4315 push @gap_list, Range->new(main::max($start, 4316 $r->[$j-1]->end + 1), 4317 $end, 4318 Type => $type); 4319 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; 4320 } 4321 4322 # Call recursively to fill in all the gaps. 4323 foreach my $gap (@gap_list) { 4324 $self->_add_delete($operation, 4325 $gap->start, 4326 $gap->end, 4327 $value, 4328 Type => $type); 4329 } 4330 4331 return; 4332 } 4333 4334 # Here, we have taken care of the case where $replace is $NO. 4335 # Remember that here, r[$i-1]->end < $start <= r[$i]->end 4336 # If inserting a multiple record, this is where it goes, before the 4337 # first (if any) existing one if inserting LIFO. (If this is to go 4338 # afterwards, FIFO, we below move the pointer to there.) These imply 4339 # an insertion, and no change to any existing ranges. Note that $i 4340 # can be -1 if this new range doesn't actually duplicate any existing, 4341 # and comes at the beginning of the list. 4342 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) { 4343 4344 if ($start != $end) { 4345 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken."); 4346 return; 4347 } 4348 4349 # If the new code point is within a current range ... 4350 if ($end >= $r->[$i]->start) { 4351 4352 # Don't add an exact duplicate, as it isn't really a multiple 4353 my $existing_value = $r->[$i]->value; 4354 my $existing_type = $r->[$i]->type; 4355 return if $value eq $existing_value && $type eq $existing_type; 4356 4357 # If the multiple value is part of an existing range, we want 4358 # to split up that range, so that only the single code point 4359 # is affected. To do this, we first call ourselves 4360 # recursively to delete that code point from the table, having 4361 # preserved its current data above. Then we call ourselves 4362 # recursively again to add the new multiple, which we know by 4363 # the test just above is different than the current code 4364 # point's value, so it will become a range containing a single 4365 # code point: just itself. Finally, we add back in the 4366 # pre-existing code point, which will again be a single code 4367 # point range. Because 'i' likely will have changed as a 4368 # result of these operations, we can't just continue on, but 4369 # do this operation recursively as well. If we are inserting 4370 # LIFO, the pre-existing code point needs to go after the new 4371 # one, so use MULTIPLE_AFTER; and vice versa. 4372 if ($r->[$i]->start != $r->[$i]->end) { 4373 $self->_add_delete('-', $start, $end, ""); 4374 $self->_add_delete('+', $start, $end, $value, Type => $type); 4375 return $self->_add_delete('+', 4376 $start, $end, 4377 $existing_value, 4378 Type => $existing_type, 4379 Replace => ($replace == $MULTIPLE_BEFORE) 4380 ? $MULTIPLE_AFTER 4381 : $MULTIPLE_BEFORE); 4382 } 4383 } 4384 4385 # If to place this new record after, move to beyond all existing 4386 # ones; but don't add this one if identical to any of them, as it 4387 # isn't really a multiple. This leaves the original order, so 4388 # that the current request is ignored. The reasoning is that the 4389 # previous request that wanted this record to have high priority 4390 # should have precedence. 4391 if ($replace == $MULTIPLE_AFTER) { 4392 while ($i < @$r && $r->[$i]->start == $start) { 4393 return if $value eq $r->[$i]->value 4394 && $type eq $r->[$i]->type; 4395 $i++; 4396 } 4397 } 4398 else { 4399 # If instead we are to place this new record before any 4400 # existing ones, remove any identical ones that come after it. 4401 # This changes the existing order so that the new one is 4402 # first, as is being requested. 4403 for (my $j = $i + 1; 4404 $j < @$r && $r->[$j]->start == $start; 4405 $j++) 4406 { 4407 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { 4408 splice @$r, $j, 1; 4409 last; # There should only be one instance, so no 4410 # need to keep looking 4411 } 4412 } 4413 } 4414 4415 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; 4416 my @return = splice @$r, 4417 $i, 4418 0, 4419 Range->new($start, 4420 $end, 4421 Value => $value, 4422 Type => $type); 4423 if (main::DEBUG && $to_trace) { 4424 trace "After splice:"; 4425 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4426 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4427 trace "i =[", $i, "]", $r->[$i] if $i >= 0; 4428 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4429 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4430 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3; 4431 } 4432 return @return; 4433 } 4434 4435 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This 4436 # leaves delete, insert, and replace either unconditionally or if not 4437 # equivalent. $i still points to the first potential affected range. 4438 # Now find the highest range affected, which will determine the length 4439 # parameter to splice. (The input range can span multiple existing 4440 # ones.) If this isn't a deletion, while we are looking through the 4441 # range list, see also if this is a replacement rather than a clean 4442 # insertion; that is if it will change the values of at least one 4443 # existing range. Start off assuming it is an insert, until find it 4444 # isn't. 4445 my $clean_insert = $operation eq '+'; 4446 my $j; # This will point to the highest affected range 4447 4448 # For non-zero types, the standard form is the value itself; 4449 my $standard_form = ($type) ? $value : main::standardize($value); 4450 4451 for ($j = $i; $j < $range_list_size; $j++) { 4452 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; 4453 4454 # If find a range that it doesn't overlap into, we can stop 4455 # searching 4456 last if $end < $r->[$j]->start; 4457 4458 # Here, overlaps the range at $j. If the values don't match, 4459 # and so far we think this is a clean insertion, it becomes a 4460 # non-clean insertion, i.e., a 'change' or 'replace' instead. 4461 if ($clean_insert) { 4462 if ($r->[$j]->standard_form ne $standard_form) { 4463 $clean_insert = 0; 4464 if ($replace == $CROAK) { 4465 main::croak("The range to add " 4466 . sprintf("%04X", $start) 4467 . '-' 4468 . sprintf("%04X", $end) 4469 . " with value '$value' overlaps an existing range $r->[$j]"); 4470 } 4471 } 4472 else { 4473 4474 # Here, the two values are essentially the same. If the 4475 # two are actually identical, replacing wouldn't change 4476 # anything so skip it. 4477 my $pre_existing = $r->[$j]->value; 4478 if ($pre_existing ne $value) { 4479 4480 # Here the new and old standardized values are the 4481 # same, but the non-standardized values aren't. If 4482 # replacing unconditionally, then replace 4483 if( $replace == $UNCONDITIONALLY) { 4484 $clean_insert = 0; 4485 } 4486 else { 4487 4488 # Here, are replacing conditionally. Decide to 4489 # replace or not based on which appears to look 4490 # the "nicest". If one is mixed case and the 4491 # other isn't, choose the mixed case one. 4492 my $new_mixed = $value =~ /[A-Z]/ 4493 && $value =~ /[a-z]/; 4494 my $old_mixed = $pre_existing =~ /[A-Z]/ 4495 && $pre_existing =~ /[a-z]/; 4496 4497 if ($old_mixed != $new_mixed) { 4498 $clean_insert = 0 if $new_mixed; 4499 if (main::DEBUG && $to_trace) { 4500 if ($clean_insert) { 4501 trace "Retaining $pre_existing over $value"; 4502 } 4503 else { 4504 trace "Replacing $pre_existing with $value"; 4505 } 4506 } 4507 } 4508 else { 4509 4510 # Here casing wasn't different between the two. 4511 # If one has hyphens or underscores and the 4512 # other doesn't, choose the one with the 4513 # punctuation. 4514 my $new_punct = $value =~ /[-_]/; 4515 my $old_punct = $pre_existing =~ /[-_]/; 4516 4517 if ($old_punct != $new_punct) { 4518 $clean_insert = 0 if $new_punct; 4519 if (main::DEBUG && $to_trace) { 4520 if ($clean_insert) { 4521 trace "Retaining $pre_existing over $value"; 4522 } 4523 else { 4524 trace "Replacing $pre_existing with $value"; 4525 } 4526 } 4527 } # else existing one is just as "good"; 4528 # retain it to save cycles. 4529 } 4530 } 4531 } 4532 } 4533 } 4534 } # End of loop looking for highest affected range. 4535 4536 # Here, $j points to one beyond the highest range that this insertion 4537 # affects (hence to beyond the range list if that range is the final 4538 # one in the range list). 4539 4540 # The splice length is all the affected ranges. Get it before 4541 # subtracting, for efficiency, so we don't have to later add 1. 4542 my $length = $j - $i; 4543 4544 $j--; # $j now points to the highest affected range. 4545 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; 4546 4547 # Here, have taken care of $NO and $MULTIPLE_foo replaces. 4548 # $j points to the highest affected range. But it can be < $i or even 4549 # -1. These happen only if the insertion is entirely in the gap 4550 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop 4551 # above exited first time through with $end < $r->[$i]->start. (And 4552 # then we subtracted one from j) This implies also that $start < 4553 # $r->[$i]->start, but we know from above that $r->[$i-1]->end < 4554 # $start, so the entire input range is in the gap. 4555 if ($j < $i) { 4556 4557 # Here the entire input range is in the gap before $i. 4558 4559 if (main::DEBUG && $to_trace) { 4560 if ($i) { 4561 trace "Entire range is between $r->[$i-1] and $r->[$i]"; 4562 } 4563 else { 4564 trace "Entire range is before $r->[$i]"; 4565 } 4566 } 4567 return if $operation ne '+'; # Deletion of a non-existent range is 4568 # a no-op 4569 } 4570 else { 4571 4572 # Here part of the input range is not in the gap before $i. Thus, 4573 # there is at least one affected one, and $j points to the highest 4574 # such one. 4575 4576 # At this point, here is the situation: 4577 # This is not an insertion of a multiple, nor of tentative ($NO) 4578 # data. 4579 # $i points to the first element in the current range list that 4580 # may be affected by this operation. In fact, we know 4581 # that the range at $i is affected because we are in 4582 # the else branch of this 'if' 4583 # $j points to the highest affected range. 4584 # In other words, 4585 # r[$i-1]->end < $start <= r[$i]->end 4586 # And: 4587 # r[$i-1]->end < $start <= $end < r[$j+1]->start 4588 # 4589 # Also: 4590 # $clean_insert is a boolean which is set true if and only if 4591 # this is a "clean insertion", i.e., not a change nor a 4592 # deletion (multiple was handled above). 4593 4594 # We now have enough information to decide if this call is a no-op 4595 # or not. It is a no-op if this is an insertion of already 4596 # existing data. To be so, it must be contained entirely in one 4597 # range. 4598 4599 if (main::DEBUG && $to_trace && $clean_insert 4600 && $start >= $r->[$i]->start 4601 && $end <= $r->[$i]->end) 4602 { 4603 trace "no-op"; 4604 } 4605 return if $clean_insert 4606 && $start >= $r->[$i]->start 4607 && $end <= $r->[$i]->end; 4608 } 4609 4610 # Here, we know that some action will have to be taken. We have 4611 # calculated the offset and length (though adjustments may be needed) 4612 # for the splice. Now start constructing the replacement list. 4613 my @replacement; 4614 my $splice_start = $i; 4615 4616 my $extends_below; 4617 my $extends_above; 4618 4619 # See if should extend any adjacent ranges. 4620 if ($operation eq '-') { # Don't extend deletions 4621 $extends_below = $extends_above = 0; 4622 } 4623 else { # Here, should extend any adjacent ranges. See if there are 4624 # any. 4625 $extends_below = ($i > 0 4626 # can't extend unless adjacent 4627 && $r->[$i-1]->end == $start -1 4628 # can't extend unless are same standard value 4629 && $r->[$i-1]->standard_form eq $standard_form 4630 # can't extend unless share type 4631 && $r->[$i-1]->type == $type); 4632 $extends_above = ($j+1 < $range_list_size 4633 && $r->[$j+1]->start == $end +1 4634 && $r->[$j+1]->standard_form eq $standard_form 4635 && $r->[$j+1]->type == $type); 4636 } 4637 if ($extends_below && $extends_above) { # Adds to both 4638 $splice_start--; # start replace at element below 4639 $length += 2; # will replace on both sides 4640 trace "Extends both below and above ranges" if main::DEBUG && $to_trace; 4641 4642 # The result will fill in any gap, replacing both sides, and 4643 # create one large range. 4644 @replacement = Range->new($r->[$i-1]->start, 4645 $r->[$j+1]->end, 4646 Value => $value, 4647 Type => $type); 4648 } 4649 else { 4650 4651 # Here we know that the result won't just be the conglomeration of 4652 # a new range with both its adjacent neighbors. But it could 4653 # extend one of them. 4654 4655 if ($extends_below) { 4656 4657 # Here the new element adds to the one below, but not to the 4658 # one above. If inserting, and only to that one range, can 4659 # just change its ending to include the new one. 4660 if ($length == 0 && $clean_insert) { 4661 $r->[$i-1]->set_end($end); 4662 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; 4663 return; 4664 } 4665 else { 4666 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; 4667 $splice_start--; # start replace at element below 4668 $length++; # will replace the element below 4669 $start = $r->[$i-1]->start; 4670 } 4671 } 4672 elsif ($extends_above) { 4673 4674 # Here the new element adds to the one above, but not below. 4675 # Mirror the code above 4676 if ($length == 0 && $clean_insert) { 4677 $r->[$j+1]->set_start($start); 4678 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; 4679 return; 4680 } 4681 else { 4682 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; 4683 $length++; # will replace the element above 4684 $end = $r->[$j+1]->end; 4685 } 4686 } 4687 4688 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; 4689 4690 # Finally, here we know there will have to be a splice. 4691 # If the change or delete affects only the highest portion of the 4692 # first affected range, the range will have to be split. The 4693 # splice will remove the whole range, but will replace it by a new 4694 # range containing just the unaffected part. So, in this case, 4695 # add to the replacement list just this unaffected portion. 4696 if (! $extends_below 4697 && $start > $r->[$i]->start && $start <= $r->[$i]->end) 4698 { 4699 push @replacement, 4700 Range->new($r->[$i]->start, 4701 $start - 1, 4702 Value => $r->[$i]->value, 4703 Type => $r->[$i]->type); 4704 } 4705 4706 # In the case of an insert or change, but not a delete, we have to 4707 # put in the new stuff; this comes next. 4708 if ($operation eq '+') { 4709 push @replacement, Range->new($start, 4710 $end, 4711 Value => $value, 4712 Type => $type); 4713 } 4714 4715 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; 4716 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; 4717 4718 # And finally, if we're changing or deleting only a portion of the 4719 # highest affected range, it must be split, as the lowest one was. 4720 if (! $extends_above 4721 && $j >= 0 # Remember that j can be -1 if before first 4722 # current element 4723 && $end >= $r->[$j]->start 4724 && $end < $r->[$j]->end) 4725 { 4726 push @replacement, 4727 Range->new($end + 1, 4728 $r->[$j]->end, 4729 Value => $r->[$j]->value, 4730 Type => $r->[$j]->type); 4731 } 4732 } 4733 4734 # And do the splice, as calculated above 4735 if (main::DEBUG && $to_trace) { 4736 trace "replacing $length element(s) at $i with "; 4737 foreach my $replacement (@replacement) { 4738 trace " $replacement"; 4739 } 4740 trace "Before splice:"; 4741 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4742 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4743 trace "i =[", $i, "]", $r->[$i]; 4744 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4745 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4746 } 4747 4748 my @return = splice @$r, $splice_start, $length, @replacement; 4749 4750 if (main::DEBUG && $to_trace) { 4751 trace "After splice:"; 4752 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4753 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4754 trace "i =[", $i, "]", $r->[$i]; 4755 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4756 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4757 trace "removed ", @return if @return; 4758 } 4759 4760 # An actual deletion could have changed the maximum in the list. 4761 # There was no deletion if the splice didn't return something, but 4762 # otherwise recalculate it. This is done too rarely to worry about 4763 # performance. 4764 if ($operation eq '-' && @return) { 4765 if (@$r) { 4766 $max{$addr} = $r->[-1]->end; 4767 } 4768 else { # Now empty 4769 $max{$addr} = $max_init; 4770 } 4771 } 4772 return @return; 4773 } 4774 4775 sub reset_each_range($self) { # reset the iterator for each_range(); 4776 undef $each_range_iterator{pack 'J', refaddr $self}; 4777 return; 4778 } 4779 4780 sub each_range($self) { 4781 # Iterate over each range in a range list. Results are undefined if 4782 # the range list is changed during the iteration. 4783 my $addr = pack 'J', refaddr $self; 4784 4785 return if $self->is_empty; 4786 4787 $each_range_iterator{$addr} = -1 4788 if ! defined $each_range_iterator{$addr}; 4789 $each_range_iterator{$addr}++; 4790 return $ranges{$addr}->[$each_range_iterator{$addr}] 4791 if $each_range_iterator{$addr} < @{$ranges{$addr}}; 4792 undef $each_range_iterator{$addr}; 4793 return; 4794 } 4795 4796 sub count($self) { # Returns count of code points in range list 4797 my $addr = pack 'J', refaddr $self; 4798 4799 my $count = 0; 4800 foreach my $range (@{$ranges{$addr}}) { 4801 $count += $range->end - $range->start + 1; 4802 } 4803 return $count; 4804 } 4805 4806 sub delete_range($self, $start, $end) { # Delete a range 4807 return $self->_add_delete('-', $start, $end, ""); 4808 } 4809 4810 sub is_empty($self) { # Returns boolean as to if a range list is empty 4811 return scalar @{$ranges{pack 'J', refaddr $self}} == 0; 4812 } 4813 4814 sub hash($self) { 4815 # Quickly returns a scalar suitable for separating tables into 4816 # buckets, i.e. it is a hash function of the contents of a table, so 4817 # there are relatively few conflicts. 4818 my $addr = pack 'J', refaddr $self; 4819 4820 # These are quickly computable. Return looks like 'min..max;count' 4821 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; 4822 } 4823} # End closure for _Range_List_Base 4824 4825package Range_List; 4826use parent '-norequire', '_Range_List_Base'; 4827 4828# A Range_List is a range list for match tables; i.e. the range values are 4829# not significant. Thus a number of operations can be safely added to it, 4830# such as inversion, intersection. Note that union is also an unsafe 4831# operation when range values are cared about, and that method is in the base 4832# class, not here. But things are set up so that that method is callable only 4833# during initialization. Only in this derived class, is there an operation 4834# that combines two tables. A Range_Map can thus be used to initialize a 4835# Range_List, and its mappings will be in the list, but are not significant to 4836# this class. 4837 4838sub trace { return main::trace(@_); } 4839 4840{ # Closure 4841 4842 use overload 4843 fallback => 0, 4844 '+' => sub { my $self = shift; 4845 my $other = shift; 4846 4847 return $self->_union($other) 4848 }, 4849 '+=' => sub { my $self = shift; 4850 my $other = shift; 4851 my $reversed = shift; 4852 4853 if ($reversed) { 4854 Carp::my_carp_bug("Bad news. Can't cope with '" 4855 . ref($other) 4856 . ' += ' 4857 . ref($self) 4858 . "'. undef returned."); 4859 return; 4860 } 4861 4862 return $self->_union($other) 4863 }, 4864 '&' => sub { my $self = shift; 4865 my $other = shift; 4866 4867 return $self->_intersect($other, 0); 4868 }, 4869 '&=' => sub { my $self = shift; 4870 my $other = shift; 4871 my $reversed = shift; 4872 4873 if ($reversed) { 4874 Carp::my_carp_bug("Bad news. Can't cope with '" 4875 . ref($other) 4876 . ' &= ' 4877 . ref($self) 4878 . "'. undef returned."); 4879 return; 4880 } 4881 4882 return $self->_intersect($other, 0); 4883 }, 4884 '~' => "_invert", 4885 '-' => "_subtract", 4886 ; 4887 4888 sub _invert($self, @) { 4889 # Returns a new Range_List that gives all code points not in $self. 4890 my $new = Range_List->new; 4891 4892 # Go through each range in the table, finding the gaps between them 4893 my $max = -1; # Set so no gap before range beginning at 0 4894 for my $range ($self->ranges) { 4895 my $start = $range->start; 4896 my $end = $range->end; 4897 4898 # If there is a gap before this range, the inverse will contain 4899 # that gap. 4900 if ($start > $max + 1) { 4901 $new->add_range($max + 1, $start - 1); 4902 } 4903 $max = $end; 4904 } 4905 4906 # And finally, add the gap from the end of the table to the max 4907 # possible code point 4908 if ($max < $MAX_WORKING_CODEPOINT) { 4909 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT); 4910 } 4911 return $new; 4912 } 4913 4914 sub _subtract($self, $other, $reversed=0) { 4915 # Returns a new Range_List with the argument deleted from it. The 4916 # argument can be a single code point, a range, or something that has 4917 # a range, with the _range_list() method on it returning them 4918 4919 if ($reversed) { 4920 Carp::my_carp_bug("Bad news. Can't cope with '" 4921 . ref($other) 4922 . ' - ' 4923 . ref($self) 4924 . "'. undef returned."); 4925 return; 4926 } 4927 4928 my $new = Range_List->new(Initialize => $self); 4929 4930 if (! ref $other) { # Single code point 4931 $new->delete_range($other, $other); 4932 } 4933 elsif ($other->isa('Range')) { 4934 $new->delete_range($other->start, $other->end); 4935 } 4936 elsif ($other->can('_range_list')) { 4937 foreach my $range ($other->_range_list->ranges) { 4938 $new->delete_range($range->start, $range->end); 4939 } 4940 } 4941 else { 4942 Carp::my_carp_bug("Can't cope with a " 4943 . ref($other) 4944 . " argument to '-'. Subtraction ignored." 4945 ); 4946 return $self; 4947 } 4948 4949 return $new; 4950 } 4951 4952 sub _intersect($a_object, $b_object, $check_if_overlapping=0) { 4953 # Returns either a boolean giving whether the two inputs' range lists 4954 # intersect (overlap), or a new Range_List containing the intersection 4955 # of the two lists. The optional final parameter being true indicates 4956 # to do the check instead of the intersection. 4957 4958 if (! defined $b_object) { 4959 my $message = ""; 4960 $message .= $a_object->_owner_name_of if defined $a_object; 4961 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); 4962 return; 4963 } 4964 4965 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) 4966 # Thus the intersection could be much more simply be written: 4967 # return ~(~$a_object + ~$b_object); 4968 # But, this is slower, and when taking the inverse of a large 4969 # range_size_1 table, back when such tables were always stored that 4970 # way, it became prohibitively slow, hence the code was changed to the 4971 # below 4972 4973 if ($b_object->isa('Range')) { 4974 $b_object = Range_List->new(Initialize => $b_object, 4975 Owner => $a_object->_owner_name_of); 4976 } 4977 $b_object = $b_object->_range_list if $b_object->can('_range_list'); 4978 4979 my @a_ranges = $a_object->ranges; 4980 my @b_ranges = $b_object->ranges; 4981 4982 #local $to_trace = 1 if main::DEBUG; 4983 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; 4984 4985 # Start with the first range in each list 4986 my $a_i = 0; 4987 my $range_a = $a_ranges[$a_i]; 4988 my $b_i = 0; 4989 my $range_b = $b_ranges[$b_i]; 4990 4991 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) 4992 if ! $check_if_overlapping; 4993 4994 # If either list is empty, there is no intersection and no overlap 4995 if (! defined $range_a || ! defined $range_b) { 4996 return $check_if_overlapping ? 0 : $new; 4997 } 4998 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 4999 5000 # Otherwise, must calculate the intersection/overlap. Start with the 5001 # very first code point in each list 5002 my $a = $range_a->start; 5003 my $b = $range_b->start; 5004 5005 # Loop through all the ranges of each list; in each iteration, $a and 5006 # $b are the current code points in their respective lists 5007 while (1) { 5008 5009 # If $a and $b are the same code point, ... 5010 if ($a == $b) { 5011 5012 # it means the lists overlap. If just checking for overlap 5013 # know the answer now, 5014 return 1 if $check_if_overlapping; 5015 5016 # The intersection includes this code point plus anything else 5017 # common to both current ranges. 5018 my $start = $a; 5019 my $end = main::min($range_a->end, $range_b->end); 5020 if (! $check_if_overlapping) { 5021 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; 5022 $new->add_range($start, $end); 5023 } 5024 5025 # Skip ahead to the end of the current intersect 5026 $a = $b = $end; 5027 5028 # If the current intersect ends at the end of either range (as 5029 # it must for at least one of them), the next possible one 5030 # will be the beginning code point in it's list's next range. 5031 if ($a == $range_a->end) { 5032 $range_a = $a_ranges[++$a_i]; 5033 last unless defined $range_a; 5034 $a = $range_a->start; 5035 } 5036 if ($b == $range_b->end) { 5037 $range_b = $b_ranges[++$b_i]; 5038 last unless defined $range_b; 5039 $b = $range_b->start; 5040 } 5041 5042 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5043 } 5044 elsif ($a < $b) { 5045 5046 # Not equal, but if the range containing $a encompasses $b, 5047 # change $a to be the middle of the range where it does equal 5048 # $b, so the next iteration will get the intersection 5049 if ($range_a->end >= $b) { 5050 $a = $b; 5051 } 5052 else { 5053 5054 # Here, the current range containing $a is entirely below 5055 # $b. Go try to find a range that could contain $b. 5056 $a_i = $a_object->_search_ranges($b); 5057 5058 # If no range found, quit. 5059 last unless defined $a_i; 5060 5061 # The search returns $a_i, such that 5062 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end 5063 # Set $a to the beginning of this new range, and repeat. 5064 $range_a = $a_ranges[$a_i]; 5065 $a = $range_a->start; 5066 } 5067 } 5068 else { # Here, $b < $a. 5069 5070 # Mirror image code to the leg just above 5071 if ($range_b->end >= $a) { 5072 $b = $a; 5073 } 5074 else { 5075 $b_i = $b_object->_search_ranges($a); 5076 last unless defined $b_i; 5077 $range_b = $b_ranges[$b_i]; 5078 $b = $range_b->start; 5079 } 5080 } 5081 } # End of looping through ranges. 5082 5083 # Intersection fully computed, or now know that there is no overlap 5084 return $check_if_overlapping ? 0 : $new; 5085 } 5086 5087 sub overlaps($self, $other) { 5088 # Returns boolean giving whether the two arguments overlap somewhere 5089 return $self->_intersect($other, 1); 5090 } 5091 5092 sub add_range($self, $start, $end) { 5093 # Add a range to the list. 5094 return $self->_add_delete('+', $start, $end, ""); 5095 } 5096 5097 sub matches_identically_to($self, $other) { 5098 # Return a boolean as to whether or not two Range_Lists match identical 5099 # sets of code points. 5100 # These are ordered in increasing real time to figure out (at least 5101 # until a patch changes that and doesn't change this) 5102 return 0 if $self->max != $other->max; 5103 return 0 if $self->min != $other->min; 5104 return 0 if $self->range_count != $other->range_count; 5105 return 0 if $self->count != $other->count; 5106 5107 # Here they could be identical because all the tests above passed. 5108 # The loop below is somewhat simpler since we know they have the same 5109 # number of elements. Compare range by range, until reach the end or 5110 # find something that differs. 5111 my @a_ranges = $self->ranges; 5112 my @b_ranges = $other->ranges; 5113 for my $i (0 .. @a_ranges - 1) { 5114 my $a = $a_ranges[$i]; 5115 my $b = $b_ranges[$i]; 5116 trace "self $a; other $b" if main::DEBUG && $to_trace; 5117 return 0 if ! defined $b 5118 || $a->start != $b->start 5119 || $a->end != $b->end; 5120 } 5121 return 1; 5122 } 5123 5124 sub is_code_point_usable($code, $try_hard) { 5125 # This used only for making the test script. See if the input 5126 # proposed trial code point is one that Perl will handle. If second 5127 # parameter is 0, it won't select some code points for various 5128 # reasons, noted below. 5129 return 0 if $code < 0; # Never use a negative 5130 5131 # shun null. I'm (khw) not sure why this was done, but NULL would be 5132 # the character very frequently used. 5133 return $try_hard if $code == 0x0000; 5134 5135 # shun non-character code points. 5136 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; 5137 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF 5138 5139 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range 5140 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate 5141 5142 return 1; 5143 } 5144 5145 sub get_valid_code_point($self) { 5146 # Return a code point that's part of the range list. Returns nothing 5147 # if the table is empty or we can't find a suitable code point. This 5148 # used only for making the test script. 5149 5150 # On first pass, don't choose less desirable code points; if no good 5151 # one is found, repeat, allowing a less desirable one to be selected. 5152 for my $try_hard (0, 1) { 5153 5154 # Look through all the ranges for a usable code point. 5155 for my $set (reverse $self->ranges) { 5156 5157 # Try the edge cases first, starting with the end point of the 5158 # range. 5159 my $end = $set->end; 5160 return $end if is_code_point_usable($end, $try_hard); 5161 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT; 5162 5163 # End point didn't, work. Start at the beginning and try 5164 # every one until find one that does work. 5165 for my $trial ($set->start .. $end - 1) { 5166 return $trial if is_code_point_usable($trial, $try_hard); 5167 } 5168 } 5169 } 5170 return (); # If none found, give up. 5171 } 5172 5173 sub get_invalid_code_point($self) { 5174 # Return a code point that's not part of the table. Returns nothing 5175 # if the table covers all code points or a suitable code point can't 5176 # be found. This used only for making the test script. 5177 5178 # Just find a valid code point of the inverse, if any. 5179 return Range_List->new(Initialize => ~ $self)->get_valid_code_point; 5180 } 5181} # end closure for Range_List 5182 5183package Range_Map; 5184use parent '-norequire', '_Range_List_Base'; 5185 5186# A Range_Map is a range list in which the range values (called maps) are 5187# significant, and hence shouldn't be manipulated by our other code, which 5188# could be ambiguous or lose things. For example, in taking the union of two 5189# lists, which share code points, but which have differing values, which one 5190# has precedence in the union? 5191# It turns out that these operations aren't really necessary for map tables, 5192# and so this class was created to make sure they aren't accidentally 5193# applied to them. 5194 5195{ # Closure 5196 5197 sub add_map($self, @add) { 5198 # Add a range containing a mapping value to the list 5199 return $self->_add_delete('+', @add); 5200 } 5201 5202 sub replace_map($self, @list) { 5203 # Replace a range 5204 return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY); 5205 } 5206 5207 sub add_duplicate { 5208 # Adds entry to a range list which can duplicate an existing entry 5209 5210 my $self = shift; 5211 my $code_point = shift; 5212 my $value = shift; 5213 my %args = @_; 5214 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE; 5215 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5216 5217 return $self->add_map($code_point, $code_point, 5218 $value, Replace => $replace); 5219 } 5220} # End of closure for package Range_Map 5221 5222package _Base_Table; 5223 5224# A table is the basic data structure that gets written out into a file for 5225# use by the Perl core. This is the abstract base class implementing the 5226# common elements from the derived ones. A list of the methods to be 5227# furnished by an implementing class is just after the constructor. 5228 5229sub standardize { return main::standardize($_[0]); } 5230sub trace { return main::trace(@_); } 5231 5232{ # Closure 5233 5234 main::setup_package(); 5235 5236 my %range_list; 5237 # Object containing the ranges of the table. 5238 main::set_access('range_list', \%range_list, 'p_r', 'p_s'); 5239 5240 my %full_name; 5241 # The full table name. 5242 main::set_access('full_name', \%full_name, 'r'); 5243 5244 my %name; 5245 # The table name, almost always shorter 5246 main::set_access('name', \%name, 'r'); 5247 5248 my %short_name; 5249 # The shortest of all the aliases for this table, with underscores removed 5250 main::set_access('short_name', \%short_name); 5251 5252 my %nominal_short_name_length; 5253 # The length of short_name before removing underscores 5254 main::set_access('nominal_short_name_length', 5255 \%nominal_short_name_length); 5256 5257 my %complete_name; 5258 # The complete name, including property. 5259 main::set_access('complete_name', \%complete_name, 'r'); 5260 5261 my %property; 5262 # Parent property this table is attached to. 5263 main::set_access('property', \%property, 'r'); 5264 5265 my %aliases; 5266 # Ordered list of alias objects of the table's name. The first ones in 5267 # the list are output first in comments 5268 main::set_access('aliases', \%aliases, 'readable_array'); 5269 5270 my %comment; 5271 # A comment associated with the table for human readers of the files 5272 main::set_access('comment', \%comment, 's'); 5273 5274 my %description; 5275 # A comment giving a short description of the table's meaning for human 5276 # readers of the files. 5277 main::set_access('description', \%description, 'readable_array'); 5278 5279 my %note; 5280 # A comment giving a short note about the table for human readers of the 5281 # files. 5282 main::set_access('note', \%note, 'readable_array'); 5283 5284 my %fate; 5285 # Enum; there are a number of possibilities for what happens to this 5286 # table: it could be normal, or suppressed, or not for external use. See 5287 # values at definition for $SUPPRESSED. 5288 main::set_access('fate', \%fate, 'r'); 5289 5290 my %find_table_from_alias; 5291 # The parent property passes this pointer to a hash which this class adds 5292 # all its aliases to, so that the parent can quickly take an alias and 5293 # find this table. 5294 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); 5295 5296 my %locked; 5297 # After this table is made equivalent to another one; we shouldn't go 5298 # changing the contents because that could mean it's no longer equivalent 5299 main::set_access('locked', \%locked, 'r'); 5300 5301 my %file_path; 5302 # This gives the final path to the file containing the table. Each 5303 # directory in the path is an element in the array 5304 main::set_access('file_path', \%file_path, 'readable_array'); 5305 5306 my %status; 5307 # What is the table's status, normal, $OBSOLETE, etc. Enum 5308 main::set_access('status', \%status, 'r'); 5309 5310 my %status_info; 5311 # A comment about its being obsolete, or whatever non normal status it has 5312 main::set_access('status_info', \%status_info, 'r'); 5313 5314 my %caseless_equivalent; 5315 # The table this is equivalent to under /i matching, if any. 5316 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's'); 5317 5318 my %range_size_1; 5319 # Is the table to be output with each range only a single code point? 5320 # This is done to avoid breaking existing code that may have come to rely 5321 # on this behavior in previous versions of this program.) 5322 main::set_access('range_size_1', \%range_size_1, 'r', 's'); 5323 5324 my %perl_extension; 5325 # A boolean set iff this table is a Perl extension to the Unicode 5326 # standard. 5327 main::set_access('perl_extension', \%perl_extension, 'r'); 5328 5329 my %output_range_counts; 5330 # A boolean set iff this table is to have comments written in the 5331 # output file that contain the number of code points in the range. 5332 # The constructor can override the global flag of the same name. 5333 main::set_access('output_range_counts', \%output_range_counts, 'r'); 5334 5335 my %write_as_invlist; 5336 # A boolean set iff the output file for this table is to be in the form of 5337 # an inversion list/map. 5338 main::set_access('write_as_invlist', \%write_as_invlist, 'r'); 5339 5340 my %format; 5341 # The format of the entries of the table. This is calculated from the 5342 # data in the table (or passed in the constructor). This is an enum e.g., 5343 # $STRING_FORMAT. It is marked protected as it should not be generally 5344 # used to override calculations. 5345 main::set_access('format', \%format, 'r', 'p_s'); 5346 5347 my %has_dependency; 5348 # A boolean that gives whether some other table in this property is 5349 # defined as the complement of this table. This is a crude, but currently 5350 # sufficient, mechanism to make this table not get destroyed before what 5351 # is dependent on it is. Other dependencies could be added, so the name 5352 # was chosen to reflect a more general situation than actually is 5353 # currently the case. 5354 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 5355 5356 sub new { 5357 # All arguments are key => value pairs, which you can see below, most 5358 # of which match fields documented above. Otherwise: Re_Pod_Entry, 5359 # OK_as_Filename, and Fuzzy apply to the names of the table, and are 5360 # documented in the Alias package 5361 5362 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 5363 5364 my $class = shift; 5365 5366 my $self = bless \do { my $anonymous_scalar }, $class; 5367 my $addr = pack 'J', refaddr $self; 5368 5369 my %args = @_; 5370 5371 $name{$addr} = delete $args{'Name'}; 5372 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; 5373 $full_name{$addr} = delete $args{'Full_Name'}; 5374 my $complete_name = $complete_name{$addr} 5375 = delete $args{'Complete_Name'}; 5376 $format{$addr} = delete $args{'Format'}; 5377 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; 5378 $property{$addr} = delete $args{'_Property'}; 5379 $range_list{$addr} = delete $args{'_Range_List'}; 5380 $status{$addr} = delete $args{'Status'} || $NORMAL; 5381 $status_info{$addr} = delete $args{'_Status_Info'} || ""; 5382 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; 5383 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; 5384 $fate{$addr} = delete $args{'Fate'} || $ORDINARY; 5385 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default 5386 my $ucd = delete $args{'UCD'}; 5387 5388 my $description = delete $args{'Description'}; 5389 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5390 my $loose_match = delete $args{'Fuzzy'}; 5391 my $note = delete $args{'Note'}; 5392 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 5393 my $perl_extension = delete $args{'Perl_Extension'}; 5394 my $suppression_reason = delete $args{'Suppression_Reason'}; 5395 5396 # Shouldn't have any left over 5397 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5398 5399 # Can't use || above because conceivably the name could be 0, and 5400 # can't use // operator in case this program gets used in Perl 5.8 5401 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; 5402 $output_range_counts{$addr} = $output_range_counts if 5403 ! defined $output_range_counts{$addr}; 5404 5405 $aliases{$addr} = [ ]; 5406 $comment{$addr} = [ ]; 5407 $description{$addr} = [ ]; 5408 $note{$addr} = [ ]; 5409 $file_path{$addr} = [ ]; 5410 $locked{$addr} = ""; 5411 $has_dependency{$addr} = 0; 5412 5413 push @{$description{$addr}}, $description if $description; 5414 push @{$note{$addr}}, $note if $note; 5415 5416 if ($fate{$addr} == $PLACEHOLDER) { 5417 5418 # A placeholder table doesn't get documented, is a perl extension, 5419 # and quite likely will be empty 5420 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5421 $perl_extension = 1 if ! defined $perl_extension; 5422 $ucd = 0 if ! defined $ucd; 5423 push @tables_that_may_be_empty, $complete_name{$addr}; 5424 $self->add_comment(<<END); 5425This is a placeholder because it is not in Version $string_version of Unicode, 5426but is needed by the Perl core to work gracefully. Because it is not in this 5427version of Unicode, it will not be listed in $pod_file.pod 5428END 5429 } 5430 elsif (exists $why_suppressed{$complete_name} 5431 # Don't suppress if overridden 5432 && ! grep { $_ eq $complete_name{$addr} } 5433 @output_mapped_properties) 5434 { 5435 $fate{$addr} = $SUPPRESSED; 5436 } 5437 elsif ($fate{$addr} == $SUPPRESSED) { 5438 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason; 5439 # Though currently unused 5440 } 5441 elsif ($suppression_reason) { 5442 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed"); 5443 } 5444 5445 # If hasn't set its status already, see if it is on one of the 5446 # lists of properties or tables that have particular statuses; if 5447 # not, is normal. The lists are prioritized so the most serious 5448 # ones are checked first 5449 if (! $status{$addr}) { 5450 if (exists $why_deprecated{$complete_name}) { 5451 $status{$addr} = $DEPRECATED; 5452 } 5453 elsif (exists $why_stabilized{$complete_name}) { 5454 $status{$addr} = $STABILIZED; 5455 } 5456 elsif (exists $why_obsolete{$complete_name}) { 5457 $status{$addr} = $OBSOLETE; 5458 } 5459 5460 # Existence above doesn't necessarily mean there is a message 5461 # associated with it. Use the most serious message. 5462 if ($status{$addr}) { 5463 if ($why_deprecated{$complete_name}) { 5464 $status_info{$addr} 5465 = $why_deprecated{$complete_name}; 5466 } 5467 elsif ($why_stabilized{$complete_name}) { 5468 $status_info{$addr} 5469 = $why_stabilized{$complete_name}; 5470 } 5471 elsif ($why_obsolete{$complete_name}) { 5472 $status_info{$addr} 5473 = $why_obsolete{$complete_name}; 5474 } 5475 } 5476 } 5477 5478 $perl_extension{$addr} = $perl_extension || 0; 5479 5480 # Don't list a property by default that is internal only 5481 if ($fate{$addr} > $MAP_PROXIED) { 5482 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5483 $ucd = 0 if ! defined $ucd; 5484 } 5485 else { 5486 $ucd = 1 if ! defined $ucd; 5487 } 5488 5489 # By convention what typically gets printed only or first is what's 5490 # first in the list, so put the full name there for good output 5491 # clarity. Other routines rely on the full name being first on the 5492 # list 5493 $self->add_alias($full_name{$addr}, 5494 OK_as_Filename => $ok_as_filename, 5495 Fuzzy => $loose_match, 5496 Re_Pod_Entry => $make_re_pod_entry, 5497 Status => $status{$addr}, 5498 UCD => $ucd, 5499 ); 5500 5501 # Then comes the other name, if meaningfully different. 5502 if (standardize($full_name{$addr}) ne standardize($name{$addr})) { 5503 $self->add_alias($name{$addr}, 5504 OK_as_Filename => $ok_as_filename, 5505 Fuzzy => $loose_match, 5506 Re_Pod_Entry => $make_re_pod_entry, 5507 Status => $status{$addr}, 5508 UCD => $ucd, 5509 ); 5510 } 5511 5512 return $self; 5513 } 5514 5515 # Here are the methods that are required to be defined by any derived 5516 # class 5517 for my $sub (qw( 5518 handle_special_range 5519 append_to_body 5520 pre_body 5521 )) 5522 # write() knows how to write out normal ranges, but it calls 5523 # handle_special_range() when it encounters a non-normal one. 5524 # append_to_body() is called by it after it has handled all 5525 # ranges to add anything after the main portion of the table. 5526 # And finally, pre_body() is called after all this to build up 5527 # anything that should appear before the main portion of the 5528 # table. Doing it this way allows things in the middle to 5529 # affect what should appear before the main portion of the 5530 # table. 5531 { 5532 no strict "refs"; 5533 *$sub = sub { 5534 Carp::my_carp_bug( __LINE__ 5535 . ": Must create method '$sub()' for " 5536 . ref shift); 5537 return; 5538 } 5539 } 5540 5541 use overload 5542 fallback => 0, 5543 "." => \&main::_operator_dot, 5544 ".=" => \&main::_operator_dot_equal, 5545 '!=' => \&main::_operator_not_equal, 5546 '==' => \&main::_operator_equal, 5547 ; 5548 5549 sub ranges { 5550 # Returns the array of ranges associated with this table. 5551 5552 return $range_list{pack 'J', refaddr shift}->ranges; 5553 } 5554 5555 sub add_alias { 5556 # Add a synonym for this table. 5557 5558 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 5559 5560 my $self = shift; 5561 my $name = shift; # The name to add. 5562 my $pointer = shift; # What the alias hash should point to. For 5563 # map tables, this is the parent property; 5564 # for match tables, it is the table itself. 5565 5566 my %args = @_; 5567 my $loose_match = delete $args{'Fuzzy'}; 5568 5569 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5570 $ok_as_filename = 1 unless defined $ok_as_filename; 5571 5572 # An internal name does not get documented, unless overridden by the 5573 # input; same for making tests for it. 5574 my $status = delete $args{'Status'} || (($name =~ /^_/) 5575 ? $INTERNAL_ALIAS 5576 : $NORMAL); 5577 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'} 5578 // (($status ne $INTERNAL_ALIAS) 5579 ? (($name =~ /^_/) ? $NO : $YES) 5580 : $NO); 5581 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); 5582 5583 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5584 5585 # Capitalize the first letter of the alias unless it is one of the CJK 5586 # ones which specifically begins with a lower 'k'. Do this because 5587 # Unicode has varied whether they capitalize first letters or not, and 5588 # have later changed their minds and capitalized them, but not the 5589 # other way around. So do it always and avoid changes from release to 5590 # release 5591 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 5592 5593 my $addr = pack 'J', refaddr $self; 5594 5595 # Figure out if should be loosely matched if not already specified. 5596 if (! defined $loose_match) { 5597 5598 # Is a loose_match if isn't null, and doesn't begin with an 5599 # underscore and isn't just a number 5600 if ($name ne "" 5601 && substr($name, 0, 1) ne '_' 5602 && $name !~ qr{^[0-9_.+-/]+$}) 5603 { 5604 $loose_match = 1; 5605 } 5606 else { 5607 $loose_match = 0; 5608 } 5609 } 5610 5611 # If this alias has already been defined, do nothing. 5612 return if defined $find_table_from_alias{$addr}->{$name}; 5613 5614 # That includes if it is standardly equivalent to an existing alias, 5615 # in which case, add this name to the list, so won't have to search 5616 # for it again. 5617 my $standard_name = main::standardize($name); 5618 if (defined $find_table_from_alias{$addr}->{$standard_name}) { 5619 $find_table_from_alias{$addr}->{$name} 5620 = $find_table_from_alias{$addr}->{$standard_name}; 5621 return; 5622 } 5623 5624 # Set the index hash for this alias for future quick reference. 5625 $find_table_from_alias{$addr}->{$name} = $pointer; 5626 $find_table_from_alias{$addr}->{$standard_name} = $pointer; 5627 local $to_trace = 0 if main::DEBUG; 5628 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; 5629 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; 5630 5631 5632 # Put the new alias at the end of the list of aliases unless the final 5633 # element begins with an underscore (meaning it is for internal perl 5634 # use) or is all numeric, in which case, put the new one before that 5635 # one. This floats any all-numeric or underscore-beginning aliases to 5636 # the end. This is done so that they are listed last in output lists, 5637 # to encourage the user to use a better name (either more descriptive 5638 # or not an internal-only one) instead. This ordering is relied on 5639 # implicitly elsewhere in this program, like in short_name() 5640 my $list = $aliases{$addr}; 5641 my $insert_position = (@$list == 0 5642 || (substr($list->[-1]->name, 0, 1) ne '_' 5643 && $list->[-1]->name =~ /\D/)) 5644 ? @$list 5645 : @$list - 1; 5646 splice @$list, 5647 $insert_position, 5648 0, 5649 Alias->new($name, $loose_match, $make_re_pod_entry, 5650 $ok_as_filename, $status, $ucd); 5651 5652 # This name may be shorter than any existing ones, so clear the cache 5653 # of the shortest, so will have to be recalculated. 5654 undef $short_name{pack 'J', refaddr $self}; 5655 return; 5656 } 5657 5658 sub short_name($self, $nominal_length_ptr=undef) { 5659 # Returns a name suitable for use as the base part of a file name. 5660 # That is, shorter wins. It can return undef if there is no suitable 5661 # name. The name has all non-essential underscores removed. 5662 5663 # The optional second parameter is a reference to a scalar in which 5664 # this routine will store the length the returned name had before the 5665 # underscores were removed, or undef if the return is undef. 5666 5667 # The shortest name can change if new aliases are added. So using 5668 # this should be deferred until after all these are added. The code 5669 # that does that should clear this one's cache. 5670 # Any name with alphabetics is preferred over an all numeric one, even 5671 # if longer. 5672 5673 my $addr = pack 'J', refaddr $self; 5674 5675 # For efficiency, don't recalculate, but this means that adding new 5676 # aliases could change what the shortest is, so the code that does 5677 # that needs to undef this. 5678 if (defined $short_name{$addr}) { 5679 if ($nominal_length_ptr) { 5680 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5681 } 5682 return $short_name{$addr}; 5683 } 5684 5685 # Look at each alias 5686 my $is_last_resort = 0; 5687 my $deprecated_or_discouraged 5688 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x; 5689 foreach my $alias ($self->aliases()) { 5690 5691 # Don't use an alias that isn't ok to use for an external name. 5692 next if ! $alias->ok_as_filename; 5693 5694 my $name = main::Standardize($alias->name); 5695 trace $self, $name if main::DEBUG && $to_trace; 5696 5697 # Take the first one, or any non-deprecated non-discouraged one 5698 # over one that is, or a shorter one that isn't numeric. This 5699 # relies on numeric aliases always being last in the array 5700 # returned by aliases(). Any alpha one will have precedence. 5701 if ( ! defined $short_name{$addr} 5702 || ( $is_last_resort 5703 && $alias->status !~ $deprecated_or_discouraged) 5704 || ($name =~ /\D/ 5705 && length($name) < length($short_name{$addr}))) 5706 { 5707 # Remove interior underscores. 5708 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; 5709 5710 $nominal_short_name_length{$addr} = length $name; 5711 $is_last_resort = $alias->status =~ $deprecated_or_discouraged; 5712 } 5713 } 5714 5715 # If the short name isn't a nice one, perhaps an equivalent table has 5716 # a better one. 5717 if ( $self->can('children') 5718 && ( ! defined $short_name{$addr} 5719 || $short_name{$addr} eq "" 5720 || $short_name{$addr} eq "_")) 5721 { 5722 my $return; 5723 foreach my $follower ($self->children) { # All equivalents 5724 my $follower_name = $follower->short_name; 5725 next unless defined $follower_name; 5726 5727 # Anything (except undefined) is better than underscore or 5728 # empty 5729 if (! defined $return || $return eq "_") { 5730 $return = $follower_name; 5731 next; 5732 } 5733 5734 # If the new follower name isn't "_" and is shorter than the 5735 # current best one, prefer the new one. 5736 next if $follower_name eq "_"; 5737 next if length $follower_name > length $return; 5738 $return = $follower_name; 5739 } 5740 $short_name{$addr} = $return if defined $return; 5741 } 5742 5743 # If no suitable external name return undef 5744 if (! defined $short_name{$addr}) { 5745 $$nominal_length_ptr = undef if $nominal_length_ptr; 5746 return; 5747 } 5748 5749 # Don't allow a null short name. 5750 if ($short_name{$addr} eq "") { 5751 $short_name{$addr} = '_'; 5752 $nominal_short_name_length{$addr} = 1; 5753 } 5754 5755 trace $self, $short_name{$addr} if main::DEBUG && $to_trace; 5756 5757 if ($nominal_length_ptr) { 5758 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5759 } 5760 return $short_name{$addr}; 5761 } 5762 5763 sub external_name($self) { 5764 # Returns the external name that this table should be known by. This 5765 # is usually the short_name, but not if the short_name is undefined, 5766 # in which case the external_name is arbitrarily set to the 5767 # underscore. 5768 5769 my $short = $self->short_name; 5770 return $short if defined $short; 5771 5772 return '_'; 5773 } 5774 5775 sub add_description($self, $description) { # Adds the parameter as a short description. 5776 push @{$description{pack 'J', refaddr $self}}, $description; 5777 5778 return; 5779 } 5780 5781 sub add_note($self, $note) { # Adds the parameter as a short note. 5782 push @{$note{pack 'J', refaddr $self}}, $note; 5783 5784 return; 5785 } 5786 5787 sub add_comment($self, $comment) { # Adds the parameter as a comment. 5788 5789 return unless $debugging_build; 5790 5791 chomp $comment; 5792 5793 push @{$comment{pack 'J', refaddr $self}}, $comment; 5794 5795 return; 5796 } 5797 5798 sub comment($self) { 5799 # Return the current comment for this table. If called in list 5800 # context, returns the array of comments. In scalar, returns a string 5801 # of each element joined together with a period ending each. 5802 5803 my $addr = pack 'J', refaddr $self; 5804 my @list = @{$comment{$addr}}; 5805 return @list if wantarray; 5806 my $return = ""; 5807 foreach my $sentence (@list) { 5808 $return .= '. ' if $return; 5809 $return .= $sentence; 5810 $return =~ s/\.$//; 5811 } 5812 $return .= '.' if $return; 5813 return $return; 5814 } 5815 5816 sub initialize($self, $initialization) { 5817 # Initialize the table with the argument which is any valid 5818 # initialization for range lists. 5819 5820 my $addr = pack 'J', refaddr $self; 5821 5822 # Replace the current range list with a new one of the same exact 5823 # type. 5824 my $class = ref $range_list{$addr}; 5825 $range_list{$addr} = $class->new(Owner => $self, 5826 Initialize => $initialization); 5827 return; 5828 5829 } 5830 5831 sub header($self) { 5832 # The header that is output for the table in the file it is written 5833 # in. 5834 my $return = ""; 5835 $return .= $DEVELOPMENT_ONLY if $compare_versions; 5836 $return .= $HEADER; 5837 return $return; 5838 } 5839 5840 sub merge_single_annotation_line ($output, $annotation, $annotation_column) { 5841 5842 # This appends an annotation comment, $annotation, to $output, 5843 # starting in or after column $annotation_column, removing any 5844 # pre-existing comment from $output. 5845 5846 $annotation =~ s/^ \s* \# \ //x; 5847 $output =~ s/ \s* ( \# \N* )? \n //x; 5848 $output = Text::Tabs::expand($output); 5849 5850 my $spaces = $annotation_column - length $output; 5851 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment 5852 5853 $output = sprintf "%s%*s# %s", 5854 $output, 5855 $spaces, 5856 " ", 5857 $annotation; 5858 return Text::Tabs::unexpand $output; 5859 } 5860 5861 sub write($self, $use_adjustments=0, $suppress_value=0) { 5862 # Write a representation of the table to its file. It calls several 5863 # functions furnished by sub-classes of this abstract base class to 5864 # handle non-normal ranges, to add stuff before the table, and at its 5865 # end. If the table is to be written so that adjustments are 5866 # required, this does that conversion. 5867 5868 5869 # $use_adjustments ? output in adjusted format or not 5870 # $suppress_value Optional, if the value associated with 5871 # a range equals this one, don't write 5872 # the range 5873 5874 my $addr = pack 'J', refaddr $self; 5875 my $write_as_invlist = $write_as_invlist{$addr}; 5876 5877 # Start with the header 5878 my @HEADER = $self->header; 5879 5880 # Then the comments 5881 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n" 5882 if $comment{$addr}; 5883 5884 # Things discovered processing the main body of the document may 5885 # affect what gets output before it, therefore pre_body() isn't called 5886 # until after all other processing of the table is done. 5887 5888 # The main body looks like a 'here' document. If there are comments, 5889 # get rid of them when processing it. 5890 my @OUT; 5891 if ($annotate || $output_range_counts) { 5892 # Use the line below in Perls that don't have /r 5893 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n"; 5894 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n"; 5895 } else { 5896 push @OUT, "return <<'END';\n"; 5897 } 5898 5899 if ($range_list{$addr}->is_empty) { 5900 5901 # This is a kludge for empty tables to silence a warning in 5902 # utf8.c, which can't really deal with empty tables, but it can 5903 # deal with a table that matches nothing, as the inverse of 'All' 5904 # does. 5905 push @OUT, "!Unicode::UCD::All\n"; 5906 } 5907 elsif ($self->name eq 'N' 5908 5909 # To save disk space and table cache space, avoid putting out 5910 # binary N tables, but instead create a file which just inverts 5911 # the Y table. Since the file will still exist and occupy a 5912 # certain number of blocks, might as well output the whole 5913 # thing if it all will fit in one block. The number of 5914 # ranges below is an approximate number for that. 5915 && ($self->property->type == $BINARY 5916 || $self->property->type == $FORCED_BINARY) 5917 # && $self->property->tables == 2 Can't do this because the 5918 # non-binary properties, like NFDQC aren't specifiable 5919 # by the notation 5920 && $range_list{$addr}->ranges > 15 5921 && ! $annotate) # Under --annotate, want to see everything 5922 { 5923 push @OUT, "!Unicode::UCD::" . $self->property->name . "\n"; 5924 } 5925 else { 5926 my $range_size_1 = $range_size_1{$addr}; 5927 5928 # To make it more readable, use a minimum indentation 5929 my $comment_indent; 5930 5931 # These are used only in $annotate option 5932 my $format; # e.g. $HEX_ADJUST_FORMAT 5933 my $include_name; # ? Include the character's name in the 5934 # annotation? 5935 my $include_cp; # ? Include its code point 5936 5937 if (! $annotate) { 5938 $comment_indent = ($self->isa('Map_Table')) 5939 ? 24 5940 : ($write_as_invlist) 5941 ? 8 5942 : 16; 5943 } 5944 else { 5945 $format = $self->format; 5946 5947 # The name of the character is output only for tables that 5948 # don't already include the name in the output. 5949 my $property = $self->property; 5950 $include_name = 5951 ! ($property == $perl_charname 5952 || $property == main::property_ref('Unicode_1_Name') 5953 || $property == main::property_ref('Name') 5954 || $property == main::property_ref('Name_Alias') 5955 ); 5956 5957 # Don't include the code point in the annotation where all 5958 # lines are a single code point, so it can be easily found in 5959 # the first column 5960 $include_cp = ! $range_size_1; 5961 5962 if (! $self->isa('Map_Table')) { 5963 $comment_indent = ($write_as_invlist) ? 8 : 16; 5964 } 5965 else { 5966 $comment_indent = 16; 5967 5968 # There are just a few short ranges in this table, so no 5969 # need to include the code point in the annotation. 5970 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT; 5971 5972 # We're trying to get this to look good, as the whole 5973 # point is to make human-readable tables. It is easier to 5974 # read if almost all the annotation comments begin in the 5975 # same column. Map tables have varying width maps, so can 5976 # create a jagged comment appearance. This code does a 5977 # preliminary pass through these tables looking for the 5978 # maximum width map in each, and causing the comments to 5979 # begin just to the right of that. However, if the 5980 # comments begin too far to the right of most lines, it's 5981 # hard to line them up horizontally with their real data. 5982 # Therefore we ignore the longest outliers 5983 my $ignore_longest_X_percent = 2; # Discard longest X% 5984 5985 # Each key in this hash is a width of at least one of the 5986 # maps in the table. Its value is how many lines have 5987 # that width. 5988 my %widths; 5989 5990 # We won't space things further left than one tab stop 5991 # after the rest of the line; initializing it to that 5992 # number saves some work. 5993 my $max_map_width = 8; 5994 5995 # Fill in the %widths hash 5996 my $total = 0; 5997 for my $set ($range_list{$addr}->ranges) { 5998 my $value = $set->value; 5999 6000 # These range types don't appear in the main table 6001 next if $set->type == 0 6002 && defined $suppress_value 6003 && $value eq $suppress_value; 6004 next if $set->type == $MULTI_CP 6005 || $set->type == $NULL; 6006 6007 # Include 2 spaces before the beginning of the 6008 # comment 6009 my $this_width = length($value) + 2; 6010 6011 # Ranges of the remaining non-zero types usually 6012 # occupy just one line (maybe occasionally two, but 6013 # this doesn't have to be dead accurate). This is 6014 # because these ranges are like "unassigned code 6015 # points" 6016 my $count = ($set->type != 0) 6017 ? 1 6018 : $set->end - $set->start + 1; 6019 $widths{$this_width} += $count; 6020 $total += $count; 6021 $max_map_width = $this_width 6022 if $max_map_width < $this_width; 6023 } 6024 6025 # If the widest map gives us less than two tab stops 6026 # worth, just take it as-is. 6027 if ($max_map_width > 16) { 6028 6029 # Otherwise go through %widths until we have included 6030 # the desired percentage of lines in the whole table. 6031 my $running_total = 0; 6032 foreach my $width (sort { $a <=> $b } keys %widths) 6033 { 6034 $running_total += $widths{$width}; 6035 use integer; 6036 if ($running_total * 100 / $total 6037 >= 100 - $ignore_longest_X_percent) 6038 { 6039 $max_map_width = $width; 6040 last; 6041 } 6042 } 6043 } 6044 $comment_indent += $max_map_width; 6045 } 6046 } 6047 6048 # Values for previous time through the loop. Initialize to 6049 # something that won't be adjacent to the first iteration; 6050 # only $previous_end matters for that. 6051 my $previous_start; 6052 my $previous_end = -2; 6053 my $previous_value; 6054 6055 # Values for next time through the portion of the loop that splits 6056 # the range. 0 in $next_start means there is no remaining portion 6057 # to deal with. 6058 my $next_start = 0; 6059 my $next_end; 6060 my $next_value; 6061 my $offset = 0; 6062 my $invlist_count = 0; 6063 6064 my $output_value_in_hex = $self->isa('Map_Table') 6065 && ($self->format eq $HEX_ADJUST_FORMAT 6066 || $self->to_output_map == $EXTERNAL_MAP); 6067 # Use leading zeroes just for files whose format should not be 6068 # changed from what it has been. Otherwise, they just take up 6069 # space and time to process. 6070 my $hex_format = ($self->isa('Map_Table') 6071 && $self->to_output_map == $EXTERNAL_MAP) 6072 ? "%04X" 6073 : "%X"; 6074 6075 # The values for some of these tables are stored in mktables as 6076 # hex strings. Normally, these are just output as strings without 6077 # change, but when we are doing adjustments, we have to operate on 6078 # these numerically, so we convert those to decimal to do that, 6079 # and back to hex for output 6080 my $convert_map_to_from_hex = 0; 6081 my $output_map_in_hex = 0; 6082 if ($self->isa('Map_Table')) { 6083 $convert_map_to_from_hex 6084 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT) 6085 || ($annotate && $self->format eq $HEX_FORMAT); 6086 $output_map_in_hex = $convert_map_to_from_hex 6087 || $self->format eq $HEX_FORMAT; 6088 } 6089 6090 # To store any annotations about the characters. 6091 my @annotation; 6092 6093 # Output each range as part of the here document. 6094 RANGE: 6095 for my $set ($range_list{$addr}->ranges) { 6096 if ($set->type != 0) { 6097 $self->handle_special_range($set); 6098 next RANGE; 6099 } 6100 my $start = $set->start; 6101 my $end = $set->end; 6102 my $value = $set->value; 6103 6104 # Don't output ranges whose value is the one to suppress 6105 next RANGE if defined $suppress_value 6106 && $value eq $suppress_value; 6107 6108 $value = CORE::hex $value if $convert_map_to_from_hex; 6109 6110 6111 { # This bare block encloses the scope where we may need to 6112 # 'redo' to. Consider a table that is to be written out 6113 # using single item ranges. This is given in the 6114 # $range_size_1 boolean. To accomplish this, we split the 6115 # range each time through the loop into two portions, the 6116 # first item, and the rest. We handle that first item 6117 # this time in the loop, and 'redo' to repeat the process 6118 # for the rest of the range. 6119 # 6120 # We may also have to do it, with other special handling, 6121 # if the table has adjustments. Consider the table that 6122 # contains the lowercasing maps. mktables stores the 6123 # ASCII range ones as 26 ranges: 6124 # ord('A') => ord('a'), .. ord('Z') => ord('z') 6125 # For compactness, the table that gets written has this as 6126 # just one range 6127 # ( ord('A') .. ord('Z') ) => ord('a') 6128 # and the software that reads the tables is smart enough 6129 # to "connect the dots". This change is accomplished in 6130 # this loop by looking to see if the current iteration 6131 # fits the paradigm of the previous iteration, and if so, 6132 # we merge them by replacing the final output item with 6133 # the merged data. Repeated 25 times, this gets A-Z. But 6134 # we also have to make sure we don't screw up cases where 6135 # we have internally stored 6136 # ( 0x1C4 .. 0x1C6 ) => 0x1C5 6137 # This single internal range has to be output as 3 ranges, 6138 # which is done by splitting, like we do for $range_size_1 6139 # tables. (There are very few of such ranges that need to 6140 # be split, so the gain of doing the combining of other 6141 # ranges far outweighs the splitting of these.) The 6142 # values to use for the redo at the end of this block are 6143 # set up just below in the scalars whose names begin with 6144 # '$next_'. 6145 6146 if (($use_adjustments || $range_size_1) && $end != $start) 6147 { 6148 $next_start = $start + 1; 6149 $next_end = $end; 6150 $next_value = $value; 6151 $end = $start; 6152 } 6153 6154 if ($use_adjustments && ! $range_size_1) { 6155 6156 # If this range is adjacent to the previous one, and 6157 # the values in each are integers that are also 6158 # adjacent (differ by 1), then this range really 6159 # extends the previous one that is already in element 6160 # $OUT[-1]. So we pop that element, and pretend that 6161 # the range starts with whatever it started with. 6162 # $offset is incremented by 1 each time so that it 6163 # gives the current offset from the first element in 6164 # the accumulating range, and we keep in $value the 6165 # value of that first element. 6166 if ($start == $previous_end + 1 6167 && $value =~ /^ -? \d+ $/xa 6168 && $previous_value =~ /^ -? \d+ $/xa 6169 && ($value == ($previous_value + ++$offset))) 6170 { 6171 pop @OUT; 6172 $start = $previous_start; 6173 $value = $previous_value; 6174 } 6175 else { 6176 $offset = 0; 6177 if (@annotation == 1) { 6178 $OUT[-1] = merge_single_annotation_line( 6179 $OUT[-1], $annotation[0], $comment_indent); 6180 } 6181 else { 6182 push @OUT, @annotation; 6183 } 6184 } 6185 undef @annotation; 6186 6187 # Save the current values for the next time through 6188 # the loop. 6189 $previous_start = $start; 6190 $previous_end = $end; 6191 $previous_value = $value; 6192 } 6193 6194 if ($write_as_invlist) { 6195 if ( $previous_end > 0 6196 && $output_range_counts{$addr}) 6197 { 6198 my $complement_count = $start - $previous_end - 1; 6199 if ($complement_count > 1) { 6200 $OUT[-1] = merge_single_annotation_line( 6201 $OUT[-1], 6202 "#" 6203 . (" " x 17) 6204 . "[" 6205 . main::clarify_code_point_count( 6206 $complement_count) 6207 . "] in complement\n", 6208 $comment_indent); 6209 } 6210 } 6211 6212 # Inversion list format has a single number per line, 6213 # the starting code point of a range that matches the 6214 # property 6215 push @OUT, $start, "\n"; 6216 $invlist_count++; 6217 6218 # Add a comment with the size of the range, if 6219 # requested. 6220 if ($output_range_counts{$addr}) { 6221 $OUT[-1] = merge_single_annotation_line( 6222 $OUT[-1], 6223 "# [" 6224 . main::clarify_code_point_count($end - $start + 1) 6225 . "]\n", 6226 $comment_indent); 6227 } 6228 } 6229 elsif ($start != $end) { # If there is a range 6230 if ($end == $MAX_WORKING_CODEPOINT) { 6231 push @OUT, sprintf "$hex_format\t$hex_format", 6232 $start, 6233 $MAX_PLATFORM_CODEPOINT; 6234 } 6235 else { 6236 push @OUT, sprintf "$hex_format\t$hex_format", 6237 $start, $end; 6238 } 6239 if (length $value) { 6240 if ($convert_map_to_from_hex) { 6241 $OUT[-1] .= sprintf "\t$hex_format\n", $value; 6242 } 6243 else { 6244 $OUT[-1] .= "\t$value\n"; 6245 } 6246 } 6247 6248 # Add a comment with the size of the range, if 6249 # requested. 6250 if ($output_range_counts{$addr}) { 6251 $OUT[-1] = merge_single_annotation_line( 6252 $OUT[-1], 6253 "# [" 6254 . main::clarify_code_point_count($end - $start + 1) 6255 . "]\n", 6256 $comment_indent); 6257 } 6258 } 6259 else { # Here to output a single code point per line. 6260 6261 # Use any passed in subroutine to output. 6262 if (ref $range_size_1 eq 'CODE') { 6263 for my $i ($start .. $end) { 6264 push @OUT, &{$range_size_1}($i, $value); 6265 } 6266 } 6267 else { 6268 6269 # Here, caller is ok with default output. 6270 for (my $i = $start; $i <= $end; $i++) { 6271 if ($convert_map_to_from_hex) { 6272 push @OUT, 6273 sprintf "$hex_format\t\t$hex_format\n", 6274 $i, $value; 6275 } 6276 else { 6277 push @OUT, sprintf $hex_format, $i; 6278 $OUT[-1] .= "\t\t$value" if $value ne ""; 6279 $OUT[-1] .= "\n"; 6280 } 6281 } 6282 } 6283 } 6284 6285 if ($annotate) { 6286 for (my $i = $start; $i <= $end; $i++) { 6287 my $annotation = ""; 6288 6289 # Get character information if don't have it already 6290 main::populate_char_info($i) 6291 if ! defined $viacode[$i]; 6292 my $type = $annotate_char_type[$i]; 6293 6294 # Figure out if should output the next code points 6295 # as part of a range or not. If this is not in an 6296 # annotation range, then won't output as a range, 6297 # so returns $i. Otherwise use the end of the 6298 # annotation range, but no further than the 6299 # maximum possible end point of the loop. 6300 my $range_end = 6301 $range_size_1 6302 ? $start 6303 : main::min( 6304 $annotate_ranges->value_of($i) || $i, 6305 $end); 6306 6307 # Use a range if it is a range, and either is one 6308 # of the special annotation ranges, or the range 6309 # is at most 3 long. This last case causes the 6310 # algorithmically named code points to be output 6311 # individually in spans of at most 3, as they are 6312 # the ones whose $type is > 0. 6313 if ($range_end != $i 6314 && ( $type < 0 || $range_end - $i > 2)) 6315 { 6316 # Here is to output a range. We don't allow a 6317 # caller-specified output format--just use the 6318 # standard one. 6319 my $range_name = $viacode[$i]; 6320 6321 # For the code points which end in their hex 6322 # value, we eliminate that from the output 6323 # annotation, and capitalize only the first 6324 # letter of each word. 6325 if ($type == $CP_IN_NAME) { 6326 my $hex = sprintf $hex_format, $i; 6327 $range_name =~ s/-$hex$//; 6328 my @words = split " ", $range_name; 6329 for my $word (@words) { 6330 $word = 6331 ucfirst(lc($word)) if $word ne 'CJK'; 6332 } 6333 $range_name = join " ", @words; 6334 } 6335 elsif ($type == $HANGUL_SYLLABLE) { 6336 $range_name = "Hangul Syllable"; 6337 } 6338 6339 # If the annotation would just repeat what's 6340 # already being output as the range, skip it. 6341 # (When an inversion list is being written, it 6342 # isn't a repeat, as that always is in 6343 # decimal) 6344 if ( $write_as_invlist 6345 || $i != $start 6346 || $range_end < $end) 6347 { 6348 if ($range_end < $MAX_WORKING_CODEPOINT) 6349 { 6350 $annotation = sprintf "%04X..%04X", 6351 $i, $range_end; 6352 } 6353 else { 6354 $annotation = sprintf "%04X..INFINITY", 6355 $i; 6356 } 6357 } 6358 else { # Indent if not displaying code points 6359 $annotation = " " x 4; 6360 } 6361 6362 if ($range_name) { 6363 $annotation .= " $age[$i]" if $age[$i]; 6364 $annotation .= " $range_name"; 6365 } 6366 6367 # Include the number of code points in the 6368 # range 6369 my $count = 6370 main::clarify_code_point_count($range_end - $i + 1); 6371 $annotation .= " [$count]\n"; 6372 6373 # Skip to the end of the range 6374 $i = $range_end; 6375 } 6376 else { # Not in a range. 6377 my $comment = ""; 6378 6379 # When outputting the names of each character, 6380 # use the character itself if printable 6381 $comment .= "'" . main::display_chr($i) . "' " 6382 if $printable[$i]; 6383 6384 my $output_value = $value; 6385 6386 # Determine the annotation 6387 if ($format eq $DECOMP_STRING_FORMAT) { 6388 6389 # This is very specialized, with the type 6390 # of decomposition beginning the line 6391 # enclosed in <...>, and the code points 6392 # that the code point decomposes to 6393 # separated by blanks. Create two 6394 # strings, one of the printable 6395 # characters, and one of their official 6396 # names. 6397 (my $map = $output_value) 6398 =~ s/ \ * < .*? > \ +//x; 6399 my $tostr = ""; 6400 my $to_name = ""; 6401 my $to_chr = ""; 6402 foreach my $to (split " ", $map) { 6403 $to = CORE::hex $to; 6404 $to_name .= " + " if $to_name; 6405 $to_chr .= main::display_chr($to); 6406 main::populate_char_info($to) 6407 if ! defined $viacode[$to]; 6408 $to_name .= $viacode[$to]; 6409 } 6410 6411 $comment .= 6412 "=> '$to_chr'; $viacode[$i] => $to_name"; 6413 } 6414 else { 6415 $output_value += $i - $start 6416 if $use_adjustments 6417 # Don't try to adjust a 6418 # non-integer 6419 && $output_value !~ /[-\D]/; 6420 6421 if ($output_map_in_hex) { 6422 main::populate_char_info($output_value) 6423 if ! defined $viacode[$output_value]; 6424 $comment .= " => '" 6425 . main::display_chr($output_value) 6426 . "'; " if $printable[$output_value]; 6427 } 6428 if ($include_name && $viacode[$i]) { 6429 $comment .= " " if $comment; 6430 $comment .= $viacode[$i]; 6431 } 6432 if ($output_map_in_hex) { 6433 $comment .= 6434 " => $viacode[$output_value]" 6435 if $viacode[$output_value]; 6436 $output_value = sprintf($hex_format, 6437 $output_value); 6438 } 6439 } 6440 6441 if ($include_cp) { 6442 $annotation = sprintf "%04X %s", $i, $age[$i]; 6443 if ($use_adjustments) { 6444 $annotation .= " => $output_value"; 6445 } 6446 } 6447 6448 if ($comment ne "") { 6449 $annotation .= " " if $annotation ne ""; 6450 $annotation .= $comment; 6451 } 6452 $annotation .= "\n" if $annotation ne ""; 6453 } 6454 6455 if ($annotation ne "") { 6456 push @annotation, (" " x $comment_indent) 6457 . "# $annotation"; 6458 } 6459 } 6460 6461 # If not adjusting, we don't have to go through the 6462 # loop again to know that the annotation comes next 6463 # in the output. 6464 if (! $use_adjustments) { 6465 if (@annotation == 1) { 6466 $OUT[-1] = merge_single_annotation_line( 6467 $OUT[-1], $annotation[0], $comment_indent); 6468 } 6469 else { 6470 push @OUT, map { Text::Tabs::unexpand $_ } 6471 @annotation; 6472 } 6473 undef @annotation; 6474 } 6475 } 6476 6477 # Add the beginning of the range that doesn't match the 6478 # property, except if the just added match range extends 6479 # to infinity. We do this after any annotations for the 6480 # match range. 6481 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) { 6482 push @OUT, $end + 1, "\n"; 6483 $invlist_count++; 6484 } 6485 6486 # If we split the range, set up so the next time through 6487 # we get the remainder, and redo. 6488 if ($next_start) { 6489 $start = $next_start; 6490 $end = $next_end; 6491 $value = $next_value; 6492 $next_start = 0; 6493 redo; 6494 } 6495 } # End of redo block 6496 } # End of loop through all the table's ranges 6497 6498 push @OUT, @annotation; # Add orphaned annotation, if any 6499 6500 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count; 6501 } 6502 6503 # Add anything that goes after the main body, but within the here 6504 # document, 6505 my $append_to_body = $self->append_to_body; 6506 push @OUT, $append_to_body if $append_to_body; 6507 6508 # And finish the here document. 6509 push @OUT, "END\n"; 6510 6511 # Done with the main portion of the body. Can now figure out what 6512 # should appear before it in the file. 6513 my $pre_body = $self->pre_body; 6514 push @HEADER, $pre_body, "\n" if $pre_body; 6515 6516 # All these files should have a .pl suffix added to them. 6517 my @file_with_pl = @{$file_path{$addr}}; 6518 $file_with_pl[-1] .= '.pl'; 6519 6520 main::write(\@file_with_pl, 6521 $annotate, # utf8 iff annotating 6522 \@HEADER, 6523 \@OUT); 6524 return; 6525 } 6526 6527 sub set_status($self, $status, $info) { # Set the table's status 6528 # status The status enum value 6529 # info Any message associated with it. 6530 my $addr = pack 'J', refaddr $self; 6531 6532 $status{$addr} = $status; 6533 $status_info{$addr} = $info; 6534 return; 6535 } 6536 6537 sub set_fate($self, $fate, $reason=undef) { # Set the fate of a table 6538 my $addr = pack 'J', refaddr $self; 6539 6540 return if $fate{$addr} == $fate; # If no-op 6541 6542 # Can only change the ordinary fate, except if going to $MAP_PROXIED 6543 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; 6544 6545 $fate{$addr} = $fate; 6546 6547 # Don't document anything to do with a non-normal fated table 6548 if ($fate != $ORDINARY) { 6549 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; 6550 foreach my $alias ($self->aliases) { 6551 $alias->set_ucd($put_in_pod); 6552 6553 # MAP_PROXIED doesn't affect the match tables 6554 next if $fate == $MAP_PROXIED; 6555 $alias->set_make_re_pod_entry($put_in_pod); 6556 } 6557 } 6558 6559 # Save the reason for suppression for output 6560 if ($fate >= $SUPPRESSED) { 6561 $reason = "" unless defined $reason; 6562 $why_suppressed{$complete_name{$addr}} = $reason; 6563 } 6564 6565 return; 6566 } 6567 6568 sub lock($self) { 6569 # Don't allow changes to the table from now on. This stores a stack 6570 # trace of where it was called, so that later attempts to modify it 6571 # can immediately show where it got locked. 6572 my $addr = pack 'J', refaddr $self; 6573 6574 $locked{$addr} = ""; 6575 6576 my $line = (caller(0))[2]; 6577 my $i = 1; 6578 6579 # Accumulate the stack trace 6580 while (1) { 6581 my ($pkg, $file, $caller_line, $caller) = caller $i++; 6582 6583 last unless defined $caller; 6584 6585 $locked{$addr} .= " called from $caller() at line $line\n"; 6586 $line = $caller_line; 6587 } 6588 $locked{$addr} .= " called from main at line $line\n"; 6589 6590 return; 6591 } 6592 6593 sub carp_if_locked($self) { 6594 # Return whether a table is locked or not, and, by the way, complain 6595 # if is locked 6596 my $addr = pack 'J', refaddr $self; 6597 6598 return 0 if ! $locked{$addr}; 6599 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); 6600 return 1; 6601 } 6602 6603 sub set_file_path($self, @path) { # Set the final directory path for this table 6604 @{$file_path{pack 'J', refaddr $self}} = @path; 6605 return 6606 } 6607 6608 # Accessors for the range list stored in this table. First for 6609 # unconditional 6610 for my $sub (qw( 6611 containing_range 6612 contains 6613 count 6614 each_range 6615 hash 6616 is_empty 6617 matches_identically_to 6618 max 6619 min 6620 range_count 6621 reset_each_range 6622 type_of 6623 value_of 6624 )) 6625 { 6626 no strict "refs"; 6627 *$sub = sub { 6628 use strict "refs"; 6629 my $self = shift; 6630 return $self->_range_list->$sub(@_); 6631 } 6632 } 6633 6634 # Then for ones that should fail if locked 6635 for my $sub (qw( 6636 delete_range 6637 )) 6638 { 6639 no strict "refs"; 6640 *$sub = sub { 6641 use strict "refs"; 6642 my $self = shift; 6643 6644 return if $self->carp_if_locked; 6645 no overloading; 6646 return $self->_range_list->$sub(@_); 6647 } 6648 } 6649 6650} # End closure 6651 6652package Map_Table; 6653use parent '-norequire', '_Base_Table'; 6654 6655# A Map Table is a table that contains the mappings from code points to 6656# values. There are two weird cases: 6657# 1) Anomalous entries are ones that aren't maps of ranges of code points, but 6658# are written in the table's file at the end of the table nonetheless. It 6659# requires specially constructed code to handle these; utf8.c can not read 6660# these in, so they should not go in $map_directory. As of this writing, 6661# the only case that these happen is for named sequences used in 6662# charnames.pm. But this code doesn't enforce any syntax on these, so 6663# something else could come along that uses it. 6664# 2) Specials are anything that doesn't fit syntactically into the body of the 6665# table. The ranges for these have a map type of non-zero. The code below 6666# knows about and handles each possible type. In most cases, these are 6667# written as part of the header. 6668# 6669# A map table deliberately can't be manipulated at will unlike match tables. 6670# This is because of the ambiguities having to do with what to do with 6671# overlapping code points. And there just isn't a need for those things; 6672# what one wants to do is just query, add, replace, or delete mappings, plus 6673# write the final result. 6674# However, there is a method to get the list of possible ranges that aren't in 6675# this table to use for defaulting missing code point mappings. And, 6676# map_add_or_replace_non_nulls() does allow one to add another table to this 6677# one, but it is clearly very specialized, and defined that the other's 6678# non-null values replace this one's if there is any overlap. 6679 6680sub trace { return main::trace(@_); } 6681 6682{ # Closure 6683 6684 main::setup_package(); 6685 6686 my %default_map; 6687 # Many input files omit some entries; this gives what the mapping for the 6688 # missing entries should be 6689 main::set_access('default_map', \%default_map, 'r'); 6690 6691 my %anomalous_entries; 6692 # Things that go in the body of the table which don't fit the normal 6693 # scheme of things, like having a range. Not much can be done with these 6694 # once there except to output them. This was created to handle named 6695 # sequences. 6696 main::set_access('anomalous_entry', \%anomalous_entries, 'a'); 6697 main::set_access('anomalous_entries', # Append singular, read plural 6698 \%anomalous_entries, 6699 'readable_array'); 6700 my %to_output_map; 6701 # Enum as to whether or not to write out this map table, and how: 6702 # 0 don't output 6703 # $EXTERNAL_MAP means its existence is noted in the documentation, and 6704 # it should not be removed nor its format changed. This 6705 # is done for those files that have traditionally been 6706 # output. 6707 # $INTERNAL_MAP means Perl reserves the right to do anything it wants 6708 # with this file 6709 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of 6710 # outputting the actual mappings as-is, we adjust things 6711 # to create a much more compact table. Only those few 6712 # tables where the mapping is convertible at least to an 6713 # integer and compacting makes a big difference should 6714 # have this. Hence, the default is to not do this 6715 # unless the table's default mapping is to $CODE_POINT, 6716 # and the range size is not 1. 6717 main::set_access('to_output_map', \%to_output_map, 's'); 6718 6719 sub new { 6720 my $class = shift; 6721 my $name = shift; 6722 6723 my %args = @_; 6724 6725 # Optional initialization data for the table. 6726 my $initialize = delete $args{'Initialize'}; 6727 6728 my $default_map = delete $args{'Default_Map'}; 6729 my $property = delete $args{'_Property'}; 6730 my $full_name = delete $args{'Full_Name'}; 6731 my $to_output_map = delete $args{'To_Output_Map'}; 6732 6733 # Rest of parameters passed on 6734 6735 my $range_list = Range_Map->new(Owner => $property); 6736 6737 my $self = $class->SUPER::new( 6738 Name => $name, 6739 Complete_Name => $full_name, 6740 Full_Name => $full_name, 6741 _Property => $property, 6742 _Range_List => $range_list, 6743 Write_As_Invlist => 0, 6744 %args); 6745 6746 my $addr = pack 'J', refaddr $self; 6747 6748 $anomalous_entries{$addr} = []; 6749 $default_map{$addr} = $default_map; 6750 $to_output_map{$addr} = $to_output_map; 6751 6752 $self->initialize($initialize) if defined $initialize; 6753 6754 return $self; 6755 } 6756 6757 use overload 6758 fallback => 0, 6759 qw("") => "_operator_stringify", 6760 ; 6761 6762 sub _operator_stringify($self, $other="", $reversed=0) { 6763 6764 my $name = $self->property->full_name; 6765 $name = '""' if $name eq ""; 6766 return "Map table for Property '$name'"; 6767 } 6768 6769 sub add_alias { 6770 # Add a synonym for this table (which means the property itself) 6771 my $self = shift; 6772 my $name = shift; 6773 # Rest of parameters passed on. 6774 6775 $self->SUPER::add_alias($name, $self->property, @_); 6776 return; 6777 } 6778 6779 sub add_map { 6780 # Add a range of code points to the list of specially-handled code 6781 # points. 0 is assumed if the type of special is not passed 6782 # in. 6783 6784 my $self = shift; 6785 my $lower = shift; 6786 my $upper = shift; 6787 my $string = shift; 6788 my %args = @_; 6789 6790 my $type = delete $args{'Type'} || 0; 6791 # Rest of parameters passed on 6792 6793 # Can't change the table if locked. 6794 return if $self->carp_if_locked; 6795 6796 $self->_range_list->add_map($lower, $upper, 6797 $string, 6798 @_, 6799 Type => $type); 6800 return; 6801 } 6802 6803 sub append_to_body($self) { 6804 # Adds to the written HERE document of the table's body any anomalous 6805 # entries in the table.. 6806 my $addr = pack 'J', refaddr $self; 6807 6808 return "" unless @{$anomalous_entries{$addr}}; 6809 return join("\n", @{$anomalous_entries{$addr}}) . "\n"; 6810 } 6811 6812 sub map_add_or_replace_non_nulls($self, $other) { 6813 # This adds the mappings in the table $other to $self. Non-null 6814 # mappings from $other override those in $self. It essentially merges 6815 # the two tables, with the second having priority except for null 6816 # mappings. 6817 return if $self->carp_if_locked; 6818 6819 if (! $other->isa(__PACKAGE__)) { 6820 Carp::my_carp_bug("$other should be a " 6821 . __PACKAGE__ 6822 . ". Not a '" 6823 . ref($other) 6824 . "'. Not added;"); 6825 return; 6826 } 6827 6828 local $to_trace = 0 if main::DEBUG; 6829 6830 my $self_range_list = $self->_range_list; 6831 my $other_range_list = $other->_range_list; 6832 foreach my $range ($other_range_list->ranges) { 6833 my $value = $range->value; 6834 next if $value eq ""; 6835 $self_range_list->_add_delete('+', 6836 $range->start, 6837 $range->end, 6838 $value, 6839 Type => $range->type, 6840 Replace => $UNCONDITIONALLY); 6841 } 6842 6843 return; 6844 } 6845 6846 sub set_default_map($self, $map, $use_full_name=0) { 6847 # Define what code points that are missing from the input files should 6848 # map to. The optional second parameter 'full_name' indicates to 6849 # force using the full name of the map instead of its standard name. 6850 if ($use_full_name && $use_full_name ne 'full_name') { 6851 Carp::my_carp_bug("Second parameter to set_default_map() if" 6852 . " present, must be 'full_name'"); 6853 } 6854 6855 my $addr = pack 'J', refaddr $self; 6856 6857 # Convert the input to the standard equivalent, if any (won't have any 6858 # for $STRING properties) 6859 my $standard = $self->property->table($map); 6860 if (defined $standard) { 6861 $map = ($use_full_name) 6862 ? $standard->full_name 6863 : $standard->name; 6864 } 6865 6866 # Warn if there already is a non-equivalent default map for this 6867 # property. Note that a default map can be a ref, which means that 6868 # what it actually means is delayed until later in the program, and it 6869 # IS permissible to override it here without a message. 6870 my $default_map = $default_map{$addr}; 6871 if (defined $default_map 6872 && ! ref($default_map) 6873 && $default_map ne $map 6874 && main::Standardize($map) ne $default_map) 6875 { 6876 my $property = $self->property; 6877 my $map_table = $property->table($map); 6878 my $default_table = $property->table($default_map); 6879 if (defined $map_table 6880 && defined $default_table 6881 && $map_table != $default_table) 6882 { 6883 Carp::my_carp("Changing the default mapping for " 6884 . $property 6885 . " from $default_map to $map'"); 6886 } 6887 } 6888 6889 $default_map{$addr} = $map; 6890 6891 # Don't also create any missing table for this map at this point, 6892 # because if we did, it could get done before the main table add is 6893 # done for PropValueAliases.txt; instead the caller will have to make 6894 # sure it exists, if desired. 6895 return; 6896 } 6897 6898 sub to_output_map($self) { 6899 # Returns boolean: should we write this map table? 6900 my $addr = pack 'J', refaddr $self; 6901 6902 # If overridden, use that 6903 return $to_output_map{$addr} if defined $to_output_map{$addr}; 6904 6905 my $full_name = $self->full_name; 6906 return $global_to_output_map{$full_name} 6907 if defined $global_to_output_map{$full_name}; 6908 6909 # If table says to output, do so; if says to suppress it, do so. 6910 my $fate = $self->fate; 6911 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; 6912 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; 6913 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; 6914 6915 my $type = $self->property->type; 6916 6917 # Don't want to output binary map tables even for debugging. 6918 return 0 if $type == $BINARY; 6919 6920 # But do want to output string ones. All the ones that remain to 6921 # be dealt with (i.e. which haven't explicitly been set to external) 6922 # are for internal Perl use only. The default for those that map to 6923 # $CODE_POINT and haven't been restricted to a single element range 6924 # is to use the adjusted form. 6925 if ($type == $STRING) { 6926 return $INTERNAL_MAP if $self->range_size_1 6927 || $default_map{$addr} ne $CODE_POINT; 6928 return $OUTPUT_ADJUSTED; 6929 } 6930 6931 # Otherwise is an $ENUM, do output it, for Perl's purposes 6932 return $INTERNAL_MAP; 6933 } 6934 6935 sub inverse_list($self) { 6936 # Returns a Range_List that is gaps of the current table. That is, 6937 # the inversion 6938 my $current = Range_List->new(Initialize => $self->_range_list, 6939 Owner => $self->property); 6940 return ~ $current; 6941 } 6942 6943 sub header($self) { 6944 my $return = $self->SUPER::header(); 6945 6946 if ($self->to_output_map >= $INTERNAL_MAP) { 6947 $return .= $INTERNAL_ONLY_HEADER; 6948 } 6949 else { 6950 # Other properties have fixed formats. 6951 my $property_name = $self->property->full_name; 6952 6953 $return .= <<END; 6954 6955# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!! 6956 6957# This file is for internal use by core Perl only. It is retained for 6958# backwards compatibility with applications that may have come to rely on it, 6959# but its format and even its name or existence are subject to change without 6960# notice in a future Perl version. Don't use it directly. Instead, its 6961# contents are now retrievable through a stable API in the Unicode::UCD 6962# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual 6963# code points can be retrieved via Unicode::UCD::charprop()); 6964END 6965 } 6966 return $return; 6967 } 6968 6969 sub set_final_comment($self) { 6970 # Just before output, create the comment that heads the file 6971 # containing this table. 6972 6973 return unless $debugging_build; 6974 6975 # No sense generating a comment if aren't going to write it out. 6976 return if ! $self->to_output_map; 6977 6978 my $addr = pack 'J', refaddr $self; 6979 6980 my $property = $self->property; 6981 6982 # Get all the possible names for this property. Don't use any that 6983 # aren't ok for use in a file name, etc. This is perhaps causing that 6984 # flag to do double duty, and may have to be changed in the future to 6985 # have our own flag for just this purpose; but it works now to exclude 6986 # Perl generated synonyms from the lists for properties, where the 6987 # name is always the proper Unicode one. 6988 my @property_aliases = grep { $_->ok_as_filename } $self->aliases; 6989 6990 my $count = $self->count; 6991 my $default_map = $default_map{$addr}; 6992 6993 # The ranges that map to the default aren't output, so subtract that 6994 # to get those actually output. A property with matching tables 6995 # already has the information calculated. 6996 if ($property->type != $STRING && $property->type != $FORCED_BINARY) { 6997 $count -= $property->table($default_map)->count; 6998 } 6999 elsif (defined $default_map) { 7000 7001 # But for $STRING properties, must calculate now. Subtract the 7002 # count from each range that maps to the default. 7003 foreach my $range ($self->_range_list->ranges) { 7004 if ($range->value eq $default_map) { 7005 $count -= $range->end +1 - $range->start; 7006 } 7007 } 7008 7009 } 7010 7011 # Get a string version of $count with underscores in large numbers, 7012 # for clarity. 7013 my $string_count = main::clarify_code_point_count($count); 7014 7015 my $code_points = ($count == 1) 7016 ? 'single code point' 7017 : "$string_count code points"; 7018 7019 my $mapping; 7020 my $these_mappings; 7021 my $are; 7022 if (@property_aliases <= 1) { 7023 $mapping = 'mapping'; 7024 $these_mappings = 'this mapping'; 7025 $are = 'is' 7026 } 7027 else { 7028 $mapping = 'synonymous mappings'; 7029 $these_mappings = 'these mappings'; 7030 $are = 'are' 7031 } 7032 my $cp; 7033 if ($count >= $MAX_UNICODE_CODEPOINTS) { 7034 $cp = "any code point in Unicode Version $string_version"; 7035 } 7036 else { 7037 my $map_to; 7038 if ($default_map eq "") { 7039 $map_to = 'the empty string'; 7040 } 7041 elsif ($default_map eq $CODE_POINT) { 7042 $map_to = "itself"; 7043 } 7044 else { 7045 $map_to = "'$default_map'"; 7046 } 7047 if ($count == 1) { 7048 $cp = "the single code point"; 7049 } 7050 else { 7051 $cp = "one of the $code_points"; 7052 } 7053 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to"; 7054 } 7055 7056 my $comment = ""; 7057 7058 my $status = $self->status; 7059 if ($status ne $NORMAL) { 7060 my $warn = uc $status_past_participles{$status}; 7061 $comment .= <<END; 7062 7063!!!!!!! $warn !!!!!!!!!!!!!!!!!!! 7064 All property or property=value combinations contained in this file are $warn. 7065 See $unicode_reference_url for what this means. 7066 7067END 7068 } 7069 $comment .= "This file returns the $mapping:\n"; 7070 7071 my $ucd_accessible_name = ""; 7072 my $has_underscore_name = 0; 7073 my $full_name = $self->property->full_name; 7074 for my $i (0 .. @property_aliases - 1) { 7075 my $name = $property_aliases[$i]->name; 7076 $has_underscore_name = 1 if $name =~ /^_/; 7077 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); 7078 if ($property_aliases[$i]->ucd) { 7079 if ($name eq $full_name) { 7080 $ucd_accessible_name = $full_name; 7081 } 7082 elsif (! $ucd_accessible_name) { 7083 $ucd_accessible_name = $name; 7084 } 7085 } 7086 } 7087 $comment .= "\nwhere 'cp' is $cp."; 7088 if ($ucd_accessible_name) { 7089 $comment .= " Note that $these_mappings"; 7090 if ($has_underscore_name) { 7091 $comment .= " (except for the one(s) that begin with an underscore)"; 7092 } 7093 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; 7094 7095 } 7096 7097 # And append any commentary already set from the actual property. 7098 $comment .= "\n\n" . $self->comment if $self->comment; 7099 if ($self->description) { 7100 $comment .= "\n\n" . join " ", $self->description; 7101 } 7102 if ($self->note) { 7103 $comment .= "\n\n" . join " ", $self->note; 7104 } 7105 $comment .= "\n"; 7106 7107 if (! $self->perl_extension) { 7108 $comment .= <<END; 7109 7110For information about what this property really means, see: 7111$unicode_reference_url 7112END 7113 } 7114 7115 if ($count) { # Format differs for empty table 7116 $comment.= "\nThe format of the "; 7117 if ($self->range_size_1) { 7118 $comment.= <<END; 7119main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT 7120is in hex; MAPPING is what CODE_POINT maps to. 7121END 7122 } 7123 else { 7124 7125 # There are tables which end up only having one element per 7126 # range, but it is not worth keeping track of for making just 7127 # this comment a little better. 7128 $comment .= <<END; 7129non-comment portions of the main body of lines of this file is: 7130START\\tSTOP\\tMAPPING where START is the starting code point of the 7131range, in hex; STOP is the ending point, or if omitted, the range has just one 7132code point; MAPPING is what each code point between START and STOP maps to. 7133END 7134 if ($self->output_range_counts) { 7135 $comment .= <<END; 7136Numbers in comments in [brackets] indicate how many code points are in the 7137range (omitted when the range is a single code point or if the mapping is to 7138the null string). 7139END 7140 } 7141 } 7142 } 7143 $self->set_comment(main::join_lines($comment)); 7144 return; 7145 } 7146 7147 my %swash_keys; # Makes sure don't duplicate swash names. 7148 7149 # The remaining variables are temporaries used while writing each table, 7150 # to output special ranges. 7151 my @multi_code_point_maps; # Map is to more than one code point. 7152 7153 sub handle_special_range($self, $range) { 7154 # Called in the middle of write when it finds a range it doesn't know 7155 # how to handle. 7156 7157 my $addr = pack 'J', refaddr $self; 7158 7159 my $type = $range->type; 7160 7161 my $low = $range->start; 7162 my $high = $range->end; 7163 my $map = $range->value; 7164 7165 # No need to output the range if it maps to the default. 7166 return if $map eq $default_map{$addr}; 7167 7168 my $property = $self->property; 7169 7170 # Switch based on the map type... 7171 if ($type == $HANGUL_SYLLABLE) { 7172 7173 # These are entirely algorithmically determinable based on 7174 # some constants furnished by Unicode; for now, just set a 7175 # flag to indicate that have them. After everything is figured 7176 # out, we will output the code that does the algorithm. (Don't 7177 # output them if not needed because we are suppressing this 7178 # property.) 7179 $has_hangul_syllables = 1 if $property->to_output_map; 7180 } 7181 elsif ($type == $CP_IN_NAME) { 7182 7183 # Code points whose name ends in their code point are also 7184 # algorithmically determinable, but need information about the map 7185 # to do so. Both the map and its inverse are stored in data 7186 # structures output in the file. They are stored in the mean time 7187 # in global lists The lists will be written out later into Name.pm, 7188 # which is created only if needed. In order to prevent duplicates 7189 # in the list, only add to them for one property, should multiple 7190 # ones need them. 7191 if ($needing_code_points_ending_in_code_point == 0) { 7192 $needing_code_points_ending_in_code_point = $property; 7193 } 7194 if ($property == $needing_code_points_ending_in_code_point) { 7195 push @{$names_ending_in_code_point{$map}->{'low'}}, $low; 7196 push @{$names_ending_in_code_point{$map}->{'high'}}, $high; 7197 7198 my $squeezed = $map =~ s/[-\s]+//gr; 7199 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, 7200 $low; 7201 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, 7202 $high; 7203 7204 # Calculate the set of legal characters in names of this 7205 # series. It includes every character in the name prefix. 7206 my %legal; 7207 $legal{$_} = 1 for split //, $map; 7208 7209 # Plus the hex code point chars, blank, and minus. Also \n 7210 # can show up as being required due to anchoring 7211 for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") { 7212 $legal{$i} = 1; 7213 } 7214 my $legal = join "", sort { $a cmp $b } keys %legal; 7215 7216 # The legal chars can be used in match optimizations 7217 push @code_points_ending_in_code_point, { low => $low, 7218 high => $high, 7219 name => $map, 7220 legal => $legal, 7221 }; 7222 } 7223 } 7224 elsif ($range->type == $MULTI_CP || $range->type == $NULL) { 7225 7226 # Multi-code point maps and null string maps have an entry 7227 # for each code point in the range. They use the same 7228 # output format. 7229 for my $code_point ($low .. $high) { 7230 7231 # The pack() below can't cope with surrogates. XXX This may 7232 # no longer be true 7233 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { 7234 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); 7235 next; 7236 } 7237 7238 # Generate the hash entries for these in the form that 7239 # utf8.c understands. 7240 my $tostr = ""; 7241 my $to_name = ""; 7242 my $to_chr = ""; 7243 foreach my $to (split " ", $map) { 7244 if ($to !~ /^$code_point_re$/) { 7245 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); 7246 next; 7247 } 7248 $tostr .= sprintf "\\x{%s}", $to; 7249 $to = CORE::hex $to; 7250 if ($annotate) { 7251 $to_name .= " + " if $to_name; 7252 $to_chr .= main::display_chr($to); 7253 main::populate_char_info($to) 7254 if ! defined $viacode[$to]; 7255 $to_name .= $viacode[$to]; 7256 } 7257 } 7258 7259 # The unpack yields a list of the bytes that comprise the 7260 # UTF-8 of $code_point, which are each placed in \xZZ format 7261 # and output in the %s to map to $tostr, so the result looks 7262 # like: 7263 # "\xC4\xB0" => "\x{0069}\x{0307}", 7264 my $utf8 = sprintf(qq["%s" => "$tostr",], 7265 join("", map { sprintf "\\x%02X", $_ } 7266 unpack("U0C*", chr $code_point))); 7267 7268 # Add a comment so that a human reader can more easily 7269 # see what's going on. 7270 push @multi_code_point_maps, 7271 sprintf("%-45s # U+%04X", $utf8, $code_point); 7272 if (! $annotate) { 7273 $multi_code_point_maps[-1] .= " => $map"; 7274 } 7275 else { 7276 main::populate_char_info($code_point) 7277 if ! defined $viacode[$code_point]; 7278 $multi_code_point_maps[-1] .= " '" 7279 . main::display_chr($code_point) 7280 . "' => '$to_chr'; $viacode[$code_point] => $to_name"; 7281 } 7282 } 7283 } 7284 else { 7285 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written"); 7286 } 7287 7288 return; 7289 } 7290 7291 sub pre_body($self) { 7292 # Returns the string that should be output in the file before the main 7293 # body of this table. It isn't called until the main body is 7294 # calculated, saving a pass. The string includes some hash entries 7295 # identifying the format of the body, and what the single value should 7296 # be for all ranges missing from it. It also includes any code points 7297 # which have map_types that don't go in the main table. 7298 7299 my $addr = pack 'J', refaddr $self; 7300 7301 my $name = $self->property->swash_name; 7302 7303 # Currently there is nothing in the pre_body unless a swash is being 7304 # generated. 7305 return unless defined $name; 7306 7307 if (defined $swash_keys{$name}) { 7308 Carp::my_carp(main::join_lines(<<END 7309Already created a swash name '$name' for $swash_keys{$name}. This means that 7310the same name desired for $self shouldn't be used. Bad News. This must be 7311fixed before production use, but proceeding anyway 7312END 7313 )); 7314 } 7315 $swash_keys{$name} = "$self"; 7316 7317 my $pre_body = ""; 7318 7319 # Here we assume we were called after have gone through the whole 7320 # file. If we actually generated anything for each map type, add its 7321 # respective header and trailer 7322 my $specials_name = ""; 7323 if (@multi_code_point_maps) { 7324 $specials_name = "Unicode::UCD::ToSpec$name"; 7325 $pre_body .= <<END; 7326 7327# Some code points require special handling because their mappings are each to 7328# multiple code points. These do not appear in the main body, but are defined 7329# in the hash below. 7330 7331# Each key is the string of N bytes that together make up the UTF-8 encoding 7332# for the code point. (i.e. the same as looking at the code point's UTF-8 7333# under "use bytes"). Each value is the UTF-8 of the translation, for speed. 7334\%$specials_name = ( 7335END 7336 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; 7337 } 7338 7339 my $format = $self->format; 7340 7341 my $return = ""; 7342 7343 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7344 if ($output_adjusted) { 7345 if ($specials_name) { 7346 $return .= <<END; 7347# The mappings in the non-hash portion of this file must be modified to get the 7348# correct values by adding the code point ordinal number to each one that is 7349# numeric. 7350END 7351 } 7352 else { 7353 $return .= <<END; 7354# The mappings must be modified to get the correct values by adding the code 7355# point ordinal number to each one that is numeric. 7356END 7357 } 7358 } 7359 7360 $return .= <<END; 7361 7362# The name this table is to be known by, with the format of the mappings in 7363# the main body of the table, and what all code points missing from this file 7364# map to. 7365\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} 7366END 7367 if ($specials_name) { 7368 $return .= <<END; 7369\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings 7370END 7371 } 7372 my $default_map = $default_map{$addr}; 7373 7374 # For $CODE_POINT default maps and using adjustments, instead the default 7375 # becomes zero. 7376 $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '" 7377 . (($output_adjusted && $default_map eq $CODE_POINT) 7378 ? "0" 7379 : $default_map) 7380 . "';"; 7381 7382 if ($default_map eq $CODE_POINT) { 7383 $return .= ' # code point maps to itself'; 7384 } 7385 elsif ($default_map eq "") { 7386 $return .= ' # code point maps to the empty string'; 7387 } 7388 $return .= "\n"; 7389 7390 $return .= $pre_body; 7391 7392 return $return; 7393 } 7394 7395 sub write($self) { 7396 # Write the table to the file. 7397 7398 my $addr = pack 'J', refaddr $self; 7399 7400 # Clear the temporaries 7401 undef @multi_code_point_maps; 7402 7403 # Calculate the format of the table if not already done. 7404 my $format = $self->format; 7405 my $type = $self->property->type; 7406 my $default_map = $self->default_map; 7407 if (! defined $format) { 7408 if ($type == $BINARY) { 7409 7410 # Don't bother checking the values, because we elsewhere 7411 # verify that a binary table has only 2 values. 7412 $format = $BINARY_FORMAT; 7413 } 7414 else { 7415 my @ranges = $self->_range_list->ranges; 7416 7417 # default an empty table based on its type and default map 7418 if (! @ranges) { 7419 7420 # But it turns out that the only one we can say is a 7421 # non-string (besides binary, handled above) is when the 7422 # table is a string and the default map is to a code point 7423 if ($type == $STRING && $default_map eq $CODE_POINT) { 7424 $format = $HEX_FORMAT; 7425 } 7426 else { 7427 $format = $STRING_FORMAT; 7428 } 7429 } 7430 else { 7431 7432 # Start with the most restrictive format, and as we find 7433 # something that doesn't fit with that, change to the next 7434 # most restrictive, and so on. 7435 $format = $DECIMAL_FORMAT; 7436 foreach my $range (@ranges) { 7437 next if $range->type != 0; # Non-normal ranges don't 7438 # affect the main body 7439 my $map = $range->value; 7440 if ($map ne $default_map) { 7441 last if $format eq $STRING_FORMAT; # already at 7442 # least 7443 # restrictive 7444 $format = $INTEGER_FORMAT 7445 if $format eq $DECIMAL_FORMAT 7446 && $map !~ / ^ [0-9] $ /x; 7447 $format = $FLOAT_FORMAT 7448 if $format eq $INTEGER_FORMAT 7449 && $map !~ / ^ -? [0-9]+ $ /x; 7450 $format = $RATIONAL_FORMAT 7451 if $format eq $FLOAT_FORMAT 7452 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; 7453 $format = $HEX_FORMAT 7454 if ($format eq $RATIONAL_FORMAT 7455 && $map !~ 7456 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x) 7457 # Assume a leading zero means hex, 7458 # even if all digits are 0-9 7459 || ($format eq $INTEGER_FORMAT 7460 && $map =~ /^0[0-9A-F]/); 7461 $format = $STRING_FORMAT if $format eq $HEX_FORMAT 7462 && $map =~ /[^0-9A-F]/; 7463 } 7464 } 7465 } 7466 } 7467 } # end of calculating format 7468 7469 if ($default_map eq $CODE_POINT 7470 && $format ne $HEX_FORMAT 7471 && ! defined $self->format) # manual settings are always 7472 # considered ok 7473 { 7474 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'") 7475 } 7476 7477 # If the output is to be adjusted, the format of the table that gets 7478 # output is actually 'a' or 'ax' instead of whatever it is stored 7479 # internally as. 7480 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7481 if ($output_adjusted) { 7482 if ($default_map eq $CODE_POINT) { 7483 $format = $HEX_ADJUST_FORMAT; 7484 } 7485 else { 7486 $format = $ADJUST_FORMAT; 7487 } 7488 } 7489 7490 $self->_set_format($format); 7491 7492 return $self->SUPER::write( 7493 $output_adjusted, 7494 $default_map); # don't write defaulteds 7495 } 7496 7497 # Accessors for the underlying list that should fail if locked. 7498 for my $sub (qw( 7499 add_duplicate 7500 replace_map 7501 )) 7502 { 7503 no strict "refs"; 7504 *$sub = sub { 7505 use strict "refs"; 7506 my $self = shift; 7507 7508 return if $self->carp_if_locked; 7509 return $self->_range_list->$sub(@_); 7510 } 7511 } 7512} # End closure for Map_Table 7513 7514package Match_Table; 7515use parent '-norequire', '_Base_Table'; 7516 7517# A Match table is one which is a list of all the code points that have 7518# the same property and property value, for use in \p{property=value} 7519# constructs in regular expressions. It adds very little data to the base 7520# structure, but many methods, as these lists can be combined in many ways to 7521# form new ones. 7522# There are only a few concepts added: 7523# 1) Equivalents and Relatedness. 7524# Two tables can match the identical code points, but have different names. 7525# This always happens when there is a perl single form extension 7526# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two 7527# tables are set to be related, with the Perl extension being a child, and 7528# the Unicode property being the parent. 7529# 7530# It may be that two tables match the identical code points and we don't 7531# know if they are related or not. This happens most frequently when the 7532# Block and Script properties have the exact range. But note that a 7533# revision to Unicode could add new code points to the script, which would 7534# now have to be in a different block (as the block was filled, or there 7535# would have been 'Unknown' script code points in it and they wouldn't have 7536# been identical). So we can't rely on any two properties from Unicode 7537# always matching the same code points from release to release, and thus 7538# these tables are considered coincidentally equivalent--not related. When 7539# two tables are unrelated but equivalent, one is arbitrarily chosen as the 7540# 'leader', and the others are 'equivalents'. This concept is useful 7541# to minimize the number of tables written out. Only one file is used for 7542# any identical set of code points, with entries in UCD.pl mapping all 7543# the involved tables to it. 7544# 7545# Related tables will always be identical; we set them up to be so. Thus 7546# if the Unicode one is deprecated, the Perl one will be too. Not so for 7547# unrelated tables. Relatedness makes generating the documentation easier. 7548# 7549# 2) Complement. 7550# Like equivalents, two tables may be the inverses of each other, the 7551# intersection between them is null, and the union is every Unicode code 7552# point. The two tables that occupy a binary property are necessarily like 7553# this. By specifying one table as the complement of another, we can avoid 7554# storing it on disk (using the other table and performing a fast 7555# transform), and some memory and calculations. 7556# 7557# 3) Conflicting. It may be that there will eventually be name clashes, with 7558# the same name meaning different things. For a while, there actually were 7559# conflicts, but they have so far been resolved by changing Perl's or 7560# Unicode's definitions to match the other, but when this code was written, 7561# it wasn't clear that that was what was going to happen. (Unicode changed 7562# because of protests during their beta period.) Name clashes are warned 7563# about during compilation, and the documentation. The generated tables 7564# are sane, free of name clashes, because the code suppresses the Perl 7565# version. But manual intervention to decide what the actual behavior 7566# should be may be required should this happen. The introductory comments 7567# have more to say about this. 7568# 7569# 4) Definition. This is a string for human consumption that specifies the 7570# code points that this table matches. This is used only for the generated 7571# pod file. It may be specified explicitly, or automatically computed. 7572# Only the first portion of complicated definitions is computed and 7573# displayed. 7574 7575sub standardize { return main::standardize($_[0]); } 7576sub trace { return main::trace(@_); } 7577 7578 7579{ # Closure 7580 7581 main::setup_package(); 7582 7583 my %leader; 7584 # The leader table of this one; initially $self. 7585 main::set_access('leader', \%leader, 'r'); 7586 7587 my %equivalents; 7588 # An array of any tables that have this one as their leader 7589 main::set_access('equivalents', \%equivalents, 'readable_array'); 7590 7591 my %parent; 7592 # The parent table to this one, initially $self. This allows us to 7593 # distinguish between equivalent tables that are related (for which this 7594 # is set to), and those which may not be, but share the same output file 7595 # because they match the exact same set of code points in the current 7596 # Unicode release. 7597 main::set_access('parent', \%parent, 'r'); 7598 7599 my %children; 7600 # An array of any tables that have this one as their parent 7601 main::set_access('children', \%children, 'readable_array'); 7602 7603 my %conflicting; 7604 # Array of any tables that would have the same name as this one with 7605 # a different meaning. This is used for the generated documentation. 7606 main::set_access('conflicting', \%conflicting, 'readable_array'); 7607 7608 my %matches_all; 7609 # Set in the constructor for tables that are expected to match all code 7610 # points. 7611 main::set_access('matches_all', \%matches_all, 'r'); 7612 7613 my %complement; 7614 # Points to the complement that this table is expressed in terms of; 0 if 7615 # none. 7616 main::set_access('complement', \%complement, 'r'); 7617 7618 my %definition; 7619 # Human readable string of the first few ranges of code points matched by 7620 # this table 7621 main::set_access('definition', \%definition, 'r', 's'); 7622 7623 sub new { 7624 my $class = shift; 7625 7626 my %args = @_; 7627 7628 # The property for which this table is a listing of property values. 7629 my $property = delete $args{'_Property'}; 7630 7631 my $name = delete $args{'Name'}; 7632 my $full_name = delete $args{'Full_Name'}; 7633 $full_name = $name if ! defined $full_name; 7634 7635 # Optional 7636 my $initialize = delete $args{'Initialize'}; 7637 my $matches_all = delete $args{'Matches_All'} || 0; 7638 my $format = delete $args{'Format'}; 7639 my $definition = delete $args{'Definition'} // ""; 7640 # Rest of parameters passed on. 7641 7642 my $range_list = Range_List->new(Initialize => $initialize, 7643 Owner => $property); 7644 7645 my $complete = $full_name; 7646 $complete = '""' if $complete eq ""; # A null name shouldn't happen, 7647 # but this helps debug if it 7648 # does 7649 # The complete name for a match table includes it's property in a 7650 # compound form 'property=table', except if the property is the 7651 # pseudo-property, perl, in which case it is just the single form, 7652 # 'table' (If you change the '=' must also change the ':' in lots of 7653 # places in this program that assume an equal sign) 7654 $complete = $property->full_name . "=$complete" if $property != $perl; 7655 7656 my $self = $class->SUPER::new(%args, 7657 Name => $name, 7658 Complete_Name => $complete, 7659 Full_Name => $full_name, 7660 _Property => $property, 7661 _Range_List => $range_list, 7662 Format => $EMPTY_FORMAT, 7663 Write_As_Invlist => 1, 7664 ); 7665 my $addr = pack 'J', refaddr $self; 7666 7667 $conflicting{$addr} = [ ]; 7668 $equivalents{$addr} = [ ]; 7669 $children{$addr} = [ ]; 7670 $matches_all{$addr} = $matches_all; 7671 $leader{$addr} = $self; 7672 $parent{$addr} = $self; 7673 $complement{$addr} = 0; 7674 $definition{$addr} = $definition; 7675 7676 if (defined $format && $format ne $EMPTY_FORMAT) { 7677 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); 7678 } 7679 7680 return $self; 7681 } 7682 7683 # See this program's beginning comment block about overloading these. 7684 use overload 7685 fallback => 0, 7686 qw("") => "_operator_stringify", 7687 '=' => sub { 7688 my $self = shift; 7689 7690 return if $self->carp_if_locked; 7691 return $self; 7692 }, 7693 7694 '+' => sub { 7695 my $self = shift; 7696 my $other = shift; 7697 7698 return $self->_range_list + $other; 7699 }, 7700 '&' => sub { 7701 my $self = shift; 7702 my $other = shift; 7703 7704 return $self->_range_list & $other; 7705 }, 7706 '+=' => sub { 7707 my $self = shift; 7708 my $other = shift; 7709 my $reversed = shift; 7710 7711 if ($reversed) { 7712 Carp::my_carp_bug("Bad news. Can't cope with '" 7713 . ref($other) 7714 . ' += ' 7715 . ref($self) 7716 . "'. undef returned."); 7717 return; 7718 } 7719 7720 return if $self->carp_if_locked; 7721 7722 if (ref $other) { 7723 7724 # Change the range list of this table to be the 7725 # union of the two. 7726 $self->_set_range_list($self->_range_list 7727 + $other); 7728 } 7729 else { # $other is just a simple value 7730 $self->add_range($other, $other); 7731 } 7732 return $self; 7733 }, 7734 '&=' => sub { 7735 my $self = shift; 7736 my $other = shift; 7737 my $reversed = shift; 7738 7739 if ($reversed) { 7740 Carp::my_carp_bug("Bad news. Can't cope with '" 7741 . ref($other) 7742 . ' &= ' 7743 . ref($self) 7744 . "'. undef returned."); 7745 return; 7746 } 7747 7748 return if $self->carp_if_locked; 7749 $self->_set_range_list($self->_range_list & $other); 7750 return $self; 7751 }, 7752 '-' => sub { my $self = shift; 7753 my $other = shift; 7754 my $reversed = shift; 7755 if ($reversed) { 7756 Carp::my_carp_bug("Bad news. Can't cope with '" 7757 . ref($other) 7758 . ' - ' 7759 . ref($self) 7760 . "'. undef returned."); 7761 return; 7762 } 7763 7764 return $self->_range_list - $other; 7765 }, 7766 '~' => sub { my $self = shift; 7767 return ~ $self->_range_list; 7768 }, 7769 ; 7770 7771 sub _operator_stringify($self, $other="", $reversed=0) { 7772 7773 my $name = $self->complete_name; 7774 return "Table '$name'"; 7775 } 7776 7777 sub _range_list { 7778 # Returns the range list associated with this table, which will be the 7779 # complement's if it has one. 7780 7781 my $self = shift; 7782 my $complement = $self->complement; 7783 7784 # In order to avoid re-complementing on each access, only do the 7785 # complement the first time, and store the result in this table's 7786 # range list to use henceforth. However, this wouldn't work if the 7787 # controlling (complement) table changed after we do this, so lock it. 7788 # Currently, the value of the complement isn't needed until after it 7789 # is fully constructed, so this works. If this were to change, the 7790 # each_range iteration functionality would no longer work on this 7791 # complement. 7792 if ($complement != 0 && $self->SUPER::_range_list->count == 0) { 7793 $self->_set_range_list($self->SUPER::_range_list 7794 + ~ $complement->_range_list); 7795 $complement->lock; 7796 } 7797 7798 return $self->SUPER::_range_list; 7799 } 7800 7801 sub add_alias { 7802 # Add a synonym for this table. See the comments in the base class 7803 7804 my $self = shift; 7805 my $name = shift; 7806 # Rest of parameters passed on. 7807 7808 $self->SUPER::add_alias($name, $self, @_); 7809 return; 7810 } 7811 7812 sub add_conflicting { 7813 # Add the name of some other object to the list of ones that name 7814 # clash with this match table. 7815 7816 my $self = shift; 7817 my $conflicting_name = shift; # The name of the conflicting object 7818 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? 7819 my $conflicting_object = shift; # Optional, the conflicting object 7820 # itself. This is used to 7821 # disambiguate the text if the input 7822 # name is identical to any of the 7823 # aliases $self is known by. 7824 # Sometimes the conflicting object is 7825 # merely hypothetical, so this has to 7826 # be an optional parameter. 7827 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7828 7829 my $addr = pack 'J', refaddr $self; 7830 7831 # Check if the conflicting name is exactly the same as any existing 7832 # alias in this table (as long as there is a real object there to 7833 # disambiguate with). 7834 if (defined $conflicting_object) { 7835 foreach my $alias ($self->aliases) { 7836 if (standardize($alias->name) eq standardize($conflicting_name)) { 7837 7838 # Here, there is an exact match. This results in 7839 # ambiguous comments, so disambiguate by changing the 7840 # conflicting name to its object's complete equivalent. 7841 $conflicting_name = $conflicting_object->complete_name; 7842 last; 7843 } 7844 } 7845 } 7846 7847 # Convert to the \p{...} final name 7848 $conflicting_name = "\\$p" . "{$conflicting_name}"; 7849 7850 # Only add once 7851 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; 7852 7853 push @{$conflicting{$addr}}, $conflicting_name; 7854 7855 return; 7856 } 7857 7858 sub is_set_equivalent_to($self, $other=undef) { 7859 # Return boolean of whether or not the other object is a table of this 7860 # type and has been marked equivalent to this one. 7861 7862 return 0 if ! defined $other; # Can happen for incomplete early 7863 # releases 7864 unless ($other->isa(__PACKAGE__)) { 7865 my $ref_other = ref $other; 7866 my $ref_self = ref $self; 7867 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); 7868 return 0; 7869 } 7870 7871 # Two tables are equivalent if they have the same leader. 7872 return $leader{pack 'J', refaddr $self} == $leader{pack 'J', refaddr $other}; 7873 return; 7874 } 7875 7876 sub set_equivalent_to { 7877 # Set $self equivalent to the parameter table. 7878 # The required Related => 'x' parameter is a boolean indicating 7879 # whether these tables are related or not. If related, $other becomes 7880 # the 'parent' of $self; if unrelated it becomes the 'leader' 7881 # 7882 # Related tables share all characteristics except names; equivalents 7883 # not quite so many. 7884 # If they are related, one must be a perl extension. This is because 7885 # we can't guarantee that Unicode won't change one or the other in a 7886 # later release even if they are identical now. 7887 7888 my $self = shift; 7889 my $other = shift; 7890 7891 my %args = @_; 7892 my $related = delete $args{'Related'}; 7893 7894 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 7895 7896 return if ! defined $other; # Keep on going; happens in some early 7897 # Unicode releases. 7898 7899 if (! defined $related) { 7900 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); 7901 $related = 0; 7902 } 7903 7904 # If already are equivalent, no need to re-do it; if subroutine 7905 # returns null, it found an error, also do nothing 7906 my $are_equivalent = $self->is_set_equivalent_to($other); 7907 return if ! defined $are_equivalent || $are_equivalent; 7908 7909 my $addr = pack 'J', refaddr $self; 7910 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; 7911 7912 if ($related) { 7913 if ($current_leader->perl_extension) { 7914 if ($other->perl_extension) { 7915 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); 7916 return; 7917 } 7918 } elsif ($self->property != $other->property # Depending on 7919 # situation, might 7920 # be better to use 7921 # add_alias() 7922 # instead for same 7923 # property 7924 && ! $other->perl_extension 7925 7926 # We allow the sc and scx properties to be marked as 7927 # related. They are in fact related, and this allows 7928 # the pod to show that better. This test isn't valid 7929 # if this is an early Unicode release without the scx 7930 # property (having that also implies the sc property 7931 # exists, so don't have to test for no 'sc') 7932 && ( ! defined $scx 7933 && ! ( ( $self->property == $script 7934 || $self->property == $scx) 7935 && ( $self->property == $script 7936 || $self->property == $scx)))) 7937 { 7938 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); 7939 $related = 0; 7940 } 7941 } 7942 7943 if (! $self->is_empty && ! $self->matches_identically_to($other)) { 7944 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); 7945 return; 7946 } 7947 7948 my $leader = pack 'J', refaddr $current_leader; 7949 my $other_addr = pack 'J', refaddr $other; 7950 7951 # Any tables that are equivalent to or children of this table must now 7952 # instead be equivalent to or (children) to the new leader (parent), 7953 # still equivalent. The equivalency includes their matches_all info, 7954 # and for related tables, their fate and status. 7955 # All related tables are of necessity equivalent, but the converse 7956 # isn't necessarily true 7957 my $status = $other->status; 7958 my $status_info = $other->status_info; 7959 my $fate = $other->fate; 7960 my $matches_all = $matches_all{other_addr}; 7961 my $caseless_equivalent = $other->caseless_equivalent; 7962 foreach my $table ($current_leader, @{$equivalents{$leader}}) { 7963 next if $table == $other; 7964 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; 7965 7966 my $table_addr = pack 'J', refaddr $table; 7967 $leader{$table_addr} = $other; 7968 $matches_all{$table_addr} = $matches_all; 7969 $self->_set_range_list($other->_range_list); 7970 push @{$equivalents{$other_addr}}, $table; 7971 if ($related) { 7972 $parent{$table_addr} = $other; 7973 push @{$children{$other_addr}}, $table; 7974 $table->set_status($status, $status_info); 7975 7976 # This reason currently doesn't get exposed outside; otherwise 7977 # would have to look up the parent's reason and use it instead. 7978 $table->set_fate($fate, "Parent's fate"); 7979 7980 $self->set_caseless_equivalent($caseless_equivalent); 7981 } 7982 } 7983 7984 # Now that we've declared these to be equivalent, any changes to one 7985 # of the tables would invalidate that equivalency. 7986 $self->lock; 7987 $other->lock; 7988 return; 7989 } 7990 7991 sub set_complement($self, $other) { 7992 # Set $self to be the complement of the parameter table. $self is 7993 # locked, as what it contains should all come from the other table. 7994 7995 if ($other->complement != 0) { 7996 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); 7997 return; 7998 } 7999 $complement{pack 'J', refaddr $self} = $other; 8000 8001 # Be sure the other property knows we are depending on them; or the 8002 # other table if it is one in the current property. 8003 if ($self->property != $other->property) { 8004 $other->property->set_has_dependency(1); 8005 } 8006 else { 8007 $other->set_has_dependency(1); 8008 } 8009 $self->lock; 8010 return; 8011 } 8012 8013 sub add_range($self, @range) { # Add a range to the list for this table. 8014 # Rest of parameters passed on 8015 8016 return if $self->carp_if_locked; 8017 return $self->_range_list->add_range(@range); 8018 } 8019 8020 sub header($self) { 8021 # All match tables are to be used only by the Perl core. 8022 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; 8023 } 8024 8025 sub pre_body { # Does nothing for match tables. 8026 return 8027 } 8028 8029 sub append_to_body { # Does nothing for match tables. 8030 return 8031 } 8032 8033 sub set_fate($self, $fate, $reason=undef) { 8034 $self->SUPER::set_fate($fate, $reason); 8035 8036 # All children share this fate 8037 foreach my $child ($self->children) { 8038 $child->set_fate($fate, $reason); 8039 } 8040 return; 8041 } 8042 8043 sub calculate_table_definition 8044 { 8045 # Returns a human-readable string showing some or all of the code 8046 # points matched by this table. The string will include a 8047 # bracketed-character class for all characters matched in the 00-FF 8048 # range, and the first few ranges matched beyond that. 8049 my $max_ranges = 6; 8050 8051 my $self = shift; 8052 my $definition = $self->definition || ""; 8053 8054 # Skip this if already have a definition. 8055 return $definition if $definition; 8056 8057 my $lows_string = ""; # The string representation of the 0-FF 8058 # characters 8059 my $string_range = ""; # The string rep. of the above FF ranges 8060 my $range_count = 0; # How many ranges in $string_rage 8061 8062 my @lows_invlist; # The inversion list of the 0-FF code points 8063 my $first_non_control = ord(" "); # Everything below this is a 8064 # control, on ASCII or EBCDIC 8065 my $max_table_code_point = $self->max; 8066 8067 # On ASCII platforms, the range 80-FF contains no printables. 8068 my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126); 8069 8070 8071 # Look through the first few ranges matched by this table. 8072 $self->reset_each_range; # Defensive programming 8073 while (defined (my $range = $self->each_range())) { 8074 my $start = $range->start; 8075 my $end = $range->end; 8076 8077 # Accumulate an inversion list of the 00-FF code points 8078 if ($start < 256 && ($start > 0 || $end < 256)) { 8079 push @lows_invlist, $start; 8080 push @lows_invlist, 1 + (($end < 256) ? $end : 255); 8081 8082 # Get next range if there are more ranges below 256 8083 next if $end < 256 && $end < $max_table_code_point; 8084 8085 # If the range straddles the 255/256 boundary, we split it 8086 # there. We already added above the low portion to the 8087 # inversion list 8088 $start = 256 if $end > 256; 8089 } 8090 8091 # Here, @lows_invlist contains the code points below 256, and 8092 # there is no other range, or the current one starts at or above 8093 # 256. Generate the [char class] for the 0-255 ones. 8094 while (@lows_invlist) { 8095 8096 # If this range (necessarily the first one, by the way) starts 8097 # at 0 ... 8098 if ($lows_invlist[0] == 0) { 8099 8100 # If it ends within the block of controls, that means that 8101 # some controls are in it and some aren't. Since Unicode 8102 # properties pretty much only know about a few of the 8103 # controls, like \n, \t, this means that its one of them 8104 # that isn't in the range. Complement the inversion list 8105 # which will likely cause these to be output using their 8106 # mnemonics, hence being clearer. 8107 if ($lows_invlist[1] < $first_non_control) { 8108 $lows_string .= '^'; 8109 shift @lows_invlist; 8110 push @lows_invlist, 256; 8111 } 8112 elsif ($lows_invlist[1] <= $highest_printable) { 8113 8114 # Here, it extends into the printables block. Split 8115 # into two ranges so that the controls are separate. 8116 $lows_string .= sprintf "\\x00-\\x%02x", 8117 $first_non_control - 1; 8118 $lows_invlist[0] = $first_non_control; 8119 } 8120 } 8121 8122 # If the range completely contains the printables, don't 8123 # individually spell out the printables. 8124 if ( $lows_invlist[0] <= $first_non_control 8125 && $lows_invlist[1] > $highest_printable) 8126 { 8127 $lows_string .= sprintf "\\x%02x-\\x%02x", 8128 $lows_invlist[0], $lows_invlist[1] - 1; 8129 shift @lows_invlist; 8130 shift @lows_invlist; 8131 next; 8132 } 8133 8134 # Here, the range may include some but not all printables. 8135 # Look at each one individually 8136 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) { 8137 my $char = chr $ord; 8138 8139 # If there is already something in the list, an 8140 # alphanumeric char could be the next in sequence. If so, 8141 # we start or extend a range. That is, we could have so 8142 # far something like 'a-c', and the next char is a 'd', so 8143 # we change it to 'a-d'. We use native_to_unicode() 8144 # because a-z on EBCDIC means 26 chars, and excludes the 8145 # gap ones. 8146 if ($lows_string ne "" && $char =~ /[[:alnum:]]/) { 8147 my $prev = substr($lows_string, -1); 8148 if ( $prev !~ /[[:alnum:]]/ 8149 || utf8::native_to_unicode(ord $prev) + 1 8150 != utf8::native_to_unicode(ord $char)) 8151 { 8152 # Not extending the range 8153 $lows_string .= $char; 8154 } 8155 elsif ( length $lows_string > 1 8156 && substr($lows_string, -2, 1) eq '-') 8157 { 8158 # We had a sequence like '-c' and the current 8159 # character is 'd'. Extend the range. 8160 substr($lows_string, -1, 1) = $char; 8161 } 8162 else { 8163 # We had something like 'd' and this is 'e'. 8164 # Start a range. 8165 $lows_string .= "-$char"; 8166 } 8167 } 8168 elsif ($char =~ /[[:graph:]]/) { 8169 8170 # We output a graphic char as-is, preceded by a 8171 # backslash if it is a metacharacter 8172 $lows_string .= '\\' 8173 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/; 8174 $lows_string .= $char; 8175 } # Otherwise use mnemonic for any that have them 8176 elsif ($char =~ /[\a]/) { 8177 $lows_string .= '\a'; 8178 } 8179 elsif ($char =~ /[\b]/) { 8180 $lows_string .= '\b'; 8181 } 8182 elsif ($char eq "\e") { 8183 $lows_string .= '\e'; 8184 } 8185 elsif ($char eq "\f") { 8186 $lows_string .= '\f'; 8187 } 8188 elsif ($char eq "\cK") { 8189 $lows_string .= '\cK'; 8190 } 8191 elsif ($char eq "\n") { 8192 $lows_string .= '\n'; 8193 } 8194 elsif ($char eq "\r") { 8195 $lows_string .= '\r'; 8196 } 8197 elsif ($char eq "\t") { 8198 $lows_string .= '\t'; 8199 } 8200 else { 8201 8202 # Here is a non-graphic without a mnemonic. We use \x 8203 # notation. But if the ordinal of this is one above 8204 # the previous, create or extend the range 8205 my $hex_representation = sprintf("%02x", ord $char); 8206 if ( length $lows_string >= 4 8207 && substr($lows_string, -4, 2) eq '\\x' 8208 && hex(substr($lows_string, -2)) + 1 == ord $char) 8209 { 8210 if ( length $lows_string >= 5 8211 && substr($lows_string, -5, 1) eq '-' 8212 && ( length $lows_string == 5 8213 || substr($lows_string, -6, 1) ne '\\')) 8214 { 8215 substr($lows_string, -2) = $hex_representation; 8216 } 8217 else { 8218 $lows_string .= '-\\x' . $hex_representation; 8219 } 8220 } 8221 else { 8222 $lows_string .= '\\x' . $hex_representation; 8223 } 8224 } 8225 } 8226 } 8227 8228 # Done with assembling the string of all lows. If there are only 8229 # lows in the property, are completely done. 8230 if ($max_table_code_point < 256) { 8231 $self->reset_each_range; 8232 last; 8233 } 8234 8235 # Otherwise, quit if reached max number of non-lows ranges. If 8236 # there are lows, count them as one unit towards the maximum. 8237 $range_count++; 8238 if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) { 8239 $string_range .= " ..."; 8240 $self->reset_each_range; 8241 last; 8242 } 8243 8244 # Otherwise add this range. 8245 $string_range .= ", " if $string_range ne ""; 8246 if ($start == $end) { 8247 $string_range .= sprintf("U+%04X", $start); 8248 } 8249 elsif ($end >= $MAX_WORKING_CODEPOINT) { 8250 $string_range .= sprintf("U+%04X..infinity", $start); 8251 } 8252 else { 8253 $string_range .= sprintf("U+%04X..%04X", 8254 $start, $end); 8255 } 8256 } 8257 8258 # Done with all the ranges we're going to look at. Assemble the 8259 # definition from the lows + non-lows. 8260 8261 if ($lows_string ne "" || $string_range ne "") { 8262 if ($lows_string ne "") { 8263 $definition .= "[$lows_string]"; 8264 $definition .= ", " if $string_range; 8265 } 8266 $definition .= $string_range; 8267 } 8268 8269 return $definition; 8270 } 8271 8272 sub write($self) { 8273 return $self->SUPER::write(0); # No adjustments 8274 } 8275 8276 # $leader - Should only be called on the leader table of an equivalent group 8277 sub set_final_comment($leader) { 8278 # This creates a comment for the file that is to hold the match table 8279 # $self. It is somewhat convoluted to make the English read nicely, 8280 # but, heh, it's just a comment. 8281 # This should be called only with the leader match table of all the 8282 # ones that share the same file. It lists all such tables, ordered so 8283 # that related ones are together. 8284 8285 return unless $debugging_build; 8286 8287 my $addr = pack 'J', refaddr $leader; 8288 8289 if ($leader{$addr} != $leader) { 8290 Carp::my_carp_bug(<<END 8291set_final_comment() must be called on a leader table, which $leader is not. 8292It is equivalent to $leader{$addr}. No comment created 8293END 8294 ); 8295 return; 8296 } 8297 8298 # Get the number of code points matched by each of the tables in this 8299 # file, and add underscores for clarity. 8300 my $count = $leader->count; 8301 my $unicode_count; 8302 my $non_unicode_string; 8303 if ($count > $MAX_UNICODE_CODEPOINTS) { 8304 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 8305 - $MAX_UNICODE_CODEPOINT); 8306 $non_unicode_string = "All above-Unicode code points match as well, and are also returned"; 8307 } 8308 else { 8309 $unicode_count = $count; 8310 $non_unicode_string = ""; 8311 } 8312 my $string_count = main::clarify_code_point_count($unicode_count); 8313 8314 my $loose_count = 0; # how many aliases loosely matched 8315 my $compound_name = ""; # ? Are any names compound?, and if so, an 8316 # example 8317 my $properties_with_compound_names = 0; # count of these 8318 8319 8320 my %flags; # The status flags used in the file 8321 my $total_entries = 0; # number of entries written in the comment 8322 my $matches_comment = ""; # The portion of the comment about the 8323 # \p{}'s 8324 my @global_comments; # List of all the tables' comments that are 8325 # there before this routine was called. 8326 my $has_ucd_alias = 0; # If there is an alias that is accessible via 8327 # Unicode::UCD. If not, then don't say it is 8328 # in the comment 8329 8330 # Get list of all the parent tables that are equivalent to this one 8331 # (including itself). 8332 my @parents = grep { $parent{main::objaddr $_} == $_ } 8333 main::uniques($leader, @{$equivalents{$addr}}); 8334 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated 8335 # tables 8336 for my $parent (@parents) { 8337 8338 my $property = $parent->property; 8339 8340 # Special case 'N' tables in properties with two match tables when 8341 # the other is a 'Y' one. These are likely to be binary tables, 8342 # but not necessarily. In either case, \P{} will match the 8343 # complement of \p{}, and so if something is a synonym of \p, the 8344 # complement of that something will be the synonym of \P. This 8345 # would be true of any property with just two match tables, not 8346 # just those whose values are Y and N; but that would require a 8347 # little extra work, and there are none such so far in Unicode. 8348 my $perl_p = 'p'; # which is it? \p{} or \P{} 8349 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table 8350 8351 if (scalar $property->tables == 2 8352 && $parent == $property->table('N') 8353 && defined (my $yes = $property->table('Y'))) 8354 { 8355 my $yes_addr = pack 'J', refaddr $yes; 8356 @yes_perl_synonyms 8357 = grep { $_->property == $perl } 8358 main::uniques($yes, 8359 $parent{$yes_addr}, 8360 $parent{$yes_addr}->children); 8361 8362 # But these synonyms are \P{} ,not \p{} 8363 $perl_p = 'P'; 8364 } 8365 8366 my @description; # Will hold the table description 8367 my @note; # Will hold the table notes. 8368 my @conflicting; # Will hold the table conflicts. 8369 8370 # Look at the parent, any yes synonyms, and all the children 8371 my $parent_addr = pack 'J', refaddr $parent; 8372 for my $table ($parent, 8373 @yes_perl_synonyms, 8374 @{$children{$parent_addr}}) 8375 { 8376 my $table_addr = pack 'J', refaddr $table; 8377 my $table_property = $table->property; 8378 8379 # Tables are separated by a blank line to create a grouping. 8380 $matches_comment .= "\n" if $matches_comment; 8381 8382 # The table is named based on the property and value 8383 # combination it is for, like script=greek. But there may be 8384 # a number of synonyms for each side, like 'sc' for 'script', 8385 # and 'grek' for 'greek'. Any combination of these is a valid 8386 # name for this table. In this case, there are three more, 8387 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than 8388 # listing all possible combinations in the comment, we make 8389 # sure that each synonym occurs at least once, and add 8390 # commentary that the other combinations are possible. 8391 # Because regular expressions don't recognize things like 8392 # \p{jsn=}, only look at non-null right-hand-sides 8393 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases; 8394 my @table_aliases = grep { $_->name ne "" } $table->aliases; 8395 8396 # The alias lists above are already ordered in the order we 8397 # want to output them. To ensure that each synonym is listed, 8398 # we must use the max of the two numbers. But if there are no 8399 # legal synonyms (nothing in @table_aliases), then we don't 8400 # list anything. 8401 my $listed_combos = (@table_aliases) 8402 ? main::max(scalar @table_aliases, 8403 scalar @property_aliases) 8404 : 0; 8405 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG; 8406 8407 my $property_had_compound_name = 0; 8408 8409 for my $i (0 .. $listed_combos - 1) { 8410 $total_entries++; 8411 8412 # The current alias for the property is the next one on 8413 # the list, or if beyond the end, start over. Similarly 8414 # for the table (\p{prop=table}) 8415 my $property_alias = $property_aliases 8416 [$i % @property_aliases]->name; 8417 my $table_alias_object = $table_aliases 8418 [$i % @table_aliases]; 8419 my $table_alias = $table_alias_object->name; 8420 my $loose_match = $table_alias_object->loose_match; 8421 $has_ucd_alias |= $table_alias_object->ucd; 8422 8423 if ($table_alias !~ /\D/) { # Clarify large numbers. 8424 $table_alias = main::clarify_number($table_alias) 8425 } 8426 8427 # Add a comment for this alias combination 8428 my $current_match_comment; 8429 if ($table_property == $perl) { 8430 $current_match_comment = "\\$perl_p" 8431 . "{$table_alias}"; 8432 } 8433 else { 8434 $current_match_comment 8435 = "\\p{$property_alias=$table_alias}"; 8436 $property_had_compound_name = 1; 8437 } 8438 8439 # Flag any abnormal status for this table. 8440 my $flag = $property->status 8441 || $table->status 8442 || $table_alias_object->status; 8443 if ($flag && $flag ne $PLACEHOLDER) { 8444 $flags{$flag} = $status_past_participles{$flag}; 8445 } 8446 8447 $loose_count++; 8448 8449 # Pretty up the comment. Note the \b; it says don't make 8450 # this line a continuation. 8451 $matches_comment .= sprintf("\b%-1s%-s%s\n", 8452 $flag, 8453 " " x 7, 8454 $current_match_comment); 8455 } # End of generating the entries for this table. 8456 8457 # Save these for output after this group of related tables. 8458 push @description, $table->description; 8459 push @note, $table->note; 8460 push @conflicting, $table->conflicting; 8461 8462 # And this for output after all the tables. 8463 push @global_comments, $table->comment; 8464 8465 # Compute an alternate compound name using the final property 8466 # synonym and the first table synonym with a colon instead of 8467 # the equal sign used elsewhere. 8468 if ($property_had_compound_name) { 8469 $properties_with_compound_names ++; 8470 if (! $compound_name || @property_aliases > 1) { 8471 $compound_name = $property_aliases[-1]->name 8472 . ': ' 8473 . $table_aliases[0]->name; 8474 } 8475 } 8476 } # End of looping through all children of this table 8477 8478 # Here have assembled in $matches_comment all the related tables 8479 # to the current parent (preceded by the same info for all the 8480 # previous parents). Put out information that applies to all of 8481 # the current family. 8482 if (@conflicting) { 8483 8484 # But output the conflicting information now, as it applies to 8485 # just this table. 8486 my $conflicting = join ", ", @conflicting; 8487 if ($conflicting) { 8488 $matches_comment .= <<END; 8489 8490 Note that contrary to what you might expect, the above is NOT the same as 8491END 8492 $matches_comment .= "any of: " if @conflicting > 1; 8493 $matches_comment .= "$conflicting\n"; 8494 } 8495 } 8496 if (@description) { 8497 $matches_comment .= "\n Meaning: " 8498 . join('; ', @description) 8499 . "\n"; 8500 } 8501 if (@note) { 8502 $matches_comment .= "\n Note: " 8503 . join("\n ", @note) 8504 . "\n"; 8505 } 8506 } # End of looping through all tables 8507 8508 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string; 8509 8510 8511 my $code_points; 8512 my $match; 8513 my $any_of_these; 8514 if ($unicode_count == 1) { 8515 $match = 'matches'; 8516 $code_points = 'single code point'; 8517 } 8518 else { 8519 $match = 'match'; 8520 $code_points = "$string_count code points"; 8521 } 8522 8523 my $synonyms; 8524 my $entries; 8525 if ($total_entries == 1) { 8526 $synonyms = ""; 8527 $entries = 'entry'; 8528 $any_of_these = 'this' 8529 } 8530 else { 8531 $synonyms = " any of the following regular expression constructs"; 8532 $entries = 'entries'; 8533 $any_of_these = 'any of these' 8534 } 8535 8536 my $comment = ""; 8537 if ($has_ucd_alias) { 8538 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; 8539 } 8540 if ($has_unrelated) { 8541 $comment .= <<END; 8542This file is for tables that are not necessarily related: To conserve 8543resources, every table that matches the identical set of code points in this 8544version of Unicode uses this file. Each one is listed in a separate group 8545below. It could be that the tables will match the same set of code points in 8546other Unicode releases, or it could be purely coincidence that they happen to 8547be the same in Unicode $unicode_version, and hence may not in other versions. 8548 8549END 8550 } 8551 8552 if (%flags) { 8553 foreach my $flag (sort keys %flags) { 8554 $comment .= <<END; 8555'$flag' below means that this form is $flags{$flag}. 8556END 8557 if ($flag eq $INTERNAL_ALIAS) { 8558 $comment .= "DO NOT USE!!!"; 8559 } 8560 else { 8561 $comment .= "Consult $pod_file.pod"; 8562 } 8563 $comment .= "\n"; 8564 } 8565 $comment .= "\n"; 8566 } 8567 8568 if ($total_entries == 0) { 8569 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); 8570 $comment .= <<END; 8571This file returns the $code_points in Unicode Version 8572$unicode_version for 8573$leader, but it is inaccessible through Perl regular expressions, as 8574"\\p{prop=}" is not recognized. 8575END 8576 8577 } else { 8578 $comment .= <<END; 8579This file returns the $code_points in Unicode Version 8580$unicode_version that 8581$match$synonyms: 8582 8583$matches_comment 8584$pod_file.pod should be consulted for the syntax rules for $any_of_these, 8585including if adding or subtracting white space, underscore, and hyphen 8586characters matters or doesn't matter, and other permissible syntactic 8587variants. Upper/lower case distinctions never matter. 8588END 8589 8590 } 8591 if ($compound_name) { 8592 $comment .= <<END; 8593 8594A colon can be substituted for the equals sign, and 8595END 8596 if ($properties_with_compound_names > 1) { 8597 $comment .= <<END; 8598within each group above, 8599END 8600 } 8601 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name); 8602 8603 # Note the \b below, it says don't make that line a continuation. 8604 $comment .= <<END; 8605anything to the left of the equals (or colon) can be combined with anything to 8606the right. Thus, for example, 8607$compound_name 8608\bis also valid. 8609END 8610 } 8611 8612 # And append any comment(s) from the actual tables. They are all 8613 # gathered here, so may not read all that well. 8614 if (@global_comments) { 8615 $comment .= "\n" . join("\n\n", @global_comments) . "\n"; 8616 } 8617 8618 if ($count) { # The format differs if no code points, and needs no 8619 # explanation in that case 8620 if ($leader->write_as_invlist) { 8621 $comment.= <<END; 8622 8623The first data line of this file begins with the letter V to indicate it is in 8624inversion list format. The number following the V gives the number of lines 8625remaining. Each of those remaining lines is a single number representing the 8626starting code point of a range which goes up to but not including the number 8627on the next line; The 0th, 2nd, 4th... ranges are for code points that match 8628the property; the 1st, 3rd, 5th... are ranges of code points that don't match 8629the property. The final line's range extends to the platform's infinity. 8630END 8631 } 8632 else { 8633 $comment.= <<END; 8634The format of the lines of this file is: 8635START\\tSTOP\\twhere START is the starting code point of the range, in hex; 8636STOP is the ending point, or if omitted, the range has just one code point. 8637END 8638 } 8639 if ($leader->output_range_counts) { 8640 $comment .= <<END; 8641Numbers in comments in [brackets] indicate how many code points are in the 8642range. 8643END 8644 } 8645 } 8646 8647 $leader->set_comment(main::join_lines($comment)); 8648 return; 8649 } 8650 8651 # Accessors for the underlying list 8652 for my $sub (qw( 8653 get_valid_code_point 8654 get_invalid_code_point 8655 )) 8656 { 8657 no strict "refs"; 8658 *$sub = sub { 8659 use strict "refs"; 8660 my $self = shift; 8661 8662 return $self->_range_list->$sub(@_); 8663 } 8664 } 8665} # End closure for Match_Table 8666 8667package Property; 8668 8669# The Property class represents a Unicode property, or the $perl 8670# pseudo-property. It contains a map table initialized empty at construction 8671# time, and for properties accessible through regular expressions, various 8672# match tables, created through the add_match_table() method, and referenced 8673# by the table('NAME') or tables() methods, the latter returning a list of all 8674# of the match tables. Otherwise table operations implicitly are for the map 8675# table. 8676# 8677# Most of the data in the property is actually about its map table, so it 8678# mostly just uses that table's accessors for most methods. The two could 8679# have been combined into one object, but for clarity because of their 8680# differing semantics, they have been kept separate. It could be argued that 8681# the 'file' and 'directory' fields should be kept with the map table. 8682# 8683# Each property has a type. This can be set in the constructor, or in the 8684# set_type accessor, but mostly it is figured out by the data. Every property 8685# starts with unknown type, overridden by a parameter to the constructor, or 8686# as match tables are added, or ranges added to the map table, the data is 8687# inspected, and the type changed. After the table is mostly or entirely 8688# filled, compute_type() should be called to finalize the analysis. 8689# 8690# There are very few operations defined. One can safely remove a range from 8691# the map table, and property_add_or_replace_non_nulls() adds the maps from another 8692# table to this one, replacing any in the intersection of the two. 8693 8694sub standardize { return main::standardize($_[0]); } 8695sub trace { return main::trace(@_) if main::DEBUG && $to_trace } 8696 8697{ # Closure 8698 8699 # This hash will contain as keys, all the aliases of all properties, and 8700 # as values, pointers to their respective property objects. This allows 8701 # quick look-up of a property from any of its names. 8702 my %alias_to_property_of; 8703 8704 sub dump_alias_to_property_of { 8705 # For debugging 8706 8707 print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; 8708 return; 8709 } 8710 8711 sub property_ref($name) { 8712 # This is a package subroutine, not called as a method. 8713 # If the single parameter is a literal '*' it returns a list of all 8714 # defined properties. 8715 # Otherwise, the single parameter is a name, and it returns a pointer 8716 # to the corresponding property object, or undef if none. 8717 # 8718 # Properties can have several different names. The 'standard' form of 8719 # each of them is stored in %alias_to_property_of as they are defined. 8720 # But it's possible that this subroutine will be called with some 8721 # variant, so if the initial lookup fails, it is repeated with the 8722 # standardized form of the input name. If found, besides returning the 8723 # result, the input name is added to the list so future calls won't 8724 # have to do the conversion again. 8725 8726 if (! defined $name) { 8727 Carp::my_carp_bug("Undefined input property. No action taken."); 8728 return; 8729 } 8730 8731 return main::uniques(values %alias_to_property_of) if $name eq '*'; 8732 8733 # Return cached result if have it. 8734 my $result = $alias_to_property_of{$name}; 8735 return $result if defined $result; 8736 8737 # Convert the input to standard form. 8738 my $standard_name = standardize($name); 8739 8740 $result = $alias_to_property_of{$standard_name}; 8741 return unless defined $result; # Don't cache undefs 8742 8743 # Cache the result before returning it. 8744 $alias_to_property_of{$name} = $result; 8745 return $result; 8746 } 8747 8748 8749 main::setup_package(); 8750 8751 my %map; 8752 # A pointer to the map table object for this property 8753 main::set_access('map', \%map); 8754 8755 my %full_name; 8756 # The property's full name. This is a duplicate of the copy kept in the 8757 # map table, but is needed because stringify needs it during 8758 # construction of the map table, and then would have a chicken before egg 8759 # problem. 8760 main::set_access('full_name', \%full_name, 'r'); 8761 8762 my %table_ref; 8763 # This hash will contain as keys, all the aliases of any match tables 8764 # attached to this property, and as values, the pointers to their 8765 # respective tables. This allows quick look-up of a table from any of its 8766 # names. 8767 main::set_access('table_ref', \%table_ref); 8768 8769 my %type; 8770 # The type of the property, $ENUM, $BINARY, etc 8771 main::set_access('type', \%type, 'r'); 8772 8773 my %file; 8774 # The filename where the map table will go (if actually written). 8775 # Normally defaulted, but can be overridden. 8776 main::set_access('file', \%file, 'r', 's'); 8777 8778 my %directory; 8779 # The directory where the map table will go (if actually written). 8780 # Normally defaulted, but can be overridden. 8781 main::set_access('directory', \%directory, 's'); 8782 8783 my %pseudo_map_type; 8784 # This is used to affect the calculation of the map types for all the 8785 # ranges in the table. It should be set to one of the values that signify 8786 # to alter the calculation. 8787 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); 8788 8789 my %has_only_code_point_maps; 8790 # A boolean used to help in computing the type of data in the map table. 8791 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); 8792 8793 my %unique_maps; 8794 # A list of the first few distinct mappings this property has. This is 8795 # used to disambiguate between binary and enum property types, so don't 8796 # have to keep more than three. 8797 main::set_access('unique_maps', \%unique_maps); 8798 8799 my %pre_declared_maps; 8800 # A boolean that gives whether the input data should declare all the 8801 # tables used, or not. If the former, unknown ones raise a warning. 8802 main::set_access('pre_declared_maps', 8803 \%pre_declared_maps, 'r', 's'); 8804 8805 my %match_subdir; 8806 # For properties whose shortest names are too long for a DOS 8.3 8807 # filesystem to distinguish between, this is used to manually give short 8808 # names for the directory name immediately under $match_tables that the 8809 # match tables for this property should be placed in. 8810 main::set_access('match_subdir', \%match_subdir, 'r'); 8811 8812 my %has_dependency; 8813 # A boolean that gives whether some table somewhere is defined as the 8814 # complement of a table in this property. This is a crude, but currently 8815 # sufficient, mechanism to make this property not get destroyed before 8816 # what is dependent on it is. Other dependencies could be added, so the 8817 # name was chosen to reflect a more general situation than actually is 8818 # currently the case. 8819 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 8820 8821 sub new { 8822 # The only required parameter is the positionally first, name. All 8823 # other parameters are key => value pairs. See the documentation just 8824 # above for the meanings of the ones not passed directly on to the map 8825 # table constructor. 8826 8827 my $class = shift; 8828 my $name = shift || ""; 8829 8830 my $self = property_ref($name); 8831 if (defined $self) { 8832 my $options_string = join ", ", @_; 8833 $options_string = ". Ignoring options $options_string" if $options_string; 8834 Carp::my_carp("$self is already in use. Using existing one$options_string;"); 8835 return $self; 8836 } 8837 8838 my %args = @_; 8839 8840 $self = bless \do { my $anonymous_scalar }, $class; 8841 my $addr = pack 'J', refaddr $self; 8842 8843 $directory{$addr} = delete $args{'Directory'}; 8844 $file{$addr} = delete $args{'File'}; 8845 $full_name{$addr} = delete $args{'Full_Name'} || $name; 8846 $type{$addr} = delete $args{'Type'} || $UNKNOWN; 8847 $pseudo_map_type{$addr} = delete $args{'Map_Type'}; 8848 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'} 8849 # Starting in this release, property 8850 # values should be defined for all 8851 # properties, except those overriding this 8852 // $v_version ge v5.1.0; 8853 $match_subdir{$addr} = delete $args{'Match_SubDir'}; 8854 8855 # Rest of parameters passed on. 8856 8857 $has_only_code_point_maps{$addr} = 1; 8858 $table_ref{$addr} = { }; 8859 $unique_maps{$addr} = { }; 8860 $has_dependency{$addr} = 0; 8861 8862 $map{$addr} = Map_Table->new($name, 8863 Full_Name => $full_name{$addr}, 8864 _Alias_Hash => \%alias_to_property_of, 8865 _Property => $self, 8866 %args); 8867 return $self; 8868 } 8869 8870 # See this program's beginning comment block about overloading the copy 8871 # constructor. Few operations are defined on properties, but a couple are 8872 # useful. It is safe to take the inverse of a property, and to remove a 8873 # single code point from it. 8874 use overload 8875 fallback => 0, 8876 qw("") => "_operator_stringify", 8877 "." => \&main::_operator_dot, 8878 ".=" => \&main::_operator_dot_equal, 8879 '==' => \&main::_operator_equal, 8880 '!=' => \&main::_operator_not_equal, 8881 '=' => sub { return shift }, 8882 '-=' => "_minus_and_equal", 8883 ; 8884 8885 sub _operator_stringify($self, $other="", $reversed=0) { 8886 return "Property '" . $self->full_name . "'"; 8887 } 8888 8889 sub _minus_and_equal($self, $other, $reversed=0) { 8890 # Remove a single code point from the map table of a property. 8891 if (ref $other) { 8892 Carp::my_carp_bug("Bad news. Can't cope with a " 8893 . ref($other) 8894 . " argument to '-='. Subtraction ignored."); 8895 return $self; 8896 } 8897 elsif ($reversed) { # Shouldn't happen in a -=, but just in case 8898 Carp::my_carp_bug("Bad news. Can't cope with subtracting a " 8899 . ref $self 8900 . " from a non-object. undef returned."); 8901 return; 8902 } 8903 else { 8904 $map{pack 'J', refaddr $self}->delete_range($other, $other); 8905 } 8906 return $self; 8907 } 8908 8909 sub add_match_table { 8910 # Add a new match table for this property, with name given by the 8911 # parameter. It returns a pointer to the table. 8912 8913 my $self = shift; 8914 my $name = shift; 8915 my %args = @_; 8916 8917 my $addr = pack 'J', refaddr $self; 8918 8919 my $table = $table_ref{$addr}{$name}; 8920 my $standard_name = main::standardize($name); 8921 if (defined $table 8922 || (defined ($table = $table_ref{$addr}{$standard_name}))) 8923 { 8924 Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); 8925 $table_ref{$addr}{$name} = $table; 8926 return $table; 8927 } 8928 else { 8929 8930 # See if this is a perl extension, if not passed in. 8931 my $perl_extension = delete $args{'Perl_Extension'}; 8932 $perl_extension 8933 = $self->perl_extension if ! defined $perl_extension; 8934 8935 my $fate; 8936 my $suppression_reason = ""; 8937 if ($self->name =~ /^_/) { 8938 $fate = $SUPPRESSED; 8939 $suppression_reason = "Parent property is internal only"; 8940 } 8941 elsif ($self->fate >= $SUPPRESSED) { 8942 $fate = $self->fate; 8943 $suppression_reason = $why_suppressed{$self->complete_name}; 8944 8945 } 8946 elsif ($name =~ /^_/) { 8947 $fate = $INTERNAL_ONLY; 8948 } 8949 $table = Match_Table->new( 8950 Name => $name, 8951 Perl_Extension => $perl_extension, 8952 _Alias_Hash => $table_ref{$addr}, 8953 _Property => $self, 8954 Fate => $fate, 8955 Suppression_Reason => $suppression_reason, 8956 Status => $self->status, 8957 _Status_Info => $self->status_info, 8958 %args); 8959 return unless defined $table; 8960 } 8961 8962 # Save the names for quick look up 8963 $table_ref{$addr}{$standard_name} = $table; 8964 $table_ref{$addr}{$name} = $table; 8965 8966 # Perhaps we can figure out the type of this property based on the 8967 # fact of adding this match table. First, string properties don't 8968 # have match tables; second, a binary property can't have 3 match 8969 # tables 8970 if ($type{$addr} == $UNKNOWN) { 8971 $type{$addr} = $NON_STRING; 8972 } 8973 elsif ($type{$addr} == $STRING) { 8974 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); 8975 $type{$addr} = $NON_STRING; 8976 } 8977 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { 8978 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) { 8979 if ($type{$addr} == $BINARY) { 8980 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); 8981 } 8982 $type{$addr} = $ENUM; 8983 } 8984 } 8985 8986 return $table; 8987 } 8988 8989 sub delete_match_table($self, $table_to_remove) { 8990 # Delete the table referred to by $2 from the property $1. 8991 my $addr = pack 'J', refaddr $self; 8992 8993 # Remove all names that refer to it. 8994 foreach my $key (keys %{$table_ref{$addr}}) { 8995 delete $table_ref{$addr}{$key} 8996 if $table_ref{$addr}{$key} == $table_to_remove; 8997 } 8998 8999 $table_to_remove->DESTROY; 9000 return; 9001 } 9002 9003 sub table($self, $name) { 9004 # Return a pointer to the match table (with name given by the 9005 # parameter) associated with this property; undef if none. 9006 my $addr = pack 'J', refaddr $self; 9007 9008 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; 9009 9010 # If quick look-up failed, try again using the standard form of the 9011 # input name. If that succeeds, cache the result before returning so 9012 # won't have to standardize this input name again. 9013 my $standard_name = main::standardize($name); 9014 return unless defined $table_ref{$addr}{$standard_name}; 9015 9016 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; 9017 return $table_ref{$addr}{$name}; 9018 } 9019 9020 sub tables { 9021 # Return a list of pointers to all the match tables attached to this 9022 # property 9023 9024 return main::uniques(values %{$table_ref{pack 'J', refaddr shift}}); 9025 } 9026 9027 sub directory { 9028 # Returns the directory the map table for this property should be 9029 # output in. If a specific directory has been specified, that has 9030 # priority; 'undef' is returned if the type isn't defined; 9031 # or $map_directory for everything else. 9032 9033 my $addr = pack 'J', refaddr shift; 9034 9035 return $directory{$addr} if defined $directory{$addr}; 9036 return undef if $type{$addr} == $UNKNOWN; 9037 return $map_directory; 9038 } 9039 9040 sub swash_name($self) { 9041 # Return the name that is used to both: 9042 # 1) Name the file that the map table is written to. 9043 # 2) The name of swash related stuff inside that file. 9044 # The reason for this is that the Perl core historically has used 9045 # certain names that aren't the same as the Unicode property names. 9046 # To continue using these, $file is hard-coded in this file for those, 9047 # but otherwise the standard name is used. This is different from the 9048 # external_name, so that the rest of the files, like in lib can use 9049 # the standard name always, without regard to historical precedent. 9050 my $addr = pack 'J', refaddr $self; 9051 9052 # Swash names are used only on either 9053 # 1) regular or internal-only map tables 9054 # 2) otherwise there should be no access to the 9055 # property map table from other parts of Perl. 9056 return if $map{$addr}->fate != $ORDINARY 9057 && ! ($map{$addr}->name =~ /^_/ 9058 && $map{$addr}->fate == $INTERNAL_ONLY); 9059 9060 return $file{$addr} if defined $file{$addr}; 9061 return $map{$addr}->external_name; 9062 } 9063 9064 sub to_create_match_tables($self) { 9065 # Returns a boolean as to whether or not match tables should be 9066 # created for this property. 9067 9068 # The whole point of this pseudo property is match tables. 9069 return 1 if $self == $perl; 9070 9071 my $addr = pack 'J', refaddr $self; 9072 9073 # Don't generate tables of code points that match the property values 9074 # of a string property. Such a list would most likely have many 9075 # property values, each with just one or very few code points mapping 9076 # to it. 9077 return 0 if $type{$addr} == $STRING; 9078 9079 # Otherwise, do. 9080 return 1; 9081 } 9082 9083 sub property_add_or_replace_non_nulls($self, $other) { 9084 # This adds the mappings in the property $other to $self. Non-null 9085 # mappings from $other override those in $self. It essentially merges 9086 # the two properties, with the second having priority except for null 9087 # mappings. 9088 9089 if (! $other->isa(__PACKAGE__)) { 9090 Carp::my_carp_bug("$other should be a " 9091 . __PACKAGE__ 9092 . ". Not a '" 9093 . ref($other) 9094 . "'. Not added;"); 9095 return; 9096 } 9097 9098 return $map{pack 'J', refaddr $self}->map_add_or_replace_non_nulls($map{pack 'J', refaddr $other}); 9099 } 9100 9101 sub set_proxy_for { 9102 # Certain tables are not generally written out to files, but 9103 # Unicode::UCD has the intelligence to know that the file for $self 9104 # can be used to reconstruct those tables. This routine just changes 9105 # things so that UCD pod entries for those suppressed tables are 9106 # generated, so the fact that a proxy is used is invisible to the 9107 # user. 9108 9109 my $self = shift; 9110 9111 foreach my $property_name (@_) { 9112 my $ref = property_ref($property_name); 9113 next if $ref->to_output_map; 9114 $ref->set_fate($MAP_PROXIED); 9115 } 9116 } 9117 9118 sub set_type($self, $type) { 9119 # Set the type of the property. Mostly this is figured out by the 9120 # data in the table. But this is used to set it explicitly. The 9121 # reason it is not a standard accessor is that when setting a binary 9122 # property, we need to make sure that all the true/false aliases are 9123 # present, as they were omitted in early Unicode releases. 9124 9125 if ($type != $ENUM 9126 && $type != $BINARY 9127 && $type != $FORCED_BINARY 9128 && $type != $STRING) 9129 { 9130 Carp::my_carp("Unrecognized type '$type'. Type not set"); 9131 return; 9132 } 9133 9134 $type{pack 'J', refaddr $self} = $type; 9135 return if $type != $BINARY && $type != $FORCED_BINARY; 9136 9137 my $yes = $self->table('Y'); 9138 $yes = $self->table('Yes') if ! defined $yes; 9139 $yes = $self->add_match_table('Y', Full_Name => 'Yes') 9140 if ! defined $yes; 9141 9142 # Add aliases in order wanted, duplicates will be ignored. We use a 9143 # binary property present in all releases for its ordered lists of 9144 # true/false aliases. Note, that could run into problems in 9145 # outputting things in that we don't distinguish between the name and 9146 # full name of these. Hopefully, if the table was already created 9147 # before this code is executed, it was done with these set properly. 9148 my $bm = property_ref("Bidi_Mirrored"); 9149 foreach my $alias ($bm->table("Y")->aliases) { 9150 $yes->add_alias($alias->name); 9151 } 9152 my $no = $self->table('N'); 9153 $no = $self->table('No') if ! defined $no; 9154 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; 9155 foreach my $alias ($bm->table("N")->aliases) { 9156 $no->add_alias($alias->name); 9157 } 9158 9159 return; 9160 } 9161 9162 sub add_map { 9163 # Add a map to the property's map table. This also keeps 9164 # track of the maps so that the property type can be determined from 9165 # its data. 9166 9167 my $self = shift; 9168 my $start = shift; # First code point in range 9169 my $end = shift; # Final code point in range 9170 my $map = shift; # What the range maps to. 9171 # Rest of parameters passed on. 9172 9173 my $addr = pack 'J', refaddr $self; 9174 9175 # If haven't the type of the property, gather information to figure it 9176 # out. 9177 if ($type{$addr} == $UNKNOWN) { 9178 9179 # If the map contains an interior blank or dash, or most other 9180 # nonword characters, it will be a string property. This 9181 # heuristic may actually miss some string properties. If so, they 9182 # may need to have explicit set_types called for them. This 9183 # happens in the Unihan properties. 9184 if ($map =~ / (?<= . ) [ -] (?= . ) /x 9185 || $map =~ / [^\w.\/\ -] /x) 9186 { 9187 $self->set_type($STRING); 9188 9189 # $unique_maps is used for disambiguating between ENUM and 9190 # BINARY later; since we know the property is not going to be 9191 # one of those, no point in keeping the data around 9192 undef $unique_maps{$addr}; 9193 } 9194 else { 9195 9196 # Not necessarily a string. The final decision has to be 9197 # deferred until all the data are in. We keep track of if all 9198 # the values are code points for that eventual decision. 9199 $has_only_code_point_maps{$addr} &= 9200 $map =~ / ^ $code_point_re $/x; 9201 9202 # For the purposes of disambiguating between binary and other 9203 # enumerations at the end, we keep track of the first three 9204 # distinct property values. Once we get to three, we know 9205 # it's not going to be binary, so no need to track more. 9206 if (scalar keys %{$unique_maps{$addr}} < 3) { 9207 $unique_maps{$addr}{main::standardize($map)} = 1; 9208 } 9209 } 9210 } 9211 9212 # Add the mapping by calling our map table's method 9213 return $map{$addr}->add_map($start, $end, $map, @_); 9214 } 9215 9216 sub compute_type($self) { 9217 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This 9218 # should be called after the property is mostly filled with its maps. 9219 # We have been keeping track of what the property values have been, 9220 # and now have the necessary information to figure out the type. 9221 9222 my $addr = pack 'J', refaddr $self; 9223 9224 my $type = $type{$addr}; 9225 9226 # If already have figured these out, no need to do so again, but we do 9227 # a double check on ENUMS to make sure that a string property hasn't 9228 # improperly been classified as an ENUM, so continue on with those. 9229 return if $type == $STRING 9230 || $type == $BINARY 9231 || $type == $FORCED_BINARY; 9232 9233 # If every map is to a code point, is a string property. 9234 if ($type == $UNKNOWN 9235 && ($has_only_code_point_maps{$addr} 9236 || (defined $map{$addr}->default_map 9237 && $map{$addr}->default_map eq ""))) 9238 { 9239 $self->set_type($STRING); 9240 } 9241 else { 9242 9243 # Otherwise, it is to some sort of enumeration. (The case where 9244 # it is a Unicode miscellaneous property, and treated like a 9245 # string in this program is handled in add_map()). Distinguish 9246 # between binary and some other enumeration type. Of course, if 9247 # there are more than two values, it's not binary. But more 9248 # subtle is the test that the default mapping is defined means it 9249 # isn't binary. This in fact may change in the future if Unicode 9250 # changes the way its data is structured. But so far, no binary 9251 # properties ever have @missing lines for them, so the default map 9252 # isn't defined for them. The few properties that are two-valued 9253 # and aren't considered binary have the default map defined 9254 # starting in Unicode 5.0, when the @missing lines appeared; and 9255 # this program has special code to put in a default map for them 9256 # for earlier than 5.0 releases. 9257 if ($type == $ENUM 9258 || scalar keys %{$unique_maps{$addr}} > 2 9259 || defined $self->default_map) 9260 { 9261 my $tables = $self->tables; 9262 my $count = $self->count; 9263 if ($verbosity && $tables > 500 && $tables/$count > .1) { 9264 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n"); 9265 } 9266 $self->set_type($ENUM); 9267 } 9268 else { 9269 $self->set_type($BINARY); 9270 } 9271 } 9272 undef $unique_maps{$addr}; # Garbage collect 9273 return; 9274 } 9275 9276 # $reaons - Ignored unless suppressing 9277 sub set_fate($self, $fate, $reason=undef) { 9278 my $addr = pack 'J', refaddr $self; 9279 if ($fate >= $SUPPRESSED) { 9280 $why_suppressed{$self->complete_name} = $reason; 9281 } 9282 9283 # Each table shares the property's fate, except that MAP_PROXIED 9284 # doesn't affect match tables 9285 $map{$addr}->set_fate($fate, $reason); 9286 if ($fate != $MAP_PROXIED) { 9287 foreach my $table ($map{$addr}, $self->tables) { 9288 $table->set_fate($fate, $reason); 9289 } 9290 } 9291 return; 9292 } 9293 9294 9295 # Most of the accessors for a property actually apply to its map table. 9296 # Setup up accessor functions for those, referring to %map 9297 for my $sub (qw( 9298 add_alias 9299 add_anomalous_entry 9300 add_comment 9301 add_conflicting 9302 add_description 9303 add_duplicate 9304 add_note 9305 aliases 9306 comment 9307 complete_name 9308 containing_range 9309 count 9310 default_map 9311 definition 9312 delete_range 9313 description 9314 each_range 9315 external_name 9316 fate 9317 file_path 9318 format 9319 initialize 9320 inverse_list 9321 is_empty 9322 name 9323 note 9324 perl_extension 9325 property 9326 range_count 9327 ranges 9328 range_size_1 9329 replace_map 9330 reset_each_range 9331 set_comment 9332 set_default_map 9333 set_file_path 9334 set_final_comment 9335 _set_format 9336 set_range_size_1 9337 set_status 9338 set_to_output_map 9339 short_name 9340 status 9341 status_info 9342 to_output_map 9343 type_of 9344 value_of 9345 write 9346 )) 9347 # 'property' above is for symmetry, so that one can take 9348 # the property of a property and get itself, and so don't 9349 # have to distinguish between properties and tables in 9350 # calling code 9351 { 9352 no strict "refs"; 9353 *$sub = sub { 9354 use strict "refs"; 9355 my $self = shift; 9356 return $map{pack 'J', refaddr $self}->$sub(@_); 9357 } 9358 } 9359 9360 9361} # End closure 9362 9363package main; 9364 9365sub display_chr { 9366 # Converts an ordinal printable character value to a displayable string, 9367 # using a dotted circle to hold combining characters. 9368 9369 my $ord = shift; 9370 my $chr = chr $ord; 9371 return $chr if $ccc->table(0)->contains($ord); 9372 return "\x{25CC}$chr"; 9373} 9374 9375sub join_lines($input) { 9376 # Returns lines of the input joined together, so that they can be folded 9377 # properly. 9378 # This causes continuation lines to be joined together into one long line 9379 # for folding. A continuation line is any line that doesn't begin with a 9380 # space or "\b" (the latter is stripped from the output). This is so 9381 # lines can be in a HERE document so as to fit nicely in the terminal 9382 # width, but be joined together in one long line, and then folded with 9383 # indents, '#' prefixes, etc, properly handled. 9384 # A blank separates the joined lines except if there is a break; an extra 9385 # blank is inserted after a period ending a line. 9386 9387 # Initialize the return with the first line. 9388 my ($return, @lines) = split "\n", $input; 9389 9390 # If the first line is null, it was an empty line, add the \n back in 9391 $return = "\n" if $return eq ""; 9392 9393 # Now join the remainder of the physical lines. 9394 for my $line (@lines) { 9395 9396 # An empty line means wanted a blank line, so add two \n's to get that 9397 # effect, and go to the next line. 9398 if (length $line == 0) { 9399 $return .= "\n\n"; 9400 next; 9401 } 9402 9403 # Look at the last character of what we have so far. 9404 my $previous_char = substr($return, -1, 1); 9405 9406 # And at the next char to be output. 9407 my $next_char = substr($line, 0, 1); 9408 9409 if ($previous_char ne "\n") { 9410 9411 # Here didn't end wth a nl. If the next char a blank or \b, it 9412 # means that here there is a break anyway. So add a nl to the 9413 # output. 9414 if ($next_char eq " " || $next_char eq "\b") { 9415 $previous_char = "\n"; 9416 $return .= $previous_char; 9417 } 9418 9419 # Add an extra space after periods. 9420 $return .= " " if $previous_char eq '.'; 9421 } 9422 9423 # Here $previous_char is still the latest character to be output. If 9424 # it isn't a nl, it means that the next line is to be a continuation 9425 # line, with a blank inserted between them. 9426 $return .= " " if $previous_char ne "\n"; 9427 9428 # Get rid of any \b 9429 substr($line, 0, 1) = "" if $next_char eq "\b"; 9430 9431 # And append this next line. 9432 $return .= $line; 9433 } 9434 9435 return $return; 9436} 9437 9438sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) { 9439 # Returns a string of the input (string or an array of strings) folded 9440 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus 9441 # a \n 9442 # This is tailored for the kind of text written by this program, 9443 # especially the pod file, which can have very long names with 9444 # underscores in the middle, or words like AbcDefgHij.... We allow 9445 # breaking in the middle of such constructs if the line won't fit 9446 # otherwise. The break in such cases will come either just after an 9447 # underscore, or just before one of the Capital letters. 9448 9449 local $to_trace = 0 if main::DEBUG; 9450 9451 # $prefix Optional string to prepend to each output line 9452 # $hanging_indent Optional number of spaces to indent 9453 # continuation lines 9454 # $right_margin Optional number of spaces to narrow the 9455 # total width by. 9456 9457 # The space available doesn't include what's automatically prepended 9458 # to each line, or what's reserved on the right. 9459 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; 9460 # XXX Instead of using the 'nofold' perhaps better to look up the stack 9461 9462 if (DEBUG && $hanging_indent >= $max) { 9463 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); 9464 $hanging_indent = 0; 9465 } 9466 9467 # First, split into the current physical lines. 9468 my @line; 9469 if (ref $line) { # Better be an array, because not bothering to 9470 # test 9471 foreach my $line (@{$line}) { 9472 push @line, split /\n/, $line; 9473 } 9474 } 9475 else { 9476 @line = split /\n/, $line; 9477 } 9478 9479 #local $to_trace = 1 if main::DEBUG; 9480 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; 9481 9482 # Look at each current physical line. 9483 for (my $i = 0; $i < @line; $i++) { 9484 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; 9485 #local $to_trace = 1 if main::DEBUG; 9486 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; 9487 9488 # Remove prefix, because will be added back anyway, don't want 9489 # doubled prefix 9490 $line[$i] =~ s/^$prefix//; 9491 9492 # Remove trailing space 9493 $line[$i] =~ s/\s+\Z//; 9494 9495 # If the line is too long, fold it. 9496 if (length $line[$i] > $max) { 9497 my $remainder; 9498 9499 # Here needs to fold. Save the leading space in the line for 9500 # later. 9501 $line[$i] =~ /^ ( \s* )/x; 9502 my $leading_space = $1; 9503 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; 9504 9505 # If character at final permissible position is white space, 9506 # fold there, which will delete that white space 9507 if (substr($line[$i], $max - 1, 1) =~ /\s/) { 9508 $remainder = substr($line[$i], $max); 9509 $line[$i] = substr($line[$i], 0, $max - 1); 9510 } 9511 else { 9512 9513 # Otherwise fold at an acceptable break char closest to 9514 # the max length. Look at just the maximal initial 9515 # segment of the line 9516 my $segment = substr($line[$i], 0, $max - 1); 9517 if ($segment =~ 9518 /^ ( .{$hanging_indent} # Don't look before the 9519 # indent. 9520 \ * # Don't look in leading 9521 # blanks past the indent 9522 [^ ] .* # Find the right-most 9523 (?: # acceptable break: 9524 [ \s = ] # space or equal 9525 | - (?! [.0-9] ) # or non-unary minus. 9526 | [^\\[(] (?= \\ )# break before single backslash 9527 # not immediately after opening 9528 # punctuation 9529 ) # $1 includes the character 9530 )/x) 9531 { 9532 # Split into the initial part that fits, and remaining 9533 # part of the input 9534 $remainder = substr($line[$i], length $1); 9535 $line[$i] = $1; 9536 trace $line[$i] if DEBUG && $to_trace; 9537 trace $remainder if DEBUG && $to_trace; 9538 } 9539 9540 # If didn't find a good breaking spot, see if there is a 9541 # not-so-good breaking spot. These are just after 9542 # underscores or where the case changes from lower to 9543 # upper. Use \a as a soft hyphen, but give up 9544 # and don't break the line if there is actually a \a 9545 # already in the input. We use an ascii character for the 9546 # soft-hyphen to avoid any attempt by miniperl to try to 9547 # access the files that this program is creating. 9548 elsif ($segment !~ /\a/ 9549 && ($segment =~ s/_/_\a/g 9550 || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg)) 9551 { 9552 # Here were able to find at least one place to insert 9553 # our substitute soft hyphen. Find the right-most one 9554 # and replace it by a real hyphen. 9555 trace $segment if DEBUG && $to_trace; 9556 substr($segment, 9557 rindex($segment, "\a"), 9558 1) = '-'; 9559 9560 # Then remove the soft hyphen substitutes. 9561 $segment =~ s/\a//g; 9562 trace $segment if DEBUG && $to_trace; 9563 9564 # And split into the initial part that fits, and 9565 # remainder of the line 9566 my $pos = rindex($segment, '-'); 9567 $remainder = substr($line[$i], $pos); 9568 trace $remainder if DEBUG && $to_trace; 9569 $line[$i] = substr($segment, 0, $pos + 1); 9570 } 9571 } 9572 9573 # Here we know if we can fold or not. If we can, $remainder 9574 # is what remains to be processed in the next iteration. 9575 if (defined $remainder) { 9576 trace "folded='$line[$i]'" if main::DEBUG && $to_trace; 9577 9578 # Insert the folded remainder of the line as a new element 9579 # of the array. (It may still be too long, but we will 9580 # deal with that next time through the loop.) Omit any 9581 # leading space in the remainder. 9582 $remainder =~ s/^\s+//; 9583 trace "remainder='$remainder'" if main::DEBUG && $to_trace; 9584 9585 # But then indent by whichever is larger of: 9586 # 1) the leading space on the input line; 9587 # 2) the hanging indent. 9588 # This preserves indentation in the original line. 9589 my $lead = ($leading_space) 9590 ? length $leading_space 9591 : $hanging_indent; 9592 $lead = max($lead, $hanging_indent); 9593 splice @line, $i+1, 0, (" " x $lead) . $remainder; 9594 } 9595 } 9596 9597 # Ready to output the line. Get rid of any trailing space 9598 # And prefix by the required $prefix passed in. 9599 $line[$i] =~ s/\s+$//; 9600 $line[$i] = "$prefix$line[$i]\n"; 9601 } # End of looping through all the lines. 9602 9603 return join "", @line; 9604} 9605 9606sub property_ref { # Returns a reference to a property object. 9607 return Property::property_ref(@_); 9608} 9609 9610sub force_unlink ($filename) { 9611 return unless file_exists($filename); 9612 return if CORE::unlink($filename); 9613 9614 # We might need write permission 9615 chmod 0777, $filename; 9616 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); 9617 return; 9618} 9619 9620sub write ($file, $use_utf8, @lines) { 9621 # Given a filename and references to arrays of lines, write the lines of 9622 # each array to the file 9623 # Filename can be given as an arrayref of directory names 9624 9625 # Get into a single string if an array, and get rid of, in Unix terms, any 9626 # leading '.' 9627 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; 9628 $file = File::Spec->canonpath($file); 9629 9630 # If has directories, make sure that they all exist 9631 (undef, my $directories, undef) = File::Spec->splitpath($file); 9632 File::Path::mkpath($directories) if $directories && ! -d $directories; 9633 9634 push @files_actually_output, $file; 9635 9636 force_unlink ($file); 9637 9638 my $OUT; 9639 if (not open $OUT, ">", $file) { 9640 Carp::my_carp("can't open $file for output. Skipping this file: $!"); 9641 return; 9642 } 9643 9644 binmode $OUT, ":utf8" if $use_utf8; 9645 9646 foreach my $lines_ref (@lines) { 9647 unless (@$lines_ref) { 9648 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;"); 9649 } 9650 9651 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); 9652 } 9653 close $OUT or die Carp::my_carp("close '$file' failed: $!"); 9654 9655 print "$file written.\n" if $verbosity >= $VERBOSE; 9656 9657 return; 9658} 9659 9660 9661sub Standardize($name=undef) { 9662 # This converts the input name string into a standardized equivalent to 9663 # use internally. 9664 9665 unless (defined $name) { 9666 Carp::my_carp_bug("Standardize() called with undef. Returning undef."); 9667 return; 9668 } 9669 9670 # Remove any leading or trailing white space 9671 $name =~ s/^\s+//g; 9672 $name =~ s/\s+$//g; 9673 9674 # Convert interior white space and hyphens into underscores. 9675 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; 9676 9677 # Capitalize the letter following an underscore, and convert a sequence of 9678 # multiple underscores to a single one 9679 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; 9680 9681 # And capitalize the first letter, but not for the special cjk ones. 9682 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 9683 return $name; 9684} 9685 9686sub standardize ($str=undef) { 9687 # Returns a lower-cased standardized name, without underscores. This form 9688 # is chosen so that it can distinguish between any real versus superficial 9689 # Unicode name differences. It relies on the fact that Unicode doesn't 9690 # have interior underscores, white space, nor dashes in any 9691 # stricter-matched name. It should not be used on Unicode code point 9692 # names (the Name property), as they mostly, but not always follow these 9693 # rules. 9694 9695 my $name = Standardize($str); 9696 return if !defined $name; 9697 9698 $name =~ s/ (?<= .) _ (?= . ) //xg; 9699 return lc $name; 9700} 9701 9702sub UCD_name ($table, $alias) { 9703 # Returns the name that Unicode::UCD will use to find a table. XXX 9704 # perhaps this function should be placed somewhere, like UCD.pm so that 9705 # Unicode::UCD can use it directly without duplicating code that can get 9706 # out-of sync. 9707 9708 my $property = $table->property; 9709 $property = ($property == $perl) 9710 ? "" # 'perl' is never explicitly stated 9711 : standardize($property->name) . '='; 9712 if ($alias->loose_match) { 9713 return $property . standardize($alias->name); 9714 } 9715 else { 9716 return lc ($property . $alias->name); 9717 } 9718 9719 return; 9720} 9721 9722{ # Closure 9723 9724 my $indent_increment = " " x ( $debugging_build ? 2 : 0); 9725 %main::already_output = (); 9726 9727 $main::simple_dumper_nesting = 0; 9728 9729 sub simple_dumper( $item, $indent = "" ) { 9730 # Like Simple Data::Dumper. Good enough for our needs. We can't use 9731 # the real thing as we have to run under miniperl. 9732 9733 # It is designed so that on input it is at the beginning of a line, 9734 # and the final thing output in any call is a trailing ",\n". 9735 9736 $indent = "" if ! $debugging_build; 9737 9738 # nesting level is localized, so that as the call stack pops, it goes 9739 # back to the prior value. 9740 local $main::simple_dumper_nesting = $main::simple_dumper_nesting; 9741 local %main::already_output = %main::already_output; 9742 $main::simple_dumper_nesting++; 9743 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; 9744 9745 # Determine the indent for recursive calls. 9746 my $next_indent = $indent . $indent_increment; 9747 9748 my $output; 9749 if (! ref $item) { 9750 9751 # Dump of scalar: just output it in quotes if not a number. To do 9752 # so we must escape certain characters, and therefore need to 9753 # operate on a copy to avoid changing the original 9754 my $copy = $item; 9755 $copy = $UNDEF unless defined $copy; 9756 9757 # Quote non-integers (integers also have optional leading '-') 9758 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { 9759 9760 # Escape apostrophe and backslash 9761 $copy =~ s/ ( ['\\] ) /\\$1/xg; 9762 $copy = "'$copy'"; 9763 } 9764 $output = "$indent$copy,\n"; 9765 } 9766 else { 9767 9768 # Keep track of cycles in the input, and refuse to infinitely loop 9769 my $addr = pack 'J', refaddr $item; 9770 if (defined $main::already_output{$addr}) { 9771 return "${indent}ALREADY OUTPUT: $item\n"; 9772 } 9773 $main::already_output{$addr} = $item; 9774 9775 if (ref $item eq 'ARRAY') { 9776 my $using_brackets; 9777 $output = $indent; 9778 if ($main::simple_dumper_nesting > 1) { 9779 $output .= '['; 9780 $using_brackets = 1; 9781 } 9782 else { 9783 $using_brackets = 0; 9784 } 9785 9786 # If the array is empty, put the closing bracket on the same 9787 # line. Otherwise, recursively add each array element 9788 if (@$item == 0) { 9789 $output .= " "; 9790 } 9791 else { 9792 $output .= "\n"; 9793 for (my $i = 0; $i < @$item; $i++) { 9794 9795 # Indent array elements one level 9796 $output .= &simple_dumper($item->[$i], $next_indent); 9797 next if ! $debugging_build; 9798 $output =~ s/\n$//; # Remove any trailing nl so 9799 $output .= " # [$i]\n"; # as to add a comment giving 9800 # the array index 9801 } 9802 $output .= $indent; # Indent closing ']' to orig level 9803 } 9804 $output .= ']' if $using_brackets; 9805 $output .= ",\n"; 9806 } 9807 elsif (ref $item eq 'HASH') { 9808 my $is_first_line; 9809 my $using_braces; 9810 my $body_indent; 9811 9812 # No surrounding braces at top level 9813 $output .= $indent; 9814 if ($main::simple_dumper_nesting > 1) { 9815 $output .= "{\n"; 9816 $is_first_line = 0; 9817 $body_indent = $next_indent; 9818 $next_indent .= $indent_increment; 9819 $using_braces = 1; 9820 } 9821 else { 9822 $is_first_line = 1; 9823 $body_indent = $indent; 9824 $using_braces = 0; 9825 } 9826 9827 # Output hashes sorted alphabetically instead of apparently 9828 # random. Use caseless alphabetic sort 9829 foreach my $key (sort { lc $a cmp lc $b } keys %$item) 9830 { 9831 if ($is_first_line) { 9832 $is_first_line = 0; 9833 } 9834 else { 9835 $output .= "$body_indent"; 9836 } 9837 9838 # The key must be a scalar, but this recursive call quotes 9839 # it 9840 $output .= &simple_dumper($key); 9841 9842 # And change the trailing comma and nl to the hash fat 9843 # comma for clarity, and so the value can be on the same 9844 # line 9845 $output =~ s/,\n$/ => /; 9846 9847 # Recursively call to get the value's dump. 9848 my $next = &simple_dumper($item->{$key}, $next_indent); 9849 9850 # If the value is all on one line, remove its indent, so 9851 # will follow the => immediately. If it takes more than 9852 # one line, start it on a new line. 9853 if ($next !~ /\n.*\n/) { 9854 $next =~ s/^ *//; 9855 } 9856 else { 9857 $output .= "\n"; 9858 } 9859 $output .= $next; 9860 } 9861 9862 $output .= "$indent},\n" if $using_braces; 9863 } 9864 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { 9865 $output = $indent . ref($item) . "\n"; 9866 # XXX see if blessed 9867 } 9868 elsif ($item->can('dump')) { 9869 9870 # By convention in this program, objects furnish a 'dump' 9871 # method. Since not doing any output at this level, just pass 9872 # on the input indent 9873 $output = $item->dump($indent); 9874 } 9875 else { 9876 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); 9877 } 9878 } 9879 return $output; 9880 } 9881} 9882 9883sub dump_inside_out( $object, $fields_ref, @args ) { 9884 # Dump inside-out hashes in an object's state by converting them to a 9885 # regular hash and then calling simple_dumper on that. 9886 9887 my $addr = pack 'J', refaddr $object; 9888 9889 my %hash; 9890 foreach my $key (keys %$fields_ref) { 9891 $hash{$key} = $fields_ref->{$key}{$addr}; 9892 } 9893 9894 return simple_dumper(\%hash, @args); 9895} 9896 9897sub _operator_dot($self, $other="", $reversed=0) { 9898 # Overloaded '.' method that is common to all packages. It uses the 9899 # package's stringify method. 9900 9901 foreach my $which (\$self, \$other) { 9902 next unless ref $$which; 9903 if ($$which->can('_operator_stringify')) { 9904 $$which = $$which->_operator_stringify; 9905 } 9906 else { 9907 my $ref = ref $$which; 9908 my $addr = pack 'J', refaddr $$which; 9909 $$which = "$ref ($addr)"; 9910 } 9911 } 9912 return ($reversed) 9913 ? "$other$self" 9914 : "$self$other"; 9915} 9916 9917sub _operator_dot_equal($self, $other="", $reversed=0) { 9918 # Overloaded '.=' method that is common to all packages. 9919 9920 if ($reversed) { 9921 return $other .= "$self"; 9922 } 9923 else { 9924 return "$self" . "$other"; 9925 } 9926} 9927 9928sub _operator_equal($self, $other, @) { 9929 # Generic overloaded '==' routine. To be equal, they must be the exact 9930 # same object 9931 9932 return 0 unless defined $other; 9933 return 0 unless ref $other; 9934 no overloading; 9935 return $self == $other; 9936} 9937 9938sub _operator_not_equal($self, $other, @) { 9939 return ! _operator_equal($self, $other); 9940} 9941 9942sub substitute_PropertyAliases($file_object) { 9943 # Deal with early releases that don't have the crucial PropertyAliases.txt 9944 # file. 9945 9946 $file_object->insert_lines(get_old_property_aliases()); 9947 9948 process_PropertyAliases($file_object); 9949} 9950 9951 9952sub process_PropertyAliases($file) { 9953 # This reads in the PropertyAliases.txt file, which contains almost all 9954 # the character properties in Unicode and their equivalent aliases: 9955 # scf ; Simple_Case_Folding ; sfc 9956 # 9957 # Field 0 is the preferred short name for the property. 9958 # Field 1 is the full name. 9959 # Any succeeding ones are other accepted names. 9960 9961 # Add any cjk properties that may have been defined. 9962 $file->insert_lines(@cjk_properties); 9963 9964 while ($file->next_line) { 9965 9966 my @data = split /\s*;\s*/; 9967 9968 my $full = $data[1]; 9969 9970 # This line is defective in early Perls. The property in Unihan.txt 9971 # is kRSUnicode. 9972 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) { 9973 push @data, qw(cjkRSUnicode kRSUnicode); 9974 } 9975 9976 my $this = Property->new($data[0], Full_Name => $full); 9977 9978 $this->set_fate($SUPPRESSED, $why_suppressed{$full}) 9979 if $why_suppressed{$full}; 9980 9981 # Start looking for more aliases after these two. 9982 for my $i (2 .. @data - 1) { 9983 $this->add_alias($data[$i]); 9984 } 9985 9986 } 9987 9988 my $scf = property_ref("Simple_Case_Folding"); 9989 $scf->add_alias("scf"); 9990 $scf->add_alias("sfc"); 9991 9992 return; 9993} 9994 9995sub finish_property_setup($file) { 9996 # Finishes setting up after PropertyAliases. 9997 9998 # This entry was missing from this file in earlier Unicode versions 9999 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) { 10000 Property->new('JSN', Full_Name => 'Jamo_Short_Name'); 10001 } 10002 10003 # These are used so much, that we set globals for them. 10004 $gc = property_ref('General_Category'); 10005 $block = property_ref('Block'); 10006 $script = property_ref('Script'); 10007 $age = property_ref('Age'); 10008 10009 # Perl adds this alias. 10010 $gc->add_alias('Category'); 10011 10012 # Unicode::Normalize expects this file with this name and directory. 10013 $ccc = property_ref('Canonical_Combining_Class'); 10014 if (defined $ccc) { 10015 $ccc->set_file('CombiningClass'); 10016 $ccc->set_directory(File::Spec->curdir()); 10017 } 10018 10019 # These two properties aren't actually used in the core, but unfortunately 10020 # the names just above that are in the core interfere with these, so 10021 # choose different names. These aren't a problem unless the map tables 10022 # for these files get written out. 10023 my $lowercase = property_ref('Lowercase'); 10024 $lowercase->set_file('IsLower') if defined $lowercase; 10025 my $uppercase = property_ref('Uppercase'); 10026 $uppercase->set_file('IsUpper') if defined $uppercase; 10027 10028 # Set up the hard-coded default mappings, but only on properties defined 10029 # for this release 10030 foreach my $property (keys %default_mapping) { 10031 my $property_object = property_ref($property); 10032 next if ! defined $property_object; 10033 my $default_map = $default_mapping{$property}; 10034 $property_object->set_default_map($default_map); 10035 10036 # A map of <code point> implies the property is string. 10037 if ($property_object->type == $UNKNOWN 10038 && $default_map eq $CODE_POINT) 10039 { 10040 $property_object->set_type($STRING); 10041 } 10042 } 10043 10044 # For backwards compatibility with applications that may read the mapping 10045 # file directly (it was documented in 5.12 and 5.14 as being thusly 10046 # usable), keep it from being adjusted. (range_size_1 is 10047 # used to force the traditional format.) 10048 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { 10049 $nfkc_cf->set_to_output_map($EXTERNAL_MAP); 10050 $nfkc_cf->set_range_size_1(1); 10051 } 10052 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { 10053 $bmg->set_to_output_map($EXTERNAL_MAP); 10054 $bmg->set_range_size_1(1); 10055 } 10056 10057 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); 10058 10059 # The rest of this sub is for properties that need the Multi_Default class 10060 # to create objects for defaults. As of v15.0, this is no longer needed. 10061 10062 return if $v_version ge v15.0.0; 10063 10064 # Bidi class has a complicated default, but the derived file takes care of 10065 # the complications, leaving just 'L'. 10066 if (file_exists("${EXTRACTED}DBidiClass.txt")) { 10067 property_ref('Bidi_Class')->set_default_map('L'); 10068 } 10069 else { 10070 my $default; 10071 10072 # The derived file was introduced in 3.1.1. The values below are 10073 # taken from table 3-8, TUS 3.0 10074 my $default_R = 10075 'my $default = Range_List->new; 10076 $default->add_range(0x0590, 0x05FF); 10077 $default->add_range(0xFB1D, 0xFB4F);' 10078 ; 10079 10080 # The defaults apply only to unassigned characters 10081 $default_R .= '$gc->table("Unassigned") & $default;'; 10082 10083 if ($v_version lt v3.0.0) { 10084 $default = Multi_Default->new(R => $default_R, 'L'); 10085 } 10086 else { 10087 10088 # AL apparently not introduced until 3.0: TUS 2.x references are 10089 # not on-line to check it out 10090 my $default_AL = 10091 'my $default = Range_List->new; 10092 $default->add_range(0x0600, 0x07BF); 10093 $default->add_range(0xFB50, 0xFDFF); 10094 $default->add_range(0xFE70, 0xFEFF);' 10095 ; 10096 10097 # Non-character code points introduced in this release; aren't AL 10098 if ($v_version ge 3.1.0) { 10099 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; 10100 } 10101 $default_AL .= '$gc->table("Unassigned") & $default'; 10102 $default = Multi_Default->new(AL => $default_AL, 10103 R => $default_R, 10104 'L'); 10105 } 10106 property_ref('Bidi_Class')->set_default_map($default); 10107 } 10108 10109 # Joining type has a complicated default, but the derived file takes care 10110 # of the complications, leaving just 'U' (or Non_Joining), except the file 10111 # is bad in 3.1.0 10112 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { 10113 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { 10114 property_ref('Joining_Type')->set_default_map('Non_Joining'); 10115 } 10116 else { 10117 10118 # Otherwise, there are not one, but two possibilities for the 10119 # missing defaults: T and U. 10120 # The missing defaults that evaluate to T are given by: 10121 # T = Mn + Cf - ZWNJ - ZWJ 10122 # where Mn and Cf are the general category values. In other words, 10123 # any non-spacing mark or any format control character, except 10124 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO 10125 # WIDTH JOINER (joining type C). 10126 my $default = Multi_Default->new( 10127 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', 10128 'Non_Joining'); 10129 property_ref('Joining_Type')->set_default_map($default); 10130 } 10131 } 10132 10133 # Line break has a complicated default in early releases. It is 'Unknown' 10134 # for non-assigned code points; 'AL' for assigned. 10135 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { 10136 my $lb = property_ref('Line_Break'); 10137 if (file_exists("${EXTRACTED}DLineBreak.txt")) { 10138 $lb->set_default_map('Unknown'); 10139 } 10140 else { 10141 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")', 10142 'Unknown', 10143 ); 10144 $lb->set_default_map($default); 10145 } 10146 } 10147 10148 return; 10149} 10150 10151sub get_old_property_aliases() { 10152 # Returns what would be in PropertyAliases.txt if it existed in very old 10153 # versions of Unicode. It was derived from the one in 3.2, and pared 10154 # down based on the data that was actually in the older releases. 10155 # An attempt was made to use the existence of files to mean inclusion or 10156 # not of various aliases, but if this was not sufficient, using version 10157 # numbers was resorted to. 10158 10159 my @return; 10160 10161 # These are to be used in all versions (though some are constructed by 10162 # this program if missing) 10163 push @return, split /\n/, <<'END'; 10164bc ; Bidi_Class 10165Bidi_M ; Bidi_Mirrored 10166cf ; Case_Folding 10167ccc ; Canonical_Combining_Class 10168dm ; Decomposition_Mapping 10169dt ; Decomposition_Type 10170gc ; General_Category 10171isc ; ISO_Comment 10172lc ; Lowercase_Mapping 10173na ; Name 10174na1 ; Unicode_1_Name 10175nt ; Numeric_Type 10176nv ; Numeric_Value 10177scf ; Simple_Case_Folding 10178slc ; Simple_Lowercase_Mapping 10179stc ; Simple_Titlecase_Mapping 10180suc ; Simple_Uppercase_Mapping 10181tc ; Titlecase_Mapping 10182uc ; Uppercase_Mapping 10183END 10184 10185 if (-e 'Blocks.txt') { 10186 push @return, "blk ; Block\n"; 10187 } 10188 if (-e 'ArabicShaping.txt') { 10189 push @return, split /\n/, <<'END'; 10190jg ; Joining_Group 10191jt ; Joining_Type 10192END 10193 } 10194 if (-e 'PropList.txt') { 10195 10196 # This first set is in the original old-style proplist. 10197 push @return, split /\n/, <<'END'; 10198Bidi_C ; Bidi_Control 10199Dash ; Dash 10200Dia ; Diacritic 10201Ext ; Extender 10202Hex ; Hex_Digit 10203Hyphen ; Hyphen 10204IDC ; ID_Continue 10205Ideo ; Ideographic 10206Join_C ; Join_Control 10207Math ; Math 10208QMark ; Quotation_Mark 10209Term ; Terminal_Punctuation 10210WSpace ; White_Space 10211END 10212 # The next sets were added later 10213 if ($v_version ge v3.0.0) { 10214 push @return, split /\n/, <<'END'; 10215Upper ; Uppercase 10216Lower ; Lowercase 10217END 10218 } 10219 if ($v_version ge v3.0.1) { 10220 push @return, split /\n/, <<'END'; 10221NChar ; Noncharacter_Code_Point 10222END 10223 } 10224 # The next sets were added in the new-style 10225 if ($v_version ge v3.1.0) { 10226 push @return, split /\n/, <<'END'; 10227OAlpha ; Other_Alphabetic 10228OLower ; Other_Lowercase 10229OMath ; Other_Math 10230OUpper ; Other_Uppercase 10231END 10232 } 10233 if ($v_version ge v3.1.1) { 10234 push @return, "AHex ; ASCII_Hex_Digit\n"; 10235 } 10236 } 10237 if (-e 'EastAsianWidth.txt') { 10238 push @return, "ea ; East_Asian_Width\n"; 10239 } 10240 if (-e 'CompositionExclusions.txt') { 10241 push @return, "CE ; Composition_Exclusion\n"; 10242 } 10243 if (-e 'LineBreak.txt') { 10244 push @return, "lb ; Line_Break\n"; 10245 } 10246 if (-e 'BidiMirroring.txt') { 10247 push @return, "bmg ; Bidi_Mirroring_Glyph\n"; 10248 } 10249 if (-e 'Scripts.txt') { 10250 push @return, "sc ; Script\n"; 10251 } 10252 if (-e 'DNormalizationProps.txt') { 10253 push @return, split /\n/, <<'END'; 10254Comp_Ex ; Full_Composition_Exclusion 10255FC_NFKC ; FC_NFKC_Closure 10256NFC_QC ; NFC_Quick_Check 10257NFD_QC ; NFD_Quick_Check 10258NFKC_QC ; NFKC_Quick_Check 10259NFKD_QC ; NFKD_Quick_Check 10260XO_NFC ; Expands_On_NFC 10261XO_NFD ; Expands_On_NFD 10262XO_NFKC ; Expands_On_NFKC 10263XO_NFKD ; Expands_On_NFKD 10264END 10265 } 10266 if (-e 'DCoreProperties.txt') { 10267 push @return, split /\n/, <<'END'; 10268Alpha ; Alphabetic 10269IDS ; ID_Start 10270XIDC ; XID_Continue 10271XIDS ; XID_Start 10272END 10273 # These can also appear in some versions of PropList.txt 10274 push @return, "Lower ; Lowercase\n" 10275 unless grep { $_ =~ /^Lower\b/} @return; 10276 push @return, "Upper ; Uppercase\n" 10277 unless grep { $_ =~ /^Upper\b/} @return; 10278 } 10279 10280 # This flag requires the DAge.txt file to be copied into the directory. 10281 if (DEBUG && $compare_versions) { 10282 push @return, 'age ; Age'; 10283 } 10284 10285 return @return; 10286} 10287 10288sub substitute_PropValueAliases($file_object) { 10289 # Deal with early releases that don't have the crucial 10290 # PropValueAliases.txt file. 10291 10292 $file_object->insert_lines(get_old_property_value_aliases()); 10293 10294 process_PropValueAliases($file_object); 10295} 10296 10297sub process_PropValueAliases($file) { 10298 # This file contains values that properties look like: 10299 # bc ; AL ; Arabic_Letter 10300 # blk; n/a ; Greek_And_Coptic ; Greek 10301 # 10302 # Field 0 is the property. 10303 # Field 1 is the short name of a property value or 'n/a' if no 10304 # short name exists; 10305 # Field 2 is the full property value name; 10306 # Any other fields are more synonyms for the property value. 10307 # Purely numeric property values are omitted from the file; as are some 10308 # others, fewer and fewer in later releases 10309 10310 # Entries for the ccc property have an extra field before the 10311 # abbreviation: 10312 # ccc; 0; NR ; Not_Reordered 10313 # It is the numeric value that the names are synonyms for. 10314 10315 # There are comment entries for values missing from this file: 10316 # # @missing: 0000..10FFFF; ISO_Comment; <none> 10317 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point> 10318 10319 if ($v_version lt 4.0.0) { 10320 $file->insert_lines(split /\n/, <<'END' 10321Hangul_Syllable_Type; L ; Leading_Jamo 10322Hangul_Syllable_Type; LV ; LV_Syllable 10323Hangul_Syllable_Type; LVT ; LVT_Syllable 10324Hangul_Syllable_Type; NA ; Not_Applicable 10325Hangul_Syllable_Type; T ; Trailing_Jamo 10326Hangul_Syllable_Type; V ; Vowel_Jamo 10327END 10328 ); 10329 } 10330 if ($v_version lt 4.1.0) { 10331 $file->insert_lines(split /\n/, <<'END' 10332_Perl_GCB; CN ; Control 10333_Perl_GCB; CR ; CR 10334_Perl_GCB; EX ; Extend 10335_Perl_GCB; L ; L 10336_Perl_GCB; LF ; LF 10337_Perl_GCB; LV ; LV 10338_Perl_GCB; LVT ; LVT 10339_Perl_GCB; T ; T 10340_Perl_GCB; V ; V 10341_Perl_GCB; XX ; Other 10342END 10343 ); 10344 } 10345 10346 # Add any explicit cjk values 10347 $file->insert_lines(@cjk_property_values); 10348 10349 # This line is used only for testing the code that checks for name 10350 # conflicts. There is a script Inherited, and when this line is executed 10351 # it causes there to be a name conflict with the 'Inherited' that this 10352 # program generates for this block property value 10353 #$file->insert_lines('blk; n/a; Herited'); 10354 10355 # Process each line of the file ... 10356 while ($file->next_line) { 10357 10358 # Fix typo in input file 10359 s/CCC133/CCC132/g if $v_version eq v6.1.0; 10360 10361 my ($property, @data) = split /\s*;\s*/; 10362 10363 # The ccc property has an extra field at the beginning, which is the 10364 # numeric value. Move it to be after the other two, mnemonic, fields, 10365 # so that those will be used as the property value's names, and the 10366 # number will be an extra alias. (Rightmost splice removes field 1-2, 10367 # returning them in a slice; left splice inserts that before anything, 10368 # thus shifting the former field 0 to after them.) 10369 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; 10370 10371 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) { 10372 my $new_style = $data[1] =~ s/-/_/gr; 10373 splice @data, 1, 0, $new_style; 10374 } 10375 10376 # Field 0 is a short name unless "n/a"; field 1 is the full name. If 10377 # there is no short name, use the full one in element 1 10378 if ($data[0] eq "n/a") { 10379 $data[0] = $data[1]; 10380 } 10381 elsif ($data[0] ne $data[1] 10382 && standardize($data[0]) eq standardize($data[1]) 10383 && $data[1] !~ /[[:upper:]]/) 10384 { 10385 # Also, there is a bug in the file in which "n/a" is omitted, and 10386 # the two fields are identical except for case, and the full name 10387 # is all lower case. Copy the "short" name unto the full one to 10388 # give it some upper case. 10389 10390 $data[1] = $data[0]; 10391 } 10392 10393 # Earlier releases had the pseudo property 'qc' that should expand to 10394 # the ones that replace it below. 10395 if ($property eq 'qc') { 10396 if (lc $data[0] eq 'y') { 10397 $file->insert_lines('NFC_QC; Y ; Yes', 10398 'NFD_QC; Y ; Yes', 10399 'NFKC_QC; Y ; Yes', 10400 'NFKD_QC; Y ; Yes', 10401 ); 10402 } 10403 elsif (lc $data[0] eq 'n') { 10404 $file->insert_lines('NFC_QC; N ; No', 10405 'NFD_QC; N ; No', 10406 'NFKC_QC; N ; No', 10407 'NFKD_QC; N ; No', 10408 ); 10409 } 10410 elsif (lc $data[0] eq 'm') { 10411 $file->insert_lines('NFC_QC; M ; Maybe', 10412 'NFKC_QC; M ; Maybe', 10413 ); 10414 } 10415 else { 10416 $file->carp_bad_line("qc followed by unexpected '$data[0]"); 10417 } 10418 next; 10419 } 10420 10421 # The first field is the short name, 2nd is the full one. 10422 my $property_object = property_ref($property); 10423 my $table = $property_object->add_match_table($data[0], 10424 Full_Name => $data[1]); 10425 10426 # Start looking for more aliases after these two. 10427 for my $i (2 .. @data - 1) { 10428 $table->add_alias($data[$i]); 10429 } 10430 } # End of looping through the file 10431 10432 # As noted in the comments early in the program, it generates tables for 10433 # the default values for all releases, even those for which the concept 10434 # didn't exist at the time. Here we add those if missing. 10435 if (defined $age && ! defined $age->table('Unassigned')) { 10436 $age->add_match_table('Unassigned'); 10437 } 10438 $block->add_match_table('No_Block') if -e 'Blocks.txt' 10439 && ! defined $block->table('No_Block'); 10440 10441 10442 # Now set the default mappings of the properties from the file. This is 10443 # done after the loop because a number of properties have only @missings 10444 # entries in the file, and may not show up until the end. 10445 my @defaults = $file->get_missings; 10446 foreach my $default_ref (@defaults) { 10447 my $default = $default_ref->{default}; 10448 my $property = property_ref($default_ref->{property}); 10449 $property->set_default_map($default); 10450 } 10451 10452 return; 10453} 10454 10455sub get_old_property_value_aliases () { 10456 # Returns what would be in PropValueAliases.txt if it existed in very old 10457 # versions of Unicode. It was derived from the one in 3.2, and pared 10458 # down. An attempt was made to use the existence of files to mean 10459 # inclusion or not of various aliases, but if this was not sufficient, 10460 # using version numbers was resorted to. 10461 10462 my @return = split /\n/, <<'END'; 10463bc ; AN ; Arabic_Number 10464bc ; B ; Paragraph_Separator 10465bc ; CS ; Common_Separator 10466bc ; EN ; European_Number 10467bc ; ES ; European_Separator 10468bc ; ET ; European_Terminator 10469bc ; L ; Left_To_Right 10470bc ; ON ; Other_Neutral 10471bc ; R ; Right_To_Left 10472bc ; WS ; White_Space 10473 10474Bidi_M; N; No; F; False 10475Bidi_M; Y; Yes; T; True 10476 10477# The standard combining classes are very much different in v1, so only use 10478# ones that look right (not checked thoroughly) 10479ccc; 0; NR ; Not_Reordered 10480ccc; 1; OV ; Overlay 10481ccc; 7; NK ; Nukta 10482ccc; 8; KV ; Kana_Voicing 10483ccc; 9; VR ; Virama 10484ccc; 202; ATBL ; Attached_Below_Left 10485ccc; 216; ATAR ; Attached_Above_Right 10486ccc; 218; BL ; Below_Left 10487ccc; 220; B ; Below 10488ccc; 222; BR ; Below_Right 10489ccc; 224; L ; Left 10490ccc; 228; AL ; Above_Left 10491ccc; 230; A ; Above 10492ccc; 232; AR ; Above_Right 10493ccc; 234; DA ; Double_Above 10494 10495dt ; can ; canonical 10496dt ; enc ; circle 10497dt ; fin ; final 10498dt ; font ; font 10499dt ; fra ; fraction 10500dt ; init ; initial 10501dt ; iso ; isolated 10502dt ; med ; medial 10503dt ; n/a ; none 10504dt ; nb ; noBreak 10505dt ; sqr ; square 10506dt ; sub ; sub 10507dt ; sup ; super 10508 10509gc ; C ; Other # Cc | Cf | Cn | Co | Cs 10510gc ; Cc ; Control 10511gc ; Cn ; Unassigned 10512gc ; Co ; Private_Use 10513gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu 10514gc ; LC ; Cased_Letter # Ll | Lt | Lu 10515gc ; Ll ; Lowercase_Letter 10516gc ; Lm ; Modifier_Letter 10517gc ; Lo ; Other_Letter 10518gc ; Lu ; Uppercase_Letter 10519gc ; M ; Mark # Mc | Me | Mn 10520gc ; Mc ; Spacing_Mark 10521gc ; Mn ; Nonspacing_Mark 10522gc ; N ; Number # Nd | Nl | No 10523gc ; Nd ; Decimal_Number 10524gc ; No ; Other_Number 10525gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps 10526gc ; Pd ; Dash_Punctuation 10527gc ; Pe ; Close_Punctuation 10528gc ; Po ; Other_Punctuation 10529gc ; Ps ; Open_Punctuation 10530gc ; S ; Symbol # Sc | Sk | Sm | So 10531gc ; Sc ; Currency_Symbol 10532gc ; Sm ; Math_Symbol 10533gc ; So ; Other_Symbol 10534gc ; Z ; Separator # Zl | Zp | Zs 10535gc ; Zl ; Line_Separator 10536gc ; Zp ; Paragraph_Separator 10537gc ; Zs ; Space_Separator 10538 10539nt ; de ; Decimal 10540nt ; di ; Digit 10541nt ; n/a ; None 10542nt ; nu ; Numeric 10543END 10544 10545 if (-e 'ArabicShaping.txt') { 10546 push @return, split /\n/, <<'END'; 10547jg ; n/a ; AIN 10548jg ; n/a ; ALEF 10549jg ; n/a ; DAL 10550jg ; n/a ; GAF 10551jg ; n/a ; LAM 10552jg ; n/a ; MEEM 10553jg ; n/a ; NO_JOINING_GROUP 10554jg ; n/a ; NOON 10555jg ; n/a ; QAF 10556jg ; n/a ; SAD 10557jg ; n/a ; SEEN 10558jg ; n/a ; TAH 10559jg ; n/a ; WAW 10560 10561jt ; C ; Join_Causing 10562jt ; D ; Dual_Joining 10563jt ; L ; Left_Joining 10564jt ; R ; Right_Joining 10565jt ; U ; Non_Joining 10566jt ; T ; Transparent 10567END 10568 if ($v_version ge v3.0.0) { 10569 push @return, split /\n/, <<'END'; 10570jg ; n/a ; ALAPH 10571jg ; n/a ; BEH 10572jg ; n/a ; BETH 10573jg ; n/a ; DALATH_RISH 10574jg ; n/a ; E 10575jg ; n/a ; FEH 10576jg ; n/a ; FINAL_SEMKATH 10577jg ; n/a ; GAMAL 10578jg ; n/a ; HAH 10579jg ; n/a ; HAMZA_ON_HEH_GOAL 10580jg ; n/a ; HE 10581jg ; n/a ; HEH 10582jg ; n/a ; HEH_GOAL 10583jg ; n/a ; HETH 10584jg ; n/a ; KAF 10585jg ; n/a ; KAPH 10586jg ; n/a ; KNOTTED_HEH 10587jg ; n/a ; LAMADH 10588jg ; n/a ; MIM 10589jg ; n/a ; NUN 10590jg ; n/a ; PE 10591jg ; n/a ; QAPH 10592jg ; n/a ; REH 10593jg ; n/a ; REVERSED_PE 10594jg ; n/a ; SADHE 10595jg ; n/a ; SEMKATH 10596jg ; n/a ; SHIN 10597jg ; n/a ; SWASH_KAF 10598jg ; n/a ; TAW 10599jg ; n/a ; TEH_MARBUTA 10600jg ; n/a ; TETH 10601jg ; n/a ; YEH 10602jg ; n/a ; YEH_BARREE 10603jg ; n/a ; YEH_WITH_TAIL 10604jg ; n/a ; YUDH 10605jg ; n/a ; YUDH_HE 10606jg ; n/a ; ZAIN 10607END 10608 } 10609 } 10610 10611 10612 if (-e 'EastAsianWidth.txt') { 10613 push @return, split /\n/, <<'END'; 10614ea ; A ; Ambiguous 10615ea ; F ; Fullwidth 10616ea ; H ; Halfwidth 10617ea ; N ; Neutral 10618ea ; Na ; Narrow 10619ea ; W ; Wide 10620END 10621 } 10622 10623 if (-e 'LineBreak.txt' || -e 'LBsubst.txt') { 10624 my @lb = split /\n/, <<'END'; 10625lb ; AI ; Ambiguous 10626lb ; AL ; Alphabetic 10627lb ; B2 ; Break_Both 10628lb ; BA ; Break_After 10629lb ; BB ; Break_Before 10630lb ; BK ; Mandatory_Break 10631lb ; CB ; Contingent_Break 10632lb ; CL ; Close_Punctuation 10633lb ; CM ; Combining_Mark 10634lb ; CR ; Carriage_Return 10635lb ; EX ; Exclamation 10636lb ; GL ; Glue 10637lb ; HY ; Hyphen 10638lb ; ID ; Ideographic 10639lb ; IN ; Inseperable 10640lb ; IS ; Infix_Numeric 10641lb ; LF ; Line_Feed 10642lb ; NS ; Nonstarter 10643lb ; NU ; Numeric 10644lb ; OP ; Open_Punctuation 10645lb ; PO ; Postfix_Numeric 10646lb ; PR ; Prefix_Numeric 10647lb ; QU ; Quotation 10648lb ; SA ; Complex_Context 10649lb ; SG ; Surrogate 10650lb ; SP ; Space 10651lb ; SY ; Break_Symbols 10652lb ; XX ; Unknown 10653lb ; ZW ; ZWSpace 10654END 10655 # If this Unicode version predates the lb property, we use our 10656 # substitute one 10657 if (-e 'LBsubst.txt') { 10658 $_ = s/^lb/_Perl_LB/r for @lb; 10659 } 10660 push @return, @lb; 10661 } 10662 10663 if (-e 'DNormalizationProps.txt') { 10664 push @return, split /\n/, <<'END'; 10665qc ; M ; Maybe 10666qc ; N ; No 10667qc ; Y ; Yes 10668END 10669 } 10670 10671 if (-e 'Scripts.txt') { 10672 push @return, split /\n/, <<'END'; 10673sc ; Arab ; Arabic 10674sc ; Armn ; Armenian 10675sc ; Beng ; Bengali 10676sc ; Bopo ; Bopomofo 10677sc ; Cans ; Canadian_Aboriginal 10678sc ; Cher ; Cherokee 10679sc ; Cyrl ; Cyrillic 10680sc ; Deva ; Devanagari 10681sc ; Dsrt ; Deseret 10682sc ; Ethi ; Ethiopic 10683sc ; Geor ; Georgian 10684sc ; Goth ; Gothic 10685sc ; Grek ; Greek 10686sc ; Gujr ; Gujarati 10687sc ; Guru ; Gurmukhi 10688sc ; Hang ; Hangul 10689sc ; Hani ; Han 10690sc ; Hebr ; Hebrew 10691sc ; Hira ; Hiragana 10692sc ; Ital ; Old_Italic 10693sc ; Kana ; Katakana 10694sc ; Khmr ; Khmer 10695sc ; Knda ; Kannada 10696sc ; Laoo ; Lao 10697sc ; Latn ; Latin 10698sc ; Mlym ; Malayalam 10699sc ; Mong ; Mongolian 10700sc ; Mymr ; Myanmar 10701sc ; Ogam ; Ogham 10702sc ; Orya ; Oriya 10703sc ; Qaai ; Inherited 10704sc ; Runr ; Runic 10705sc ; Sinh ; Sinhala 10706sc ; Syrc ; Syriac 10707sc ; Taml ; Tamil 10708sc ; Telu ; Telugu 10709sc ; Thaa ; Thaana 10710sc ; Thai ; Thai 10711sc ; Tibt ; Tibetan 10712sc ; Yiii ; Yi 10713sc ; Zyyy ; Common 10714END 10715 } 10716 10717 if ($v_version ge v2.0.0) { 10718 push @return, split /\n/, <<'END'; 10719dt ; com ; compat 10720dt ; nar ; narrow 10721dt ; sml ; small 10722dt ; vert ; vertical 10723dt ; wide ; wide 10724 10725gc ; Cf ; Format 10726gc ; Cs ; Surrogate 10727gc ; Lt ; Titlecase_Letter 10728gc ; Me ; Enclosing_Mark 10729gc ; Nl ; Letter_Number 10730gc ; Pc ; Connector_Punctuation 10731gc ; Sk ; Modifier_Symbol 10732END 10733 } 10734 if ($v_version ge v2.1.2) { 10735 push @return, "bc ; S ; Segment_Separator\n"; 10736 } 10737 if ($v_version ge v2.1.5) { 10738 push @return, split /\n/, <<'END'; 10739gc ; Pf ; Final_Punctuation 10740gc ; Pi ; Initial_Punctuation 10741END 10742 } 10743 if ($v_version ge v2.1.8) { 10744 push @return, "ccc; 240; IS ; Iota_Subscript\n"; 10745 } 10746 10747 if ($v_version ge v3.0.0) { 10748 push @return, split /\n/, <<'END'; 10749bc ; AL ; Arabic_Letter 10750bc ; BN ; Boundary_Neutral 10751bc ; LRE ; Left_To_Right_Embedding 10752bc ; LRO ; Left_To_Right_Override 10753bc ; NSM ; Nonspacing_Mark 10754bc ; PDF ; Pop_Directional_Format 10755bc ; RLE ; Right_To_Left_Embedding 10756bc ; RLO ; Right_To_Left_Override 10757 10758ccc; 233; DB ; Double_Below 10759END 10760 } 10761 10762 if ($v_version ge v3.1.0) { 10763 push @return, "ccc; 226; R ; Right\n"; 10764 } 10765 10766 return @return; 10767} 10768 10769sub process_NormalizationsTest($file) { 10770 10771 # Each line looks like: 10772 # source code point; NFC; NFD; NFKC; NFKD 10773 # e.g. 10774 # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 10775 10776 # Process each line of the file ... 10777 while ($file->next_line) { 10778 10779 next if /^@/; 10780 10781 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/; 10782 10783 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) { 10784 $$var = pack "U0U*", map { hex } split " ", $$var; 10785 $$var =~ s/(\\)/$1$1/g; 10786 } 10787 10788 push @normalization_tests, 10789 "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n"; 10790 } # End of looping through the file 10791} 10792 10793sub output_perl_charnames_line ($code_point, $name) { 10794 10795 # Output the entries in Perl_charnames specially, using 5 digits instead 10796 # of four. This makes the entries a constant length, and simplifies 10797 # charnames.pm which this table is for. Unicode can have 6 digit 10798 # ordinals, but they are all private use or noncharacters which do not 10799 # have names, so won't be in this table. 10800 10801 return sprintf "%05X\n%s\n\n", $code_point, $name; 10802} 10803 10804{ # Closure 10805 10806 # These are constants to the $property_info hash in this subroutine, to 10807 # avoid using a quoted-string which might have a typo. 10808 my $TYPE = 'type'; 10809 my $DEFAULT_MAP = 'default_map'; 10810 my $DEFAULT_TABLE = 'default_table'; 10811 my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; 10812 my $MISSINGS = 'missings'; 10813 10814 sub process_generic_property_file($file) { 10815 # This processes a file containing property mappings and puts them 10816 # into internal map tables. It should be used to handle any property 10817 # files that have mappings from a code point or range thereof to 10818 # something else. This means almost all the UCD .txt files. 10819 # each_line_handlers() should be set to adjust the lines of these 10820 # files, if necessary, to what this routine understands: 10821 # 10822 # 0374 ; NFD_QC; N 10823 # 003C..003E ; Math 10824 # 10825 # the fields are: "codepoint-range ; property; map" 10826 # 10827 # meaning the codepoints in the range all have the value 'map' under 10828 # 'property'. 10829 # Beginning and trailing white space in each field are not significant. 10830 # Note there is not a trailing semi-colon in the above. A trailing 10831 # semi-colon means the map is a null-string. An omitted map, as 10832 # opposed to a null-string, is assumed to be 'Y', based on Unicode 10833 # table syntax. (This could have been hidden from this routine by 10834 # doing it in the $file object, but that would require parsing of the 10835 # line there, so would have to parse it twice, or change the interface 10836 # to pass this an array. So not done.) 10837 # 10838 # The map field may begin with a sequence of commands that apply to 10839 # this range. Each such command begins and ends with $CMD_DELIM. 10840 # These are used to indicate, for example, that the mapping for a 10841 # range has a non-default type. 10842 # 10843 # This loops through the file, calling its next_line() method, and 10844 # then taking the map and adding it to the property's table. 10845 # Complications arise because any number of properties can be in the 10846 # file, in any order, interspersed in any way. The first time a 10847 # property is seen, it gets information about that property and 10848 # caches it for quick retrieval later. It also normalizes the maps 10849 # so that only one of many synonyms is stored. The Unicode input 10850 # files do use some multiple synonyms. 10851 10852 my %property_info; # To keep track of what properties 10853 # have already had entries in the 10854 # current file, and info about each, 10855 # so don't have to recompute. 10856 my $property_name; # property currently being worked on 10857 my $property_type; # and its type 10858 my $previous_property_name = ""; # name from last time through loop 10859 my $property_object; # pointer to the current property's 10860 # object 10861 my $property_addr; # the address of that object 10862 my $default_map; # the string that code points missing 10863 # from the file map to 10864 my $default_table; # For non-string properties, a 10865 # reference to the match table that 10866 # will contain the list of code 10867 # points that map to $default_map. 10868 10869 # Get the next real non-comment line 10870 LINE: 10871 while ($file->next_line) { 10872 10873 # Default replacement type; means that if parts of the range have 10874 # already been stored in our tables, the new map overrides them if 10875 # they differ more than cosmetically 10876 my $replace = $IF_NOT_EQUIVALENT; 10877 my $map_type; # Default type for the map of this range 10878 10879 #local $to_trace = 1 if main::DEBUG; 10880 trace $_ if main::DEBUG && $to_trace; 10881 10882 # Split the line into components 10883 my ($range, $property_name, $map, @remainder) 10884 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 10885 10886 # If more or less on the line than we are expecting, warn and skip 10887 # the line 10888 if (@remainder) { 10889 $file->carp_bad_line('Extra fields'); 10890 next LINE; 10891 } 10892 elsif ( ! defined $property_name) { 10893 $file->carp_bad_line('Missing property'); 10894 next LINE; 10895 } 10896 10897 # Examine the range. 10898 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 10899 { 10900 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); 10901 next LINE; 10902 } 10903 my $low = hex $1; 10904 my $high = (defined $2) ? hex $2 : $low; 10905 10906 # If changing to a new property, get the things constant per 10907 # property 10908 if ($previous_property_name ne $property_name) { 10909 10910 $property_object = property_ref($property_name); 10911 if (! defined $property_object) { 10912 $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); 10913 next LINE; 10914 } 10915 $property_addr = pack 'J', refaddr $property_object; 10916 10917 # Defer changing names until have a line that is acceptable 10918 # (the 'next' statement above means is unacceptable) 10919 $previous_property_name = $property_name; 10920 10921 # If not the first time for this property, retrieve info about 10922 # it from the cache 10923 my $this_property_info = $property_info{$property_addr}; 10924 if (defined ($this_property_info->{$TYPE})) { 10925 $property_type = $this_property_info->{$TYPE}; 10926 $default_map = $this_property_info->{$DEFAULT_MAP}; 10927 $map_type = $this_property_info->{$PSEUDO_MAP_TYPE}; 10928 $default_table = $this_property_info->{$DEFAULT_TABLE}; 10929 } 10930 else { 10931 10932 # Here, is the first time for this property. Set up the 10933 # cache. 10934 $property_type = $this_property_info->{$TYPE} 10935 = $property_object->type; 10936 $map_type 10937 = $this_property_info->{$PSEUDO_MAP_TYPE} 10938 = $property_object->pseudo_map_type; 10939 10940 # The Unicode files are set up so that if the map is not 10941 # defined, it is a binary property 10942 if (! defined $map && $property_type != $BINARY) { 10943 if ($property_type != $UNKNOWN 10944 && $property_type != $NON_STRING) 10945 { 10946 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); 10947 } 10948 else { 10949 $property_object->set_type($BINARY); 10950 $property_type = $this_property_info->{$TYPE} 10951 = $BINARY; 10952 } 10953 } 10954 10955 # Get any @missings default for this property. This 10956 # should precede the first entry for the property in the 10957 # input file, and is located in a comment that has been 10958 # stored by the Input_file class until we access it here. 10959 # It's possible that there is more than one such line 10960 # waiting for us; collect them all, and parse 10961 my @missings_list; 10962 @missings_list = $file->get_missings 10963 if $file->has_missings_defaults; 10964 10965 foreach my $default_ref (@missings_list) { 10966 10967 # For now, we are only interested in the fallback 10968 # default for the entire property. i.e., an @missing 10969 # line that is for the whole Unicode range. 10970 next if $default_ref->{start} != 0 10971 || $default_ref->{end} != $MAX_UNICODE_CODEPOINT; 10972 10973 $default_map = $default_ref->{default}; 10974 10975 # For string properties, the default is just what the 10976 # file says, but non-string properties should already 10977 # have set up a table for the default property value; 10978 # use the table for these, so can resolve synonyms 10979 # later to a single standard one. 10980 if ($property_type == $STRING 10981 || $property_type == $UNKNOWN) 10982 { 10983 $this_property_info->{$MISSINGS} = $default_map; 10984 } 10985 else { 10986 $default_map = 10987 $property_object->table($default_map)->full_name; 10988 $this_property_info->{$MISSINGS} = $default_map; 10989 $this_property_info->{$DEFAULT_MAP} = $default_map; 10990 if (! defined $property_object->default_map) { 10991 $property_object->set_default_map($default_map); 10992 } 10993 } 10994 } 10995 10996 # For later Unicode versions, multiple @missing lines for 10997 # a single property can appear in the files. The first 10998 # always applies to the entire Unicode range, and was 10999 # handled above. The subsequent ones are for smaller 11000 # ranges, and can be read as "But for this range, the 11001 # default is ...". So each overrides all the preceding 11002 # ones for the range it applies to. Typically they apply 11003 # to disjoint ranges, but don't have to. What we do is to 11004 # set them up to work in reverse order, so that after the 11005 # rest of the table is filled, the highest priority 11006 # default range fills in any code points that haven't been 11007 # specified; then the next highest priority one is 11008 # applied, and so forth. 11009 if (@missings_list > 1 && $v_version ge v15.0.0) { 11010 if ($property_type != $ENUM) { 11011 Carp::my_carp_bug("Multiple \@missings lines only" 11012 . " make sense for ENUM-type" 11013 . " properties. Changing type to" 11014 . " that"); 11015 $property_type = $this_property_info->{$TYPE} 11016 = $ENUM; 11017 $property_object->set_type($ENUM); 11018 } 11019 11020 my $multi = Multi_Default->new(); 11021 11022 # The overall default should be first on this list, 11023 # and is handled differently than the rest. 11024 $default_map = shift @missings_list; 11025 Carp::my_carp_bug("\@missings needs to be entire range") 11026 if $default_map->{start} != 0 11027 || $default_map->{end} != $MAX_UNICODE_CODEPOINT; 11028 11029 # We already have looked at this line above. Use that 11030 # result 11031 $multi->set_final_default($this_property_info-> 11032 {$MISSINGS}); 11033 11034 # Now get the individual range elements, and add them 11035 # to Multi_Default object 11036 while (@missings_list) { 11037 my $this_entry = pop @missings_list; 11038 my $subrange_default = $this_entry->{default}; 11039 11040 # Use the short name as a standard 11041 $subrange_default = $property_object-> 11042 table($subrange_default)->short_name; 11043 $multi->append_default($subrange_default, 11044 "Range_List->new(Initialize => Range->new(" 11045 . "$this_entry->{start}, $this_entry->{end}))"); 11046 } 11047 11048 # Override the property's simple default with this. 11049 $property_object->set_default_map($multi); 11050 } 11051 11052 if (! $default_map || $property_type != $ENUM) { 11053 11054 # Finished storing all the @missings defaults in the 11055 # input file so far. Get the one for the current 11056 # property. 11057 my $missings = $this_property_info->{$MISSINGS}; 11058 11059 # But we likely have separately stored what the 11060 # default should be. (This is to accommodate versions 11061 # of the standard where the @missings lines are absent 11062 # or incomplete.) Hopefully the two will match. But 11063 # check it out. 11064 $default_map = $property_object->default_map; 11065 11066 # If the map is a ref, it means that the default won't 11067 # be processed until later, so undef it, so next few 11068 # lines will redefine it to something that nothing 11069 # will match 11070 undef $default_map if ref $default_map; 11071 11072 # Create a $default_map if don't have one; maybe a 11073 # dummy that won't match anything. 11074 if (! defined $default_map) { 11075 11076 # Use any @missings line in the file. 11077 if (defined $missings) { 11078 if (ref $missings) { 11079 $default_map = $missings->full_name; 11080 $default_table = $missings; 11081 } 11082 else { 11083 $default_map = $missings; 11084 } 11085 11086 # And store it with the property for outside 11087 # use. 11088 $property_object->set_default_map($default_map); 11089 } 11090 else { 11091 11092 # Neither an @missings nor a default map. 11093 # Create a dummy one, so won't have to test 11094 # definedness in the main loop. 11095 $default_map = '_Perl This will never be in a' 11096 . ' file from Unicode'; 11097 } 11098 } 11099 11100 # Here, we have $default_map defined, possibly in 11101 # terms of $missings, but maybe not, and possibly is a 11102 # dummy one. 11103 if (defined $missings) { 11104 11105 # Make sure there is no conflict between the two. 11106 # $missings has priority. 11107 if (ref $missings) { 11108 $default_table 11109 = $property_object->table($default_map); 11110 if ( ! defined $default_table 11111 || $default_table != $missings) 11112 { 11113 if (! defined $default_table) { 11114 $default_table = $UNDEF; 11115 } 11116 $file->carp_bad_line(<<END 11117The \@missings line for $property_name in $file says that missings default to 11118$missings, but we expect it to be $default_table. $missings used. 11119END 11120 ); 11121 $default_table = $missings; 11122 $default_map = $missings->full_name; 11123 } 11124 $this_property_info->{$DEFAULT_TABLE} 11125 = $default_table; 11126 } 11127 elsif ($default_map ne $missings) { 11128 $file->carp_bad_line(<<END 11129The \@missings line for $property_name in $file says that missings default to 11130$missings, but we expect it to be $default_map. $missings used. 11131END 11132 ); 11133 $default_map = $missings; 11134 } 11135 } 11136 11137 $this_property_info->{$DEFAULT_MAP} = $default_map; 11138 11139 # If haven't done so already, find the table 11140 # corresponding to this map for non-string properties. 11141 if (! defined $default_table 11142 && $property_type != $STRING 11143 && $property_type != $UNKNOWN) 11144 { 11145 $default_table 11146 = $this_property_info->{$DEFAULT_TABLE} 11147 = $property_object->table($default_map); 11148 } 11149 } 11150 } # End of is first time for this property 11151 } # End of switching properties. 11152 11153 # Ready to process the line. 11154 # The Unicode files are set up so that if the map is not defined, 11155 # it is a binary property with value 'Y' 11156 if (! defined $map) { 11157 $map = 'Y'; 11158 } 11159 else { 11160 11161 # If the map begins with a special command to us (enclosed in 11162 # delimiters), extract the command(s). 11163 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { 11164 my $command = $1; 11165 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { 11166 $replace = $1; 11167 } 11168 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { 11169 $map_type = $1; 11170 } 11171 else { 11172 $file->carp_bad_line("Unknown command line: '$1'"); 11173 next LINE; 11174 } 11175 } 11176 } 11177 11178 if ( $default_map eq $CODE_POINT 11179 && $map =~ / ^ $code_point_re $/x) 11180 { 11181 11182 # Here, we have a map to a particular code point, and the 11183 # default map is to a code point itself. If the range 11184 # includes the particular code point, change that portion of 11185 # the range to the default. This makes sure that in the final 11186 # table only the non-defaults are listed. 11187 my $decimal_map = hex $map; 11188 if ($low <= $decimal_map && $decimal_map <= $high) { 11189 11190 # If the range includes stuff before or after the map 11191 # we're changing, split it and process the split-off parts 11192 # later. 11193 if ($low < $decimal_map) { 11194 $file->insert_adjusted_lines( 11195 sprintf("%04X..%04X; %s; %s", 11196 $low, 11197 $decimal_map - 1, 11198 $property_name, 11199 $map)); 11200 } 11201 if ($high > $decimal_map) { 11202 $file->insert_adjusted_lines( 11203 sprintf("%04X..%04X; %s; %s", 11204 $decimal_map + 1, 11205 $high, 11206 $property_name, 11207 $map)); 11208 } 11209 $low = $high = $decimal_map; 11210 $map = $CODE_POINT; 11211 } 11212 } 11213 11214 if ($property_type != $STRING && $property_type != $UNKNOWN) { 11215 my $table = $property_object->table($map); 11216 if (defined $table) { 11217 11218 # Unicode isn't very consistent about which synonym they 11219 # use in their .txt files, even within the same file, or 11220 # two files that are for the same property. For enum 11221 # properties, we know already what all the synonyms are 11222 # (because we processed PropValueAliases already). 11223 # Therefore we can take the input and map it to a uniform 11224 # value now, saving us trouble later. 11225 # 11226 # Only if the map is well-behaved do we try this: 11227 # non-empty, all non-blank. 11228 if ($property_type == $ENUM && $map =~ / ^ \S+ $ /x) { 11229 11230 # Use existing practice as much as easily practicable, 11231 # so that code that has assumptions about spelling 11232 # doesn't have to change 11233 my $short_name = $property_object->short_name; 11234 if ($short_name =~ / ^ (BC | EA | GC |HST | JT | 11235 Lb | BT | BPT | NFCQC | 11236 NFKCQC) $ /ix) 11237 { 11238 $map = $table->short_name; 11239 } 11240 elsif ($short_name !~ / ^ ( Ccc | Age | InSC | JG | 11241 SB) $ /ix) 11242 { 11243 $map = $table->full_name; 11244 } 11245 } 11246 elsif ($table == $default_table) { 11247 11248 # When it isn't an ENUM, we we can still tell if 11249 # this is a synonym for the default map. If so, use 11250 # the default one instead. 11251 $map = $default_map; 11252 } 11253 } 11254 } 11255 11256 # And figure out the map type if not known. 11257 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { 11258 if ($map eq "") { # Nulls are always $NULL map type 11259 $map_type = $NULL; 11260 } # Otherwise, non-strings, and those that don't allow 11261 # $MULTI_CP, and those that aren't multiple code points are 11262 # 0 11263 elsif 11264 (($property_type != $STRING && $property_type != $UNKNOWN) 11265 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) 11266 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) 11267 { 11268 $map_type = 0; 11269 } 11270 else { 11271 $map_type = $MULTI_CP; 11272 } 11273 } 11274 11275 $property_object->add_map($low, $high, 11276 $map, 11277 Type => $map_type, 11278 Replace => $replace); 11279 } # End of loop through file's lines 11280 11281 return; 11282 } 11283} 11284 11285{ # Closure for UnicodeData.txt handling 11286 11287 # This file was the first one in the UCD; its design leads to some 11288 # awkwardness in processing. Here is a sample line: 11289 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 11290 # The fields in order are: 11291 my $i = 0; # The code point is in field 0, and is shifted off. 11292 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") 11293 my $CATEGORY = $i++; # category (e.g. "Lu") 11294 my $CCC = $i++; # Canonical combining class (e.g. "230") 11295 my $BIDI = $i++; # directional class (e.g. "L") 11296 my $PERL_DECOMPOSITION = $i++; # decomposition mapping 11297 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value 11298 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript 11299 # Dual-use in this program; see below 11300 my $NUMERIC = $i++; # numeric value 11301 my $MIRRORED = $i++; # ? mirrored 11302 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 11303 my $COMMENT = $i++; # iso comment 11304 my $UPPER = $i++; # simple uppercase mapping 11305 my $LOWER = $i++; # simple lowercase mapping 11306 my $TITLE = $i++; # simple titlecase mapping 11307 my $input_field_count = $i; 11308 11309 # This routine in addition outputs these extra fields: 11310 11311 my $DECOMP_TYPE = $i++; # Decomposition type 11312 11313 # These fields are modifications of ones above, and are usually 11314 # suppressed; they must come last, as for speed, the loop upper bound is 11315 # normally set to ignore them 11316 my $NAME = $i++; # This is the strict name field, not the one that 11317 # charnames uses. 11318 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used 11319 # by Unicode::Normalize 11320 my $last_field = $i - 1; 11321 11322 # All these are read into an array for each line, with the indices defined 11323 # above. The empty fields in the example line above indicate that the 11324 # value is defaulted. The handler called for each line of the input 11325 # changes these to their defaults. 11326 11327 # Here are the official names of the properties, in a parallel array: 11328 my @field_names; 11329 $field_names[$BIDI] = 'Bidi_Class'; 11330 $field_names[$CATEGORY] = 'General_Category'; 11331 $field_names[$CCC] = 'Canonical_Combining_Class'; 11332 $field_names[$CHARNAME] = 'Perl_Charnames'; 11333 $field_names[$COMMENT] = 'ISO_Comment'; 11334 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; 11335 $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; 11336 $field_names[$LOWER] = 'Lowercase_Mapping'; 11337 $field_names[$MIRRORED] = 'Bidi_Mirrored'; 11338 $field_names[$NAME] = 'Name'; 11339 $field_names[$NUMERIC] = 'Numeric_Value'; 11340 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; 11341 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; 11342 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; 11343 $field_names[$TITLE] = 'Titlecase_Mapping'; 11344 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; 11345 $field_names[$UPPER] = 'Uppercase_Mapping'; 11346 11347 # Some of these need a little more explanation: 11348 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode 11349 # property, but is used in calculating the Numeric_Type. Perl however, 11350 # creates a file from this field, so a Perl property is created from it. 11351 # Similarly, the Other_Digit field is used only for calculating the 11352 # Numeric_Type, and so it can be safely re-used as the place to store 11353 # the value for Numeric_Type; hence it is referred to as 11354 # $NUMERIC_TYPE_OTHER_DIGIT. 11355 # The input field named $PERL_DECOMPOSITION is a combination of both the 11356 # decomposition mapping and its type. Perl creates a file containing 11357 # exactly this field, so it is used for that. The two properties are 11358 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. 11359 # $DECOMP_MAP is usually suppressed (unless the lists are changed to 11360 # output it), as Perl doesn't use it directly. 11361 # The input field named here $CHARNAME is used to construct the 11362 # Perl_Charnames property, which is a combination of the Name property 11363 # (which the input field contains), and the Unicode_1_Name property, and 11364 # others from other files. Since, the strict Name property is not used 11365 # by Perl, this field is used for the table that Perl does use. The 11366 # strict Name property table is usually suppressed (unless the lists are 11367 # changed to output it), so it is accumulated in a separate field, 11368 # $NAME, which to save time is discarded unless the table is actually to 11369 # be output 11370 11371 # This file is processed like most in this program. Control is passed to 11372 # process_generic_property_file() which calls filter_UnicodeData_line() 11373 # for each input line. This filter converts the input into line(s) that 11374 # process_generic_property_file() understands. There is also a setup 11375 # routine called before any of the file is processed, and a handler for 11376 # EOF processing, all in this closure. 11377 11378 # A huge speed-up occurred at the cost of some added complexity when these 11379 # routines were altered to buffer the outputs into ranges. Almost all the 11380 # lines of the input file apply to just one code point, and for most 11381 # properties, the map for the next code point up is the same as the 11382 # current one. So instead of creating a line for each property for each 11383 # input line, filter_UnicodeData_line() remembers what the previous map 11384 # of a property was, and doesn't generate a line to pass on until it has 11385 # to, as when the map changes; and that passed-on line encompasses the 11386 # whole contiguous range of code points that have the same map for that 11387 # property. This means a slight amount of extra setup, and having to 11388 # flush these buffers on EOF, testing if the maps have changed, plus 11389 # remembering state information in the closure. But it means a lot less 11390 # real time in not having to change the data base for each property on 11391 # each line. 11392 11393 # Another complication is that there are already a few ranges designated 11394 # in the input. There are two lines for each, with the same maps except 11395 # the code point and name on each line. This was actually the hardest 11396 # thing to design around. The code points in those ranges may actually 11397 # have real maps not given by these two lines. These maps will either 11398 # be algorithmically determinable, or be in the extracted files furnished 11399 # with the UCD. In the event of conflicts between these extracted files, 11400 # and this one, Unicode says that this one prevails. But it shouldn't 11401 # prevail for conflicts that occur in these ranges. The data from the 11402 # extracted files prevails in those cases. So, this program is structured 11403 # so that those files are processed first, storing maps. Then the other 11404 # files are processed, generally overwriting what the extracted files 11405 # stored. But just the range lines in this input file are processed 11406 # without overwriting. This is accomplished by adding a special string to 11407 # the lines output to tell process_generic_property_file() to turn off the 11408 # overwriting for just this one line. 11409 # A similar mechanism is used to tell it that the map is of a non-default 11410 # type. 11411 11412 sub setup_UnicodeData($file) { # Called before any lines of the input are read 11413 11414 # Create a new property specially located that is a combination of 11415 # various Name properties: Name, Unicode_1_Name, Named Sequences, and 11416 # _Perl_Name_Alias properties. (The final one duplicates elements of the 11417 # first, and starting in v6.1, is the same as the 'Name_Alias 11418 # property.) A comment for the new property will later be constructed 11419 # based on the actual properties present and used 11420 $perl_charname = Property->new('Perl_Charnames', 11421 Default_Map => "", 11422 Directory => File::Spec->curdir(), 11423 File => 'Name', 11424 Fate => $INTERNAL_ONLY, 11425 Perl_Extension => 1, 11426 Range_Size_1 => \&output_perl_charnames_line, 11427 Type => $STRING, 11428 ); 11429 $perl_charname->set_proxy_for('Name'); 11430 11431 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', 11432 Directory => File::Spec->curdir(), 11433 File => 'Decomposition', 11434 Format => $DECOMP_STRING_FORMAT, 11435 Fate => $INTERNAL_ONLY, 11436 Perl_Extension => 1, 11437 Default_Map => $CODE_POINT, 11438 11439 # normalize.pm can't cope with these 11440 Output_Range_Counts => 0, 11441 11442 # This is a specially formatted table 11443 # explicitly for normalize.pm, which 11444 # is expecting a particular format, 11445 # which means that mappings containing 11446 # multiple code points are in the main 11447 # body of the table 11448 Map_Type => $COMPUTE_NO_MULTI_CP, 11449 Type => $STRING, 11450 To_Output_Map => $INTERNAL_MAP, 11451 ); 11452 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); 11453 $Perl_decomp->add_comment(join_lines(<<END 11454This mapping is a combination of the Unicode 'Decomposition_Type' and 11455'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is 11456identical to the official Unicode 'Decomposition_Mapping' property except for 11457two things: 11458 1) It omits the algorithmically determinable Hangul syllable decompositions, 11459which normalize.pm handles algorithmically. 11460 2) It contains the decomposition type as well. Non-canonical decompositions 11461begin with a word in angle brackets, like <super>, which denotes the 11462compatible decomposition type. If the map does not begin with the <angle 11463brackets>, the decomposition is canonical. 11464END 11465 )); 11466 11467 my $Decimal_Digit = Property->new("Perl_Decimal_Digit", 11468 Default_Map => "", 11469 Perl_Extension => 1, 11470 Directory => $map_directory, 11471 Type => $STRING, 11472 To_Output_Map => $OUTPUT_ADJUSTED, 11473 ); 11474 $Decimal_Digit->add_comment(join_lines(<<END 11475This file gives the mapping of all code points which represent a single 11476decimal digit [0-9] to their respective digits, but it has ranges of 10 code 11477points, and the mapping of each non-initial element of each range is actually 11478not to "0", but to the offset that element has from its corresponding DIGIT 0. 11479These code points are those that have Numeric_Type=Decimal; not special 11480things, like subscripts nor Roman numerals. 11481END 11482 )); 11483 11484 # These properties are not used for generating anything else, and are 11485 # usually not output. By making them last in the list, we can just 11486 # change the high end of the loop downwards to avoid the work of 11487 # generating a table(s) that is/are just going to get thrown away. 11488 if (! property_ref('Decomposition_Mapping')->to_output_map 11489 && ! property_ref('Name')->to_output_map) 11490 { 11491 $last_field = min($NAME, $DECOMP_MAP) - 1; 11492 } elsif (property_ref('Decomposition_Mapping')->to_output_map) { 11493 $last_field = $DECOMP_MAP; 11494 } elsif (property_ref('Name')->to_output_map) { 11495 $last_field = $NAME; 11496 } 11497 return; 11498 } 11499 11500 my $first_time = 1; # ? Is this the first line of the file 11501 my $in_range = 0; # ? Are we in one of the file's ranges 11502 my $previous_cp; # hex code point of previous line 11503 my $decimal_previous_cp = -1; # And its decimal equivalent 11504 my @start; # For each field, the current starting 11505 # code point in hex for the range 11506 # being accumulated. 11507 my @fields; # The input fields; 11508 my @previous_fields; # And those from the previous call 11509 11510 sub filter_UnicodeData_line($file) { 11511 # Handle a single input line from UnicodeData.txt; see comments above 11512 # Conceptually this takes a single line from the file containing N 11513 # properties, and converts it into N lines with one property per line, 11514 # which is what the final handler expects. But there are 11515 # complications due to the quirkiness of the input file, and to save 11516 # time, it accumulates ranges where the property values don't change 11517 # and only emits lines when necessary. This is about an order of 11518 # magnitude fewer lines emitted. 11519 11520 # $_ contains the input line. 11521 # -1 in split means retain trailing null fields 11522 (my $cp, @fields) = split /\s*;\s*/, $_, -1; 11523 11524 #local $to_trace = 1 if main::DEBUG; 11525 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; 11526 if (@fields > $input_field_count) { 11527 $file->carp_bad_line('Extra fields'); 11528 $_ = ""; 11529 return; 11530 } 11531 11532 my $decimal_cp = hex $cp; 11533 11534 # We have to output all the buffered ranges when the next code point 11535 # is not exactly one after the previous one, which means there is a 11536 # gap in the ranges. 11537 my $force_output = ($decimal_cp != $decimal_previous_cp + 1); 11538 11539 # The decomposition mapping field requires special handling. It looks 11540 # like either: 11541 # 11542 # <compat> 0032 0020 11543 # 0041 0300 11544 # 11545 # The decomposition type is enclosed in <brackets>; if missing, it 11546 # means the type is canonical. There are two decomposition mapping 11547 # tables: the one for use by Perl's normalize.pm has a special format 11548 # which is this field intact; the other, for general use is of 11549 # standard format. In either case we have to find the decomposition 11550 # type. Empty fields have None as their type, and map to the code 11551 # point itself 11552 if ($fields[$PERL_DECOMPOSITION] eq "") { 11553 $fields[$DECOMP_TYPE] = 'None'; 11554 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; 11555 } 11556 else { 11557 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] 11558 =~ / < ( .+? ) > \s* ( .+ ) /x; 11559 if (! defined $fields[$DECOMP_TYPE]) { 11560 $fields[$DECOMP_TYPE] = 'Canonical'; 11561 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; 11562 } 11563 else { 11564 $fields[$DECOMP_MAP] = $map; 11565 } 11566 } 11567 11568 # The 3 numeric fields also require special handling. The 2 digit 11569 # fields must be either empty or match the number field. This means 11570 # that if it is empty, they must be as well, and the numeric type is 11571 # None, and the numeric value is 'Nan'. 11572 # The decimal digit field must be empty or match the other digit 11573 # field. If the decimal digit field is non-empty, the code point is 11574 # a decimal digit, and the other two fields will have the same value. 11575 # If it is empty, but the other digit field is non-empty, the code 11576 # point is an 'other digit', and the number field will have the same 11577 # value as the other digit field. If the other digit field is empty, 11578 # but the number field is non-empty, the code point is a generic 11579 # numeric type. 11580 if ($fields[$NUMERIC] eq "") { 11581 if ($fields[$PERL_DECIMAL_DIGIT] ne "" 11582 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" 11583 ) { 11584 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); 11585 } 11586 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; 11587 $fields[$NUMERIC] = 'NaN'; 11588 } 11589 else { 11590 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; 11591 if ($fields[$PERL_DECIMAL_DIGIT] ne "") { 11592 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; 11593 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd"; 11594 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; 11595 } 11596 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { 11597 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; 11598 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; 11599 } 11600 else { 11601 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; 11602 11603 # Rationals require extra effort. 11604 if ($fields[$NUMERIC] =~ qr{/}) { 11605 reduce_fraction(\$fields[$NUMERIC]); 11606 register_fraction($fields[$NUMERIC]) 11607 } 11608 } 11609 } 11610 11611 # For the properties that have empty fields in the file, and which 11612 # mean something different from empty, change them to that default. 11613 # Certain fields just haven't been empty so far in any Unicode 11614 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, 11615 # $CATEGORY. This leaves just the two fields, and so we hard-code in 11616 # the defaults; which are very unlikely to ever change. 11617 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; 11618 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; 11619 11620 # UAX44 says that if title is empty, it is the same as whatever upper 11621 # is, 11622 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; 11623 11624 # There are a few pairs of lines like: 11625 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 11626 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 11627 # that define ranges. These should be processed after the fields are 11628 # adjusted above, as they may override some of them; but mostly what 11629 # is left is to possibly adjust the $CHARNAME field. The names of all the 11630 # paired lines start with a '<', but this is also true of '<control>, 11631 # which isn't one of these special ones. 11632 if ($fields[$CHARNAME] eq '<control>') { 11633 11634 # Some code points in this file have the pseudo-name 11635 # '<control>', but the official name for such ones is the null 11636 # string. 11637 $fields[$NAME] = $fields[$CHARNAME] = ""; 11638 11639 # We had better not be in between range lines. 11640 if ($in_range) { 11641 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11642 $in_range = 0; 11643 } 11644 } 11645 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { 11646 11647 # Here is a non-range line. We had better not be in between range 11648 # lines. 11649 if ($in_range) { 11650 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11651 $in_range = 0; 11652 } 11653 if ($fields[$CHARNAME] =~ s/- $cp $//x) { 11654 11655 # These are code points whose names end in their code points, 11656 # which means the names are algorithmically derivable from the 11657 # code points. To shorten the output Name file, the algorithm 11658 # for deriving these is placed in the file instead of each 11659 # code point, so they have map type $CP_IN_NAME 11660 $fields[$CHARNAME] = $CMD_DELIM 11661 . $MAP_TYPE_CMD 11662 . '=' 11663 . $CP_IN_NAME 11664 . $CMD_DELIM 11665 . $fields[$CHARNAME]; 11666 } 11667 $fields[$NAME] = $fields[$CHARNAME]; 11668 } 11669 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { 11670 $fields[$CHARNAME] = $fields[$NAME] = $1; 11671 11672 # Here we are at the beginning of a range pair. 11673 if ($in_range) { 11674 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); 11675 } 11676 $in_range = 1; 11677 11678 # Because the properties in the range do not overwrite any already 11679 # in the db, we must flush the buffers of what's already there, so 11680 # they get handled in the normal scheme. 11681 $force_output = 1; 11682 11683 } 11684 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { 11685 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); 11686 $_ = ""; 11687 return; 11688 } 11689 else { # Here, we are at the last line of a range pair. 11690 11691 if (! $in_range) { 11692 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); 11693 $_ = ""; 11694 return; 11695 } 11696 $in_range = 0; 11697 11698 $fields[$NAME] = $fields[$CHARNAME]; 11699 11700 # Check that the input is valid: that the closing of the range is 11701 # the same as the beginning. 11702 foreach my $i (0 .. $last_field) { 11703 next if $fields[$i] eq $previous_fields[$i]; 11704 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); 11705 } 11706 11707 # The processing differs depending on the type of range, 11708 # determined by its $CHARNAME 11709 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { 11710 11711 # Check that the data looks right. 11712 if ($decimal_previous_cp != $SBase) { 11713 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); 11714 } 11715 if ($decimal_cp != $SBase + $SCount - 1) { 11716 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); 11717 } 11718 11719 # The Hangul syllable range has a somewhat complicated name 11720 # generation algorithm. Each code point in it has a canonical 11721 # decomposition also computable by an algorithm. The 11722 # perl decomposition map table built from these is used only 11723 # by normalize.pm, which has the algorithm built in it, so the 11724 # decomposition maps are not needed, and are large, so are 11725 # omitted from it. If the full decomposition map table is to 11726 # be output, the decompositions are generated for it, in the 11727 # EOF handling code for this input file. 11728 11729 $previous_fields[$DECOMP_TYPE] = 'Canonical'; 11730 11731 # This range is stored in our internal structure with its 11732 # own map type, different from all others. 11733 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11734 = $CMD_DELIM 11735 . $MAP_TYPE_CMD 11736 . '=' 11737 . $HANGUL_SYLLABLE 11738 . $CMD_DELIM 11739 . $fields[$CHARNAME]; 11740 } 11741 elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter 11742 11743 # All the CJK ranges like this have the name given as a 11744 # special case in the next code line. And for the others, we 11745 # hope that Unicode continues to use the correct name in 11746 # future releases, so we don't have to make further special 11747 # cases. 11748 my $name = ($fields[$CHARNAME] =~ /^CJK/) 11749 ? 'CJK UNIFIED IDEOGRAPH' 11750 : uc $fields[$CHARNAME]; 11751 11752 # The name for these contains the code point itself, and all 11753 # are defined to have the same base name, regardless of what 11754 # is in the file. They are stored in our internal structure 11755 # with a map type of $CP_IN_NAME 11756 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11757 = $CMD_DELIM 11758 . $MAP_TYPE_CMD 11759 . '=' 11760 . $CP_IN_NAME 11761 . $CMD_DELIM 11762 . $name; 11763 11764 } 11765 elsif ($fields[$CATEGORY] eq 'Co' 11766 || $fields[$CATEGORY] eq 'Cs') 11767 { 11768 # The names of all the code points in these ranges are set to 11769 # null, as there are no names for the private use and 11770 # surrogate code points. 11771 11772 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; 11773 } 11774 else { 11775 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); 11776 } 11777 11778 # The first line of the range caused everything else to be output, 11779 # and then its values were stored as the beginning values for the 11780 # next set of ranges, which this one ends. Now, for each value, 11781 # add a command to tell the handler that these values should not 11782 # replace any existing ones in our database. 11783 foreach my $i (0 .. $last_field) { 11784 $previous_fields[$i] = $CMD_DELIM 11785 . $REPLACE_CMD 11786 . '=' 11787 . $NO 11788 . $CMD_DELIM 11789 . $previous_fields[$i]; 11790 } 11791 11792 # And change things so it looks like the entire range has been 11793 # gone through with this being the final part of it. Adding the 11794 # command above to each field will cause this range to be flushed 11795 # during the next iteration, as it guaranteed that the stored 11796 # field won't match whatever value the next one has. 11797 $previous_cp = $cp; 11798 $decimal_previous_cp = $decimal_cp; 11799 11800 # We are now set up for the next iteration; so skip the remaining 11801 # code in this subroutine that does the same thing, but doesn't 11802 # know about these ranges. 11803 $_ = ""; 11804 11805 return; 11806 } 11807 11808 # On the very first line, we fake it so the code below thinks there is 11809 # nothing to output, and initialize so that when it does get output it 11810 # uses the first line's values for the lowest part of the range. 11811 # (One could avoid this by using peek(), but then one would need to 11812 # know the adjustments done above and do the same ones in the setup 11813 # routine; not worth it) 11814 if ($first_time) { 11815 $first_time = 0; 11816 @previous_fields = @fields; 11817 @start = ($cp) x scalar @fields; 11818 $decimal_previous_cp = $decimal_cp - 1; 11819 } 11820 11821 # For each field, output the stored up ranges that this code point 11822 # doesn't fit in. Earlier we figured out if all ranges should be 11823 # terminated because of changing the replace or map type styles, or if 11824 # there is a gap between this new code point and the previous one, and 11825 # that is stored in $force_output. But even if those aren't true, we 11826 # need to output the range if this new code point's value for the 11827 # given property doesn't match the stored range's. 11828 #local $to_trace = 1 if main::DEBUG; 11829 foreach my $i (0 .. $last_field) { 11830 my $field = $fields[$i]; 11831 if ($force_output || $field ne $previous_fields[$i]) { 11832 11833 # Flush the buffer of stored values. 11834 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11835 11836 # Start a new range with this code point and its value 11837 $start[$i] = $cp; 11838 $previous_fields[$i] = $field; 11839 } 11840 } 11841 11842 # Set the values for the next time. 11843 $previous_cp = $cp; 11844 $decimal_previous_cp = $decimal_cp; 11845 11846 # The input line has generated whatever adjusted lines are needed, and 11847 # should not be looked at further. 11848 $_ = ""; 11849 return; 11850 } 11851 11852 sub EOF_UnicodeData($file) { 11853 # Called upon EOF to flush the buffers, and create the Hangul 11854 # decomposition mappings if needed. 11855 11856 # Flush the buffers. 11857 foreach my $i (0 .. $last_field) { 11858 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11859 } 11860 11861 if (-e 'Jamo.txt') { 11862 11863 # The algorithm is published by Unicode, based on values in 11864 # Jamo.txt, (which should have been processed before this 11865 # subroutine), and the results left in %Jamo 11866 unless (%Jamo) { 11867 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); 11868 return; 11869 } 11870 11871 # If the full decomposition map table is being output, insert 11872 # into it the Hangul syllable mappings. This is to avoid having 11873 # to publish a subroutine in it to compute them. (which would 11874 # essentially be this code.) This uses the algorithm published by 11875 # Unicode. (No hangul syllables in version 1) 11876 if ($v_version ge v2.0.0 11877 && property_ref('Decomposition_Mapping')->to_output_map) { 11878 for (my $S = $SBase; $S < $SBase + $SCount; $S++) { 11879 use integer; 11880 my $SIndex = $S - $SBase; 11881 my $L = $LBase + $SIndex / $NCount; 11882 my $V = $VBase + ($SIndex % $NCount) / $TCount; 11883 my $T = $TBase + $SIndex % $TCount; 11884 11885 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; 11886 my $decomposition = sprintf("%04X %04X", $L, $V); 11887 $decomposition .= sprintf(" %04X", $T) if $T != $TBase; 11888 $file->insert_adjusted_lines( 11889 sprintf("%04X; Decomposition_Mapping; %s", 11890 $S, 11891 $decomposition)); 11892 } 11893 } 11894 } 11895 11896 return; 11897 } 11898 11899 sub filter_v1_ucd($file) { 11900 # Fix UCD lines in version 1. This is probably overkill, but this 11901 # fixes some glaring errors in Version 1 UnicodeData.txt. That file: 11902 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later 11903 # removed. This program retains them 11904 # 2) didn't include ranges, which it should have, and which are now 11905 # added in @corrected_lines below. It was hand populated by 11906 # taking the data from Version 2, verified by analyzing 11907 # DAge.txt. 11908 # 3) There is a syntax error in the entry for U+09F8 which could 11909 # cause problems for Unicode::UCD, and so is changed. It's 11910 # numeric value was simply a minus sign, without any number. 11911 # (Eventually Unicode changed the code point to non-numeric.) 11912 # 4) The decomposition types often don't match later versions 11913 # exactly, and the whole syntax of that field is different; so 11914 # the syntax is changed as well as the types to their later 11915 # terminology. Otherwise normalize.pm would be very unhappy 11916 # 5) Many ccc classes are different. These are left intact. 11917 # 6) U+FF10..U+FF19 are missing their numeric values in all three 11918 # fields. These are unchanged because it doesn't really cause 11919 # problems for Perl. 11920 # 7) A number of code points, such as controls, don't have their 11921 # Unicode Version 1 Names in this file. These are added. 11922 # 8) A number of Symbols were marked as Lm. This changes those in 11923 # the Latin1 range, so that regexes work. 11924 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are 11925 # referred to by their lc equivalents. Not fixed. 11926 11927 my @corrected_lines = split /\n/, <<'END'; 119284E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 119299FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11930E000;<Private Use, First>;Co;0;L;;;;;N;;;;; 11931F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;; 11932F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;; 11933FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11934END 11935 11936 #local $to_trace = 1 if main::DEBUG; 11937 trace $_ if main::DEBUG && $to_trace; 11938 11939 # -1 => retain trailing null fields 11940 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11941 11942 # At the first place that is wrong in the input, insert all the 11943 # corrections, replacing the wrong line. 11944 if ($code_point eq '4E00') { 11945 my @copy = @corrected_lines; 11946 $_ = shift @copy; 11947 ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11948 11949 $file->insert_lines(@copy); 11950 } 11951 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') { 11952 11953 # There are no Lm characters in Latin1; these should be 'Sk', but 11954 # there isn't that in V1. 11955 $fields[$CATEGORY] = 'So'; 11956 } 11957 11958 if ($fields[$NUMERIC] eq '-') { 11959 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. 11960 } 11961 11962 if ($fields[$PERL_DECOMPOSITION] ne "") { 11963 11964 # Several entries have this change to superscript 2 or 3 in the 11965 # middle. Convert these to the modern version, which is to use 11966 # the actual U+00B2 and U+00B3 (the superscript forms) instead. 11967 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes 11968 # 'HHHH HHHH 00B3 HHHH'. 11969 # It turns out that all of these that don't have another 11970 # decomposition defined at the beginning of the line have the 11971 # <square> decomposition in later releases. 11972 if ($code_point ne '00B2' && $code_point ne '00B3') { 11973 if ($fields[$PERL_DECOMPOSITION] 11974 =~ s/<\+sup> 003([23]) <-sup>/00B$1/) 11975 { 11976 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { 11977 $fields[$PERL_DECOMPOSITION] = '<square> ' 11978 . $fields[$PERL_DECOMPOSITION]; 11979 } 11980 } 11981 } 11982 11983 # If is like '<+circled> 0052 <-circled>', convert to 11984 # '<circled> 0052' 11985 $fields[$PERL_DECOMPOSITION] =~ 11986 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg; 11987 11988 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc. 11989 $fields[$PERL_DECOMPOSITION] =~ 11990 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x 11991 or $fields[$PERL_DECOMPOSITION] =~ 11992 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x 11993 or $fields[$PERL_DECOMPOSITION] =~ 11994 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x 11995 or $fields[$PERL_DECOMPOSITION] =~ 11996 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x; 11997 11998 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc. 11999 $fields[$PERL_DECOMPOSITION] =~ 12000 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; 12001 12002 # Change names to modern form. 12003 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g; 12004 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g; 12005 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g; 12006 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g; 12007 12008 # One entry has weird braces 12009 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; 12010 12011 # One entry at U+2116 has an extra <sup> 12012 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x; 12013 } 12014 12015 $_ = join ';', $code_point, @fields; 12016 trace $_ if main::DEBUG && $to_trace; 12017 return; 12018 } 12019 12020 sub filter_bad_Nd_ucd { 12021 # Early versions specified a value in the decimal digit field even 12022 # though the code point wasn't a decimal digit. Clear the field in 12023 # that situation, so that the main code doesn't think it is a decimal 12024 # digit. 12025 12026 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12027 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') { 12028 $fields[$PERL_DECIMAL_DIGIT] = ""; 12029 $_ = join ';', $code_point, @fields; 12030 } 12031 return; 12032 } 12033 12034 my @U1_control_names = split /\n/, <<'END'; 12035NULL 12036START OF HEADING 12037START OF TEXT 12038END OF TEXT 12039END OF TRANSMISSION 12040ENQUIRY 12041ACKNOWLEDGE 12042BELL 12043BACKSPACE 12044HORIZONTAL TABULATION 12045LINE FEED 12046VERTICAL TABULATION 12047FORM FEED 12048CARRIAGE RETURN 12049SHIFT OUT 12050SHIFT IN 12051DATA LINK ESCAPE 12052DEVICE CONTROL ONE 12053DEVICE CONTROL TWO 12054DEVICE CONTROL THREE 12055DEVICE CONTROL FOUR 12056NEGATIVE ACKNOWLEDGE 12057SYNCHRONOUS IDLE 12058END OF TRANSMISSION BLOCK 12059CANCEL 12060END OF MEDIUM 12061SUBSTITUTE 12062ESCAPE 12063FILE SEPARATOR 12064GROUP SEPARATOR 12065RECORD SEPARATOR 12066UNIT SEPARATOR 12067DELETE 12068BREAK PERMITTED HERE 12069NO BREAK HERE 12070INDEX 12071NEXT LINE 12072START OF SELECTED AREA 12073END OF SELECTED AREA 12074CHARACTER TABULATION SET 12075CHARACTER TABULATION WITH JUSTIFICATION 12076LINE TABULATION SET 12077PARTIAL LINE DOWN 12078PARTIAL LINE UP 12079REVERSE LINE FEED 12080SINGLE SHIFT TWO 12081SINGLE SHIFT THREE 12082DEVICE CONTROL STRING 12083PRIVATE USE ONE 12084PRIVATE USE TWO 12085SET TRANSMIT STATE 12086CANCEL CHARACTER 12087MESSAGE WAITING 12088START OF GUARDED AREA 12089END OF GUARDED AREA 12090START OF STRING 12091SINGLE CHARACTER INTRODUCER 12092CONTROL SEQUENCE INTRODUCER 12093STRING TERMINATOR 12094OPERATING SYSTEM COMMAND 12095PRIVACY MESSAGE 12096APPLICATION PROGRAM COMMAND 12097END 12098 12099 sub filter_early_U1_names { 12100 # Very early versions did not have the Unicode_1_name field specified. 12101 # They differed in which ones were present; make sure a U1 name 12102 # exists, so that Unicode::UCD::charinfo will work 12103 12104 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12105 12106 12107 # @U1_control names above are entirely positional, so we pull them out 12108 # in the exact order required, with gaps for the ones that don't have 12109 # names. 12110 if ($code_point =~ /^00[01]/ 12111 || $code_point eq '007F' 12112 || $code_point =~ /^008[2-9A-F]/ 12113 || $code_point =~ /^009[0-8A-F]/) 12114 { 12115 my $u1_name = shift @U1_control_names; 12116 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME]; 12117 $_ = join ';', $code_point, @fields; 12118 } 12119 return; 12120 } 12121 12122 sub filter_v2_1_5_ucd { 12123 # A dozen entries in this 2.1.5 file had the mirrored and numeric 12124 # columns swapped; These all had mirrored be 'N'. So if the numeric 12125 # column appears to be N, swap it back. 12126 12127 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12128 if ($fields[$NUMERIC] eq 'N') { 12129 $fields[$NUMERIC] = $fields[$MIRRORED]; 12130 $fields[$MIRRORED] = 'N'; 12131 $_ = join ';', $code_point, @fields; 12132 } 12133 return; 12134 } 12135 12136 sub filter_v6_ucd { 12137 12138 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17, 12139 # it wasn't accepted, to allow for some deprecation cycles. This 12140 # function is not called after 5.16 12141 12142 return if $_ !~ /^(?:0007|1F514|070F);/; 12143 12144 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12145 if ($code_point eq '0007') { 12146 $fields[$CHARNAME] = ""; 12147 } 12148 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see 12149 # http://www.unicode.org/versions/corrigendum8.html 12150 $fields[$BIDI] = "AL"; 12151 } 12152 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name 12153 $fields[$CHARNAME] = ""; 12154 } 12155 12156 $_ = join ';', $code_point, @fields; 12157 12158 return; 12159 } 12160} # End closure for UnicodeData 12161 12162sub process_GCB_test($file) { 12163 12164 while ($file->next_line) { 12165 push @backslash_X_tests, $_; 12166 } 12167 12168 return; 12169} 12170 12171sub process_LB_test($file) { 12172 12173 while ($file->next_line) { 12174 push @LB_tests, $_; 12175 } 12176 12177 return; 12178} 12179 12180sub process_SB_test($file) { 12181 12182 while ($file->next_line) { 12183 push @SB_tests, $_; 12184 } 12185 12186 return; 12187} 12188 12189sub process_WB_test($file) { 12190 12191 while ($file->next_line) { 12192 push @WB_tests, $_; 12193 } 12194 12195 return; 12196} 12197 12198sub process_NamedSequences($file) { 12199 # NamedSequences.txt entries are just added to an array. Because these 12200 # don't look like the other tables, they have their own handler. 12201 # An example: 12202 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 12203 # 12204 # This just adds the sequence to an array for later handling 12205 12206 while ($file->next_line) { 12207 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; 12208 if (@remainder) { 12209 $file->carp_bad_line( 12210 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); 12211 next; 12212 } 12213 12214 # Code points need to be 5 digits long like the other entries in 12215 # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be 12216 # converted to native 12217 $sequence = join " ", map { sprintf("%05X", 12218 utf8::unicode_to_native(hex $_)) 12219 } split / /, $sequence; 12220 push @named_sequences, "$sequence\n$name\n"; 12221 } 12222 return; 12223} 12224 12225{ # Closure 12226 12227 my $first_range; 12228 12229 sub filter_early_ea_lb { 12230 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a 12231 # third field be the name of the code point, which can be ignored in 12232 # most cases. But it can be meaningful if it marks a range: 12233 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 12234 # 3400;W;<CJK Ideograph Extension A, First> 12235 # 12236 # We need to see the First in the example above to know it's a range. 12237 # They did not use the later range syntaxes. This routine changes it 12238 # to use the modern syntax. 12239 # $1 is the Input_file object. 12240 12241 my @fields = split /\s*;\s*/; 12242 if ($fields[2] =~ /^<.*, First>/) { 12243 $first_range = $fields[0]; 12244 $_ = ""; 12245 } 12246 elsif ($fields[2] =~ /^<.*, Last>/) { 12247 $_ = $_ = "$first_range..$fields[0]; $fields[1]"; 12248 } 12249 else { 12250 undef $first_range; 12251 $_ = "$fields[0]; $fields[1]"; 12252 } 12253 12254 return; 12255 } 12256} 12257 12258sub filter_substitute_lb { 12259 # Used on Unicodes that predate the LB property, where there is a 12260 # substitute file. This just does the regular ea_lb handling for such 12261 # files, and then substitutes the long property value name for the short 12262 # one that comes with the file. (The other break files have the long 12263 # names in them, so this is the odd one out.) The reason for doing this 12264 # kludge is that regen/mk_invlists.pl is expecting the long name. This 12265 # also fixes the typo 'Inseperable' that leads to problems. 12266 12267 filter_early_ea_lb; 12268 return unless $_; 12269 12270 my @fields = split /\s*;\s*/; 12271 $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name; 12272 $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable'; 12273 $_ = join '; ', @fields; 12274} 12275 12276sub filter_old_style_arabic_shaping { 12277 # Early versions used a different term for the later one. 12278 12279 my @fields = split /\s*;\s*/; 12280 $fields[3] =~ s/<no shaping>/No_Joining_Group/; 12281 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores 12282 $_ = join ';', @fields; 12283 return; 12284} 12285 12286{ # Closure 12287 my $lc; # Table for lowercase mapping 12288 my $tc; 12289 my $uc; 12290 my %special_casing_code_points; 12291 12292 sub setup_special_casing($file) { 12293 # SpecialCasing.txt contains the non-simple case change mappings. The 12294 # simple ones are in UnicodeData.txt, which should already have been 12295 # read in to the full property data structures, so as to initialize 12296 # these with the simple ones. Then the SpecialCasing.txt entries 12297 # add or overwrite the ones which have different full mappings. 12298 12299 # This routine sees if the simple mappings are to be output, and if 12300 # so, copies what has already been put into the full mapping tables, 12301 # while they still contain only the simple mappings. 12302 12303 # The reason it is done this way is that the simple mappings are 12304 # probably not going to be output, so it saves work to initialize the 12305 # full tables with the simple mappings, and then overwrite those 12306 # relatively few entries in them that have different full mappings, 12307 # and thus skip the simple mapping tables altogether. 12308 12309 $lc = property_ref('lc'); 12310 $tc = property_ref('tc'); 12311 $uc = property_ref('uc'); 12312 12313 # For each of the case change mappings... 12314 foreach my $full_casing_table ($lc, $tc, $uc) { 12315 my $full_casing_name = $full_casing_table->name; 12316 my $full_casing_full_name = $full_casing_table->full_name; 12317 unless (defined $full_casing_table 12318 && ! $full_casing_table->is_empty) 12319 { 12320 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); 12321 } 12322 12323 $full_casing_table->add_comment(join_lines( <<END 12324This file includes both the simple and full case changing maps. The simple 12325ones are in the main body of the table below, and the full ones adding to or 12326overriding them are in the hash. 12327END 12328 )); 12329 12330 # The simple version's name in each mapping merely has an 's' in 12331 # front of the full one's 12332 my $simple_name = 's' . $full_casing_name; 12333 my $simple = property_ref($simple_name); 12334 $simple->initialize($full_casing_table) if $simple->to_output_map(); 12335 } 12336 12337 return; 12338 } 12339 12340 sub filter_2_1_8_special_casing_line { 12341 12342 # This version had duplicate entries in this file. Delete all but the 12343 # first one 12344 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12345 # fields 12346 if (exists $special_casing_code_points{$fields[0]}) { 12347 $_ = ""; 12348 return; 12349 } 12350 12351 $special_casing_code_points{$fields[0]} = 1; 12352 filter_special_casing_line(@_); 12353 } 12354 12355 sub filter_special_casing_line($file) { 12356 # Change the format of $_ from SpecialCasing.txt into something that 12357 # the generic handler understands. Each input line contains three 12358 # case mappings. This will generate three lines to pass to the 12359 # generic handler for each of those. 12360 12361 # The input syntax (after stripping comments and trailing white space 12362 # is like one of the following (with the final two being entries that 12363 # we ignore): 12364 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 12365 # 03A3; 03C2; 03A3; 03A3; Final_Sigma; 12366 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 12367 # Note the trailing semi-colon, unlike many of the input files. That 12368 # means that there will be an extra null field generated by the split 12369 12370 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12371 # fields 12372 12373 # field #4 is when this mapping is conditional. If any of these get 12374 # implemented, it would be by hard-coding in the casing functions in 12375 # the Perl core, not through tables. But if there is a new condition 12376 # we don't know about, output a warning. We know about all the 12377 # conditions through 6.0 12378 if ($fields[4] ne "") { 12379 my @conditions = split ' ', $fields[4]; 12380 if ($conditions[0] ne 'tr' # We know that these languages have 12381 # conditions, and some are multiple 12382 && $conditions[0] ne 'az' 12383 && $conditions[0] ne 'lt' 12384 12385 # And, we know about a single condition Final_Sigma, but 12386 # nothing else. 12387 && ($v_version gt v5.2.0 12388 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) 12389 { 12390 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); 12391 } 12392 elsif ($conditions[0] ne 'Final_Sigma') { 12393 12394 # Don't print out a message for Final_Sigma, because we 12395 # have hard-coded handling for it. (But the standard 12396 # could change what the rule should be, but it wouldn't 12397 # show up here anyway. 12398 12399 print "# SKIPPING Special Casing: $_\n" 12400 if $verbosity >= $VERBOSE; 12401 } 12402 $_ = ""; 12403 return; 12404 } 12405 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { 12406 $file->carp_bad_line('Extra fields'); 12407 $_ = ""; 12408 return; 12409 } 12410 12411 my $decimal_code_point = hex $fields[0]; 12412 12413 # Loop to handle each of the three mappings in the input line, in 12414 # order, with $i indicating the current field number. 12415 my $i = 0; 12416 for my $object ($lc, $tc, $uc) { 12417 $i++; # First time through, $i = 0 ... 3rd time = 3 12418 12419 my $value = $object->value_of($decimal_code_point); 12420 $value = ($value eq $CODE_POINT) 12421 ? $decimal_code_point 12422 : hex $value; 12423 12424 # If this isn't a multi-character mapping, it should already have 12425 # been read in. 12426 if ($fields[$i] !~ / /) { 12427 if ($value != hex $fields[$i]) { 12428 Carp::my_carp("Bad news. UnicodeData.txt thinks " 12429 . $object->name 12430 . "(0x$fields[0]) is $value" 12431 . " and SpecialCasing.txt thinks it is " 12432 . hex($fields[$i]) 12433 . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); 12434 } 12435 } 12436 else { 12437 12438 # The mapping is additional, beyond the simple mapping. 12439 $file->insert_adjusted_lines("$fields[0]; " 12440 . $object->name 12441 . "; " 12442 . $CMD_DELIM 12443 . "$REPLACE_CMD=$MULTIPLE_BEFORE" 12444 . $CMD_DELIM 12445 . $fields[$i]); 12446 } 12447 } 12448 12449 # Everything has been handled by the insert_adjusted_lines() 12450 $_ = ""; 12451 12452 return; 12453 } 12454} 12455 12456sub filter_old_style_case_folding($file) { 12457 # This transforms $_ containing the case folding style of 3.0.1, to 3.1 12458 # and later style. Different letters were used in the earlier. 12459 12460 my @fields = split /\s*;\s*/; 12461 12462 if ($fields[1] eq 'L') { 12463 $fields[1] = 'C'; # L => C always 12464 } 12465 elsif ($fields[1] eq 'E') { 12466 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise 12467 $fields[1] = 'F' 12468 } 12469 else { 12470 $fields[1] = 'C' 12471 } 12472 } 12473 else { 12474 $file->carp_bad_line("Expecting L or E in second field"); 12475 $_ = ""; 12476 return; 12477 } 12478 $_ = join("; ", @fields) . ';'; 12479 return; 12480} 12481 12482{ # Closure for case folding 12483 12484 # Create the map for simple only if are going to output it, for otherwise 12485 # it takes no part in anything we do. 12486 my $to_output_simple; 12487 12488 sub setup_case_folding { 12489 # Read in the case foldings in CaseFolding.txt. This handles both 12490 # simple and full case folding. 12491 12492 $to_output_simple 12493 = property_ref('Simple_Case_Folding')->to_output_map; 12494 12495 if (! $to_output_simple) { 12496 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); 12497 } 12498 12499 # If we ever wanted to show that these tables were combined, a new 12500 # property method could be created, like set_combined_props() 12501 property_ref('Case_Folding')->add_comment(join_lines( <<END 12502This file includes both the simple and full case folding maps. The simple 12503ones are in the main body of the table below, and the full ones adding to or 12504overriding them are in the hash. 12505END 12506 )); 12507 return; 12508 } 12509 12510 sub filter_case_folding_line($file) { 12511 # Called for each line in CaseFolding.txt 12512 # Input lines look like: 12513 # 0041; C; 0061; # LATIN CAPITAL LETTER A 12514 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 12515 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 12516 # 12517 # 'C' means that folding is the same for both simple and full 12518 # 'F' that it is only for full folding 12519 # 'S' that it is only for simple folding 12520 # 'T' is locale-dependent, and ignored 12521 # 'I' is a type of 'F' used in some early releases. 12522 # Note the trailing semi-colon, unlike many of the input files. That 12523 # means that there will be an extra null field generated by the split 12524 # below, which we ignore and hence is not an error. 12525 12526 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; 12527 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { 12528 $file->carp_bad_line('Extra fields'); 12529 $_ = ""; 12530 return; 12531 } 12532 12533 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent 12534 $_ = ""; 12535 return; 12536 } 12537 12538 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase 12539 # I are all full foldings; S is single-char. For S, there is always 12540 # an F entry, so we must allow multiple values for the same code 12541 # point. Fortunately this table doesn't need further manipulation 12542 # which would preclude using multiple-values. The S is now included 12543 # so that _swash_inversion_hash() is able to construct closures 12544 # without having to worry about F mappings. 12545 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { 12546 $_ = "$range; Case_Folding; " 12547 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map"; 12548 } 12549 else { 12550 $_ = ""; 12551 $file->carp_bad_line('Expecting C F I S or T in second field'); 12552 } 12553 12554 # C and S are simple foldings, but simple case folding is not needed 12555 # unless we explicitly want its map table output. 12556 if ($to_output_simple && $type eq 'C' || $type eq 'S') { 12557 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); 12558 } 12559 12560 return; 12561 } 12562 12563} # End case fold closure 12564 12565sub filter_jamo_line { 12566 # Filter Jamo.txt lines. This routine mainly is used to populate hashes 12567 # from this file that is used in generating the Name property for Jamo 12568 # code points. But, it also is used to convert early versions' syntax 12569 # into the modern form. Here are two examples: 12570 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax 12571 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax 12572 # 12573 # The input is $_, the output is $_ filtered. 12574 12575 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 12576 12577 # Let the caller handle unexpected input. In earlier versions, there was 12578 # a third field which is supposed to be a comment, but did not have a '#' 12579 # before it. 12580 return if @fields > (($v_version gt v3.0.0) ? 2 : 3); 12581 12582 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous 12583 # beginning. 12584 12585 # Some 2.1 versions had this wrong. Causes havoc with the algorithm. 12586 $fields[1] = 'R' if $fields[0] eq '1105'; 12587 12588 # Add to structure so can generate Names from it. 12589 my $cp = hex $fields[0]; 12590 my $short_name = $fields[1]; 12591 $Jamo{$cp} = $short_name; 12592 if ($cp <= $LBase + $LCount) { 12593 $Jamo_L{$short_name} = $cp - $LBase; 12594 } 12595 elsif ($cp <= $VBase + $VCount) { 12596 $Jamo_V{$short_name} = $cp - $VBase; 12597 } 12598 elsif ($cp <= $TBase + $TCount) { 12599 $Jamo_T{$short_name} = $cp - $TBase; 12600 } 12601 else { 12602 Carp::my_carp_bug("Unexpected Jamo code point in $_"); 12603 } 12604 12605 12606 # Reassemble using just the first two fields to look like a typical 12607 # property file line 12608 $_ = "$fields[0]; $fields[1]"; 12609 12610 return; 12611} 12612 12613sub register_fraction($rational) { 12614 # This registers the input rational number so that it can be passed on to 12615 # Unicode::UCD, both in rational and floating forms. 12616 12617 my $floating = eval $rational; 12618 12619 my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating; 12620 12621 # See if the denominator is a power of 2. 12622 $rational =~ m!.*/(.*)!; 12623 my $denominator = $1; 12624 if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) { 12625 12626 # Here the denominator is a power of 2. This means it has an exact 12627 # representation in binary, so rounding could go either way. It turns 12628 # out that Windows doesn't necessarily round towards even, so output 12629 # an extra entry. This happens when the final digit we output is even 12630 # and the next digits would be 50* to the precision of the machine. 12631 my $extra_digit_float = sprintf "%e", $floating; 12632 my $q = $E_FLOAT_PRECISION - 1; 12633 if ($extra_digit_float =~ / ( .* \. \d{$q} ) 12634 ( [02468] ) 5 0* ( e .*) 12635 /ix) 12636 { 12637 push @floats, $1 . ($2 + 1) . $3; 12638 } 12639 } 12640 12641 foreach my $float (@floats) { 12642 # Strip off any leading zeros beyond 2 digits to make it C99 12643 # compliant. (Windows has 3 digit exponents, contrary to C99) 12644 $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x; 12645 12646 if ( defined $nv_floating_to_rational{$float} 12647 && $nv_floating_to_rational{$float} ne $rational) 12648 { 12649 die Carp::my_carp_bug("Both '$rational' and" 12650 . " '$nv_floating_to_rational{$float}' evaluate to" 12651 . " the same floating point number." 12652 . " \$E_FLOAT_PRECISION must be increased"); 12653 } 12654 $nv_floating_to_rational{$float} = $rational; 12655 } 12656 return; 12657} 12658 12659sub gcd($a, $b) { # Greatest-common-divisor; from 12660 # http://en.wikipedia.org/wiki/Euclidean_algorithm 12661 use integer; 12662 12663 while ($b != 0) { 12664 my $temp = $b; 12665 $b = $a % $b; 12666 $a = $temp; 12667 } 12668 return $a; 12669} 12670 12671sub reduce_fraction($fraction_ref) { 12672 # Reduce a fraction to lowest terms. The Unicode data may be reducible, 12673 # hence this is needed. The argument is a reference to the 12674 # string denoting the fraction, which must be of the form: 12675 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) { 12676 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged"); 12677 return; 12678 } 12679 12680 my $sign = $1; 12681 my $numerator = $2; 12682 my $denominator = $3; 12683 12684 use integer; 12685 12686 # Find greatest common divisor 12687 my $gcd = gcd($numerator, $denominator); 12688 12689 # And reduce using the gcd. 12690 if ($gcd != 1) { 12691 $numerator /= $gcd; 12692 $denominator /= $gcd; 12693 $$fraction_ref = "$sign$numerator/$denominator"; 12694 } 12695 12696 return; 12697} 12698 12699sub filter_numeric_value_line($file) { 12700 # DNumValues contains lines of a different syntax than the typical 12701 # property file: 12702 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO 12703 # 12704 # This routine transforms $_ containing the anomalous syntax to the 12705 # typical, by filtering out the extra columns, and convert early version 12706 # decimal numbers to strings that look like rational numbers. 12707 12708 # Starting in 5.1, there is a rational field. Just use that, omitting the 12709 # extra columns. Otherwise convert the decimal number in the second field 12710 # to a rational, and omit extraneous columns. 12711 my @fields = split /\s*;\s*/, $_, -1; 12712 my $rational; 12713 12714 if ($v_version ge v5.1.0) { 12715 if (@fields != 4) { 12716 $file->carp_bad_line('Not 4 semi-colon separated fields'); 12717 $_ = ""; 12718 return; 12719 } 12720 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/}; 12721 $rational = $fields[3]; 12722 12723 $_ = join '; ', @fields[ 0, 3 ]; 12724 } 12725 else { 12726 12727 # Here, is an older Unicode file, which has decimal numbers instead of 12728 # rationals in it. Use the fraction to calculate the denominator and 12729 # convert to rational. 12730 12731 if (@fields != 2 && @fields != 3) { 12732 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); 12733 $_ = ""; 12734 return; 12735 } 12736 12737 my $codepoints = $fields[0]; 12738 my $decimal = $fields[1]; 12739 if ($decimal =~ s/\.0+$//) { 12740 12741 # Anything ending with a decimal followed by nothing but 0's is an 12742 # integer 12743 $_ = "$codepoints; $decimal"; 12744 $rational = $decimal; 12745 } 12746 else { 12747 12748 my $denominator; 12749 if ($decimal =~ /\.50*$/) { 12750 $denominator = 2; 12751 } 12752 12753 # Here have the hardcoded repeating decimals in the fraction, and 12754 # the denominator they imply. There were only a few denominators 12755 # in the older Unicode versions of this file which this code 12756 # handles, so it is easy to convert them. 12757 12758 # The 4 is because of a round-off error in the Unicode 3.2 files 12759 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { 12760 $denominator = 3; 12761 } 12762 elsif ($decimal =~ /\.[27]50*$/) { 12763 $denominator = 4; 12764 } 12765 elsif ($decimal =~ /\.[2468]0*$/) { 12766 $denominator = 5; 12767 } 12768 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { 12769 $denominator = 6; 12770 } 12771 elsif ($decimal =~ /\.(12|37|62|87)50*$/) { 12772 $denominator = 8; 12773 } 12774 if ($denominator) { 12775 my $sign = ($decimal < 0) ? "-" : ""; 12776 my $numerator = int((abs($decimal) * $denominator) + .5); 12777 $rational = "$sign$numerator/$denominator"; 12778 $_ = "$codepoints; $rational"; 12779 } 12780 else { 12781 $file->carp_bad_line("Can't cope with number '$decimal'."); 12782 $_ = ""; 12783 return; 12784 } 12785 } 12786 } 12787 12788 register_fraction($rational) if $rational =~ qr{/}; 12789 return; 12790} 12791 12792{ # Closure 12793 my %unihan_properties; 12794 12795 sub construct_unihan($file_object) { 12796 12797 return unless file_exists($file_object->file); 12798 12799 if ($v_version lt v4.0.0) { 12800 push @cjk_properties, 'URS ; Unicode_Radical_Stroke'; 12801 push @cjk_property_values, split "\n", <<'END'; 12802# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none> 12803END 12804 } 12805 12806 if ($v_version ge v3.0.0) { 12807 push @cjk_properties, split "\n", <<'END'; 12808cjkIRG_GSource; kIRG_GSource 12809cjkIRG_JSource; kIRG_JSource 12810cjkIRG_KSource; kIRG_KSource 12811cjkIRG_TSource; kIRG_TSource 12812cjkIRG_VSource; kIRG_VSource 12813END 12814 push @cjk_property_values, split "\n", <<'END'; 12815# @missing: 0000..10FFFF; cjkIRG_GSource; <none> 12816# @missing: 0000..10FFFF; cjkIRG_JSource; <none> 12817# @missing: 0000..10FFFF; cjkIRG_KSource; <none> 12818# @missing: 0000..10FFFF; cjkIRG_TSource; <none> 12819# @missing: 0000..10FFFF; cjkIRG_VSource; <none> 12820END 12821 } 12822 if ($v_version ge v3.1.0) { 12823 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource'; 12824 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>'; 12825 } 12826 if ($v_version ge v3.1.1) { 12827 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource'; 12828 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>'; 12829 } 12830 if ($v_version ge v3.2.0) { 12831 push @cjk_properties, split "\n", <<'END'; 12832cjkAccountingNumeric; kAccountingNumeric 12833cjkCompatibilityVariant; kCompatibilityVariant 12834cjkOtherNumeric; kOtherNumeric 12835cjkPrimaryNumeric; kPrimaryNumeric 12836END 12837 push @cjk_property_values, split "\n", <<'END'; 12838# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 12839# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> 12840# @missing: 0000..10FFFF; cjkOtherNumeric; NaN 12841# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN 12842END 12843 } 12844 if ($v_version gt v4.0.0) { 12845 push @cjk_properties, 'cjkIRG_USource; kIRG_USource'; 12846 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>'; 12847 } 12848 12849 if ($v_version ge v4.1.0) { 12850 push @cjk_properties, 'cjkIICore ; kIICore'; 12851 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>'; 12852 } 12853 } 12854 12855 sub setup_unihan { 12856 # Do any special setup for Unihan properties. 12857 12858 # This property gives the wrong computed type, so override. 12859 my $usource = property_ref('kIRG_USource'); 12860 $usource->set_type($STRING) if defined $usource; 12861 12862 # This property is to be considered binary (it says so in 12863 # http://www.unicode.org/reports/tr38/) 12864 my $iicore = property_ref('kIICore'); 12865 if (defined $iicore) { 12866 $iicore->set_type($FORCED_BINARY); 12867 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38."); 12868 12869 # Unicode doesn't include the maps for this property, so don't 12870 # warn that they are missing. 12871 $iicore->set_pre_declared_maps(0); 12872 $iicore->add_comment(join_lines( <<END 12873This property contains string values, but any non-empty ones are considered to 12874be 'core', so Perl creates tables for both: 1) its string values, plus 2) 12875tables so that \\p{kIICore} matches any code point which has a non-empty 12876value for this property. 12877END 12878 )); 12879 } 12880 12881 return; 12882 } 12883 12884 sub filter_unihan_line { 12885 # Change unihan db lines to look like the others in the db. Here is 12886 # an input sample: 12887 # U+341C kCangjie IEKN 12888 12889 # Tabs are used instead of semi-colons to separate fields; therefore 12890 # they may have semi-colons embedded in them. Change these to periods 12891 # so won't screw up the rest of the code. 12892 s/;/./g; 12893 12894 # Remove lines that don't look like ones we accept. 12895 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { 12896 $_ = ""; 12897 return; 12898 } 12899 12900 # Extract the property, and save a reference to its object. 12901 my $property = $1; 12902 if (! exists $unihan_properties{$property}) { 12903 $unihan_properties{$property} = property_ref($property); 12904 } 12905 12906 # Don't do anything unless the property is one we're handling, which 12907 # we determine by seeing if there is an object defined for it or not 12908 if (! defined $unihan_properties{$property}) { 12909 $_ = ""; 12910 return; 12911 } 12912 12913 # Convert the tab separators to our standard semi-colons, and convert 12914 # the U+HHHH notation to the rest of the standard's HHHH 12915 s/\t/;/g; 12916 s/\b U \+ (?= $code_point_re )//xg; 12917 12918 #local $to_trace = 1 if main::DEBUG; 12919 trace $_ if main::DEBUG && $to_trace; 12920 12921 return; 12922 } 12923} 12924 12925sub filter_blocks_lines($file) { 12926 # In the Blocks.txt file, the names of the blocks don't quite match the 12927 # names given in PropertyValueAliases.txt, so this changes them so they 12928 # do match: Blanks and hyphens are changed into underscores. Also makes 12929 # early release versions look like later ones 12930 # 12931 # $_ is transformed to the correct value. 12932 12933 if ($v_version lt v3.2.0) { 12934 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted 12935 $_ = ""; 12936 return; 12937 } 12938 12939 # Old versions used a different syntax to mark the range. 12940 $_ =~ s/;\s+/../ if $v_version lt v3.1.0; 12941 } 12942 12943 my @fields = split /\s*;\s*/, $_, -1; 12944 if (@fields != 2) { 12945 $file->carp_bad_line("Expecting exactly two fields"); 12946 $_ = ""; 12947 return; 12948 } 12949 12950 # Change hyphens and blanks in the block name field only 12951 $fields[1] =~ s/[ -]/_/g; 12952 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word 12953 12954 $_ = join("; ", @fields); 12955 return; 12956} 12957 12958{ # Closure 12959 my $current_property; 12960 12961 sub filter_old_style_proplist { 12962 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it 12963 # was in a completely different syntax. Ken Whistler of Unicode says 12964 # that it was something he used as an aid for his own purposes, but 12965 # was never an official part of the standard. Many of the properties 12966 # in it were incorporated into the later PropList.txt, but some were 12967 # not. This program uses this early file to generate property tables 12968 # that are otherwise not accessible in the early UCD's. It does this 12969 # for the ones that eventually became official, and don't appear to be 12970 # too different in their contents from the later official version, and 12971 # throws away the rest. It could be argued that the ones it generates 12972 # were probably not really official at that time, so should be 12973 # ignored. You can easily modify things to skip all of them by 12974 # changing this function to just set $_ to "", and return; and to skip 12975 # certain of them by simply removing their declarations from 12976 # get_old_property_aliases(). 12977 # 12978 # Here is a list of all the ones that are thrown away: 12979 # Alphabetic The definitions for this are very 12980 # defective, so better to not mislead 12981 # people into thinking it works. 12982 # Instead the Perl extension of the 12983 # same name is constructed from first 12984 # principles. 12985 # Bidi=* duplicates UnicodeData.txt 12986 # Combining never made into official property; 12987 # is \P{ccc=0} 12988 # Composite never made into official property. 12989 # Currency Symbol duplicates UnicodeData.txt: gc=sc 12990 # Decimal Digit duplicates UnicodeData.txt: gc=nd 12991 # Delimiter never made into official property; 12992 # removed in 3.0.1 12993 # Format Control never made into official property; 12994 # similar to gc=cf 12995 # High Surrogate duplicates Blocks.txt 12996 # Ignorable Control never made into official property; 12997 # similar to di=y 12998 # ISO Control duplicates UnicodeData.txt: gc=cc 12999 # Left of Pair never made into official property; 13000 # Line Separator duplicates UnicodeData.txt: gc=zl 13001 # Low Surrogate duplicates Blocks.txt 13002 # Non-break was actually listed as a property 13003 # in 3.2, but without any code 13004 # points. Unicode denies that this 13005 # was ever an official property 13006 # Non-spacing duplicate UnicodeData.txt: gc=mn 13007 # Numeric duplicates UnicodeData.txt: gc=cc 13008 # Paired Punctuation never made into official property; 13009 # appears to be gc=ps + gc=pe 13010 # Paragraph Separator duplicates UnicodeData.txt: gc=cc 13011 # Private Use duplicates UnicodeData.txt: gc=co 13012 # Private Use High Surrogate duplicates Blocks.txt 13013 # Punctuation duplicates UnicodeData.txt: gc=p 13014 # Space different definition than eventual 13015 # one. 13016 # Titlecase duplicates UnicodeData.txt: gc=lt 13017 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn 13018 # Zero-width never made into official property; 13019 # subset of gc=cf 13020 # Most of the properties have the same names in this file as in later 13021 # versions, but a couple do not. 13022 # 13023 # This subroutine filters $_, converting it from the old style into 13024 # the new style. Here's a sample of the old-style 13025 # 13026 # ******************************************* 13027 # 13028 # Property dump for: 0x100000A0 (Join Control) 13029 # 13030 # 200C..200D (2 chars) 13031 # 13032 # In the example, the property is "Join Control". It is kept in this 13033 # closure between calls to the subroutine. The numbers beginning with 13034 # 0x were internal to Ken's program that generated this file. 13035 13036 # If this line contains the property name, extract it. 13037 if (/^Property dump for: [^(]*\((.*)\)/) { 13038 $_ = $1; 13039 13040 # Convert white space to underscores. 13041 s/ /_/g; 13042 13043 # Convert the few properties that don't have the same name as 13044 # their modern counterparts 13045 s/Identifier_Part/ID_Continue/ 13046 or s/Not_a_Character/NChar/; 13047 13048 # If the name matches an existing property, use it. 13049 if (defined property_ref($_)) { 13050 trace "new property=", $_ if main::DEBUG && $to_trace; 13051 $current_property = $_; 13052 } 13053 else { # Otherwise discard it 13054 trace "rejected property=", $_ if main::DEBUG && $to_trace; 13055 undef $current_property; 13056 } 13057 $_ = ""; # The property is saved for the next lines of the 13058 # file, but this defining line is of no further use, 13059 # so clear it so that the caller won't process it 13060 # further. 13061 } 13062 elsif (! defined $current_property || $_ !~ /^$code_point_re/) { 13063 13064 # Here, the input line isn't a header defining a property for the 13065 # following section, and either we aren't in such a section, or 13066 # the line doesn't look like one that defines the code points in 13067 # such a section. Ignore this line. 13068 $_ = ""; 13069 } 13070 else { 13071 13072 # Here, we have a line defining the code points for the current 13073 # stashed property. Anything starting with the first blank is 13074 # extraneous. Otherwise, it should look like a normal range to 13075 # the caller. Append the property name so that it looks just like 13076 # a modern PropList entry. 13077 13078 $_ =~ s/\s.*//; 13079 $_ .= "; $current_property"; 13080 } 13081 trace $_ if main::DEBUG && $to_trace; 13082 return; 13083 } 13084} # End closure for old style proplist 13085 13086sub filter_old_style_normalization_lines { 13087 # For early releases of Unicode, the lines were like: 13088 # 74..2A76 ; NFKD_NO 13089 # For later releases this became: 13090 # 74..2A76 ; NFKD_QC; N 13091 # Filter $_ to look like those in later releases. 13092 # Similarly for MAYBEs 13093 13094 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; 13095 13096 # Also, the property FC_NFKC was abbreviated to FNC 13097 s/FNC/FC_NFKC/; 13098 return; 13099} 13100 13101sub setup_script_extensions { 13102 # The Script_Extensions property starts out with a clone of the Script 13103 # property. 13104 13105 $scx = property_ref("Script_Extensions"); 13106 return unless defined $scx; 13107 13108 $scx->_set_format($STRING_WHITE_SPACE_LIST); 13109 $scx->initialize($script); 13110 $scx->set_default_map($script->default_map); 13111 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 13112 $scx->add_comment(join_lines( <<END 13113The values for code points that appear in one script are just the same as for 13114the 'Script' property. Likewise the values for those that appear in many 13115scripts are either 'Common' or 'Inherited', same as with 'Script'. But the 13116values of code points that appear in a few scripts are a space separated list 13117of those scripts. 13118END 13119 )); 13120 13121 # Initialize scx's tables and the aliases for them to be the same as sc's 13122 foreach my $table ($script->tables) { 13123 my $scx_table = $scx->add_match_table($table->name, 13124 Full_Name => $table->full_name); 13125 foreach my $alias ($table->aliases) { 13126 $scx_table->add_alias($alias->name); 13127 } 13128 } 13129} 13130 13131sub filter_script_extensions_line { 13132 # The Scripts file comes with the full name for the scripts; the 13133 # ScriptExtensions, with the short name. The final mapping file is a 13134 # combination of these, and without adjustment, would have inconsistent 13135 # entries. This filters the latter file to convert to full names. 13136 # Entries look like this: 13137 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 13138 13139 my @fields = split /\s*;\s*/; 13140 13141 # This script was erroneously omitted in this Unicode version. 13142 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; 13143 13144 my @full_names; 13145 foreach my $short_name (split " ", $fields[1]) { 13146 push @full_names, $script->table($short_name)->full_name; 13147 } 13148 $fields[1] = join " ", @full_names; 13149 $_ = join "; ", @fields; 13150 13151 return; 13152} 13153 13154sub setup_emojidata { 13155 my $prop_ref = Property->new('ExtPict', 13156 Full_Name => 'Extended_Pictographic', 13157 ); 13158 $prop_ref->set_fate($PLACEHOLDER, 13159 "Not part of the Unicode Character Database"); 13160} 13161 13162sub filter_emojidata_line { 13163 # We only are interested in this single property from this non-UCD data 13164 # file, and we turn it into a Perl property, so that it isn't accessible 13165 # to the users 13166 13167 $_ = "" unless /\bExtended_Pictographic\b/; 13168 13169 return; 13170} 13171 13172sub setup_IdStatus { 13173 my $ids = Property->new('Identifier_Status', 13174 Match_SubDir => 'IdStatus', 13175 Default_Map => 'Restricted', 13176 ); 13177 $ids->add_match_table('Allowed'); 13178} 13179 13180sub setup_IdType { 13181 $idt = Property->new('Identifier_Type', 13182 Match_SubDir => 'IdType', 13183 Default_Map => 'Not_Character', 13184 Format => $STRING_WHITE_SPACE_LIST, 13185 ); 13186} 13187 13188sub filter_IdType_line { 13189 13190 # Some code points have more than one type, separated by spaces on the 13191 # input. For now, we just add everything as a property value. Later when 13192 # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve 13193 # things 13194 13195 my @fields = split /\s*;\s*/; 13196 my $types = $fields[1]; 13197 $idt->add_match_table($types) unless defined $idt->table($types); 13198 13199 return; 13200} 13201 13202sub generate_hst($file) { 13203 13204 # Populates the Hangul Syllable Type property from first principles 13205 13206 # These few ranges are hard-coded in. 13207 $file->insert_lines(split /\n/, <<'END' 132081100..1159 ; L 13209115F ; L 132101160..11A2 ; V 1321111A8..11F9 ; T 13212END 13213); 13214 13215 # The Hangul syllables in version 1 are at different code points than 13216 # those that came along starting in version 2, and have different names; 13217 # they comprise about 60% of the code points of the later version. 13218 # From my (khw) research on them (see <558493EB.4000807@att.net>), the 13219 # initial set is a subset of the later version, with different English 13220 # transliterations. I did not see an easy mapping between them. The 13221 # later set includes essentially all possibilities, even ones that aren't 13222 # in modern use (if they ever were), and over 96% of the new ones are type 13223 # LVT. Mathematically, the early set must also contain a preponderance of 13224 # LVT values. In lieu of doing nothing, we just set them all to LVT, and 13225 # expect that this will be right most of the time, which is better than 13226 # not being right at all. 13227 if ($v_version lt v2.0.0) { 13228 my $property = property_ref($file->property); 13229 $file->insert_lines(sprintf("%04X..%04X; LVT\n", 13230 $FIRST_REMOVED_HANGUL_SYLLABLE, 13231 $FINAL_REMOVED_HANGUL_SYLLABLE)); 13232 push @tables_that_may_be_empty, $property->table('LV')->complete_name; 13233 return; 13234 } 13235 13236 # The algorithmically derived syllables are almost all LVT ones, so 13237 # initialize the whole range with that. 13238 $file->insert_lines(sprintf "%04X..%04X; LVT\n", 13239 $SBase, $SBase + $SCount -1); 13240 13241 # Those ones that aren't LVT are LV, and they occur at intervals of 13242 # $TCount code points, starting with the first code point, at $SBase. 13243 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) { 13244 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i); 13245 } 13246 13247 return; 13248} 13249 13250sub generate_GCB($file) { 13251 13252 # Populates the Grapheme Cluster Break property from first principles 13253 13254 # All these definitions are from 13255 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation 13256 # from http://www.unicode.org/reports/tr29/tr29-4.html 13257 13258 foreach my $range ($gc->ranges) { 13259 13260 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc 13261 # and gc=Cf 13262 if ($range->value =~ / ^ M [en] $ /x) { 13263 $file->insert_lines(sprintf "%04X..%04X; Extend", 13264 $range->start, $range->end); 13265 } 13266 elsif ($range->value =~ / ^ C [cf] $ /x) { 13267 $file->insert_lines(sprintf "%04X..%04X; Control", 13268 $range->start, $range->end); 13269 } 13270 } 13271 $file->insert_lines("2028; Control"); # Line Separator 13272 $file->insert_lines("2029; Control"); # Paragraph Separator 13273 13274 $file->insert_lines("000D; CR"); 13275 $file->insert_lines("000A; LF"); 13276 13277 # Also from http://www.unicode.org/reports/tr29/tr29-3.html. 13278 foreach my $code_point ( qw{ 13279 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6 13280 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F 13281 } 13282 ) { 13283 my $category = $gc->value_of(hex $code_point); 13284 next if ! defined $category || $category eq 'Cn'; # But not if 13285 # unassigned in this 13286 # release 13287 $file->insert_lines("$code_point; Extend"); 13288 } 13289 13290 my $hst = property_ref('Hangul_Syllable_Type'); 13291 if ($hst->count > 0) { 13292 foreach my $range ($hst->ranges) { 13293 $file->insert_lines(sprintf "%04X..%04X; %s", 13294 $range->start, $range->end, $range->value); 13295 } 13296 } 13297 else { 13298 generate_hst($file); 13299 } 13300 13301 main::process_generic_property_file($file); 13302} 13303 13304 13305sub fixup_early_perl_name_alias($file) { 13306 13307 # Different versions of Unicode have varying support for the name synonyms 13308 # below. Just include everything. As of 6.1, all these are correct in 13309 # the Unicode-supplied file. 13310 13311 # ALERT did not come along until 6.0, at which point it became preferred 13312 # over BELL. By inserting it last in early releases, BELL is preferred 13313 # over it; and vice-vers in 6.0 13314 my $type_for_bell = ($v_version lt v6.0.0) 13315 ? 'correction' 13316 : 'alternate'; 13317 $file->insert_lines(split /\n/, <<END 133180007;BELL; $type_for_bell 13319000A;LINE FEED (LF);alternate 13320000C;FORM FEED (FF);alternate 13321000D;CARRIAGE RETURN (CR);alternate 133220085;NEXT LINE (NEL);alternate 13323END 13324 13325 ); 13326 13327 # One might think that the 'Unicode_1_Name' field, could work for most 13328 # of the above names, but sadly that field varies depending on the 13329 # release. Version 1.1.5 had no names for any of the controls; Version 13330 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names. 13331 # 3.0.1 removed the name INDEX; and 3.2 changed some names: 13332 # changed to parenthesized versions like "NEXT LINE" to 13333 # "NEXT LINE (NEL)"; 13334 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD 13335 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; 13336 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR 13337 # 13338 # All these are present in the 6.1 NameAliases.txt 13339 13340 return; 13341} 13342 13343sub filter_later_version_name_alias_line { 13344 13345 # This file has an extra entry per line for the alias type. This is 13346 # handled by creating a compound entry: "$alias: $type"; First, split 13347 # the line into components. 13348 my ($range, $alias, $type, @remainder) 13349 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13350 13351 # This file contains multiple entries for some components, so tell the 13352 # downstream code to allow this in our internal tables; the 13353 # $MULTIPLE_AFTER preserves the input ordering. 13354 $_ = join ";", $range, $CMD_DELIM 13355 . $REPLACE_CMD 13356 . '=' 13357 . $MULTIPLE_AFTER 13358 . $CMD_DELIM 13359 . "$alias: $type", 13360 @remainder; 13361 return; 13362} 13363 13364sub filter_early_version_name_alias_line { 13365 13366 # Early versions did not have the trailing alias type field; implicitly it 13367 # was 'correction'. 13368 $_ .= "; correction"; 13369 13370 filter_later_version_name_alias_line; 13371 return; 13372} 13373 13374sub filter_all_caps_script_names { 13375 13376 # Some early Unicode releases had the script names in all CAPS. This 13377 # converts them to just the first letter of each word being capital. 13378 13379 my ($range, $script, @remainder) 13380 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13381 my @words = split /[_-]/, $script; 13382 for my $word (@words) { 13383 $word = 13384 ucfirst(lc($word)) if $word ne 'CJK'; 13385 } 13386 $script = join "_", @words; 13387 $_ = join ";", $range, $script, @remainder; 13388} 13389 13390sub finish_Unicode() { 13391 # This routine should be called after all the Unicode files have been read 13392 # in. It: 13393 # 1) Creates properties that are missing from the version of Unicode being 13394 # compiled, and which, for whatever reason, are needed for the Perl 13395 # core to function properly. These are minimally populated as 13396 # necessary. 13397 # 2) Adds the mappings for code points missing from the files which have 13398 # defaults specified for them. 13399 # 3) At this point all mappings are known, so it computes the type of 13400 # each property whose type hasn't been determined yet. 13401 # 4) Calculates all the regular expression match tables based on the 13402 # mappings. 13403 # 5) Calculates and adds the tables which are defined by Unicode, but 13404 # which aren't derived by them, and certain derived tables that Perl 13405 # uses. 13406 13407 # Folding information was introduced later into Unicode data. To get 13408 # Perl's case ignore (/i) to work at all in releases that don't have 13409 # folding, use the best available alternative, which is lower casing. 13410 my $fold = property_ref('Case_Folding'); 13411 if ($fold->is_empty) { 13412 $fold->initialize(property_ref('Lowercase_Mapping')); 13413 $fold->add_note(join_lines(<<END 13414WARNING: This table uses lower case as a substitute for missing fold 13415information 13416END 13417 )); 13418 } 13419 13420 # Multiple-character mapping was introduced later into Unicode data, so it 13421 # is by default the simple version. If to output the simple versions and 13422 # not present, just use the regular (which in these Unicode versions is 13423 # the simple as well). 13424 foreach my $map (qw { Uppercase_Mapping 13425 Lowercase_Mapping 13426 Titlecase_Mapping 13427 Case_Folding 13428 } ) 13429 { 13430 my $comment = <<END; 13431 13432Note that although the Perl core uses this file, it has the standard values 13433for code points from U+0000 to U+00FF compiled in, so changing this table will 13434not change the core's behavior with respect to these code points. Use 13435Unicode::Casing to override this table. 13436END 13437 if ($map eq 'Case_Folding') { 13438 $comment .= <<END; 13439(/i regex matching is not overridable except by using a custom regex engine) 13440END 13441 } 13442 property_ref($map)->add_comment(join_lines($comment)); 13443 my $simple = property_ref("Simple_$map"); 13444 next if ! $simple->is_empty; 13445 if ($simple->to_output_map) { 13446 $simple->initialize(property_ref($map)); 13447 } 13448 else { 13449 property_ref($map)->set_proxy_for($simple->name); 13450 } 13451 } 13452 13453 # For each property, fill in any missing mappings, and calculate the re 13454 # match tables. If a property has more than one missing mapping, the 13455 # default is a reference to a data structure, and may require data from 13456 # other properties to resolve. The sort is used to cause these to be 13457 # processed last, after all the other properties have been calculated. 13458 # (Fortunately, the missing properties so far don't depend on each other.) 13459 foreach my $property 13460 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } 13461 property_ref('*')) 13462 { 13463 # $perl has been defined, but isn't one of the Unicode properties that 13464 # need to be finished up. 13465 next if $property == $perl; 13466 13467 # Nor do we need to do anything with properties that aren't going to 13468 # be output. 13469 next if $property->fate == $SUPPRESSED; 13470 13471 # Handle the properties that have more than one possible default 13472 if (ref $property->default_map) { 13473 my $default_map = $property->default_map; 13474 13475 # These properties have stored in the default_map: 13476 # One or more of: 13477 # 1) A default map which applies to all code points in a 13478 # certain class 13479 # 2) an expression which will evaluate to the list of code 13480 # points in that class 13481 # And 13482 # 3) the default map which applies to every other missing code 13483 # point. 13484 # 13485 # Go through each list. 13486 while (my ($default, $eval) = $default_map->get_next_defaults) { 13487 last unless defined $eval; 13488 13489 # Get the class list, and intersect it with all the so-far 13490 # unspecified code points yielding all the code points 13491 # in the class that haven't been specified. 13492 my $list = eval $eval; 13493 if ($@) { 13494 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); 13495 last; 13496 } 13497 13498 # Narrow down the list to just those code points we don't have 13499 # maps for yet. 13500 $list = $list & $property->inverse_list; 13501 13502 # Add mappings to the property for each code point in the list 13503 foreach my $range ($list->ranges) { 13504 $property->add_map($range->start, $range->end, $default, 13505 Replace => $NO); 13506 } 13507 } 13508 13509 # All remaining code points have the other mapping. Set that up 13510 # so the normal single-default mapping code will work on them 13511 $property->set_default_map($default_map->other_default); 13512 13513 # And fall through to do that 13514 } 13515 13516 # We should have enough data now to compute the type of the property. 13517 my $property_name = $property->name; 13518 $property->compute_type; 13519 my $property_type = $property->type; 13520 13521 next if ! $property->to_create_match_tables; 13522 13523 # Here want to create match tables for this property 13524 13525 # The Unicode db always (so far, and they claim into the future) have 13526 # the default for missing entries in binary properties be 'N' (unless 13527 # there is a '@missing' line that specifies otherwise) 13528 if (! defined $property->default_map) { 13529 if ($property_type == $BINARY) { 13530 $property->set_default_map('N'); 13531 } 13532 elsif ($property_type == $ENUM) { 13533 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one"); 13534 $property->set_default_map('XXX This makes sure there is a default map'); 13535 } 13536 } 13537 13538 # Add any remaining code points to the mapping, using the default for 13539 # missing code points. 13540 my $default_table; 13541 my $default_map = $property->default_map; 13542 if ($property_type == $FORCED_BINARY) { 13543 13544 # A forced binary property creates a 'Y' table that matches all 13545 # non-default values. The actual string values are also written out 13546 # as a map table. (The default value will almost certainly be the 13547 # empty string, so the pod glosses over the distinction, and just 13548 # talks about empty vs non-empty.) 13549 my $yes = $property->table("Y"); 13550 foreach my $range ($property->ranges) { 13551 next if $range->value eq $default_map; 13552 $yes->add_range($range->start, $range->end); 13553 } 13554 $property->table("N")->set_complement($yes); 13555 } 13556 else { 13557 if (defined $default_map) { 13558 13559 # Make sure there is a match table for the default 13560 if (! defined ($default_table = $property->table($default_map))) 13561 { 13562 $default_table = $property->add_match_table($default_map); 13563 } 13564 13565 # And, if the property is binary, the default table will just 13566 # be the complement of the other table. 13567 if ($property_type == $BINARY) { 13568 my $non_default_table; 13569 13570 # Find the non-default table. 13571 for my $table ($property->tables) { 13572 if ($table == $default_table) { 13573 if ($v_version le v5.0.0) { 13574 $table->add_alias($_) for qw(N No F False); 13575 } 13576 next; 13577 } elsif ($v_version le v5.0.0) { 13578 $table->add_alias($_) for qw(Y Yes T True); 13579 } 13580 $non_default_table = $table; 13581 } 13582 $default_table->set_complement($non_default_table); 13583 } 13584 else { 13585 13586 # This fills in any missing values with the default. It's 13587 # not necessary to do this with binary properties, as the 13588 # default is defined completely in terms of the Y table. 13589 $property->add_map(0, $MAX_WORKING_CODEPOINT, 13590 $default_map, Replace => $NO); 13591 } 13592 } 13593 13594 # Have all we need to populate the match tables. 13595 my $maps_should_be_defined = $property->pre_declared_maps; 13596 foreach my $range ($property->ranges) { 13597 my $map = $range->value; 13598 my $table = $property->table($map); 13599 if (! defined $table) { 13600 13601 # Integral and rational property values are not 13602 # necessarily defined in PropValueAliases, but whether all 13603 # the other ones should be depends on the property. 13604 if ($maps_should_be_defined 13605 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) 13606 { 13607 Carp::my_carp("Table '$property_name=$map' should " 13608 . "have been defined. Defining it now.") 13609 } 13610 $table = $property->add_match_table($map); 13611 } 13612 13613 next if $table->complement != 0; # Don't need to populate these 13614 $table->add_range($range->start, $range->end); 13615 } 13616 } 13617 13618 # For Perl 5.6 compatibility, all properties matchable in regexes can 13619 # have an optional 'Is_' prefix. This is now done in Unicode::UCD. 13620 # But warn if this creates a conflict with a (new) Unicode property 13621 # name, although it appears that Unicode has made a decision never to 13622 # begin a property name with 'Is_', so this shouldn't happen. 13623 foreach my $alias ($property->aliases) { 13624 my $Is_name = 'Is_' . $alias->name; 13625 if (defined (my $pre_existing = property_ref($Is_name))) { 13626 Carp::my_carp(<<END 13627There is already an alias named $Is_name (from " . $pre_existing . "), so 13628creating one for $property won't work. This is bad news. If it is not too 13629late, get Unicode to back off. Otherwise go back to the old scheme (findable 13630from the git blame log for this area of the code that suppressed individual 13631aliases that conflict with the new Unicode names. Proceeding anyway. 13632END 13633 ); 13634 } 13635 } # End of loop through aliases for this property 13636 13637 13638 # Properties that have sets of values for some characters are now 13639 # converted. For example, the Script_Extensions property started out 13640 # as a clone of the Script property. But processing its data file 13641 # caused some elements to be replaced with different data. (These 13642 # elements were for the Common and Inherited properties.) This data 13643 # is a qw() list of all the scripts that the code points in the given 13644 # range are in. An example line is: 13645 # 13646 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA 13647 # 13648 # Code executed earlier has created a new match table named "Arab Syrc 13649 # Thaa" which contains 060C. (The cloned table started out with this 13650 # code point mapping to "Common".) Now we add 060C to each of the 13651 # Arab, Syrc, and Thaa match tables. Then we delete the now spurious 13652 # "Arab Syrc Thaa" match table. This is repeated for all these tables 13653 # and ranges. The map data is retained in the map table for 13654 # reference, but the spurious match tables are deleted. 13655 my $format = $property->format; 13656 if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) { 13657 foreach my $table ($property->tables) { 13658 13659 # Space separates the entries which should go in multiple 13660 # tables 13661 next unless $table->name =~ /\s/; 13662 13663 # The list of the entries, hence the names of the tables that 13664 # everything in this combo table should be added to. 13665 my @list = split /\s+/, $table->name; 13666 13667 # Add the entries from the combo table to each individual 13668 # table 13669 foreach my $individual (@list) { 13670 my $existing_table = $property->table($individual); 13671 13672 # This should only be necessary if this particular entry 13673 # occurs only in combo with others. 13674 $existing_table = $property->add_match_table($individual) 13675 unless defined $existing_table; 13676 $existing_table += $table; 13677 } 13678 $property->delete_match_table($table); 13679 } 13680 } 13681 } # End of loop through all Unicode properties. 13682 13683 # Fill in the mappings that Unicode doesn't completely furnish. First the 13684 # single letter major general categories. If Unicode were to start 13685 # delivering the values, this would be redundant, but better that than to 13686 # try to figure out if should skip and not get it right. Ths could happen 13687 # if a new major category were to be introduced, and the hard-coded test 13688 # wouldn't know about it. 13689 # This routine depends on the standard names for the general categories 13690 # being what it thinks they are, like 'Cn'. The major categories are the 13691 # union of all the general category tables which have the same first 13692 # letters. eg. L = Lu + Lt + Ll + Lo + Lm 13693 foreach my $minor_table ($gc->tables) { 13694 my $minor_name = $minor_table->name; 13695 next if length $minor_name == 1; 13696 if (length $minor_name != 2) { 13697 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); 13698 next; 13699 } 13700 13701 my $major_name = uc(substr($minor_name, 0, 1)); 13702 my $major_table = $gc->table($major_name); 13703 $major_table += $minor_table; 13704 } 13705 13706 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt 13707 # defines it as LC) 13708 my $LC = $gc->table('LC'); 13709 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... 13710 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. 13711 13712 13713 if ($LC->is_empty) { # Assume if not empty that Unicode has started to 13714 # deliver the correct values in it 13715 $LC->initialize($gc->table('Ll') + $gc->table('Lu')); 13716 13717 # Lt not in release 1. 13718 if (defined $gc->table('Lt')) { 13719 $LC += $gc->table('Lt'); 13720 $gc->table('Lt')->set_caseless_equivalent($LC); 13721 } 13722 } 13723 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); 13724 13725 $gc->table('Ll')->set_caseless_equivalent($LC); 13726 $gc->table('Lu')->set_caseless_equivalent($LC); 13727 13728 # Make sure this assumption in perl core code is valid in this Unicode 13729 # release, with known exceptions 13730 foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) { 13731 next if $range->end - $range->start == 9; 13732 next if $range->start == 0x1D7CE; # This whole range was added in 3.1 13733 next if $range->end == 0x19DA && $v_version eq v5.2.0; 13734 next if $range->end - $range->start < 9 && $v_version le 4.0.0; 13735 Carp::my_carp("Range $range unexpectedly doesn't contain 10" 13736 . " decimal digits. Code in regcomp.c assumes it does," 13737 . " and will have to be fixed. Proceeding anyway."); 13738 } 13739 13740 # Mark the scx table as the parent of the corresponding sc table for those 13741 # which are identical. This causes the pod for the script table to refer 13742 # to the corresponding scx one. This is done after everything, so as to 13743 # wait until the tables are stabilized before checking for equivalency. 13744 if (defined $scx) { 13745 if (defined $pod_directory) { 13746 foreach my $table ($scx->tables) { 13747 my $plain_sc_equiv = $script->table($table->name); 13748 if ($table->matches_identically_to($plain_sc_equiv)) { 13749 $plain_sc_equiv->set_equivalent_to($table, Related => 1); 13750 } 13751 } 13752 } 13753 } 13754 13755 return; 13756} 13757 13758sub pre_3_dot_1_Nl () { 13759 13760 # Return a range list for gc=nl for Unicode versions prior to 3.1, which 13761 # is when Unicode's became fully usable. These code points were 13762 # determined by inspection and experimentation. gc=nl is important for 13763 # certain Perl-extension properties that should be available in all 13764 # releases. 13765 13766 my $Nl = Range_List->new(); 13767 if (defined (my $official = $gc->table('Nl'))) { 13768 $Nl += $official; 13769 } 13770 else { 13771 $Nl->add_range(0x2160, 0x2182); 13772 $Nl->add_range(0x3007, 0x3007); 13773 $Nl->add_range(0x3021, 0x3029); 13774 } 13775 $Nl->add_range(0xFE20, 0xFE23); 13776 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when 13777 # these were added 13778 return $Nl; 13779} 13780 13781sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be 13782 # called before the Cn's are completely filled. 13783 # Works on Unicodes earlier than ones that 13784 # explicitly specify Cn. 13785 return if defined $Assigned; 13786 13787 if (! defined $gc || $gc->is_empty()) { 13788 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated"); 13789 } 13790 13791 $Assigned = $perl->add_match_table('Assigned', 13792 Description => "All assigned code points", 13793 ); 13794 while (defined (my $range = $gc->each_range())) { 13795 my $standard_value = standardize($range->value); 13796 next if $standard_value eq 'cn' || $standard_value eq 'unassigned'; 13797 $Assigned->add_range($range->start, $range->end); 13798 } 13799} 13800 13801sub calculate_DI() { # Set $DI to a Range_List equivalent to the 13802 # Default_Ignorable_Code_Point property. Works on 13803 # Unicodes earlier than ones that explicitly specify 13804 # DI. 13805 return if defined $DI; 13806 13807 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 13808 $DI = $di->table('Y'); 13809 } 13810 else { 13811 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D, 13812 0x2060 .. 0x206F, 13813 0xFE00 .. 0xFE0F, 13814 0xFFF0 .. 0xFFFB, 13815 ]); 13816 if ($v_version ge v2.0) { 13817 $DI += $gc->table('Cf') 13818 + $gc->table('Cs'); 13819 13820 # These are above the Unicode version 1 max 13821 $DI->add_range(0xE0000, 0xE0FFF); 13822 } 13823 $DI += $gc->table('Cc') 13824 - ord("\t") 13825 - utf8::unicode_to_native(0x0A) # LINE FEED 13826 - utf8::unicode_to_native(0x0B) # VERTICAL TAB 13827 - ord("\f") 13828 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 13829 - utf8::unicode_to_native(0x85); # NEL 13830 } 13831} 13832 13833sub calculate_NChar() { # Create a Perl extension match table which is the 13834 # same as the Noncharacter_Code_Point property, and 13835 # set $NChar to point to it. Works on Unicodes 13836 # earlier than ones that explicitly specify NChar 13837 return if defined $NChar; 13838 13839 $NChar = $perl->add_match_table('_Perl_Nchar', 13840 Perl_Extension => 1, 13841 Fate => $INTERNAL_ONLY); 13842 if (defined (my $off_nchar = property_ref('NChar'))) { 13843 $NChar->initialize($off_nchar->table('Y')); 13844 } 13845 else { 13846 $NChar->initialize([ 0xFFFE .. 0xFFFF ]); 13847 if ($v_version ge v2.0) { # First release with these nchars 13848 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) { 13849 $NChar += [ $i .. $i+1 ]; 13850 } 13851 } 13852 } 13853} 13854 13855sub handle_compare_versions () { 13856 # This fixes things up for the $compare_versions capability, where we 13857 # compare Unicode version X with version Y (with Y > X), and we are 13858 # running it on the Unicode Data for version Y. 13859 # 13860 # It works by calculating the code points whose meaning has been specified 13861 # after release X, by using the Age property. The complement of this set 13862 # is the set of code points whose meaning is unchanged between the 13863 # releases. This is the set the program restricts itself to. It includes 13864 # everything whose meaning has been specified by the time version X came 13865 # along, plus those still unassigned by the time of version Y. (We will 13866 # continue to use the word 'assigned' to mean 'meaning has been 13867 # specified', as it's shorter and is accurate in all cases except the 13868 # Noncharacter code points.) 13869 # 13870 # This function is run after all the properties specified by Unicode have 13871 # been calculated for release Y. This makes sure we get all the nuances 13872 # of Y's rules. (It is done before the Perl extensions are calculated, as 13873 # those are based entirely on the Unicode ones.) But doing it after the 13874 # Unicode table calculations means we have to fix up the Unicode tables. 13875 # We do this by subtracting the code points that have been assigned since 13876 # X (which is actually done by ANDing each table of assigned code points 13877 # with the set of unchanged code points). Most Unicode properties are of 13878 # the form such that all unassigned code points have a default, grab-bag, 13879 # property value which is changed when the code point gets assigned. For 13880 # these, we just remove the changed code points from the table for the 13881 # latter property value, and add them back in to the grab-bag one. A few 13882 # other properties are not entirely of this form and have values for some 13883 # or all unassigned code points that are not the grab-bag one. These have 13884 # to be handled specially, and are hard-coded in to this routine based on 13885 # manual inspection of the Unicode character database. A list of the 13886 # outlier code points is made for each of these properties, and those 13887 # outliers are excluded from adding and removing from tables. 13888 # 13889 # Note that there are glitches when comparing against Unicode 1.1, as some 13890 # Hangul syllables in it were later ripped out and eventually replaced 13891 # with other things. 13892 13893 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS; 13894 13895 my $after_first_version = "All matching code points were added after " 13896 . "Unicode $string_compare_versions"; 13897 13898 # Calculate the delta as those code points that have been newly assigned 13899 # since the first compare version. 13900 my $delta = Range_List->new(); 13901 foreach my $table ($age->tables) { 13902 use version; 13903 next if $table == $age->table('Unassigned'); 13904 next if version->parse($table->name) 13905 le version->parse($string_compare_versions); 13906 $delta += $table; 13907 } 13908 if ($delta->is_empty) { 13909 die ("No changes; perhaps you need a 'DAge.txt' file?"); 13910 } 13911 13912 my $unchanged = ~ $delta; 13913 13914 calculate_Assigned() if ! defined $Assigned; 13915 $Assigned &= $unchanged; 13916 13917 # $Assigned now contains the code points that were assigned as of Unicode 13918 # version X. 13919 13920 # A block is all or nothing. If nothing is assigned in it, it all goes 13921 # back to the No_Block pool; but if even one code point is assigned, the 13922 # block is retained. 13923 my $no_block = $block->table('No_Block'); 13924 foreach my $this_block ($block->tables) { 13925 next if $this_block == $no_block 13926 || ! ($this_block & $Assigned)->is_empty; 13927 $this_block->set_fate($SUPPRESSED, $after_first_version); 13928 foreach my $range ($this_block->ranges) { 13929 $block->replace_map($range->start, $range->end, 'No_Block') 13930 } 13931 $no_block += $this_block; 13932 } 13933 13934 my @special_delta_properties; # List of properties that have to be 13935 # handled specially. 13936 my %restricted_delta; # Keys are the entries in 13937 # @special_delta_properties; values 13938 # are the range list of the code points 13939 # that behave normally when they get 13940 # assigned. 13941 13942 # In the next three properties, the Default Ignorable code points are 13943 # outliers. 13944 calculate_DI(); 13945 $DI &= $unchanged; 13946 13947 push @special_delta_properties, property_ref('_Perl_GCB'); 13948 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13949 13950 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded'))) 13951 { 13952 push @special_delta_properties, $cwnfkcc; 13953 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13954 } 13955 13956 calculate_NChar(); # Non-character code points 13957 $NChar &= $unchanged; 13958 13959 # This may have to be updated from time-to-time to get the most accurate 13960 # results. 13961 my $default_BC_non_LtoR = Range_List->new(Initialize => 13962 # These came from the comments in v8.0 DBidiClass.txt 13963 [ # AL 13964 0x0600 .. 0x07BF, 13965 0x08A0 .. 0x08FF, 13966 0xFB50 .. 0xFDCF, 13967 0xFDF0 .. 0xFDFF, 13968 0xFE70 .. 0xFEFF, 13969 0x1EE00 .. 0x1EEFF, 13970 # R 13971 0x0590 .. 0x05FF, 13972 0x07C0 .. 0x089F, 13973 0xFB1D .. 0xFB4F, 13974 0x10800 .. 0x10FFF, 13975 0x1E800 .. 0x1EDFF, 13976 0x1EF00 .. 0x1EFFF, 13977 # ET 13978 0x20A0 .. 0x20CF, 13979 ] 13980 ); 13981 $default_BC_non_LtoR += $DI + $NChar; 13982 push @special_delta_properties, property_ref('BidiClass'); 13983 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR; 13984 13985 if (defined (my $eaw = property_ref('East_Asian_Width'))) { 13986 13987 my $default_EA_width_W = Range_List->new(Initialize => 13988 # From comments in v8.0 EastAsianWidth.txt 13989 [ 13990 0x3400 .. 0x4DBF, 13991 0x4E00 .. 0x9FFF, 13992 0xF900 .. 0xFAFF, 13993 0x20000 .. 0x2A6DF, 13994 0x2A700 .. 0x2B73F, 13995 0x2B740 .. 0x2B81F, 13996 0x2B820 .. 0x2CEAF, 13997 0x2F800 .. 0x2FA1F, 13998 0x20000 .. 0x2FFFD, 13999 0x30000 .. 0x3FFFD, 14000 ] 14001 ); 14002 push @special_delta_properties, $eaw; 14003 $restricted_delta{$special_delta_properties[-1]} 14004 = ~ $default_EA_width_W; 14005 14006 # Line break came along in the same release as East_Asian_Width, and 14007 # the non-grab-bag default set is a superset of the EAW one. 14008 if (defined (my $lb = property_ref('Line_Break'))) { 14009 my $default_LB_non_XX = Range_List->new(Initialize => 14010 # From comments in v8.0 LineBreak.txt 14011 [ 0x20A0 .. 0x20CF ]); 14012 $default_LB_non_XX += $default_EA_width_W; 14013 push @special_delta_properties, $lb; 14014 $restricted_delta{$special_delta_properties[-1]} 14015 = ~ $default_LB_non_XX; 14016 } 14017 } 14018 14019 # Go through every property, skipping those we've already worked on, those 14020 # that are immutable, and the perl ones that will be calculated after this 14021 # routine has done its fixup. 14022 foreach my $property (property_ref('*')) { 14023 next if $property == $perl # Done later in the program 14024 || $property == $block # Done just above 14025 || $property == $DI # Done just above 14026 || $property == $NChar # Done just above 14027 14028 # The next two are invariant across Unicode versions 14029 || $property == property_ref('Pattern_Syntax') 14030 || $property == property_ref('Pattern_White_Space'); 14031 14032 # Find the grab-bag value. 14033 my $default_map = $property->default_map; 14034 14035 if (! $property->to_create_match_tables) { 14036 14037 # Here there aren't any match tables. So far, all such properties 14038 # have a default map, and don't require special handling. Just 14039 # change each newly assigned code point back to the default map, 14040 # as if they were unassigned. 14041 foreach my $range ($delta->ranges) { 14042 $property->add_map($range->start, 14043 $range->end, 14044 $default_map, 14045 Replace => $UNCONDITIONALLY); 14046 } 14047 } 14048 else { # Here there are match tables. Find the one (if any) for the 14049 # grab-bag value that unassigned code points go to. 14050 my $default_table; 14051 if (defined $default_map) { 14052 $default_table = $property->table($default_map); 14053 } 14054 14055 # If some code points don't go back to the grab-bag when they 14056 # are considered unassigned, exclude them from the list that does 14057 # that. 14058 my $this_delta = $delta; 14059 my $this_unchanged = $unchanged; 14060 if (grep { $_ == $property } @special_delta_properties) { 14061 $this_delta = $delta & $restricted_delta{$property}; 14062 $this_unchanged = ~ $this_delta; 14063 } 14064 14065 # Fix up each match table for this property. 14066 foreach my $table ($property->tables) { 14067 if (defined $default_table && $table == $default_table) { 14068 14069 # The code points assigned after release X (the ones we 14070 # are excluding in this routine) go back on to the default 14071 # (grab-bag) table. However, some of these tables don't 14072 # actually exist, but are specified solely by the other 14073 # tables. (In a binary property, we don't need to 14074 # actually have an 'N' table, as it's just the complement 14075 # of the 'Y' table.) Such tables will be locked, so just 14076 # skip those. 14077 $table += $this_delta unless $table->locked; 14078 } 14079 else { 14080 14081 # Here the table is not for the default value. We need to 14082 # subtract the code points we are ignoring for this 14083 # comparison (the deltas) from it. But if the table 14084 # started out with nothing, no need to exclude anything, 14085 # and want to skip it here anyway, so it gets listed 14086 # properly in the pod. 14087 next if $table->is_empty; 14088 14089 # Save the deltas for later, before we do the subtraction 14090 my $deltas = $table & $this_delta; 14091 14092 $table &= $this_unchanged; 14093 14094 # Suppress the table if the subtraction left it with 14095 # nothing in it 14096 if ($table->is_empty) { 14097 if ($property->type == $BINARY) { 14098 push @tables_that_may_be_empty, $table->complete_name; 14099 } 14100 else { 14101 $table->set_fate($SUPPRESSED, $after_first_version); 14102 } 14103 } 14104 14105 # Now we add the removed code points to the property's 14106 # map, as they should now map to the grab-bag default 14107 # property (which they did in the first comparison 14108 # version). But we don't have to do this if the map is 14109 # only for internal use. 14110 if (defined $default_map && $property->to_output_map) { 14111 14112 # The gc property has pseudo property values whose names 14113 # have length 1. These are the union of all the 14114 # property values whose name is longer than 1 and 14115 # whose first letter is all the same. The replacement 14116 # is done once for the longer-named tables. 14117 next if $property == $gc && length $table->name == 1; 14118 14119 foreach my $range ($deltas->ranges) { 14120 $property->add_map($range->start, 14121 $range->end, 14122 $default_map, 14123 Replace => $UNCONDITIONALLY); 14124 } 14125 } 14126 } 14127 } 14128 } 14129 } 14130 14131 # The above code doesn't work on 'gc=C', as it is a superset of the default 14132 # ('Cn') table. It's easiest to just special case it here. 14133 my $C = $gc->table('C'); 14134 $C += $gc->table('Cn'); 14135 14136 return; 14137} 14138 14139sub compile_perl() { 14140 # Create perl-defined tables. Almost all are part of the pseudo-property 14141 # named 'perl' internally to this program. Many of these are recommended 14142 # in UTS#18 "Unicode Regular Expressions", and their derivations are based 14143 # on those found there. 14144 # Almost all of these are equivalent to some Unicode property. 14145 # A number of these properties have equivalents restricted to the ASCII 14146 # range, with their names prefaced by 'Posix', to signify that these match 14147 # what the Posix standard says they should match. A couple are 14148 # effectively this, but the name doesn't have 'Posix' in it because there 14149 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended 14150 # to the full Unicode range, by our guesses as to what is appropriate. 14151 14152 # 'All' is all code points. As an error check, instead of just setting it 14153 # to be that, construct it to be the union of all the major categories 14154 $All = $perl->add_match_table('All', 14155 Description 14156 => "All code points, including those above Unicode. Same as qr/./s", 14157 Matches_All => 1); 14158 14159 foreach my $major_table ($gc->tables) { 14160 14161 # Major categories are the ones with single letter names. 14162 next if length($major_table->name) != 1; 14163 14164 $All += $major_table; 14165 } 14166 14167 if ($All->max != $MAX_WORKING_CODEPOINT) { 14168 Carp::my_carp_bug("Generated highest code point (" 14169 . sprintf("%X", $All->max) 14170 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.") 14171 } 14172 if ($All->range_count != 1 || $All->min != 0) { 14173 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.") 14174 } 14175 14176 my $Any = $perl->add_match_table('Any', 14177 Description => "All Unicode code points"); 14178 $Any->add_range(0, $MAX_UNICODE_CODEPOINT); 14179 $Any->add_alias('Unicode'); 14180 14181 calculate_Assigned(); 14182 14183 my $ASCII = $perl->add_match_table('ASCII'); 14184 if (defined $block) { # This is equivalent to the block if have it. 14185 my $Unicode_ASCII = $block->table('Basic_Latin'); 14186 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { 14187 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); 14188 } 14189 } 14190 14191 # Very early releases didn't have blocks, so initialize ASCII ourselves if 14192 # necessary 14193 if ($ASCII->is_empty) { 14194 if (! NON_ASCII_PLATFORM) { 14195 $ASCII->add_range(0, 127); 14196 } 14197 else { 14198 for my $i (0 .. 127) { 14199 $ASCII->add_range(utf8::unicode_to_native($i), 14200 utf8::unicode_to_native($i)); 14201 } 14202 } 14203 } 14204 14205 # Get the best available case definitions. Early Unicode versions didn't 14206 # have Uppercase and Lowercase defined, so use the general category 14207 # instead for them, modified by hard-coding in the code points each is 14208 # missing. 14209 my $Lower = $perl->add_match_table('XPosixLower'); 14210 my $Unicode_Lower = property_ref('Lowercase'); 14211 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { 14212 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); 14213 14214 } 14215 else { 14216 $Lower += $gc->table('Lowercase_Letter'); 14217 14218 # There are quite a few code points in Lower, that aren't in gc=lc, 14219 # and not all are in all releases. 14220 my $temp = Range_List->new(Initialize => [ 14221 utf8::unicode_to_native(0xAA), 14222 utf8::unicode_to_native(0xBA), 14223 0x02B0 .. 0x02B8, 14224 0x02C0 .. 0x02C1, 14225 0x02E0 .. 0x02E4, 14226 0x0345, 14227 0x037A, 14228 0x1D2C .. 0x1D6A, 14229 0x1D78, 14230 0x1D9B .. 0x1DBF, 14231 0x2071, 14232 0x207F, 14233 0x2090 .. 0x209C, 14234 0x2170 .. 0x217F, 14235 0x24D0 .. 0x24E9, 14236 0x2C7C .. 0x2C7D, 14237 0xA770, 14238 0xA7F8 .. 0xA7F9, 14239 ]); 14240 $Lower += $temp & $Assigned; 14241 } 14242 my $Posix_Lower = $perl->add_match_table("PosixLower", 14243 Initialize => $Lower & $ASCII, 14244 ); 14245 14246 my $Upper = $perl->add_match_table("XPosixUpper"); 14247 my $Unicode_Upper = property_ref('Uppercase'); 14248 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { 14249 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); 14250 } 14251 else { 14252 14253 # Unlike Lower, there are only two ranges in Upper that aren't in 14254 # gc=Lu, and all code points were assigned in all releases. 14255 $Upper += $gc->table('Uppercase_Letter'); 14256 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals 14257 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters 14258 } 14259 my $Posix_Upper = $perl->add_match_table("PosixUpper", 14260 Initialize => $Upper & $ASCII, 14261 ); 14262 14263 # Earliest releases didn't have title case. Initialize it to empty if not 14264 # otherwise present 14265 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase', 14266 Description => '(= \p{Gc=Lt})'); 14267 my $lt = $gc->table('Lt'); 14268 14269 # Earlier versions of mktables had this related to $lt since they have 14270 # identical code points, but their caseless equivalents are not the same, 14271 # one being 'Cased' and the other being 'LC', and so now must be kept as 14272 # separate entities. 14273 if (defined $lt) { 14274 $Title += $lt; 14275 } 14276 else { 14277 push @tables_that_may_be_empty, $Title->complete_name; 14278 } 14279 14280 my $Unicode_Cased = property_ref('Cased'); 14281 if (defined $Unicode_Cased) { 14282 my $yes = $Unicode_Cased->table('Y'); 14283 my $no = $Unicode_Cased->table('N'); 14284 $Title->set_caseless_equivalent($yes); 14285 if (defined $Unicode_Upper) { 14286 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes); 14287 $Unicode_Upper->table('N')->set_caseless_equivalent($no); 14288 } 14289 $Upper->set_caseless_equivalent($yes); 14290 if (defined $Unicode_Lower) { 14291 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes); 14292 $Unicode_Lower->table('N')->set_caseless_equivalent($no); 14293 } 14294 $Lower->set_caseless_equivalent($yes); 14295 } 14296 else { 14297 # If this Unicode version doesn't have Cased, set up the Perl 14298 # extension from first principles. From Unicode 5.1: Definition D120: 14299 # A character C is defined to be cased if and only if C has the 14300 # Lowercase or Uppercase property or has a General_Category value of 14301 # Titlecase_Letter. 14302 my $cased = $perl->add_match_table('Cased', 14303 Initialize => $Lower + $Upper + $Title, 14304 Description => 'Uppercase or Lowercase or Titlecase', 14305 ); 14306 # $notcased is purely for the caseless equivalents below 14307 my $notcased = $perl->add_match_table('_Not_Cased', 14308 Initialize => ~ $cased, 14309 Fate => $INTERNAL_ONLY, 14310 Description => 'All not-cased code points'); 14311 $Title->set_caseless_equivalent($cased); 14312 if (defined $Unicode_Upper) { 14313 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased); 14314 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased); 14315 } 14316 $Upper->set_caseless_equivalent($cased); 14317 if (defined $Unicode_Lower) { 14318 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased); 14319 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased); 14320 } 14321 $Lower->set_caseless_equivalent($cased); 14322 } 14323 14324 # The remaining perl defined tables are mostly based on Unicode TR 18, 14325 # "Annex C: Compatibility Properties". All of these have two versions, 14326 # one whose name generally begins with Posix that is posix-compliant, and 14327 # one that matches Unicode characters beyond the Posix, ASCII range 14328 14329 my $Alpha = $perl->add_match_table('XPosixAlpha'); 14330 14331 # Alphabetic was not present in early releases 14332 my $Alphabetic = property_ref('Alphabetic'); 14333 if (defined $Alphabetic && ! $Alphabetic->is_empty) { 14334 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); 14335 } 14336 else { 14337 14338 # The Alphabetic property doesn't exist for early releases, so 14339 # generate it. The actual definition, in 5.2 terms is: 14340 # 14341 # gc=L + gc=Nl + Other_Alphabetic 14342 # 14343 # Other_Alphabetic is also not defined in these early releases, but it 14344 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add 14345 # those last two as well, then subtract the relatively few of them that 14346 # shouldn't have been added. (The gc=So range is the circled capital 14347 # Latin characters. Early releases mistakenly didn't also include the 14348 # lower-case versions of these characters, and so we don't either, to 14349 # maintain consistency with those releases that first had this 14350 # property. 14351 $Alpha->initialize($gc->table('Letter') 14352 + pre_3_dot_1_Nl() 14353 + $gc->table('Mn') 14354 + $gc->table('Mc') 14355 ); 14356 $Alpha->add_range(0x24D0, 0x24E9); # gc=So 14357 foreach my $range ( [ 0x0300, 0x0344 ], 14358 [ 0x0346, 0x034E ], 14359 [ 0x0360, 0x0362 ], 14360 [ 0x0483, 0x0486 ], 14361 [ 0x0591, 0x05AF ], 14362 [ 0x06DF, 0x06E0 ], 14363 [ 0x06EA, 0x06EC ], 14364 [ 0x0740, 0x074A ], 14365 0x093C, 14366 0x094D, 14367 [ 0x0951, 0x0954 ], 14368 0x09BC, 14369 0x09CD, 14370 0x0A3C, 14371 0x0A4D, 14372 0x0ABC, 14373 0x0ACD, 14374 0x0B3C, 14375 0x0B4D, 14376 0x0BCD, 14377 0x0C4D, 14378 0x0CCD, 14379 0x0D4D, 14380 0x0DCA, 14381 [ 0x0E47, 0x0E4C ], 14382 0x0E4E, 14383 [ 0x0EC8, 0x0ECC ], 14384 [ 0x0F18, 0x0F19 ], 14385 0x0F35, 14386 0x0F37, 14387 0x0F39, 14388 [ 0x0F3E, 0x0F3F ], 14389 [ 0x0F82, 0x0F84 ], 14390 [ 0x0F86, 0x0F87 ], 14391 0x0FC6, 14392 0x1037, 14393 0x1039, 14394 [ 0x17C9, 0x17D3 ], 14395 [ 0x20D0, 0x20DC ], 14396 0x20E1, 14397 [ 0x302A, 0x302F ], 14398 [ 0x3099, 0x309A ], 14399 [ 0xFE20, 0xFE23 ], 14400 [ 0x1D165, 0x1D169 ], 14401 [ 0x1D16D, 0x1D172 ], 14402 [ 0x1D17B, 0x1D182 ], 14403 [ 0x1D185, 0x1D18B ], 14404 [ 0x1D1AA, 0x1D1AD ], 14405 ) { 14406 if (ref $range) { 14407 $Alpha->delete_range($range->[0], $range->[1]); 14408 } 14409 else { 14410 $Alpha->delete_range($range, $range); 14411 } 14412 } 14413 $Alpha->add_description('Alphabetic'); 14414 $Alpha->add_alias('Alphabetic'); 14415 } 14416 my $Posix_Alpha = $perl->add_match_table("PosixAlpha", 14417 Initialize => $Alpha & $ASCII, 14418 ); 14419 $Posix_Upper->set_caseless_equivalent($Posix_Alpha); 14420 $Posix_Lower->set_caseless_equivalent($Posix_Alpha); 14421 14422 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum', 14423 Description => 'Alphabetic and (decimal) Numeric', 14424 Initialize => $Alpha + $gc->table('Decimal_Number'), 14425 ); 14426 $perl->add_match_table("PosixAlnum", 14427 Initialize => $Alnum & $ASCII, 14428 ); 14429 14430 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord', 14431 Description => '\w, including beyond ASCII;' 14432 . ' = \p{Alnum} + \pM + \p{Pc}' 14433 . ' + \p{Join_Control}', 14434 Initialize => $Alnum + $gc->table('Mark'), 14435 ); 14436 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 14437 if (defined $Pc) { 14438 $Word += $Pc; 14439 } 14440 else { 14441 $Word += ord('_'); # Make sure this is a $Word 14442 } 14443 my $JC = property_ref('Join_Control'); # Wasn't in release 1 14444 if (defined $JC) { 14445 $Word += $JC->table('Y'); 14446 } 14447 else { 14448 $Word += 0x200C + 0x200D; 14449 } 14450 14451 # This is a Perl extension, so the name doesn't begin with Posix. 14452 my $PerlWord = $perl->add_match_table('PosixWord', 14453 Description => '\w, restricted to ASCII', 14454 Initialize => $Word & $ASCII, 14455 ); 14456 $PerlWord->add_alias('PerlWord'); 14457 14458 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank', 14459 Description => '\h, Horizontal white space', 14460 14461 # 200B is Zero Width Space which is for line 14462 # break control, and was listed as 14463 # Space_Separator in early releases 14464 Initialize => $gc->table('Space_Separator') 14465 + ord("\t") 14466 - 0x200B, # ZWSP 14467 ); 14468 $Blank->add_alias('HorizSpace'); # Another name for it. 14469 $perl->add_match_table("PosixBlank", 14470 Initialize => $Blank & $ASCII, 14471 ); 14472 14473 my $VertSpace = $perl->add_match_table('VertSpace', 14474 Description => '\v', 14475 Initialize => 14476 $gc->table('Line_Separator') 14477 + $gc->table('Paragraph_Separator') 14478 + utf8::unicode_to_native(0x0A) # LINE FEED 14479 + utf8::unicode_to_native(0x0B) # VERTICAL TAB 14480 + ord("\f") 14481 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 14482 + utf8::unicode_to_native(0x85) # NEL 14483 ); 14484 # No Posix equivalent for vertical space 14485 14486 my $Space = $perl->add_match_table('XPosixSpace', 14487 Description => '\s including beyond ASCII and vertical tab', 14488 Initialize => $Blank + $VertSpace, 14489 ); 14490 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms 14491 $Space->add_alias('SpacePerl'); 14492 $Space->add_alias('Space') if $v_version lt v4.1.0; 14493 14494 my $Posix_space = $perl->add_match_table("PosixSpace", 14495 Initialize => $Space & $ASCII, 14496 ); 14497 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym 14498 14499 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl', 14500 Description => 'Control characters'); 14501 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); 14502 $perl->add_match_table("PosixCntrl", 14503 Description => "ASCII control characters", 14504 Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2," 14505 . " DC3, DC4, DEL, DLE, ENQ, EOM," 14506 . " EOT, ESC, ETB, ETX, FF, FS, GS," 14507 . " HT, LF, NAK, NUL, RS, SI, SO," 14508 . " SOH, STX, SUB, SYN, US, VT", 14509 Initialize => $Cntrl & $ASCII, 14510 ); 14511 14512 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate'); 14513 my $Cs = $gc->table('Cs'); 14514 if (defined $Cs && ! $Cs->is_empty) { 14515 $perl_surrogate += $Cs; 14516 } 14517 else { 14518 push @tables_that_may_be_empty, '_Perl_Surrogate'; 14519 } 14520 14521 # $controls is a temporary used to construct Graph. 14522 my $controls = Range_List->new(Initialize => $gc->table('Unassigned') 14523 + $gc->table('Control') 14524 + $perl_surrogate); 14525 14526 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) 14527 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph', 14528 Description => 'Characters that are graphical', 14529 Initialize => ~ ($Space + $controls), 14530 ); 14531 $perl->add_match_table("PosixGraph", 14532 Initialize => $Graph & $ASCII, 14533 ); 14534 14535 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint', 14536 Description => 'Characters that are graphical plus space characters (but no controls)', 14537 Initialize => $Blank + $Graph - $gc->table('Control'), 14538 ); 14539 $perl->add_match_table("PosixPrint", 14540 Initialize => $print & $ASCII, 14541 ); 14542 14543 my $Punct = $perl->add_match_table('Punct'); 14544 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); 14545 14546 # \p{punct} doesn't include the symbols, which posix does 14547 my $XPosixPunct = $perl->add_match_table('XPosixPunct', 14548 Description => '\p{Punct} + ASCII-range \p{Symbol}', 14549 Initialize => $gc->table('Punctuation') 14550 + ($ASCII & $gc->table('Symbol')), 14551 Perl_Extension => 1 14552 ); 14553 $perl->add_match_table('PosixPunct', Perl_Extension => 1, 14554 Initialize => $ASCII & $XPosixPunct, 14555 ); 14556 14557 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit', 14558 Description => '[0-9] + all other decimal digits'); 14559 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); 14560 my $PosixDigit = $perl->add_match_table("PosixDigit", 14561 Initialize => $Digit & $ASCII, 14562 ); 14563 14564 # Hex_Digit was not present in first release 14565 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit'); 14566 my $Hex = property_ref('Hex_Digit'); 14567 if (defined $Hex && ! $Hex->is_empty) { 14568 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); 14569 } 14570 else { 14571 $Xdigit->initialize([ ord('0') .. ord('9'), 14572 ord('A') .. ord('F'), 14573 ord('a') .. ord('f'), 14574 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); 14575 } 14576 14577 # AHex was not present in early releases 14578 my $PosixXDigit = $perl->add_match_table('PosixXDigit'); 14579 my $AHex = property_ref('ASCII_Hex_Digit'); 14580 if (defined $AHex && ! $AHex->is_empty) { 14581 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); 14582 } 14583 else { 14584 $PosixXDigit->initialize($Xdigit & $ASCII); 14585 $PosixXDigit->add_alias('AHex'); 14586 $PosixXDigit->add_alias('Ascii_Hex_Digit'); 14587 } 14588 14589 my $any_folds = $perl->add_match_table("_Perl_Any_Folds", 14590 Description => "Code points that particpate in some fold", 14591 ); 14592 my $loc_problem_folds = $perl->add_match_table( 14593 "_Perl_Problematic_Locale_Folds", 14594 Description => 14595 "Code points that are in some way problematic under locale", 14596 ); 14597 14598 # This allows regexec.c to skip some work when appropriate. Some of the 14599 # entries in _Perl_Problematic_Locale_Folds are multi-character folds, 14600 my $loc_problem_folds_start = $perl->add_match_table( 14601 "_Perl_Problematic_Locale_Foldeds_Start", 14602 Description => 14603 "The first character of every sequence in _Perl_Problematic_Locale_Folds", 14604 ); 14605 14606 my $cf = property_ref('Case_Folding'); 14607 14608 # Every character 0-255 is problematic because what each folds to depends 14609 # on the current locale 14610 $loc_problem_folds->add_range(0, 255); 14611 $loc_problem_folds->add_range(0x130, 0x131); # These are problematic in 14612 # Turkic locales 14613 $loc_problem_folds_start += $loc_problem_folds; 14614 14615 # Also problematic are anything these fold to outside the range. Likely 14616 # forever the only thing folded to by these outside the 0-255 range is the 14617 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code 14618 # completely general, which should catch any unexpected changes or errors. 14619 # We look at each code point 0-255, and add its fold (including each part 14620 # of a multi-char fold) to the list. See commit message 14621 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description 14622 # of the MU issue. 14623 foreach my $range ($loc_problem_folds->ranges) { 14624 foreach my $code_point ($range->start .. $range->end) { 14625 my $fold_range = $cf->containing_range($code_point); 14626 next unless defined $fold_range; 14627 14628 # Skip if folds to itself 14629 next if $fold_range->value eq $CODE_POINT; 14630 14631 my @hex_folds = split " ", $fold_range->value; 14632 my $start_cp = $hex_folds[0]; 14633 next if $start_cp eq $CODE_POINT; 14634 $start_cp = hex $start_cp; 14635 foreach my $i (0 .. @hex_folds - 1) { 14636 my $cp = $hex_folds[$i]; 14637 next if $cp eq $CODE_POINT; 14638 $cp = hex $cp; 14639 next unless $cp > 255; # Already have the < 256 ones 14640 14641 $loc_problem_folds->add_range($cp, $cp); 14642 $loc_problem_folds_start->add_range($start_cp, $start_cp); 14643 } 14644 } 14645 } 14646 14647 my $folds_to_multi_char = $perl->add_match_table( 14648 "_Perl_Folds_To_Multi_Char", 14649 Description => 14650 "Code points whose fold is a string of more than one character", 14651 ); 14652 my $in_multi_fold = $perl->add_match_table( 14653 "_Perl_Is_In_Multi_Char_Fold", 14654 Description => 14655 "Code points that are in some multiple character fold", 14656 ); 14657 if ($v_version lt v3.0.1) { 14658 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char', 14659 '_Perl_Is_In_Multi_Char_Fold', 14660 '_Perl_Non_Final_Folds'; 14661 } 14662 14663 # Look through all the known folds to populate these tables. 14664 foreach my $range ($cf->ranges) { 14665 next if $range->value eq $CODE_POINT; 14666 my $start = $range->start; 14667 my $end = $range->end; 14668 $any_folds->add_range($start, $end); 14669 14670 my @hex_folds = split " ", $range->value; 14671 if (@hex_folds > 1) { # Is multi-char fold 14672 $folds_to_multi_char->add_range($start, $end); 14673 } 14674 14675 my $found_locale_problematic = 0; 14676 14677 my $folded_count = @hex_folds; 14678 if ($folded_count > 3) { 14679 die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's $folded_count for U+" . sprintf "%04X", $range->start); 14680 } 14681 14682 # Look at each of the folded-to characters... 14683 foreach my $i (1 .. $folded_count) { 14684 my $cp = hex $hex_folds[$i-1]; 14685 $any_folds->add_range($cp, $cp); 14686 14687 # The fold is problematic if any of the folded-to characters is 14688 # already considered problematic. 14689 if ($loc_problem_folds->contains($cp)) { 14690 $loc_problem_folds->add_range($start, $end); 14691 $found_locale_problematic = 1; 14692 } 14693 14694 if ($folded_count > 1) { 14695 $in_multi_fold->add_range($cp, $cp); 14696 } 14697 } 14698 14699 # If this is a problematic fold, add to the start chars the 14700 # folding-from characters and first folded-to character. 14701 if ($found_locale_problematic) { 14702 $loc_problem_folds_start->add_range($start, $end); 14703 my $cp = hex $hex_folds[0]; 14704 $loc_problem_folds_start->add_range($cp, $cp); 14705 } 14706 } 14707 14708 my $dt = property_ref('Decomposition_Type'); 14709 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', 14710 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), 14711 Perl_Extension => 1, 14712 Note => 'Union of all non-canonical decompositions', 14713 ); 14714 14715 # For backward compatibility, Perl has its own definition for IDStart. 14716 # It is regular XID_Start plus the underscore, but all characters must be 14717 # Word characters as well 14718 my $XID_Start = property_ref('XID_Start'); 14719 my $perl_xids = $perl->add_match_table('_Perl_IDStart', 14720 Perl_Extension => 1, 14721 Fate => $INTERNAL_ONLY, 14722 Initialize => ord('_') 14723 ); 14724 if (defined $XID_Start 14725 || defined ($XID_Start = property_ref('ID_Start'))) 14726 { 14727 $perl_xids += $XID_Start->table('Y'); 14728 } 14729 else { 14730 # For Unicode versions that don't have the property, construct our own 14731 # from first principles. The actual definition is: 14732 # Letters 14733 # + letter numbers (Nl) 14734 # - Pattern_Syntax 14735 # - Pattern_White_Space 14736 # + stability extensions 14737 # - NKFC modifications 14738 # 14739 # What we do in the code below is to include the identical code points 14740 # that are in the first release that had Unicode's version of this 14741 # property, essentially extrapolating backwards. There were no 14742 # stability extensions until v4.1, so none are included; likewise in 14743 # no Unicode version so far do subtracting PatSyn and PatWS make any 14744 # difference, so those also are ignored. 14745 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl(); 14746 14747 # We do subtract the NFKC modifications that are in the first version 14748 # that had this property. We don't bother to test if they are in the 14749 # version in question, because if they aren't, the operation is a 14750 # no-op. The NKFC modifications are discussed in 14751 # http://www.unicode.org/reports/tr31/#NFKC_Modifications 14752 foreach my $range ( 0x037A, 14753 0x0E33, 14754 0x0EB3, 14755 [ 0xFC5E, 0xFC63 ], 14756 [ 0xFDFA, 0xFE70 ], 14757 [ 0xFE72, 0xFE76 ], 14758 0xFE78, 14759 0xFE7A, 14760 0xFE7C, 14761 0xFE7E, 14762 [ 0xFF9E, 0xFF9F ], 14763 ) { 14764 if (ref $range) { 14765 $perl_xids->delete_range($range->[0], $range->[1]); 14766 } 14767 else { 14768 $perl_xids->delete_range($range, $range); 14769 } 14770 } 14771 } 14772 14773 $perl_xids &= $Word; 14774 14775 my $perl_xidc = $perl->add_match_table('_Perl_IDCont', 14776 Perl_Extension => 1, 14777 Fate => $INTERNAL_ONLY); 14778 my $XIDC = property_ref('XID_Continue'); 14779 if (defined $XIDC 14780 || defined ($XIDC = property_ref('ID_Continue'))) 14781 { 14782 $perl_xidc += $XIDC->table('Y'); 14783 } 14784 else { 14785 # Similarly, we construct our own XIDC if necessary for early Unicode 14786 # versions. The definition is: 14787 # everything in XIDS 14788 # + Gc=Mn 14789 # + Gc=Mc 14790 # + Gc=Nd 14791 # + Gc=Pc 14792 # - Pattern_Syntax 14793 # - Pattern_White_Space 14794 # + stability extensions 14795 # - NFKC modifications 14796 # 14797 # The same thing applies to this as with XIDS for the PatSyn, PatWS, 14798 # and stability extensions. There is a somewhat different set of NFKC 14799 # mods to remove (and add in this case). The ones below make this 14800 # have identical code points as in the first release that defined it. 14801 $perl_xidc += $perl_xids 14802 + $gc->table('L') 14803 + $gc->table('Mn') 14804 + $gc->table('Mc') 14805 + $gc->table('Nd') 14806 + utf8::unicode_to_native(0xB7) 14807 ; 14808 if (defined (my $pc = $gc->table('Pc'))) { 14809 $perl_xidc += $pc; 14810 } 14811 else { # 1.1.5 didn't have Pc, but these should have been in it 14812 $perl_xidc += 0xFF3F; 14813 $perl_xidc->add_range(0x203F, 0x2040); 14814 $perl_xidc->add_range(0xFE33, 0xFE34); 14815 $perl_xidc->add_range(0xFE4D, 0xFE4F); 14816 } 14817 14818 # Subtract the NFKC mods 14819 foreach my $range ( 0x037A, 14820 [ 0xFC5E, 0xFC63 ], 14821 [ 0xFDFA, 0xFE1F ], 14822 0xFE70, 14823 [ 0xFE72, 0xFE76 ], 14824 0xFE78, 14825 0xFE7A, 14826 0xFE7C, 14827 0xFE7E, 14828 ) { 14829 if (ref $range) { 14830 $perl_xidc->delete_range($range->[0], $range->[1]); 14831 } 14832 else { 14833 $perl_xidc->delete_range($range, $range); 14834 } 14835 } 14836 } 14837 14838 $perl_xidc &= $Word; 14839 14840 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin', 14841 Perl_Extension => 1, 14842 Fate => $INTERNAL_ONLY, 14843 Initialize => $gc->table('Letter') & $Alpha & $perl_xids, 14844 ); 14845 14846 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue', 14847 Perl_Extension => 1, 14848 Fate => $INTERNAL_ONLY, 14849 Initialize => $perl_xidc 14850 + ord(" ") 14851 + ord("(") 14852 + ord(")") 14853 + ord("-") 14854 ); 14855 14856 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); 14857 14858 if (@named_sequences) { 14859 push @composition, 'Named_Sequence'; 14860 foreach my $sequence (@named_sequences) { 14861 $perl_charname->add_anomalous_entry($sequence); 14862 } 14863 } 14864 14865 my $alias_sentence = ""; 14866 my %abbreviations; 14867 my $alias = property_ref('_Perl_Name_Alias'); 14868 $perl_charname->set_proxy_for('_Perl_Name_Alias'); 14869 14870 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go 14871 # with respect to any existing entry depends on the entry type. 14872 # Corrections go before said entry, as they should be returned in 14873 # preference over the existing entry. (A correction to a correction 14874 # should be later in the _Perl_Name_Alias table, so it will correctly 14875 # precede the erroneous correction in Perl_Charnames.) 14876 # 14877 # Abbreviations go after everything else, so they are saved temporarily in 14878 # a hash for later. 14879 # 14880 # Everything else is added afterwards, which preserves the input 14881 # ordering 14882 14883 foreach my $range ($alias->ranges) { 14884 next if $range->value eq ""; 14885 my $code_point = $range->start; 14886 if ($code_point != $range->end) { 14887 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14888 } 14889 my ($value, $type) = split ': ', $range->value; 14890 my $replace_type; 14891 if ($type eq 'correction') { 14892 $replace_type = $MULTIPLE_BEFORE; 14893 } 14894 elsif ($type eq 'abbreviation') { 14895 14896 # Save for later 14897 $abbreviations{$value} = $code_point; 14898 next; 14899 } 14900 else { 14901 $replace_type = $MULTIPLE_AFTER; 14902 } 14903 14904 # Actually add; before or after current entry(ies) as determined 14905 # above. 14906 14907 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); 14908 } 14909 $alias_sentence = <<END; 14910The _Perl_Name_Alias property adds duplicate code point entries that are 14911alternatives to the original name. If an addition is a corrected 14912name, it will be physically first in the table. The original (less correct, 14913but still valid) name will be next; then any alternatives, in no particular 14914order; and finally any abbreviations, again in no particular order. 14915END 14916 14917 # Now add the Unicode_1 names for the controls. The Unicode_1 names had 14918 # precedence before 6.1, including the awful ones like "LINE FEED (LF)", 14919 # so should be first in the file; the other names have precedence starting 14920 # in 6.1, 14921 my $before_or_after = ($v_version lt v6.1.0) 14922 ? $MULTIPLE_BEFORE 14923 : $MULTIPLE_AFTER; 14924 14925 foreach my $range (property_ref('Unicode_1_Name')->ranges) { 14926 my $code_point = $range->start; 14927 my $unicode_1_value = $range->value; 14928 next if $unicode_1_value eq ""; # Skip if name doesn't exist. 14929 14930 if ($code_point != $range->end) { 14931 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14932 } 14933 14934 # To handle EBCDIC, we don't hard code in the code points of the 14935 # controls; instead realizing that all of them are below 256. 14936 last if $code_point > 255; 14937 14938 # We only add in the controls. 14939 next if $gc->value_of($code_point) ne 'Cc'; 14940 14941 # We reject this Unicode1 name for later Perls, as it is used for 14942 # another code point 14943 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0; 14944 14945 # This won't add an exact duplicate. 14946 $perl_charname->add_duplicate($code_point, $unicode_1_value, 14947 Replace => $before_or_after); 14948 } 14949 14950 # Now that have everything added, add in abbreviations after 14951 # everything else. Sort so results don't change between runs of this 14952 # program 14953 foreach my $value (sort keys %abbreviations) { 14954 $perl_charname->add_duplicate($abbreviations{$value}, $value, 14955 Replace => $MULTIPLE_AFTER); 14956 } 14957 14958 my $comment; 14959 if (@composition <= 2) { # Always at least 2 14960 $comment = join " and ", @composition; 14961 } 14962 else { 14963 $comment = join ", ", @composition[0 .. scalar @composition - 2]; 14964 $comment .= ", and $composition[-1]"; 14965 } 14966 14967 $perl_charname->add_comment(join_lines( <<END 14968This file is for charnames.pm. It is the union of the $comment properties. 14969Unicode_1_Name entries are used only for nameless code points in the Name 14970property. 14971$alias_sentence 14972This file doesn't include the algorithmically determinable names. For those, 14973use 'unicore/Name.pm' 14974END 14975 )); 14976 property_ref('Name')->add_comment(join_lines( <<END 14977This file doesn't include the algorithmically determinable names. For those, 14978use 'unicore/Name.pm' 14979END 14980 )); 14981 14982 # Construct the Present_In property from the Age property. 14983 if (-e 'DAge.txt' && defined $age) { 14984 my $default_map = $age->default_map; 14985 my $in = Property->new('In', 14986 Default_Map => $default_map, 14987 Full_Name => "Present_In", 14988 Perl_Extension => 1, 14989 Type => $ENUM, 14990 Initialize => $age, 14991 ); 14992 $in->add_comment(join_lines(<<END 14993THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the 14994same as for $age, and not for what $in really means. This is because anything 14995defined in a given release should have multiple values: that release and all 14996higher ones. But only one value per code point can be represented in a table 14997like this. 14998END 14999 )); 15000 15001 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the 15002 # lowest numbered (earliest) come first, with the non-numeric one 15003 # last. 15004 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/) 15005 ? 1 15006 : ($b->name !~ /^[\d.]*$/) 15007 ? -1 15008 : $a->name <=> $b->name 15009 } $age->tables; 15010 15011 # The Present_In property is the cumulative age properties. The first 15012 # one hence is identical to the first age one. 15013 my $first_in = $in->add_match_table($first_age->name); 15014 $first_in->set_equivalent_to($first_age, Related => 1); 15015 15016 my $description_start = "Code point's usage introduced in version "; 15017 $first_age->add_description($description_start . $first_age->name); 15018 foreach my $alias ($first_age->aliases) { # Include its aliases 15019 $first_in->add_alias($alias->name); 15020 } 15021 15022 # To construct the accumulated values, for each of the age tables 15023 # starting with the 2nd earliest, merge the earliest with it, to get 15024 # all those code points existing in the 2nd earliest. Repeat merging 15025 # the new 2nd earliest with the 3rd earliest to get all those existing 15026 # in the 3rd earliest, and so on. 15027 my $previous_in = $first_in; 15028 foreach my $current_age (@rest_ages) { 15029 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric 15030 15031 my $current_in = $in->add_match_table( 15032 $current_age->name, 15033 Initialize => $current_age + $previous_in, 15034 Description => $description_start 15035 . $current_age->name 15036 . ' or earlier', 15037 ); 15038 foreach my $alias ($current_age->aliases) { 15039 $current_in->add_alias($alias->name); 15040 } 15041 $previous_in = $current_in; 15042 15043 # Add clarifying material for the corresponding age file. This is 15044 # in part because of the confusing and contradictory information 15045 # given in the Standard's documentation itself, as of 5.2. 15046 $current_age->add_description( 15047 "Code point's usage was introduced in version " 15048 . $current_age->name); 15049 $current_age->add_note("See also $in"); 15050 15051 } 15052 15053 # And finally the code points whose usages have yet to be decided are 15054 # the same in both properties. Note that permanently unassigned code 15055 # points actually have their usage assigned (as being permanently 15056 # unassigned), so that these tables are not the same as gc=cn. 15057 my $unassigned = $in->add_match_table($default_map); 15058 my $age_default = $age->table($default_map); 15059 $age_default->add_description(<<END 15060Code point's usage has not been assigned in any Unicode release thus far. 15061END 15062 ); 15063 $unassigned->set_equivalent_to($age_default, Related => 1); 15064 foreach my $alias ($age_default->aliases) { 15065 $unassigned->add_alias($alias->name); 15066 } 15067 } 15068 15069 my $patws = $perl->add_match_table('_Perl_PatWS', 15070 Perl_Extension => 1, 15071 Fate => $INTERNAL_ONLY); 15072 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) { 15073 $patws->initialize($off_patws->table('Y')); 15074 } 15075 else { 15076 $patws->initialize([ ord("\t"), 15077 ord("\n"), 15078 utf8::unicode_to_native(0x0B), # VT 15079 ord("\f"), 15080 ord("\r"), 15081 ord(" "), 15082 utf8::unicode_to_native(0x85), # NEL 15083 0x200E..0x200F, # Left, Right marks 15084 0x2028..0x2029 # Line, Paragraph seps 15085 ] ); 15086 } 15087 15088 # See L<perlfunc/quotemeta> 15089 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', 15090 Perl_Extension => 1, 15091 Fate => $INTERNAL_ONLY, 15092 15093 # Initialize to what's common in 15094 # all Unicode releases. 15095 Initialize => 15096 $gc->table('Control') 15097 + $Space 15098 + $patws 15099 + ((~ $Word) & $ASCII) 15100 ); 15101 15102 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) { 15103 $quotemeta += $patsyn->table('Y'); 15104 } 15105 else { 15106 $quotemeta += ((~ $Word) & Range->new(0, 255)) 15107 - utf8::unicode_to_native(0xA8) 15108 - utf8::unicode_to_native(0xAF) 15109 - utf8::unicode_to_native(0xB2) 15110 - utf8::unicode_to_native(0xB3) 15111 - utf8::unicode_to_native(0xB4) 15112 - utf8::unicode_to_native(0xB7) 15113 - utf8::unicode_to_native(0xB8) 15114 - utf8::unicode_to_native(0xB9) 15115 - utf8::unicode_to_native(0xBC) 15116 - utf8::unicode_to_native(0xBD) 15117 - utf8::unicode_to_native(0xBE); 15118 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the 15119 # same in all releases 15120 0x2010 .. 0x2027, 15121 0x2030 .. 0x203E, 15122 0x2041 .. 0x2053, 15123 0x2055 .. 0x205E, 15124 0x2190 .. 0x245F, 15125 0x2500 .. 0x2775, 15126 0x2794 .. 0x2BFF, 15127 0x2E00 .. 0x2E7F, 15128 0x3001 .. 0x3003, 15129 0x3008 .. 0x3020, 15130 0x3030 .. 0x3030, 15131 0xFD3E .. 0xFD3F, 15132 0xFE45 .. 0xFE46 15133 ]; 15134 } 15135 15136 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 15137 $quotemeta += $di->table('Y') 15138 } 15139 else { 15140 if ($v_version ge v2.0) { 15141 $quotemeta += $gc->table('Cf') 15142 + $gc->table('Cs'); 15143 15144 # These are above the Unicode version 1 max 15145 $quotemeta->add_range(0xE0000, 0xE0FFF); 15146 } 15147 $quotemeta += $gc->table('Cc') 15148 - $Space; 15149 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D, 15150 0x2060 .. 0x206F, 15151 0xFE00 .. 0xFE0F, 15152 0xFFF0 .. 0xFFFB, 15153 ]); 15154 $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0; 15155 $quotemeta += $temp; 15156 } 15157 calculate_DI(); 15158 $quotemeta += $DI; 15159 15160 calculate_NChar(); 15161 15162 # Finished creating all the perl properties. All non-internal non-string 15163 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with 15164 # an underscore.) These do not get a separate entry in the pod file 15165 foreach my $table ($perl->tables) { 15166 foreach my $alias ($table->aliases) { 15167 next if $alias->name =~ /^_/; 15168 $table->add_alias('Is_' . $alias->name, 15169 Re_Pod_Entry => 0, 15170 UCD => 0, 15171 Status => $alias->status, 15172 OK_as_Filename => 0); 15173 } 15174 } 15175 15176 # Perl tailors the WordBreak property so that \b{wb} doesn't split 15177 # adjacent spaces into separate words. Unicode 11.0 moved in that 15178 # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO 15179 # BREAK SPACE as breaking, so we retained the original Perl customization. 15180 # To do this, in the Perl copy of WB, simply replace the mappings of 15181 # horizontal space characters that otherwise would map to the default or 15182 # the 11.0 'WSegSpace' to instead map to our tailoring. 15183 my $perl_wb = property_ref('_Perl_WB'); 15184 my $default = $perl_wb->default_map; 15185 for my $range ($Blank->ranges) { 15186 for my $i ($range->start .. $range->end) { 15187 my $value = $perl_wb->value_of($i); 15188 15189 next unless $value eq $default || $value eq 'WSegSpace'; 15190 $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace', 15191 Replace => $UNCONDITIONALLY); 15192 } 15193 } 15194 15195 # Also starting in Unicode 11.0, rules for some of the boundary types are 15196 # based on a non-UCD property (which we have read in if it exists). 15197 # Recall that these boundary properties partition the code points into 15198 # equivalence classes (represented as enums). 15199 # 15200 # The loop below goes through each code point that matches the non-UCD 15201 # property, and for each current equivalence class containing such a code 15202 # point, splits it so that those that are in both are now in a newly 15203 # created equivalence class whose name is a combination of the property 15204 # and the old class name, leaving unchanged everything that doesn't match 15205 # the non-UCD property. 15206 my $ep = property_ref('ExtPict'); 15207 $ep = $ep->table('Y') if defined $ep; 15208 if (defined $ep) { 15209 foreach my $base_property (property_ref('GCB'), 15210 property_ref('WB')) 15211 { 15212 my $property = property_ref('_Perl_' . $base_property->name); 15213 foreach my $range ($ep->ranges) { 15214 foreach my $i ($range->start .. $range->end) { 15215 my $current = $property->value_of($i); 15216 $current = $property->table($current)->short_name; 15217 $property->add_map($i, $i, 'ExtPict_' . $current, 15218 Replace => $UNCONDITIONALLY); 15219 } 15220 } 15221 } 15222 } 15223 15224 # Create a version of the LineBreak property with the mappings that are 15225 # omitted in the default algorithm remapped to what 15226 # http://www.unicode.org/reports/tr14 says they should be. 15227 # 15228 # First, create a plain copy, but with all property values written out in 15229 # their long form, as regen/mk_invlist.pl expects that, and also fix 15230 # occurrences of the typo in early Unicode versions: 'inseperable'. 15231 my $perl_lb = property_ref('_Perl_LB'); 15232 if (! defined $perl_lb) { 15233 $perl_lb = Property->new('_Perl_LB', 15234 Fate => $INTERNAL_ONLY, 15235 Perl_Extension => 1, 15236 Directory => $map_directory, 15237 Type => $STRING); 15238 my $lb = property_ref('Line_Break'); 15239 15240 # Populate from $lb, but use full name and fix typo. 15241 foreach my $range ($lb->ranges) { 15242 my $full_name = $lb->table($range->value)->full_name; 15243 $full_name = 'Inseparable' 15244 if standardize($full_name) eq 'inseperable'; 15245 $perl_lb->add_map($range->start, $range->end, $full_name); 15246 } 15247 } 15248 15249 # What tr14 says is this: 15250 15251 # Original Resolved General_Category 15252 # AI, SG, XX AL Any 15253 # SA CM Only Mn or Mc 15254 # SA AL Any except Mn and Mc 15255 # CJ NS Any 15256 15257 $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL 15258 15259 my $ea = property_ref('East_Asian_Width'); 15260 my $Cn_EP; 15261 $Cn_EP = $ep & $gc->table('Unassigned') if defined $ep; 15262 15263 for my $range ($perl_lb->ranges) { 15264 my $value = standardize($range->value); 15265 if ( $value eq standardize('Unknown') 15266 || $value eq standardize('Ambiguous') 15267 || $value eq standardize('Surrogate')) 15268 { 15269 $perl_lb->add_map($range->start, $range->end, 'Alphabetic', 15270 Replace => $UNCONDITIONALLY); 15271 } 15272 elsif ($value eq standardize('Conditional_Japanese_Starter')) { 15273 $perl_lb->add_map($range->start, $range->end, 'Nonstarter', 15274 Replace => $UNCONDITIONALLY); 15275 } 15276 elsif ($value eq standardize('Complex_Context')) { 15277 for my $i ($range->start .. $range->end) { 15278 my $gc_val = $gc->value_of($i); 15279 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') { 15280 $perl_lb->add_map($i, $i, 'Combining_Mark', 15281 Replace => $UNCONDITIONALLY); 15282 } 15283 else { 15284 $perl_lb->add_map($i, $i, 'Alphabetic', 15285 Replace => $UNCONDITIONALLY); 15286 } 15287 } 15288 } 15289 elsif (defined $ep && $value eq standardize('Ideographic')) { 15290 15291 # Unicode 14 adds a rule to not break lines before any potential 15292 # EBase, They say that any unassigned code point that is ExtPict, 15293 # is potentially an EBase. In 14.0, all such ones are in the 15294 # ExtPict=ID category. We must split that category for the 15295 # pairwise rule table to work. 15296 for my $i ($range->start .. $range->end) { 15297 if ($Cn_EP->contains($i)) { 15298 $perl_lb->add_map($i, $i, 15299 'Unassigned_Extended_Pictographic_Ideographic', 15300 Replace => $UNCONDITIONALLY); 15301 } 15302 } 15303 } 15304 elsif ( defined $ea 15305 && ( $value eq standardize('Close_Parenthesis') 15306 || $value eq standardize('Open_Punctuation'))) 15307 { 15308 # Unicode 13 splits the OP and CP properties each into East Asian, 15309 # and non-. We retain the (now somewhat misleading) names OP and 15310 # CP for the non-East Asian variety, as there are very few East 15311 # Asian ones. 15312 my $replace = ($value eq standardize('Open_Punctuation')) 15313 ? 'East_Asian_OP' 15314 : 'East_Asian_CP'; 15315 for my $i ($range->start .. $range->end) { 15316 my $ea_val = $ea->value_of($i); 15317 if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') { 15318 $perl_lb->add_map($i, $i, $replace, 15319 Replace => $UNCONDITIONALLY); 15320 } 15321 } 15322 } 15323 } 15324 15325 # This property is a modification of the scx property 15326 my $perl_scx = Property->new('_Perl_SCX', 15327 Fate => $INTERNAL_ONLY, 15328 Perl_Extension => 1, 15329 Directory => $map_directory, 15330 Type => $ENUM); 15331 my $source; 15332 15333 # Use scx if available; otherwise sc; if neither is there (a very old 15334 # Unicode version, just say that everything is 'Common' 15335 if (defined $scx) { 15336 $source = $scx; 15337 $perl_scx->set_default_map('Unknown'); 15338 } 15339 elsif (defined $script) { 15340 $source = $script; 15341 15342 # Early versions of 'sc', had everything be 'Common' 15343 if (defined $script->table('Unknown')) { 15344 $perl_scx->set_default_map('Unknown'); 15345 } 15346 else { 15347 $perl_scx->set_default_map('Common'); 15348 } 15349 } else { 15350 $perl_scx->add_match_table('Common'); 15351 $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common'); 15352 15353 $perl_scx->add_match_table('Unknown'); 15354 $perl_scx->set_default_map('Unknown'); 15355 } 15356 15357 $perl_scx->_set_format($STRING_WHITE_SPACE_LIST); 15358 $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 15359 15360 if (defined $source) { 15361 $perl_scx->initialize($source); 15362 15363 # UTS 39 says that the scx property should be modified for these 15364 # countries where certain mixed scripts are commonly used. 15365 for my $range ($perl_scx->ranges) { 15366 my $value = $range->value; 15367 my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi; 15368 $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi; 15369 $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi; 15370 $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) } 15371 {$1 Katakana Hiragana Jpan}xi; 15372 $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi; 15373 $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi; 15374 15375 if ($changed) { 15376 $value = join " ", uniques split " ", $value; 15377 $range->set_value($value) 15378 } 15379 } 15380 15381 foreach my $table ($source->tables) { 15382 my $scx_table = $perl_scx->add_match_table($table->name, 15383 Full_Name => $table->full_name); 15384 foreach my $alias ($table->aliases) { 15385 $scx_table->add_alias($alias->name); 15386 } 15387 } 15388 } 15389 15390 # Here done with all the basic stuff. Ready to populate the information 15391 # about each character if annotating them. 15392 if ($annotate) { 15393 15394 # See comments at its declaration 15395 $annotate_ranges = Range_Map->new; 15396 15397 # This separates out the non-characters from the other unassigneds, so 15398 # can give different annotations for each. 15399 $unassigned_sans_noncharacters = Range_List->new( 15400 Initialize => $gc->table('Unassigned')); 15401 $unassigned_sans_noncharacters &= (~ $NChar); 15402 15403 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) { 15404 $i = populate_char_info($i); # Note sets $i so may cause skips 15405 15406 } 15407 } 15408 15409 return; 15410} 15411 15412sub add_perl_synonyms() { 15413 # A number of Unicode tables have Perl synonyms that are expressed in 15414 # the single-form, \p{name}. These are: 15415 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and 15416 # \p{Is_Name} as synonyms 15417 # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms 15418 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms 15419 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no 15420 # conflict, \p{Value} and \p{Is_Value} as well 15421 # 15422 # This routine generates these synonyms, warning of any unexpected 15423 # conflicts. 15424 15425 # Construct the list of tables to get synonyms for. Start with all the 15426 # binary and the General_Category ones. 15427 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } 15428 property_ref('*'); 15429 push @tables, $gc->tables; 15430 15431 # If the version of Unicode includes the Script Extensions (preferably), 15432 # or Script property, add its tables 15433 if (defined $scx) { 15434 push @tables, $scx->tables; 15435 } 15436 else { 15437 push @tables, $script->tables if defined $script; 15438 } 15439 15440 # The Block tables are kept separate because they are treated differently. 15441 # And the earliest versions of Unicode didn't include them, so add only if 15442 # there are some. 15443 my @blocks; 15444 push @blocks, $block->tables if defined $block; 15445 15446 # Here, have the lists of tables constructed. Process blocks last so that 15447 # if there are name collisions with them, blocks have lowest priority. 15448 # Should there ever be other collisions, manual intervention would be 15449 # required. See the comments at the beginning of the program for a 15450 # possible way to handle those semi-automatically. 15451 foreach my $table (@tables, @blocks) { 15452 15453 # For non-binary properties, the synonym is just the name of the 15454 # table, like Greek, but for binary properties the synonym is the name 15455 # of the property, and means the code points in its 'Y' table. 15456 my $nominal = $table; 15457 my $nominal_property = $nominal->property; 15458 my $actual; 15459 if (! $nominal->isa('Property')) { 15460 $actual = $table; 15461 } 15462 else { 15463 15464 # Here is a binary property. Use the 'Y' table. Verify that is 15465 # there 15466 my $yes = $nominal->table('Y'); 15467 unless (defined $yes) { # Must be defined, but is permissible to 15468 # be empty. 15469 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); 15470 next; 15471 } 15472 $actual = $yes; 15473 } 15474 15475 foreach my $alias ($nominal->aliases) { 15476 15477 # Attempt to create a table in the perl directory for the 15478 # candidate table, using whatever aliases in it that don't 15479 # conflict. Also add non-conflicting aliases for all these 15480 # prefixed by 'Is_' (and/or 'In_' for Block property tables) 15481 PREFIX: 15482 foreach my $prefix ("", 'Is_', 'In_') { 15483 15484 # Only Block properties can have added 'In_' aliases. 15485 next if $prefix eq 'In_' and $nominal_property != $block; 15486 15487 my $proposed_name = $prefix . $alias->name; 15488 15489 # No Is_Is, In_In, nor combinations thereof 15490 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; 15491 next if $proposed_name =~ /^ I [ns] _I [ns] _/x; 15492 15493 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; 15494 15495 # Get a reference to any existing table in the perl 15496 # directory with the desired name. 15497 my $pre_existing = $perl->table($proposed_name); 15498 15499 if (! defined $pre_existing) { 15500 15501 # No name collision, so OK to add the perl synonym. 15502 15503 my $make_re_pod_entry; 15504 my $ok_as_filename; 15505 my $status = $alias->status; 15506 if ($nominal_property == $block) { 15507 15508 # For block properties, only the compound form is 15509 # preferred for external use; the others are 15510 # discouraged. The pod file contains wild cards for 15511 # the 'In' and 'Is' forms so no entries for those; and 15512 # we don't want people using the name without any 15513 # prefix, so discourage that. 15514 if ($prefix eq "") { 15515 $make_re_pod_entry = 1; 15516 $status = $status || $DISCOURAGED; 15517 $ok_as_filename = 0; 15518 } 15519 elsif ($prefix eq 'In_') { 15520 $make_re_pod_entry = 0; 15521 $status = $status || $DISCOURAGED; 15522 $ok_as_filename = 1; 15523 } 15524 else { 15525 $make_re_pod_entry = 0; 15526 $status = $status || $DISCOURAGED; 15527 $ok_as_filename = 0; 15528 } 15529 } 15530 elsif ($prefix ne "") { 15531 15532 # The 'Is' prefix is handled in the pod by a wild 15533 # card, and we won't use it for an external name 15534 $make_re_pod_entry = 0; 15535 $status = $status || $NORMAL; 15536 $ok_as_filename = 0; 15537 } 15538 else { 15539 15540 # Here, is an empty prefix, non block. This gets its 15541 # own pod entry and can be used for an external name. 15542 $make_re_pod_entry = 1; 15543 $status = $status || $NORMAL; 15544 $ok_as_filename = 1; 15545 } 15546 15547 # Here, there isn't a perl pre-existing table with the 15548 # name. Look through the list of equivalents of this 15549 # table to see if one is a perl table. 15550 foreach my $equivalent ($actual->leader->equivalents) { 15551 next if $equivalent->property != $perl; 15552 15553 # Here, have found a table for $perl. Add this alias 15554 # to it, and are done with this prefix. 15555 $equivalent->add_alias($proposed_name, 15556 Re_Pod_Entry => $make_re_pod_entry, 15557 15558 # Currently don't output these in the 15559 # ucd pod, as are strongly discouraged 15560 # from being used 15561 UCD => 0, 15562 15563 Status => $status, 15564 OK_as_Filename => $ok_as_filename); 15565 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; 15566 next PREFIX; 15567 } 15568 15569 # Here, $perl doesn't already have a table that is a 15570 # synonym for this property, add one. 15571 my $added_table = $perl->add_match_table($proposed_name, 15572 Re_Pod_Entry => $make_re_pod_entry, 15573 15574 # See UCD comment just above 15575 UCD => 0, 15576 15577 Status => $status, 15578 OK_as_Filename => $ok_as_filename); 15579 # And it will be related to the actual table, since it is 15580 # based on it. 15581 $added_table->set_equivalent_to($actual, Related => 1); 15582 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; 15583 next; 15584 } # End of no pre-existing. 15585 15586 # Here, there is a pre-existing table that has the proposed 15587 # name. We could be in trouble, but not if this is just a 15588 # synonym for another table that we have already made a child 15589 # of the pre-existing one. 15590 if ($pre_existing->is_set_equivalent_to($actual)) { 15591 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; 15592 $pre_existing->add_alias($proposed_name); 15593 next; 15594 } 15595 15596 # Here, there is a name collision, but it still could be OK if 15597 # the tables match the identical set of code points, in which 15598 # case, we can combine the names. Compare each table's code 15599 # point list to see if they are identical. 15600 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; 15601 if ($pre_existing->matches_identically_to($actual)) { 15602 15603 # Here, they do match identically. Not a real conflict. 15604 # Make the perl version a child of the Unicode one, except 15605 # in the non-obvious case of where the perl name is 15606 # already a synonym of another Unicode property. (This is 15607 # excluded by the test for it being its own parent.) The 15608 # reason for this exclusion is that then the two Unicode 15609 # properties become related; and we don't really know if 15610 # they are or not. We generate documentation based on 15611 # relatedness, and this would be misleading. Code 15612 # later executed in the process will cause the tables to 15613 # be represented by a single file anyway, without making 15614 # it look in the pod like they are necessarily related. 15615 if ($pre_existing->parent == $pre_existing 15616 && ($pre_existing->property == $perl 15617 || $actual->property == $perl)) 15618 { 15619 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; 15620 $pre_existing->set_equivalent_to($actual, Related => 1); 15621 } 15622 elsif (main::DEBUG && $to_trace) { 15623 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; 15624 trace $pre_existing->parent; 15625 } 15626 next PREFIX; 15627 } 15628 15629 # Here they didn't match identically, there is a real conflict 15630 # between our new name and a pre-existing property. 15631 $actual->add_conflicting($proposed_name, 'p', $pre_existing); 15632 $pre_existing->add_conflicting($nominal->full_name, 15633 'p', 15634 $actual); 15635 15636 # Don't output a warning for aliases for the block 15637 # properties (unless they start with 'In_') as it is 15638 # expected that there will be conflicts and the block 15639 # form loses. 15640 if ($verbosity >= $NORMAL_VERBOSITY 15641 && ($actual->property != $block || $prefix eq 'In_')) 15642 { 15643 print simple_fold(join_lines(<<END 15644There is already an alias named $proposed_name (from $pre_existing), 15645so not creating this alias for $actual 15646END 15647 ), "", 4); 15648 } 15649 15650 # Keep track for documentation purposes. 15651 $has_In_conflicts++ if $prefix eq 'In_'; 15652 $has_Is_conflicts++ if $prefix eq 'Is_'; 15653 } 15654 } 15655 } 15656 15657 # There are some properties which have No and Yes (and N and Y) as 15658 # property values, but aren't binary, and could possibly be confused with 15659 # binary ones. So create caveats for them. There are tables that are 15660 # named 'No', and tables that are named 'N', but confusion is not likely 15661 # unless they are the same table. For example, N meaning Number or 15662 # Neutral is not likely to cause confusion, so don't add caveats to things 15663 # like them. 15664 foreach my $property (grep { $_->type != $BINARY 15665 && $_->type != $FORCED_BINARY } 15666 property_ref('*')) 15667 { 15668 my $yes = $property->table('Yes'); 15669 if (defined $yes) { 15670 my $y = $property->table('Y'); 15671 if (defined $y && $yes == $y) { 15672 foreach my $alias ($property->aliases) { 15673 $yes->add_conflicting($alias->name); 15674 } 15675 } 15676 } 15677 my $no = $property->table('No'); 15678 if (defined $no) { 15679 my $n = $property->table('N'); 15680 if (defined $n && $no == $n) { 15681 foreach my $alias ($property->aliases) { 15682 $no->add_conflicting($alias->name, 'P'); 15683 } 15684 } 15685 } 15686 } 15687 15688 return; 15689} 15690 15691sub register_file_for_name($table, $directory_ref, $file) { 15692 # Given info about a table and a datafile that it should be associated 15693 # with, register that association 15694 15695 # $directory_ref # Array of the directory path for the file 15696 # $file # The file name in the final directory. 15697 15698 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace; 15699 15700 if ($table->isa('Property')) { 15701 $table->set_file_path(@$directory_ref, $file); 15702 push @map_properties, $table; 15703 15704 # No swash means don't do the rest of this. 15705 return if $table->fate != $ORDINARY 15706 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY); 15707 15708 # Get the path to the file 15709 my @path = $table->file_path; 15710 15711 # Use just the file name if no subdirectory. 15712 shift @path if $path[0] eq File::Spec->curdir(); 15713 15714 my $file = join '/', @path; 15715 15716 # Create a hash entry for Unicode::UCD to get the file that stores this 15717 # property's map table 15718 foreach my $alias ($table->aliases) { 15719 my $name = $alias->name; 15720 if ($name =~ /^_/) { 15721 $strict_property_to_file_of{lc $name} = $file; 15722 } 15723 else { 15724 $loose_property_to_file_of{standardize($name)} = $file; 15725 } 15726 } 15727 15728 # And a way for Unicode::UCD to find the proper key in the SwashInfo 15729 # hash for this property. 15730 $file_to_swash_name{$file} = "To" . $table->swash_name; 15731 return; 15732 } 15733 15734 # Do all of the work for all equivalent tables when called with the leader 15735 # table, so skip if isn't the leader. 15736 return if $table->leader != $table; 15737 15738 # If this is a complement of another file, use that other file instead, 15739 # with a ! prepended to it. 15740 my $complement; 15741 if (($complement = $table->complement) != 0) { 15742 my @directories = $complement->file_path; 15743 15744 # This assumes that the 0th element is something like 'lib', 15745 # the 1th element the property name (in its own directory), like 15746 # 'AHex', and the 2th element the file like 'Y' which will have a .pl 15747 # appended to it later. 15748 $directories[1] =~ s/^/!/; 15749 $file = pop @directories; 15750 $directory_ref =\@directories; 15751 } 15752 15753 # Join all the file path components together, using slashes. 15754 my $full_filename = join('/', @$directory_ref, $file); 15755 15756 # All go in the same subdirectory of unicore, or the special 15757 # pseudo-directory '#' 15758 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) { 15759 Carp::my_carp("Unexpected directory in " 15760 . join('/', @{$directory_ref}, $file)); 15761 } 15762 15763 # For this table and all its equivalents ... 15764 foreach my $table ($table, $table->equivalents) { 15765 15766 # Associate it with its file internally. Don't include the 15767 # $matches_directory first component 15768 $table->set_file_path(@$directory_ref, $file); 15769 15770 # No swash means don't do the rest of this. 15771 next if $table->isa('Map_Table') && $table->fate != $ORDINARY; 15772 15773 my $sub_filename = join('/', $directory_ref->[1, -1], $file); 15774 15775 my $property = $table->property; 15776 my $property_name = ($property == $perl) 15777 ? "" # 'perl' is never explicitly stated 15778 : standardize($property->name) . '='; 15779 15780 my $is_default = 0; # Is this table the default one for the property? 15781 15782 # To calculate $is_default, we find if this table is the same as the 15783 # default one for the property. But this is complicated by the 15784 # possibility that there is a master table for this one, and the 15785 # information is stored there instead of here. 15786 my $parent = $table->parent; 15787 my $leader_prop = $parent->property; 15788 my $default_map = $leader_prop->default_map; 15789 if (defined $default_map) { 15790 my $default_table = $leader_prop->table($default_map); 15791 $is_default = 1 if defined $default_table && $parent == $default_table; 15792 } 15793 15794 # Calculate the loose name for this table. Mostly it's just its name, 15795 # standardized. But in the case of Perl tables that are single-form 15796 # equivalents to Unicode properties, it is the latter's name. 15797 my $loose_table_name = 15798 ($property != $perl || $leader_prop == $perl) 15799 ? standardize($table->name) 15800 : standardize($parent->name); 15801 15802 my $deprecated = ($table->status eq $DEPRECATED) 15803 ? $table->status_info 15804 : ""; 15805 my $caseless_equivalent = $table->caseless_equivalent; 15806 15807 # And for each of the table's aliases... This inner loop eventually 15808 # goes through all aliases in the UCD that we generate regex match 15809 # files for 15810 foreach my $alias ($table->aliases) { 15811 my $standard = UCD_name($table, $alias); 15812 15813 # Generate an entry in either the loose or strict hashes, which 15814 # will translate the property and alias names combination into the 15815 # file where the table for them is stored. 15816 if ($alias->loose_match) { 15817 if (exists $loose_to_file_of{$standard}) { 15818 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); 15819 } 15820 else { 15821 $loose_to_file_of{$standard} = $sub_filename; 15822 } 15823 } 15824 else { 15825 if (exists $stricter_to_file_of{$standard}) { 15826 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); 15827 } 15828 else { 15829 $stricter_to_file_of{$standard} = $sub_filename; 15830 15831 # Tightly coupled with how Unicode::UCD works, for a 15832 # floating point number that is a whole number, get rid of 15833 # the trailing decimal point and 0's, so that Unicode::UCD 15834 # will work. Also note that this assumes that such a 15835 # number is matched strictly; so if that were to change, 15836 # this would be wrong. 15837 if ((my $integer_name = $alias->name) 15838 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) 15839 { 15840 $stricter_to_file_of{$property_name . $integer_name} 15841 = $sub_filename; 15842 } 15843 } 15844 } 15845 15846 # For Unicode::UCD, create a mapping of the prop=value to the 15847 # canonical =value for that property. 15848 if ($standard =~ /=/) { 15849 15850 # This could happen if a strict name mapped into an existing 15851 # loose name. In that event, the strict names would have to 15852 # be moved to a new hash. 15853 if (exists($loose_to_standard_value{$standard})) { 15854 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); 15855 } 15856 $loose_to_standard_value{$standard} = $loose_table_name; 15857 } 15858 15859 # Keep a list of the deprecated properties and their filenames 15860 if ($deprecated && $complement == 0) { 15861 $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated; 15862 } 15863 15864 # And a substitute table, if any, for case-insensitive matching 15865 if ($caseless_equivalent != 0) { 15866 $caseless_equivalent_to{$standard} = $caseless_equivalent; 15867 } 15868 15869 # Add to defaults list if the table this alias belongs to is the 15870 # default one 15871 $loose_defaults{$standard} = 1 if $is_default; 15872 } 15873 } 15874 15875 return; 15876} 15877 15878{ # Closure 15879 my %base_names; # Names already used for avoiding DOS 8.3 filesystem 15880 # conflicts 15881 my %full_dir_name_of; # Full length names of directories used. 15882 15883 sub construct_filename($name, $mutable, $directories_ref) { 15884 # Return a file name for a table, based on the table name, but perhaps 15885 # changed to get rid of non-portable characters in it, and to make 15886 # sure that it is unique on a file system that allows the names before 15887 # any period to be at most 8 characters (DOS). While we're at it 15888 # check and complain if there are any directory conflicts. 15889 15890 # $name # The name to start with 15891 # $mutable # Boolean: can it be changed? If no, but 15892 # yet it must be to work properly, a warning 15893 # is given 15894 # $directories_ref # A reference to an array containing the 15895 # path to the file, with each element one path 15896 # component. This is used because the same 15897 # name can be used in different directories. 15898 15899 my $warn = ! defined wantarray; # If true, then if the name is 15900 # changed, a warning is issued as well. 15901 15902 if (! defined $name) { 15903 Carp::my_carp("Undefined name in directory " 15904 . File::Spec->join(@$directories_ref) 15905 . ". '_' used"); 15906 return '_'; 15907 } 15908 15909 # Make sure that no directory names conflict with each other. Look at 15910 # each directory in the input file's path. If it is already in use, 15911 # assume it is correct, and is merely being re-used, but if we 15912 # truncate it to 8 characters, and find that there are two directories 15913 # that are the same for the first 8 characters, but differ after that, 15914 # then that is a problem. 15915 foreach my $directory (@$directories_ref) { 15916 my $short_dir = substr($directory, 0, 8); 15917 if (defined $full_dir_name_of{$short_dir}) { 15918 next if $full_dir_name_of{$short_dir} eq $directory; 15919 Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); 15920 } 15921 else { 15922 $full_dir_name_of{$short_dir} = $directory; 15923 } 15924 } 15925 15926 my $path = join '/', @$directories_ref; 15927 $path .= '/' if $path; 15928 15929 # Remove interior underscores. 15930 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; 15931 15932 # Convert the dot in floating point numbers to an underscore 15933 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x; 15934 15935 my $suffix = ""; 15936 15937 # Extract any suffix, delete any non-word character, and truncate to 3 15938 # after the dot 15939 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) { 15940 $filename = $1; 15941 $suffix = $2; 15942 $suffix =~ s/\W+//g; 15943 substr($suffix, 4) = "" if length($suffix) > 4; 15944 } 15945 15946 # Change any non-word character outside the suffix into an underscore, 15947 # and truncate to 8. 15948 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" 15949 substr($filename, 8) = "" if length($filename) > 8; 15950 15951 # Make sure the basename doesn't conflict with something we 15952 # might have already written. If we have, say, 15953 # InGreekExtended1 15954 # InGreekExtended2 15955 # they become 15956 # InGreekE 15957 # InGreek2 15958 my $warned = 0; 15959 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) { 15960 $num++; # so basenames with numbers start with '2', which 15961 # just looks more natural. 15962 15963 # Want to append $num, but if it'll make the basename longer 15964 # than 8 characters, pre-truncate $filename so that the result 15965 # is acceptable. 15966 my $delta = length($filename) + length($num) - 8; 15967 if ($delta > 0) { 15968 substr($filename, -$delta) = $num; 15969 } 15970 else { 15971 $filename .= $num; 15972 } 15973 if ($warn && ! $warned) { 15974 $warned = 1; 15975 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); 15976 } 15977 } 15978 15979 return $filename if $mutable; 15980 15981 # If not changeable, must return the input name, but warn if needed to 15982 # change it beyond shortening it. 15983 if ($name ne $filename 15984 && substr($name, 0, length($filename)) ne $filename) { 15985 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); 15986 } 15987 return $name; 15988 } 15989} 15990 15991# The pod file contains a very large table. Many of the lines in that table 15992# would exceed a typical output window's size, and so need to be wrapped with 15993# a hanging indent to make them look good. The pod language is really 15994# insufficient here. There is no general construct to do that in pod, so it 15995# is done here by beginning each such line with a space to cause the result to 15996# be output without formatting, and doing all the formatting here. This leads 15997# to the result that if the eventual display window is too narrow it won't 15998# look good, and if the window is too wide, no advantage is taken of that 15999# extra width. A further complication is that the output may be indented by 16000# the formatter so that there is less space than expected. What I (khw) have 16001# done is to assume that that indent is a particular number of spaces based on 16002# what it is in my Linux system; people can always resize their windows if 16003# necessary, but this is obviously less than desirable, but the best that can 16004# be expected. 16005my $automatic_pod_indent = 8; 16006 16007# Try to format so that uses fewest lines, but few long left column entries 16008# slide into the right column. An experiment on 5.1 data yielded the 16009# following percentages that didn't cut into the other side along with the 16010# associated first-column widths 16011# 69% = 24 16012# 80% not too bad except for a few blocks 16013# 90% = 33; # , cuts 353/3053 lines from 37 = 12% 16014# 95% = 37; 16015my $indent_info_column = 27; # 75% of lines didn't have overlap 16016 16017my $FILLER = 3; # Length of initial boiler-plate columns in a pod line 16018 # The 3 is because of: 16019 # 1 for the leading space to tell the pod formatter to 16020 # output as-is 16021 # 1 for the flag 16022 # 1 for the space between the flag and the main data 16023 16024sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) { 16025 # Take a pod line and return it, formatted properly 16026 16027 # $entry Contents of left column 16028 # $info Contents of right column 16029 16030 my $flags = ""; 16031 $flags .= $STRICTER if ! $loose_match; 16032 16033 $flags .= $status if $status; 16034 16035 # There is a blank in the left column to cause the pod formatter to 16036 # output the line as-is. 16037 return sprintf " %-*s%-*s %s\n", 16038 # The first * in the format is replaced by this, the -1 is 16039 # to account for the leading blank. There isn't a 16040 # hard-coded blank after this to separate the flags from 16041 # the rest of the line, so that in the unlikely event that 16042 # multiple flags are shown on the same line, they both 16043 # will get displayed at the expense of that separation, 16044 # but since they are left justified, a blank will be 16045 # inserted in the normal case. 16046 $FILLER - 1, 16047 $flags, 16048 16049 # The other * in the format is replaced by this number to 16050 # cause the first main column to right fill with blanks. 16051 # The -1 is for the guaranteed blank following it. 16052 $first_column_width - $FILLER - 1, 16053 $entry, 16054 $info; 16055} 16056 16057my @zero_match_tables; # List of tables that have no matches in this release 16058 16059sub make_re_pod_entries($input_table) { 16060 # This generates the entries for the pod file for a given table. 16061 # Also done at this time are any children tables. The output looks like: 16062 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) 16063 16064 # Generate parent and all its children at the same time. 16065 return if $input_table->parent != $input_table; 16066 16067 my $property = $input_table->property; 16068 my $type = $property->type; 16069 my $full_name = $property->full_name; 16070 16071 my $count = $input_table->count; 16072 my $unicode_count; 16073 my $non_unicode_string; 16074 if ($count > $MAX_UNICODE_CODEPOINTS) { 16075 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 16076 - $MAX_UNICODE_CODEPOINT); 16077 $non_unicode_string = " plus all above-Unicode code points"; 16078 } 16079 else { 16080 $unicode_count = $count; 16081 $non_unicode_string = ""; 16082 } 16083 16084 my $string_count = clarify_number($unicode_count) . $non_unicode_string; 16085 16086 my $definition = $input_table->calculate_table_definition; 16087 if ($definition) { 16088 16089 # Save the definition for later use. 16090 $input_table->set_definition($definition); 16091 16092 $definition = ": $definition"; 16093 } 16094 16095 my $status = $input_table->status; 16096 my $status_info = $input_table->status_info; 16097 my $caseless_equivalent = $input_table->caseless_equivalent; 16098 16099 # Don't mention a placeholder equivalent as it isn't to be listed in the 16100 # pod 16101 $caseless_equivalent = 0 if $caseless_equivalent != 0 16102 && $caseless_equivalent->fate > $ORDINARY; 16103 16104 my $entry_for_first_table; # The entry for the first table output. 16105 # Almost certainly, it is the parent. 16106 16107 # For each related table (including itself), we will generate a pod entry 16108 # for each name each table goes by 16109 foreach my $table ($input_table, $input_table->children) { 16110 16111 # Unicode::UCD cannot deal with null string property values, so skip 16112 # any tables that have no non-null names. 16113 next if ! grep { $_->name ne "" } $table->aliases; 16114 16115 # First, gather all the info that applies to this table as a whole. 16116 16117 push @zero_match_tables, $table if $count == 0 16118 # Don't mention special tables 16119 # as being zero length 16120 && $table->fate == $ORDINARY; 16121 16122 my $table_property = $table->property; 16123 16124 # The short name has all the underscores removed, while the full name 16125 # retains them. Later, we decide whether to output a short synonym 16126 # for the full one, we need to compare apples to apples, so we use the 16127 # short name's length including underscores. 16128 my $table_property_short_name_length; 16129 my $table_property_short_name 16130 = $table_property->short_name(\$table_property_short_name_length); 16131 my $table_property_full_name = $table_property->full_name; 16132 16133 # Get how much savings there is in the short name over the full one 16134 # (delta will always be <= 0) 16135 my $table_property_short_delta = $table_property_short_name_length 16136 - length($table_property_full_name); 16137 my @table_description = $table->description; 16138 my @table_note = $table->note; 16139 16140 # Generate an entry for each alias in this table. 16141 my $entry_for_first_alias; # saves the first one encountered. 16142 foreach my $alias ($table->aliases) { 16143 16144 # Skip if not to go in pod. 16145 next unless $alias->make_re_pod_entry; 16146 16147 # Start gathering all the components for the entry 16148 my $name = $alias->name; 16149 16150 # Skip if name is empty, as can't be accessed by regexes. 16151 next if $name eq ""; 16152 16153 my $entry; # Holds the left column, may include extras 16154 my $entry_ref; # To refer to the left column's contents from 16155 # another entry; has no extras 16156 16157 # First the left column of the pod entry. Tables for the $perl 16158 # property always use the single form. 16159 if ($table_property == $perl) { 16160 $entry = "\\p{$name}"; 16161 $entry .= " \\p$name" if length $name == 1; # Show non-braced 16162 # form too 16163 $entry_ref = "\\p{$name}"; 16164 } 16165 else { # Compound form. 16166 16167 # Only generate one entry for all the aliases that mean true 16168 # or false in binary properties. Append a '*' to indicate 16169 # some are missing. (The heading comment notes this.) 16170 my $rhs; 16171 if ($type == $BINARY) { 16172 next if $name ne 'N' && $name ne 'Y'; 16173 $rhs = "$name*"; 16174 } 16175 elsif ($type != $FORCED_BINARY) { 16176 $rhs = $name; 16177 } 16178 else { 16179 16180 # Forced binary properties require special handling. It 16181 # has two sets of tables, one set is true/false; and the 16182 # other set is everything else. Entries are generated for 16183 # each set. Use the Bidi_Mirrored property (which appears 16184 # in all Unicode versions) to get a list of the aliases 16185 # for the true/false tables. Of these, only output the N 16186 # and Y ones, the same as, a regular binary property. And 16187 # output all the rest, same as a non-binary property. 16188 my $bm = property_ref("Bidi_Mirrored"); 16189 if ($name eq 'N' || $name eq 'Y') { 16190 $rhs = "$name*"; 16191 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, 16192 $bm->table("N")->aliases) 16193 { 16194 next; 16195 } 16196 else { 16197 $rhs = $name; 16198 } 16199 } 16200 16201 # Colon-space is used to give a little more space to be easier 16202 # to read; 16203 $entry = "\\p{" 16204 . $table_property_full_name 16205 . ": $rhs}"; 16206 16207 # But for the reference to this entry, which will go in the 16208 # right column, where space is at a premium, use equals 16209 # without a space 16210 $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; 16211 } 16212 16213 # Then the right (info) column. This is stored as components of 16214 # an array for the moment, then joined into a string later. For 16215 # non-internal only properties, begin the info with the entry for 16216 # the first table we encountered (if any), as things are ordered 16217 # so that that one is the most descriptive. This leads to the 16218 # info column of an entry being a more descriptive version of the 16219 # name column 16220 my @info; 16221 if ($name =~ /^_/) { 16222 push @info, 16223 '(For internal use by Perl, not necessarily stable)'; 16224 } 16225 elsif ($entry_for_first_alias) { 16226 push @info, $entry_for_first_alias; 16227 } 16228 16229 # If this entry is equivalent to another, add that to the info, 16230 # using the first such table we encountered 16231 if ($entry_for_first_table) { 16232 if (@info) { 16233 push @info, "(= $entry_for_first_table)"; 16234 } 16235 else { 16236 push @info, $entry_for_first_table; 16237 } 16238 } 16239 16240 # If the name is a large integer, add an equivalent with an 16241 # exponent for better readability 16242 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { 16243 push @info, sprintf "(= %.1e)", $name 16244 } 16245 16246 my $parenthesized = ""; 16247 if (! $entry_for_first_alias) { 16248 16249 # This is the first alias for the current table. The alias 16250 # array is ordered so that this is the fullest, most 16251 # descriptive alias, so it gets the fullest info. The other 16252 # aliases are mostly merely pointers to this one, using the 16253 # information already added above. 16254 16255 # Display any status message, but only on the parent table 16256 if ($status && ! $entry_for_first_table) { 16257 push @info, $status_info; 16258 } 16259 16260 # Put out any descriptive info 16261 if (@table_description || @table_note) { 16262 push @info, join "; ", @table_description, @table_note; 16263 } 16264 16265 # Look to see if there is a shorter name we can point people 16266 # at 16267 my $standard_name = standardize($name); 16268 my $short_name; 16269 my $proposed_short = $table->short_name; 16270 if (defined $proposed_short) { 16271 my $standard_short = standardize($proposed_short); 16272 16273 # If the short name is shorter than the standard one, or 16274 # even if it's not, but the combination of it and its 16275 # short property name (as in \p{prop=short} ($perl doesn't 16276 # have this form)) saves at least two characters, then, 16277 # cause it to be listed as a shorter synonym. 16278 if (length $standard_short < length $standard_name 16279 || ($table_property != $perl 16280 && (length($standard_short) 16281 - length($standard_name) 16282 + $table_property_short_delta) # (<= 0) 16283 < -2)) 16284 { 16285 $short_name = $proposed_short; 16286 if ($table_property != $perl) { 16287 $short_name = $table_property_short_name 16288 . "=$short_name"; 16289 } 16290 $short_name = "\\p{$short_name}"; 16291 } 16292 } 16293 16294 # And if this is a compound form name, see if there is a 16295 # single form equivalent 16296 my $single_form; 16297 if ($table_property != $perl && $table_property != $block) { 16298 16299 # Special case the binary N tables, so that will print 16300 # \P{single}, but use the Y table values to populate 16301 # 'single', as we haven't likewise populated the N table. 16302 # For forced binary tables, we can't just look at the N 16303 # table, but must see if this table is equivalent to the N 16304 # one, as there are two equivalent beasts in these 16305 # properties. 16306 my $test_table; 16307 my $p; 16308 if ( ($type == $BINARY 16309 && $input_table == $property->table('No')) 16310 || ($type == $FORCED_BINARY 16311 && $property->table('No')-> 16312 is_set_equivalent_to($input_table))) 16313 { 16314 $test_table = $property->table('Yes'); 16315 $p = 'P'; 16316 } 16317 else { 16318 $test_table = $input_table; 16319 $p = 'p'; 16320 } 16321 16322 # Look for a single form amongst all the children. 16323 foreach my $table ($test_table->children) { 16324 next if $table->property != $perl; 16325 my $proposed_name = $table->short_name; 16326 next if ! defined $proposed_name; 16327 16328 # Don't mention internal-only properties as a possible 16329 # single form synonym 16330 next if substr($proposed_name, 0, 1) eq '_'; 16331 16332 $proposed_name = "\\$p\{$proposed_name}"; 16333 if (! defined $single_form 16334 || length($proposed_name) < length $single_form) 16335 { 16336 $single_form = $proposed_name; 16337 16338 # The goal here is to find a single form; not the 16339 # shortest possible one. We've already found a 16340 # short name. So, stop at the first single form 16341 # found, which is likely to be closer to the 16342 # original. 16343 last; 16344 } 16345 } 16346 } 16347 16348 # Output both short and single in the same parenthesized 16349 # expression, but with only one of 'Single', 'Short' if there 16350 # are both items. 16351 if ($short_name || $single_form || $table->conflicting) { 16352 $parenthesized .= "Short: $short_name" if $short_name; 16353 if ($short_name && $single_form) { 16354 $parenthesized .= ', '; 16355 } 16356 elsif ($single_form) { 16357 $parenthesized .= 'Single: '; 16358 } 16359 $parenthesized .= $single_form if $single_form; 16360 } 16361 } 16362 16363 if ($caseless_equivalent != 0) { 16364 $parenthesized .= '; ' if $parenthesized ne ""; 16365 $parenthesized .= "/i= " . $caseless_equivalent->complete_name; 16366 } 16367 16368 16369 # Warn if this property isn't the same as one that a 16370 # semi-casual user might expect. The other components of this 16371 # parenthesized structure are calculated only for the first entry 16372 # for this table, but the conflicting is deemed important enough 16373 # to go on every entry. 16374 my $conflicting = join " NOR ", $table->conflicting; 16375 if ($conflicting) { 16376 $parenthesized .= '; ' if $parenthesized ne ""; 16377 $parenthesized .= "NOT $conflicting"; 16378 } 16379 16380 push @info, "($parenthesized)" if $parenthesized; 16381 16382 if ($name =~ /_$/ && $alias->loose_match) { 16383 push @info, "Note the trailing '_' matters in spite of loose matching rules."; 16384 } 16385 16386 if ($table_property != $perl && $table->perl_extension) { 16387 push @info, '(Perl extension)'; 16388 } 16389 my $definition = $table->definition // ""; 16390 $definition = "" if $entry_for_first_alias; 16391 $definition = ": $definition" if $definition; 16392 push @info, "($string_count$definition)"; 16393 16394 # Now, we have both the entry and info so add them to the 16395 # list of all the properties. 16396 push @match_properties, 16397 format_pod_line($indent_info_column, 16398 $entry, 16399 join( " ", @info), 16400 $alias->status, 16401 $alias->loose_match); 16402 16403 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; 16404 } # End of looping through the aliases for this table. 16405 16406 if (! $entry_for_first_table) { 16407 $entry_for_first_table = $entry_for_first_alias; 16408 } 16409 } # End of looping through all the related tables 16410 return; 16411} 16412 16413sub make_ucd_table_pod_entries($table) { 16414 # Generate the entries for the UCD section of the pod for $table. This 16415 # also calculates if names are ambiguous, so has to be called even if the 16416 # pod is not being output 16417 16418 my $short_name = $table->name; 16419 my $standard_short_name = standardize($short_name); 16420 my $full_name = $table->full_name; 16421 my $standard_full_name = standardize($full_name); 16422 16423 my $full_info = ""; # Text of info column for full-name entries 16424 my $other_info = ""; # Text of info column for short-name entries 16425 my $short_info = ""; # Text of info column for other entries 16426 my $meaning = ""; # Synonym of this table 16427 16428 my $property = ($table->isa('Property')) 16429 ? $table 16430 : $table->parent->property; 16431 16432 my $perl_extension = $table->perl_extension; 16433 my $is_perl_extension_match_table_but_not_dollar_perl 16434 = $property != $perl 16435 && $perl_extension 16436 && $property != $table; 16437 16438 # Get the more official name for perl extensions that aren't 16439 # stand-alone properties 16440 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16441 if ($property->type == $BINARY) { 16442 $meaning = $property->full_name; 16443 } 16444 else { 16445 $meaning = $table->parent->complete_name; 16446 } 16447 } 16448 16449 # There are three types of info column. One for the short name, one for 16450 # the full name, and one for everything else. They mostly are the same, 16451 # so initialize in the same loop. 16452 16453 foreach my $info_ref (\$full_info, \$short_info, \$other_info) { 16454 if ($info_ref != \$full_info) { 16455 16456 # The non-full name columns include the full name 16457 $$info_ref .= $full_name; 16458 } 16459 16460 16461 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16462 16463 # Add the synonymous name for the non-full name entries; and to 16464 # the full-name entry if it adds extra information 16465 if ( standardize($meaning) ne $standard_full_name 16466 || $info_ref == \$other_info 16467 || $info_ref == \$short_info) 16468 { 16469 my $parenthesized = $info_ref != \$full_info; 16470 $$info_ref .= " " if $$info_ref && $parenthesized; 16471 $$info_ref .= "(=" if $parenthesized; 16472 $$info_ref .= "$meaning"; 16473 $$info_ref .= ")" if $parenthesized; 16474 $$info_ref .= "."; 16475 } 16476 } 16477 16478 # And the full-name entry includes the short name, if shorter 16479 if ($info_ref == \$full_info 16480 && length $standard_short_name < length $standard_full_name) 16481 { 16482 $full_info =~ s/\.\Z//; 16483 $full_info .= " " if $full_info; 16484 $full_info .= "(Short: $short_name)"; 16485 } 16486 16487 if ($table->perl_extension) { 16488 $$info_ref =~ s/\.\Z//; 16489 $$info_ref .= ". " if $$info_ref; 16490 $$info_ref .= "(Perl extension)"; 16491 } 16492 } 16493 16494 my $definition; 16495 my $definition_table; 16496 my $type = $table->property->type; 16497 if ($type == $BINARY || $type == $FORCED_BINARY) { 16498 $definition_table = $table->property->table('Y'); 16499 } 16500 elsif ($table->isa('Match_Table')) { 16501 $definition_table = $table; 16502 } 16503 16504 $definition = $definition_table->calculate_table_definition 16505 if defined $definition_table 16506 && $definition_table != 0; 16507 16508 # Add any extra annotations to the full name entry 16509 foreach my $more_info ($table->description, 16510 $definition, 16511 $table->note, 16512 $table->status_info) 16513 { 16514 next unless $more_info; 16515 $full_info =~ s/\.\Z//; 16516 $full_info .= ". " if $full_info; 16517 $full_info .= $more_info; 16518 } 16519 if ($table->property->type == $FORCED_BINARY) { 16520 if ($full_info) { 16521 $full_info =~ s/\.\Z//; 16522 $full_info .= ". "; 16523 } 16524 $full_info .= "This is a combination property which has both:" 16525 . " 1) a map to various string values; and" 16526 . " 2) a map to boolean Y/N, where 'Y' means the" 16527 . " string value is non-empty. Add the prefix 'is'" 16528 . " to the prop_invmap() call to get the latter"; 16529 } 16530 16531 # These keep track if have created full and short name pod entries for the 16532 # property 16533 my $done_full = 0; 16534 my $done_short = 0; 16535 16536 # Every possible name is kept track of, even those that aren't going to be 16537 # output. This way we can be sure to find the ambiguities. 16538 foreach my $alias ($table->aliases) { 16539 my $name = $alias->name; 16540 my $standard = standardize($name); 16541 my $info; 16542 my $output_this = $alias->ucd; 16543 16544 # If the full and short names are the same, we want to output the full 16545 # one's entry, so it has priority. 16546 if ($standard eq $standard_full_name) { 16547 next if $done_full; 16548 $done_full = 1; 16549 $info = $full_info; 16550 } 16551 elsif ($standard eq $standard_short_name) { 16552 next if $done_short; 16553 $done_short = 1; 16554 next if $standard_short_name eq $standard_full_name; 16555 $info = $short_info; 16556 } 16557 else { 16558 $info = $other_info; 16559 } 16560 16561 $combination_property{$standard} = 1 16562 if $table->property->type == $FORCED_BINARY; 16563 16564 # Here, we have set up the two columns for this entry. But if an 16565 # entry already exists for this name, we have to decide which one 16566 # we're going to later output. 16567 if (exists $ucd_pod{$standard}) { 16568 16569 # If the two entries refer to the same property, it's not going to 16570 # be ambiguous. (Likely it's because the names when standardized 16571 # are the same.) But that means if they are different properties, 16572 # there is ambiguity. 16573 if ($ucd_pod{$standard}->{'property'} != $property) { 16574 16575 # Here, we have an ambiguity. This code assumes that one is 16576 # scheduled to be output and one not and that one is a perl 16577 # extension (which is not to be output) and the other isn't. 16578 # If those assumptions are wrong, things have to be rethought. 16579 if ($ucd_pod{$standard}{'output_this'} == $output_this 16580 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension 16581 || $output_this == $perl_extension) 16582 { 16583 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); 16584 } 16585 16586 # We modify the info column of the one being output to 16587 # indicate the ambiguity. Set $which to point to that one's 16588 # info. 16589 my $which; 16590 if ($ucd_pod{$standard}{'output_this'}) { 16591 $which = \$ucd_pod{$standard}->{'info'}; 16592 } 16593 else { 16594 $which = \$info; 16595 $meaning = $ucd_pod{$standard}{'meaning'}; 16596 } 16597 16598 chomp $$which; 16599 $$which =~ s/\.\Z//; 16600 $$which .= "; NOT '$standard' meaning '$meaning'"; 16601 16602 $ambiguous_names{$standard} = 1; 16603 } 16604 16605 # Use the non-perl-extension variant 16606 next unless $ucd_pod{$standard}{'perl_extension'}; 16607 } 16608 16609 # Store enough information about this entry that we can later look for 16610 # ambiguities, and output it properly. 16611 $ucd_pod{$standard} = { 'name' => $name, 16612 'info' => $info, 16613 'meaning' => $meaning, 16614 'output_this' => $output_this, 16615 'perl_extension' => $perl_extension, 16616 'property' => $property, 16617 'status' => $alias->status, 16618 }; 16619 } # End of looping through all this table's aliases 16620 16621 return; 16622} 16623 16624sub pod_alphanumeric_sort { 16625 # Sort pod entries alphanumerically. 16626 16627 # The first few character columns are filler, plus the '\p{'; and get rid 16628 # of all the trailing stuff, starting with the trailing '}', so as to sort 16629 # on just 'Name=Value' 16630 (my $a = lc $a) =~ s/^ .*? \{ //x; 16631 $a =~ s/}.*//; 16632 (my $b = lc $b) =~ s/^ .*? \{ //x; 16633 $b =~ s/}.*//; 16634 16635 # Determine if the two operands are both internal only or both not. 16636 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 16637 # should be the underscore that begins internal only 16638 my $a_is_internal = (substr($a, 0, 1) eq '_'); 16639 my $b_is_internal = (substr($b, 0, 1) eq '_'); 16640 16641 # Sort so the internals come last in the table instead of first (which the 16642 # leading underscore would otherwise indicate). 16643 if ($a_is_internal != $b_is_internal) { 16644 return 1 if $a_is_internal; 16645 return -1 16646 } 16647 16648 # Determine if the two operands are compound or not, and if so if are 16649 # "numeric" property values or not, like \p{Age: 3.0}. But there are also 16650 # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0}, 16651 # all of which this considers numeric, and for sorting, looks just at the 16652 # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}. 16653 my $split_re = qr/ 16654 ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the 16655 # property name 16656 [:=] \s* # The syntax for the compound form 16657 (?: # followed by ... 16658 ( # $2 gets defined if what follows is a "numeric" 16659 # expression, which is ... 16660 ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational 16661 # number, optionally signed 16662 | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either 16663 # of these go into $3 16664 | ( V \d+ _ \d+ ) # or a Unicode's Age property version 16665 # number, into $4 16666 ) 16667 | .* $ # If not "numeric", accept anything so that $1 gets 16668 # defined if it is any compound form 16669 ) /ix; 16670 my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re); 16671 my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re); 16672 16673 # Sort alphabeticlly on the whole property name if either operand isn't 16674 # compound, or they differ. 16675 return $a cmp $b if ! defined $a_initial 16676 || ! defined $b_initial 16677 || $a_initial ne $b_initial; 16678 16679 if (! defined $a_numeric) { 16680 16681 # If neither is numeric, use alpha sort 16682 return $a cmp $b if ! defined $b_numeric; 16683 return 1; # Sort numeric ahead of alpha 16684 } 16685 16686 # Here $a is numeric 16687 return -1 if ! defined $b_numeric; # Numeric sorts before alpha 16688 16689 # Here they are both numeric in the same property. 16690 # Convert version numbers into regular numbers 16691 if (defined $a_version) { 16692 ($a_number = $a_version) =~ s/^V//i; 16693 $a_number =~ s/_/./; 16694 } 16695 else { # Otherwise get rid of the, e.g., CCC in CCC9 */ 16696 $a_number =~ s/ ^ [[:alpha:]]+ //x; 16697 } 16698 if (defined $b_version) { 16699 ($b_number = $b_version) =~ s/^V//i; 16700 $b_number =~ s/_/./; 16701 } 16702 else { 16703 $b_number =~ s/ ^ [[:alpha:]]+ //x; 16704 } 16705 16706 # Convert rationals to floating for the comparison. 16707 $a_number = eval $a_number if $a_number =~ qr{/}; 16708 $b_number = eval $b_number if $b_number =~ qr{/}; 16709 16710 return $a_number <=> $b_number || $a cmp $b; 16711} 16712 16713sub make_pod () { 16714 # Create the .pod file. This generates the various subsections and then 16715 # combines them in one big HERE document. 16716 16717 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; 16718 16719 return unless defined $pod_directory; 16720 print "Making pod file\n" if $verbosity >= $PROGRESS; 16721 16722 my $exception_message = 16723 '(Any exceptions are individually noted beginning with the word NOT.)'; 16724 my @block_warning; 16725 if (-e 'Blocks.txt') { 16726 16727 # Add the line: '\p{In_*} \p{Block: *}', with the warning message 16728 # if the global $has_In_conflicts indicates we have them. 16729 push @match_properties, format_pod_line($indent_info_column, 16730 '\p{In_*}', 16731 '\p{Block: *}' 16732 . (($has_In_conflicts) 16733 ? " $exception_message" 16734 : ""), 16735 $DISCOURAGED); 16736 @block_warning = << "END"; 16737 16738In particular, matches in the Block property have single forms 16739defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at 16740all, Like all B<DISCOURAGED> forms, these are not stable. For example, 16741C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>, 16742C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may 16743come along that would force Perl to change the meaning of one or more of 16744these, and your program would no longer be correct. Currently there are no 16745such conflicts with the form that begins C<"In_">, but there are many with the 16746other two shortcuts, and Unicode continues to define new properties that begin 16747with C<"In">, so it's quite possible that a conflict will occur in the future. 16748The compound form is guaranteed to not become obsolete, and its meaning is 16749clearer anyway. See L<perlunicode/"Blocks"> for more information about this. 16750 16751User-defined properties must begin with "In" or "Is". These override any 16752Unicode property of the same name. 16753END 16754 } 16755 my $text = $Is_flags_text; 16756 $text = "$exception_message $text" if $has_Is_conflicts; 16757 16758 # And the 'Is_ line'; 16759 push @match_properties, format_pod_line($indent_info_column, 16760 '\p{Is_*}', 16761 "\\p{*} $text"); 16762 push @match_properties, format_pod_line($indent_info_column, 16763 '\p{Name=*}', 16764 "Combination of Name and Name_Alias properties; has special" 16765 . " loose matching rules, for which see Unicode UAX #44"); 16766 push @match_properties, format_pod_line($indent_info_column, 16767 '\p{Na=*}', 16768 '\p{Name=*}'); 16769 16770 # Sort the properties array for output. It is sorted alphabetically 16771 # except numerically for numeric properties, and only output unique lines. 16772 @match_properties = sort pod_alphanumeric_sort uniques @match_properties; 16773 16774 my $formatted_properties = simple_fold(\@match_properties, 16775 "", 16776 # indent succeeding lines by two extra 16777 # which looks better 16778 $indent_info_column + 2, 16779 16780 # shorten the line length by how much 16781 # the formatter indents, so the folded 16782 # line will fit in the space 16783 # presumably available 16784 $automatic_pod_indent); 16785 # Add column headings, indented to be a little more centered, but not 16786 # exactly 16787 $formatted_properties = format_pod_line($indent_info_column, 16788 ' NAME', 16789 ' INFO') 16790 . "\n" 16791 . $formatted_properties; 16792 16793 # Generate pod documentation lines for the tables that match nothing 16794 my $zero_matches = ""; 16795 if (@zero_match_tables) { 16796 @zero_match_tables = uniques(@zero_match_tables); 16797 $zero_matches = join "\n\n", 16798 map { $_ = '=item \p{' . $_->complete_name . "}" } 16799 sort { $a->complete_name cmp $b->complete_name } 16800 @zero_match_tables; 16801 16802 $zero_matches = <<END; 16803 16804=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters 16805 16806Unicode has some property-value pairs that currently don't match anything. 16807This happens generally either because they are obsolete, or they exist for 16808symmetry with other forms, but no language has yet been encoded that uses 16809them. In this version of Unicode, the following match zero code points: 16810 16811=over 4 16812 16813$zero_matches 16814 16815=back 16816 16817END 16818 } 16819 16820 # Generate list of properties that we don't accept, grouped by the reasons 16821 # why. This is so only put out the 'why' once, and then list all the 16822 # properties that have that reason under it. 16823 16824 my %why_list; # The keys are the reasons; the values are lists of 16825 # properties that have the key as their reason 16826 16827 # For each property, add it to the list that are suppressed for its reason 16828 # The sort will cause the alphabetically first properties to be added to 16829 # each list first, so each list will be sorted. 16830 foreach my $property (sort keys %why_suppressed) { 16831 next unless $why_suppressed{$property}; 16832 push @{$why_list{$why_suppressed{$property}}}, $property; 16833 } 16834 16835 # For each reason (sorted by the first property that has that reason)... 16836 my @bad_re_properties; 16837 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } 16838 keys %why_list) 16839 { 16840 # Add to the output, all the properties that have that reason. 16841 my $has_item = 0; # Flag if actually output anything. 16842 foreach my $name (@{$why_list{$why}}) { 16843 16844 # Split compound names into $property and $table components 16845 my $property = $name; 16846 my $table; 16847 if ($property =~ / (.*) = (.*) /x) { 16848 $property = $1; 16849 $table = $2; 16850 } 16851 16852 # This release of Unicode may not have a property that is 16853 # suppressed, so don't reference a non-existent one. 16854 $property = property_ref($property); 16855 next if ! defined $property; 16856 16857 # And since this list is only for match tables, don't list the 16858 # ones that don't have match tables. 16859 next if ! $property->to_create_match_tables; 16860 16861 # Find any abbreviation, and turn it into a compound name if this 16862 # is a property=value pair. 16863 my $short_name = $property->name; 16864 $short_name .= '=' . $property->table($table)->name if $table; 16865 16866 # Start with an empty line. 16867 push @bad_re_properties, "\n\n" unless $has_item; 16868 16869 # And add the property as an item for the reason. 16870 push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; 16871 $has_item = 1; 16872 } 16873 16874 # And add the reason under the list of properties, if such a list 16875 # actually got generated. Note that the header got added 16876 # unconditionally before. But pod ignores extra blank lines, so no 16877 # harm. 16878 push @bad_re_properties, "\n$why\n" if $has_item; 16879 16880 } # End of looping through each reason. 16881 16882 if (! @bad_re_properties) { 16883 push @bad_re_properties, 16884 "*** This installation accepts ALL non-Unihan properties ***"; 16885 } 16886 else { 16887 # Add =over only if non-empty to avoid an empty =over/=back section, 16888 # which is considered bad form. 16889 unshift @bad_re_properties, "\n=over 4\n"; 16890 push @bad_re_properties, "\n=back\n"; 16891 } 16892 16893 # Similarly, generate a list of files that we don't use, grouped by the 16894 # reasons why (Don't output if the reason is empty). First, create a hash 16895 # whose keys are the reasons, and whose values are anonymous arrays of all 16896 # the files that share that reason. 16897 my %grouped_by_reason; 16898 foreach my $file (keys %skipped_files) { 16899 next unless $skipped_files{$file}; 16900 push @{$grouped_by_reason{$skipped_files{$file}}}, $file; 16901 } 16902 16903 # Then, sort each group. 16904 foreach my $group (keys %grouped_by_reason) { 16905 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } 16906 @{$grouped_by_reason{$group}} ; 16907 } 16908 16909 # Finally, create the output text. For each reason (sorted by the 16910 # alphabetically first file that has that reason)... 16911 my @unused_files; 16912 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] 16913 cmp lc $grouped_by_reason{$b}->[0] 16914 } 16915 keys %grouped_by_reason) 16916 { 16917 # Add all the files that have that reason to the output. Start 16918 # with an empty line. 16919 push @unused_files, "\n\n"; 16920 push @unused_files, map { "\n=item F<$_> \n" } 16921 @{$grouped_by_reason{$reason}}; 16922 # And add the reason under the list of files 16923 push @unused_files, "\n$reason\n"; 16924 } 16925 16926 # Similarly, create the output text for the UCD section of the pod 16927 my @ucd_pod; 16928 foreach my $key (keys %ucd_pod) { 16929 next unless $ucd_pod{$key}->{'output_this'}; 16930 push @ucd_pod, format_pod_line($indent_info_column, 16931 $ucd_pod{$key}->{'name'}, 16932 $ucd_pod{$key}->{'info'}, 16933 $ucd_pod{$key}->{'status'}, 16934 ); 16935 } 16936 16937 # Sort alphabetically, and fold for output 16938 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; 16939 my $ucd_pod = simple_fold(\@ucd_pod, 16940 ' ', 16941 $indent_info_column, 16942 $automatic_pod_indent); 16943 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') 16944 . "\n" 16945 . $ucd_pod; 16946 my $space_hex = sprintf("%02x", ord " "); 16947 local $" = ""; 16948 16949 # Everything is ready to assemble. 16950 my @OUT = << "END"; 16951=begin comment 16952 16953$HEADER 16954 16955To change this file, edit $0 instead. 16956 16957=end comment 16958 16959=head1 NAME 16960 16961$pod_file - Index of Unicode Version $unicode_version character properties in Perl 16962 16963=head1 DESCRIPTION 16964 16965This document provides information about the portion of the Unicode database 16966that deals with character properties, that is the portion that is defined on 16967single code points. (L</Other information in the Unicode data base> 16968below briefly mentions other data that Unicode provides.) 16969 16970Perl can provide access to all non-provisional Unicode character properties, 16971though not all are enabled by default. The omitted ones are the Unihan 16972properties and certain 16973deprecated or Unicode-internal properties. (An installation may choose to 16974recompile Perl's tables to change this. See L</Unicode character 16975properties that are NOT accepted by Perl>.) 16976 16977For most purposes, access to Unicode properties from the Perl core is through 16978regular expression matches, as described in the next section. 16979For some special purposes, and to access the properties that are not suitable 16980for regular expression matching, all the Unicode character properties that 16981Perl handles are accessible via the standard L<Unicode::UCD> module, as 16982described in the section L</Properties accessible through Unicode::UCD>. 16983 16984Perl also provides some additional extensions and short-cut synonyms 16985for Unicode properties. 16986 16987This document merely lists all available properties and does not attempt to 16988explain what each property really means. There is a brief description of each 16989Perl extension; see L<perlunicode/Other Properties> for more information on 16990these. There is some detail about Blocks, Scripts, General_Category, 16991and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the 16992official Unicode properties, refer to the Unicode standard. A good starting 16993place is L<$unicode_reference_url>. 16994 16995Note that you can define your own properties; see 16996L<perlunicode/"User-Defined Character Properties">. 16997 16998=head1 Properties accessible through C<\\p{}> and C<\\P{}> 16999 17000The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to 17001most of the Unicode character properties. The table below shows all these 17002constructs, both single and compound forms. 17003 17004B<Compound forms> consist of two components, separated by an equals sign or a 17005colon. The first component is the property name, and the second component is 17006the particular value of the property to match against, for example, 17007C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean 17008to match characters whose Script_Extensions property value is Greek. 17009(C<Script_Extensions> is an improved version of the C<Script> property.) 17010 17011B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for 17012their equivalent compound forms. The table shows these equivalences. (In our 17013example, C<\\p{Greek}> is a just a shortcut for 17014C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single 17015forms that are not shortcuts for a compound form. One such is C<\\p{Word}>. 17016These are also listed in the table. 17017 17018In parsing these constructs, Perl always ignores Upper/lower case differences 17019everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as 17020C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before 17021the left brace completely changes the meaning of the construct, from "match" 17022(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is 17023for improved legibility. 17024 17025Also, white space, hyphens, and underscores are normally ignored 17026everywhere between the {braces}, and hence can be freely added or removed 17027even if the C</x> modifier hasn't been specified on the regular expression. 17028But in the table below $a_bold_stricter at the beginning of an entry 17029means that tighter (stricter) rules are used for that entry: 17030 17031=over 4 17032 17033=over 4 17034 17035=item Single form (C<\\p{name}>) tighter rules: 17036 17037White space, hyphens, and underscores ARE significant 17038except for: 17039 17040=over 4 17041 17042=item * white space adjacent to a non-word character 17043 17044=item * underscores separating digits in numbers 17045 17046=back 17047 17048That means, for example, that you can freely add or remove white space 17049adjacent to (but within) the braces without affecting the meaning. 17050 17051=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules: 17052 17053The tighter rules given above for the single form apply to everything to the 17054right of the colon or equals; the looser rules still apply to everything to 17055the left. 17056 17057That means, for example, that you can freely add or remove white space 17058adjacent to (but within) the braces and the colon or equal sign. 17059 17060=back 17061 17062=back 17063 17064Some properties are considered obsolete by Unicode, but still available. 17065There are several varieties of obsolescence: 17066 17067=over 4 17068 17069=over 4 17070 17071=item Stabilized 17072 17073A property may be stabilized. Such a determination does not indicate 17074that the property should or should not be used; instead it is a declaration 17075that the property will not be maintained nor extended for newly encoded 17076characters. Such properties are marked with $a_bold_stabilized in the 17077table. 17078 17079=item Deprecated 17080 17081A property may be deprecated, perhaps because its original intent 17082has been replaced by another property, or because its specification was 17083somehow defective. This means that its use is strongly 17084discouraged, so much so that a warning will be issued if used, unless the 17085regular expression is in the scope of a C<S<no warnings 'deprecated'>> 17086statement. $A_bold_deprecated flags each such entry in the table, and 17087the entry there for the longest, most descriptive version of the property will 17088give the reason it is deprecated, and perhaps advice. Perl may issue such a 17089warning, even for properties that aren't officially deprecated by Unicode, 17090when there used to be characters or code points that were matched by them, but 17091no longer. This is to warn you that your program may not work like it did on 17092earlier Unicode releases. 17093 17094A deprecated property may be made unavailable in a future Perl version, so it 17095is best to move away from them. 17096 17097A deprecated property may also be stabilized, but this fact is not shown. 17098 17099=item Obsolete 17100 17101Properties marked with $a_bold_obsolete in the table are considered (plain) 17102obsolete. Generally this designation is given to properties that Unicode once 17103used for internal purposes (but not any longer). 17104 17105=item Discouraged 17106 17107This is not actually a Unicode-specified obsolescence, but applies to certain 17108Perl extensions that are present for backwards compatibility, but are 17109discouraged from being used. These are not obsolete, but their meanings are 17110not stable. Future Unicode versions could force any of these extensions to be 17111removed without warning, replaced by another property with the same name that 17112means something different. $A_bold_discouraged flags each such entry in the 17113table. Use the equivalent shown instead. 17114 17115@block_warning 17116 17117=back 17118 17119=back 17120 17121The table below has two columns. The left column contains the C<\\p{}> 17122constructs to look up, possibly preceded by the flags mentioned above; and 17123the right column contains information about them, like a description, or 17124synonyms. The table shows both the single and compound forms for each 17125property that has them. If the left column is a short name for a property, 17126the right column will give its longer, more descriptive name; and if the left 17127column is the longest name, the right column will show any equivalent shortest 17128name, in both single and compound forms if applicable. 17129 17130If braces are not needed to specify a property (e.g., C<\\pL>), the left 17131column contains both forms, with and without braces. 17132 17133The right column will also caution you if a property means something different 17134than what might normally be expected. 17135 17136All single forms are Perl extensions; a few compound forms are as well, and 17137are noted as such. 17138 17139Numbers in (parentheses) indicate the total number of Unicode code points 17140matched by the property. For the entries that give the longest, most 17141descriptive version of the property, the count is followed by a list of some 17142of the code points matched by it. The list includes all the matched 17143characters in the 0-255 range, enclosed in the familiar [brackets] the same as 17144a regular expression bracketed character class. Following that, the next few 17145higher matching ranges are also given. To avoid visual ambiguity, the SPACE 17146character is represented as C<\\x$space_hex>. 17147 17148For emphasis, those properties that match no code points at all are listed as 17149well in a separate section following the table. 17150 17151Most properties match the same code points regardless of whether C<"/i"> 17152case-insensitive matching is specified or not. But a few properties are 17153affected. These are shown with the notation S<C<(/i= I<other_property>)>> 17154in the second column. Under case-insensitive matching they match the 17155same code pode points as the property I<other_property>. 17156 17157There is no description given for most non-Perl defined properties (See 17158L<$unicode_reference_url> for that). 17159 17160For compactness, 'B<*>' is used as a wildcard instead of showing all possible 17161combinations. For example, entries like: 17162 17163 \\p{Gc: *} \\p{General_Category: *} 17164 17165mean that 'Gc' is a synonym for 'General_Category', and anything that is valid 17166for the latter is also valid for the former. Similarly, 17167 17168 \\p{Is_*} \\p{*} 17169 17170means that if and only if, for example, C<\\p{Foo}> exists, then 17171C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing. 17172And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and 17173C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an 17174underscore. 17175 17176Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. 17177And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 17178'N*' to indicate this, and doesn't have separate entries for the other 17179possibilities. Note that not all properties which have values 'Yes' and 'No' 17180are binary, and they have all their values spelled out without using this wild 17181card, and a C<NOT> clause in their description that highlights their not being 17182binary. These also require the compound form to match them, whereas true 17183binary properties have both single and compound forms available. 17184 17185Note that all non-essential underscores are removed in the display of the 17186short names below. 17187 17188B<Legend summary:> 17189 17190=over 4 17191 17192=item Z<>B<*> is a wild-card 17193 17194=item B<(\\d+)> in the info column gives the number of Unicode code points matched 17195by this property. 17196 17197=item B<$DEPRECATED> means this is deprecated. 17198 17199=item B<$OBSOLETE> means this is obsolete. 17200 17201=item B<$STABILIZED> means this is stabilized. 17202 17203=item B<$STRICTER> means tighter (stricter) name matching applies. 17204 17205=item B<$DISCOURAGED> means use of this form is discouraged, and may not be 17206stable. 17207 17208=back 17209 17210$formatted_properties 17211 17212$zero_matches 17213 17214=head1 Properties accessible through Unicode::UCD 17215 17216The value of any Unicode (not including Perl extensions) character 17217property mentioned above for any single code point is available through 17218L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the 17219values of all the Unicode properties for a given code point. 17220 17221Besides these, all the Unicode character properties mentioned above 17222(except for those marked as for internal use by Perl) are also 17223accessible by L<Unicode::UCD/prop_invlist()>. 17224 17225Due to their nature, not all Unicode character properties are suitable for 17226regular expression matches, nor C<prop_invlist()>. The remaining 17227non-provisional, non-internal ones are accessible via 17228L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation 17229hasn't included; see L<below for which those are|/Unicode character properties 17230that are NOT accepted by Perl>). 17231 17232For compatibility with other parts of Perl, all the single forms given in the 17233table in the L<section above|/Properties accessible through \\p{} and \\P{}> 17234are recognized. BUT, there are some ambiguities between some Perl extensions 17235and the Unicode properties, all of which are silently resolved in favor of the 17236official Unicode property. To avoid surprises, you should only use 17237C<prop_invmap()> for forms listed in the table below, which omits the 17238non-recommended ones. The affected forms are the Perl single form equivalents 17239of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of 17240C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property, 17241whose short name is C<sc>. The table indicates the current ambiguities in the 17242INFO column, beginning with the word C<"NOT">. 17243 17244The standard Unicode properties listed below are documented in 17245L<$unicode_reference_url>; Perl_Decimal_Digit is documented in 17246L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in 17247L<perlunicode/Other Properties>; 17248 17249The first column in the table is a name for the property; the second column is 17250an alternative name, if any, plus possibly some annotations. The alternative 17251name is the property's full name, unless that would simply repeat the first 17252column, in which case the second column indicates the property's short name 17253(if different). The annotations are given only in the entry for the full 17254name. The annotations for binary properties include a list of the first few 17255ranges that the property matches. To avoid any ambiguity, the SPACE character 17256is represented as C<\\x$space_hex>. 17257 17258If a property is obsolete, etc, the entry will be flagged with the same 17259characters used in the table in the L<section above|/Properties accessible 17260through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>. 17261 17262$ucd_pod 17263 17264=head1 Properties accessible through other means 17265 17266Certain properties are accessible also via core function calls. These are: 17267 17268 Lowercase_Mapping lc() and lcfirst() 17269 Titlecase_Mapping ucfirst() 17270 Uppercase_Mapping uc() 17271 17272Also, Case_Folding is accessible through the C</i> modifier in regular 17273expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>> 17274operator. 17275 17276Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases 17277properties are accessible through the C<\\N{}> interpolation in double-quoted 17278strings and regular expressions; and functions C<charnames::viacode()>, 17279C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a 17280C<use charnames ();> to be specified. 17281 17282Finally, most properties related to decomposition are accessible via 17283L<Unicode::Normalize>. 17284 17285=head1 Unicode character properties that are NOT accepted by Perl 17286 17287Perl will generate an error for a few character properties in Unicode when 17288used in a regular expression. The non-Unihan ones are listed below, with the 17289reasons they are not accepted, perhaps with work-arounds. The short names for 17290the properties are listed enclosed in (parentheses). 17291As described after the list, an installation can change the defaults and choose 17292to accept any of these. The list is machine generated based on the 17293choices made for the installation that generated this document. 17294 17295@bad_re_properties 17296 17297An installation can choose to allow any of these to be matched by downloading 17298the Unicode database from L<http://www.unicode.org/Public/> to 17299C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the 17300controlling lists contained in the program 17301C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing. 17302(C<\%Config> is available from the Config module). 17303 17304Also, perl can be recompiled to operate on an earlier version of the Unicode 17305standard. Further information is at 17306C<\$Config{privlib}>/F<unicore/README.perl>. 17307 17308=head1 Other information in the Unicode data base 17309 17310The Unicode data base is delivered in two different formats. The XML version 17311is valid for more modern Unicode releases. The other version is a collection 17312of files. The two are intended to give equivalent information. Perl uses the 17313older form; this allows you to recompile Perl to use early Unicode releases. 17314 17315The only non-character property that Perl currently supports is Named 17316Sequences, in which a sequence of code points 17317is given a name and generally treated as a single entity. (Perl supports 17318these via the C<\\N{...}> double-quotish construct, 17319L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>. 17320 17321Below is a list of the files in the Unicode data base that Perl doesn't 17322currently use, along with very brief descriptions of their purposes. 17323Some of the names of the files have been shortened from those that Unicode 17324uses, in order to allow them to be distinguishable from similarly named files 17325on file systems for which only the first 8 characters of a name are 17326significant. 17327 17328=over 4 17329 17330@unused_files 17331 17332=back 17333 17334=head1 SEE ALSO 17335 17336L<$unicode_reference_url> 17337 17338L<perlrecharclass> 17339 17340L<perlunicode> 17341 17342END 17343 17344 # And write it. The 0 means no utf8. 17345 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT); 17346 return; 17347} 17348 17349sub make_Name_pm () { 17350 # Create and write Name.pm, which contains subroutines and data to use in 17351 # conjunction with Name.pl 17352 17353 # Maybe there's nothing to do. 17354 return unless $has_hangul_syllables || @code_points_ending_in_code_point; 17355 17356 my @name = <<END; 17357$HEADER 17358$INTERNAL_ONLY_HEADER 17359 17360END 17361 17362 # Convert these structures to output format. 17363 my $code_points_ending_in_code_point = 17364 main::simple_dumper(\@code_points_ending_in_code_point, 17365 ' ' x 8); 17366 my $names = main::simple_dumper(\%names_ending_in_code_point, 17367 ' ' x 8); 17368 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point, 17369 ' ' x 8); 17370 17371 # Do the same with the Hangul names, 17372 my $jamo; 17373 my $jamo_l; 17374 my $jamo_v; 17375 my $jamo_t; 17376 my $jamo_re; 17377 if ($has_hangul_syllables) { 17378 17379 # Construct a regular expression of all the possible 17380 # combinations of the Hangul syllables. 17381 my @L_re; # Leading consonants 17382 for my $i ($LBase .. $LBase + $LCount - 1) { 17383 push @L_re, $Jamo{$i} 17384 } 17385 my @V_re; # Middle vowels 17386 for my $i ($VBase .. $VBase + $VCount - 1) { 17387 push @V_re, $Jamo{$i} 17388 } 17389 my @T_re; # Trailing consonants 17390 for my $i ($TBase + 1 .. $TBase + $TCount - 1) { 17391 push @T_re, $Jamo{$i} 17392 } 17393 17394 # The whole re is made up of the L V T combination. 17395 $jamo_re = '(' 17396 . join ('|', sort @L_re) 17397 . ')(' 17398 . join ('|', sort @V_re) 17399 . ')(' 17400 . join ('|', sort @T_re) 17401 . ')?'; 17402 17403 # These hashes needed by the algorithm were generated 17404 # during reading of the Jamo.txt file 17405 $jamo = main::simple_dumper(\%Jamo, ' ' x 8); 17406 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8); 17407 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8); 17408 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8); 17409 } 17410 17411 push @name, <<END; 17412 17413package charnames; 17414 17415# This module contains machine-generated tables and code for the 17416# algorithmically-determinable Unicode character names. The following 17417# routines can be used to translate between name and code point and vice versa 17418 17419{ # Closure 17420 17421 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 17422 # two must be 10; if there are 5, the first must not be a 0. Written this 17423 # way to decrease backtracking. The first regex allows the code point to 17424 # be at the end of a word, but to work properly, the word shouldn't end 17425 # with a valid hex character. The second one won't match a code point at 17426 # the end of a word, and doesn't have the run-on issue 17427 my \$run_on_code_point_re = qr/$run_on_code_point_re/; 17428 my \$code_point_re = qr/$code_point_re/; 17429 17430 # In the following hash, the keys are the bases of names which include 17431 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 17432 # of each key is another hash which is used to get the low and high ends 17433 # for each range of code points that apply to the name. 17434 my %names_ending_in_code_point = ( 17435$names 17436 ); 17437 17438 # The following hash is a copy of the previous one, except is for loose 17439 # matching, so each name has blanks and dashes squeezed out 17440 my %loose_names_ending_in_code_point = ( 17441$loose_names 17442 ); 17443 17444 # And the following array gives the inverse mapping from code points to 17445 # names. Lowest code points are first 17446 \@code_points_ending_in_code_point = ( 17447$code_points_ending_in_code_point 17448 ); 17449 17450 # Is exportable, make read-only 17451 Internals::SvREADONLY(\@code_points_ending_in_code_point, 1); 17452END 17453 # Earlier releases didn't have Jamos. No sense outputting 17454 # them unless will be used. 17455 if ($has_hangul_syllables) { 17456 push @name, <<END; 17457 17458 # Convert from code point to Jamo short name for use in composing Hangul 17459 # syllable names 17460 my %Jamo = ( 17461$jamo 17462 ); 17463 17464 # Leading consonant (can be null) 17465 my %Jamo_L = ( 17466$jamo_l 17467 ); 17468 17469 # Vowel 17470 my %Jamo_V = ( 17471$jamo_v 17472 ); 17473 17474 # Optional trailing consonant 17475 my %Jamo_T = ( 17476$jamo_t 17477 ); 17478 17479 # Computed re that splits up a Hangul name into LVT or LV syllables 17480 my \$syllable_re = qr/$jamo_re/; 17481 17482 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 17483 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 17484 17485 # These constants names and values were taken from the Unicode standard, 17486 # version 5.1, section 3.12. They are used in conjunction with Hangul 17487 # syllables 17488 my \$SBase = $SBase_string; 17489 my \$LBase = $LBase_string; 17490 my \$VBase = $VBase_string; 17491 my \$TBase = $TBase_string; 17492 my \$SCount = $SCount; 17493 my \$LCount = $LCount; 17494 my \$VCount = $VCount; 17495 my \$TCount = $TCount; 17496 my \$NCount = \$VCount * \$TCount; 17497END 17498 } # End of has Jamos 17499 17500 push @name, << 'END'; 17501 17502 sub name_to_code_point_special { 17503 my ($name, $loose) = @_; 17504 17505 # Returns undef if not one of the specially handled names; otherwise 17506 # returns the code point equivalent to the input name 17507 # $loose is non-zero if to use loose matching, 'name' in that case 17508 # must be input as upper case with all blanks and dashes squeezed out. 17509END 17510 if ($has_hangul_syllables) { 17511 push @name, << 'END'; 17512 17513 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 17514 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 17515 { 17516 return if $name !~ qr/^$syllable_re$/; 17517 my $L = $Jamo_L{$1}; 17518 my $V = $Jamo_V{$2}; 17519 my $T = (defined $3) ? $Jamo_T{$3} : 0; 17520 return ($L * $VCount + $V) * $TCount + $T + $SBase; 17521 } 17522END 17523 } 17524 push @name, << 'END'; 17525 17526 # Name must end in 'code_point' for this to handle. 17527 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 17528 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 17529 17530 my $base = $1; 17531 my $code_point = CORE::hex $2; 17532 my $names_ref; 17533 17534 if ($loose) { 17535 $names_ref = \%loose_names_ending_in_code_point; 17536 } 17537 else { 17538 return if $base !~ s/-$//; 17539 $names_ref = \%names_ending_in_code_point; 17540 } 17541 17542 # Name must be one of the ones which has the code point in it. 17543 return if ! $names_ref->{$base}; 17544 17545 # Look through the list of ranges that apply to this name to see if 17546 # the code point is in one of them. 17547 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 17548 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 17549 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 17550 17551 # Here, the code point is in the range. 17552 return $code_point; 17553 } 17554 17555 # Here, looked like the name had a code point number in it, but 17556 # did not match one of the valid ones. 17557 return; 17558 } 17559 17560 sub code_point_to_name_special { 17561 my $code_point = shift; 17562 17563 # Returns the name of a code point if algorithmically determinable; 17564 # undef if not 17565END 17566 if ($has_hangul_syllables) { 17567 push @name, << 'END'; 17568 17569 # If in the Hangul range, calculate the name based on Unicode's 17570 # algorithm 17571 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 17572 use integer; 17573 my $SIndex = $code_point - $SBase; 17574 my $L = $LBase + $SIndex / $NCount; 17575 my $V = $VBase + ($SIndex % $NCount) / $TCount; 17576 my $T = $TBase + $SIndex % $TCount; 17577 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 17578 $name .= $Jamo{$T} if $T != $TBase; 17579 return $name; 17580 } 17581END 17582 } 17583 push @name, << 'END'; 17584 17585 # Look through list of these code points for one in range. 17586 foreach my $hash (@code_points_ending_in_code_point) { 17587 return if $code_point < $hash->{'low'}; 17588 if ($code_point <= $hash->{'high'}) { 17589 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 17590 } 17591 } 17592 return; # None found 17593 } 17594} # End closure 17595 175961; 17597END 17598 17599 main::write("Name.pm", 0, \@name); # The 0 means no utf8. 17600 return; 17601} 17602 17603sub make_UCD () { 17604 # Create and write UCD.pl, which passes info about the tables to 17605 # Unicode::UCD 17606 17607 # Stringify structures for output 17608 my $loose_property_name_of 17609 = simple_dumper(\%loose_property_name_of, ' ' x 4); 17610 chomp $loose_property_name_of; 17611 17612 my $strict_property_name_of 17613 = simple_dumper(\%strict_property_name_of, ' ' x 4); 17614 chomp $strict_property_name_of; 17615 17616 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); 17617 chomp $stricter_to_file_of; 17618 17619 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4); 17620 chomp $inline_definitions; 17621 17622 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); 17623 chomp $loose_to_file_of; 17624 17625 my $nv_floating_to_rational 17626 = simple_dumper(\%nv_floating_to_rational, ' ' x 4); 17627 chomp $nv_floating_to_rational; 17628 17629 my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4); 17630 chomp $why_deprecated; 17631 17632 # We set the key to the file when we associated files with tables, but we 17633 # couldn't do the same for the value then, as we might not have the file 17634 # for the alternate table figured out at that time. 17635 foreach my $cased (keys %caseless_equivalent_to) { 17636 my @path = $caseless_equivalent_to{$cased}->file_path; 17637 my $path; 17638 if ($path[0] eq "#") { # Pseudo-directory '#' 17639 $path = join '/', @path; 17640 } 17641 else { # Gets rid of lib/ 17642 $path = join '/', @path[1, -1]; 17643 } 17644 $caseless_equivalent_to{$cased} = $path; 17645 } 17646 my $caseless_equivalent_to 17647 = simple_dumper(\%caseless_equivalent_to, ' ' x 4); 17648 chomp $caseless_equivalent_to; 17649 17650 my $loose_property_to_file_of 17651 = simple_dumper(\%loose_property_to_file_of, ' ' x 4); 17652 chomp $loose_property_to_file_of; 17653 17654 my $strict_property_to_file_of 17655 = simple_dumper(\%strict_property_to_file_of, ' ' x 4); 17656 chomp $strict_property_to_file_of; 17657 17658 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); 17659 chomp $file_to_swash_name; 17660 17661 # Create a mapping from each alias of Perl single-form extensions to all 17662 # its equivalent aliases, for quick look-up. 17663 my %perlprop_to_aliases; 17664 foreach my $table ($perl->tables) { 17665 17666 # First create the list of the aliases of each extension 17667 my @aliases_list; # List of legal aliases for this extension 17668 17669 my $table_name = $table->name; 17670 my $standard_table_name = standardize($table_name); 17671 my $table_full_name = $table->full_name; 17672 my $standard_table_full_name = standardize($table_full_name); 17673 17674 # Make sure that the list has both the short and full names 17675 push @aliases_list, $table_name, $table_full_name; 17676 17677 my $found_ucd = 0; # ? Did we actually get an alias that should be 17678 # output for this table 17679 17680 # Go through all the aliases (including the two just added), and add 17681 # any new unique ones to the list 17682 foreach my $alias ($table->aliases) { 17683 17684 # Skip non-legal names 17685 next unless $alias->ok_as_filename; 17686 next unless $alias->ucd; 17687 17688 $found_ucd = 1; # have at least one legal name 17689 17690 my $name = $alias->name; 17691 my $standard = standardize($name); 17692 17693 # Don't repeat a name that is equivalent to one already on the 17694 # list 17695 next if $standard eq $standard_table_name; 17696 next if $standard eq $standard_table_full_name; 17697 17698 push @aliases_list, $name; 17699 } 17700 17701 # If there were no legal names, don't output anything. 17702 next unless $found_ucd; 17703 17704 # To conserve memory in the program reading these in, omit full names 17705 # that are identical to the short name, when those are the only two 17706 # aliases for the property. 17707 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) { 17708 pop @aliases_list; 17709 } 17710 17711 # Here, @aliases_list is the list of all the aliases that this 17712 # extension legally has. Now can create a map to it from each legal 17713 # standardized alias 17714 foreach my $alias ($table->aliases) { 17715 next unless $alias->ucd; 17716 next unless $alias->ok_as_filename; 17717 push @{$perlprop_to_aliases{standardize($alias->name)}}, 17718 uniques @aliases_list; 17719 } 17720 } 17721 17722 # Make a list of all combinations of properties/values that are suppressed. 17723 my @suppressed; 17724 if (! $debug_skip) { # This tends to fail in this debug mode 17725 foreach my $property_name (keys %why_suppressed) { 17726 17727 # Just the value 17728 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; 17729 17730 # The hash may contain properties not in this release of Unicode 17731 next unless defined (my $property = property_ref($property_name)); 17732 17733 # Find all combinations 17734 foreach my $prop_alias ($property->aliases) { 17735 my $prop_alias_name = standardize($prop_alias->name); 17736 17737 # If no =value, there's just one combination possible for this 17738 if (! $value_name) { 17739 17740 # The property may be suppressed, but there may be a proxy 17741 # for it, so it shouldn't be listed as suppressed 17742 next if $prop_alias->ucd; 17743 push @suppressed, $prop_alias_name; 17744 } 17745 else { # Otherwise 17746 foreach my $value_alias 17747 ($property->table($value_name)->aliases) 17748 { 17749 next if $value_alias->ucd; 17750 17751 push @suppressed, "$prop_alias_name=" 17752 . standardize($value_alias->name); 17753 } 17754 } 17755 } 17756 } 17757 } 17758 @suppressed = sort @suppressed; # So doesn't change between runs of this 17759 # program 17760 17761 # Convert the structure below (designed for Name.pm) to a form that UCD 17762 # wants, so it doesn't have to modify it at all; i.e. so that it includes 17763 # an element for the Hangul syllables in the appropriate place, and 17764 # otherwise changes the name to include the "-<code point>" suffix. 17765 my @algorithm_names; 17766 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came 17767 # along in this version 17768 # Copy it linearly. 17769 for my $i (0 .. @code_points_ending_in_code_point - 1) { 17770 17771 # Insert the hanguls in the correct place. 17772 if (! $done_hangul 17773 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase) 17774 { 17775 $done_hangul = 1; 17776 push @algorithm_names, { low => $SBase, 17777 high => $SBase + $SCount - 1, 17778 name => '<hangul syllable>', 17779 }; 17780 } 17781 17782 # Copy the current entry, modified. 17783 push @algorithm_names, { 17784 low => $code_points_ending_in_code_point[$i]->{'low'}, 17785 high => $code_points_ending_in_code_point[$i]->{'high'}, 17786 name => 17787 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>", 17788 }; 17789 } 17790 17791 # Serialize these structures for output. 17792 my $loose_to_standard_value 17793 = simple_dumper(\%loose_to_standard_value, ' ' x 4); 17794 chomp $loose_to_standard_value; 17795 17796 my $string_property_loose_to_name 17797 = simple_dumper(\%string_property_loose_to_name, ' ' x 4); 17798 chomp $string_property_loose_to_name; 17799 17800 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4); 17801 chomp $perlprop_to_aliases; 17802 17803 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4); 17804 chomp $prop_aliases; 17805 17806 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4); 17807 chomp $prop_value_aliases; 17808 17809 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : ""; 17810 chomp $suppressed; 17811 17812 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4); 17813 chomp $algorithm_names; 17814 17815 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); 17816 chomp $ambiguous_names; 17817 17818 my $combination_property = simple_dumper(\%combination_property, ' ' x 4); 17819 chomp $combination_property; 17820 17821 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); 17822 chomp $loose_defaults; 17823 17824 my @ucd = <<END; 17825$HEADER 17826$INTERNAL_ONLY_HEADER 17827 17828# This file is for the use of Unicode::UCD 17829 17830# Highest legal Unicode code point 17831\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING; 17832 17833# Hangul syllables 17834\$Unicode::UCD::HANGUL_BEGIN = $SBase_string; 17835\$Unicode::UCD::HANGUL_COUNT = $SCount; 17836 17837# Maps Unicode (not Perl single-form extensions) property names in loose 17838# standard form to their corresponding standard names 17839\%Unicode::UCD::loose_property_name_of = ( 17840$loose_property_name_of 17841); 17842 17843# Same, but strict names 17844\%Unicode::UCD::strict_property_name_of = ( 17845$strict_property_name_of 17846); 17847 17848# Gives the definitions (in the form of inversion lists) for those properties 17849# whose definitions aren't kept in files 17850\@Unicode::UCD::inline_definitions = ( 17851$inline_definitions 17852); 17853 17854# Maps property, table to file for those using stricter matching. For paths 17855# whose directory is '#', the file is in the form of a numeric index into 17856# \@inline_definitions 17857\%Unicode::UCD::stricter_to_file_of = ( 17858$stricter_to_file_of 17859); 17860 17861# Maps property, table to file for those using loose matching. For paths 17862# whose directory is '#', the file is in the form of a numeric index into 17863# \@inline_definitions 17864\%Unicode::UCD::loose_to_file_of = ( 17865$loose_to_file_of 17866); 17867 17868# Maps floating point to fractional form 17869\%Unicode::UCD::nv_floating_to_rational = ( 17870$nv_floating_to_rational 17871); 17872 17873# If a %e floating point number doesn't have this number of digits in it after 17874# the decimal point to get this close to a fraction, it isn't considered to be 17875# that fraction even if all the digits it does have match. 17876\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION; 17877 17878# Deprecated tables to generate a warning for. The key is the file containing 17879# the table, so as to avoid duplication, as many property names can map to the 17880# file, but we only need one entry for all of them. 17881\%Unicode::UCD::why_deprecated = ( 17882$why_deprecated 17883); 17884 17885# A few properties have different behavior under /i matching. This maps 17886# those to substitute files to use under /i. 17887\%Unicode::UCD::caseless_equivalent = ( 17888$caseless_equivalent_to 17889); 17890 17891# Property names to mapping files 17892\%Unicode::UCD::loose_property_to_file_of = ( 17893$loose_property_to_file_of 17894); 17895 17896# Property names to mapping files 17897\%Unicode::UCD::strict_property_to_file_of = ( 17898$strict_property_to_file_of 17899); 17900 17901# Files to the swash names within them. 17902\%Unicode::UCD::file_to_swash_name = ( 17903$file_to_swash_name 17904); 17905 17906# Keys are all the possible "prop=value" combinations, in loose form; values 17907# are the standard loose name for the 'value' part of the key 17908\%Unicode::UCD::loose_to_standard_value = ( 17909$loose_to_standard_value 17910); 17911 17912# String property loose names to standard loose name 17913\%Unicode::UCD::string_property_loose_to_name = ( 17914$string_property_loose_to_name 17915); 17916 17917# Keys are Perl extensions in loose form; values are each one's list of 17918# aliases 17919\%Unicode::UCD::loose_perlprop_to_name = ( 17920$perlprop_to_aliases 17921); 17922 17923# Keys are standard property name; values are each one's aliases 17924\%Unicode::UCD::prop_aliases = ( 17925$prop_aliases 17926); 17927 17928# Keys of top level are standard property name; values are keys to another 17929# hash, Each one is one of the property's values, in standard form. The 17930# values are that prop-val's aliases. If only one specified, the short and 17931# long alias are identical. 17932\%Unicode::UCD::prop_value_aliases = ( 17933$prop_value_aliases 17934); 17935 17936# Ordered (by code point ordinal) list of the ranges of code points whose 17937# names are algorithmically determined. Each range entry is an anonymous hash 17938# of the start and end points and a template for the names within it. 17939\@Unicode::UCD::algorithmic_named_code_points = ( 17940$algorithm_names 17941); 17942 17943# The properties that as-is have two meanings, and which must be disambiguated 17944\%Unicode::UCD::ambiguous_names = ( 17945$ambiguous_names 17946); 17947 17948# Keys are the prop-val combinations which are the default values for the 17949# given property, expressed in standard loose form 17950\%Unicode::UCD::loose_defaults = ( 17951$loose_defaults 17952); 17953 17954# The properties that are combinations, in that they have both a map table and 17955# a match table. This is actually for UCD.t, so it knows how to test for 17956# these. 17957\%Unicode::UCD::combination_property = ( 17958$combination_property 17959); 17960 17961# All combinations of names that are suppressed. 17962# This is actually for UCD.t, so it knows which properties shouldn't have 17963# entries. If it got any bigger, would probably want to put it in its own 17964# file to use memory only when it was needed, in testing. 17965\@Unicode::UCD::suppressed_properties = ( 17966$suppressed 17967); 17968 179691; 17970END 17971 17972 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8. 17973 return; 17974} 17975 17976sub write_all_tables() { 17977 # Write out all the tables generated by this program to files, as well as 17978 # the supporting data structures, pod file, and .t file. 17979 17980 my @writables; # List of tables that actually get written 17981 my %match_tables_to_write; # Used to collapse identical match tables 17982 # into one file. Each key is a hash function 17983 # result to partition tables into buckets. 17984 # Each value is an array of the tables that 17985 # fit in the bucket. 17986 17987 # For each property ... 17988 # (sort so that if there is an immutable file name, it has precedence, so 17989 # some other property can't come in and take over its file name. (We 17990 # don't care if both defined, as they had better be different anyway.) 17991 # The property named 'Perl' needs to be first (it doesn't have any 17992 # immutable file name) because empty properties are defined in terms of 17993 # its table named 'All' under the -annotate option.) We also sort by 17994 # the property's name. This is just for repeatability of the outputs 17995 # between runs of this program, but does not affect correctness. 17996 PROPERTY: 17997 foreach my $property ($perl, 17998 sort { return -1 if defined $a->file; 17999 return 1 if defined $b->file; 18000 return $a->name cmp $b->name; 18001 } grep { $_ != $perl } property_ref('*')) 18002 { 18003 my $type = $property->type; 18004 18005 # And for each table for that property, starting with the mapping 18006 # table for it ... 18007 TABLE: 18008 foreach my $table($property, 18009 18010 # and all the match tables for it (if any), sorted so 18011 # the ones with the shortest associated file name come 18012 # first. The length sorting prevents problems of a 18013 # longer file taking a name that might have to be used 18014 # by a shorter one. The alphabetic sorting prevents 18015 # differences between releases 18016 sort { my $ext_a = $a->external_name; 18017 return 1 if ! defined $ext_a; 18018 my $ext_b = $b->external_name; 18019 return -1 if ! defined $ext_b; 18020 18021 # But return the non-complement table before 18022 # the complement one, as the latter is defined 18023 # in terms of the former, and needs to have 18024 # the information for the former available. 18025 return 1 if $a->complement != 0; 18026 return -1 if $b->complement != 0; 18027 18028 # Similarly, return a subservient table after 18029 # a leader 18030 return 1 if $a->leader != $a; 18031 return -1 if $b->leader != $b; 18032 18033 my $cmp = length $ext_a <=> length $ext_b; 18034 18035 # Return result if lengths not equal 18036 return $cmp if $cmp; 18037 18038 # Alphabetic if lengths equal 18039 return $ext_a cmp $ext_b 18040 } $property->tables 18041 ) 18042 { 18043 18044 # Here we have a table associated with a property. It could be 18045 # the map table (done first for each property), or one of the 18046 # other tables. Determine which type. 18047 my $is_property = $table->isa('Property'); 18048 18049 my $name = $table->name; 18050 my $complete_name = $table->complete_name; 18051 18052 # See if should suppress the table if is empty, but warn if it 18053 # contains something. 18054 my $suppress_if_empty_warn_if_not 18055 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0; 18056 18057 # Calculate if this table should have any code points associated 18058 # with it or not. 18059 my $expected_empty = 18060 18061 # $perl should be empty 18062 ($is_property && ($table == $perl)) 18063 18064 # Match tables in properties we skipped populating should be 18065 # empty 18066 || (! $is_property && ! $property->to_create_match_tables) 18067 18068 # Tables and properties that are expected to have no code 18069 # points should be empty 18070 || $suppress_if_empty_warn_if_not 18071 ; 18072 18073 # Set a boolean if this table is the complement of an empty binary 18074 # table 18075 my $is_complement_of_empty_binary = 18076 $type == $BINARY && 18077 (($table == $property->table('Y') 18078 && $property->table('N')->is_empty) 18079 || ($table == $property->table('N') 18080 && $property->table('Y')->is_empty)); 18081 18082 if ($table->is_empty) { 18083 18084 if ($suppress_if_empty_warn_if_not) { 18085 $table->set_fate($SUPPRESSED, 18086 $suppress_if_empty_warn_if_not); 18087 } 18088 18089 # Suppress (by skipping them) expected empty tables. 18090 next TABLE if $expected_empty; 18091 18092 # And setup to later output a warning for those that aren't 18093 # known to be allowed to be empty. Don't do the warning if 18094 # this table is a child of another one to avoid duplicating 18095 # the warning that should come from the parent one. 18096 if (($table == $property || $table->parent == $table) 18097 && $table->fate != $SUPPRESSED 18098 && $table->fate != $MAP_PROXIED 18099 && ! grep { $complete_name =~ /^$_$/ } 18100 @tables_that_may_be_empty) 18101 { 18102 push @unhandled_properties, "$table"; 18103 } 18104 18105 # The old way of expressing an empty match list was to 18106 # complement the list that matches everything. The new way is 18107 # to create an empty inversion list, but this doesn't work for 18108 # annotating, so use the old way then. 18109 $table->set_complement($All) if $annotate 18110 && $table != $property; 18111 } 18112 elsif ($expected_empty) { 18113 my $because = ""; 18114 if ($suppress_if_empty_warn_if_not) { 18115 $because = " because $suppress_if_empty_warn_if_not"; 18116 } 18117 18118 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); 18119 } 18120 18121 # Some tables should match everything 18122 my $expected_full = 18123 ($table->fate == $SUPPRESSED) 18124 ? 0 18125 : ($is_property) 18126 ? # All these types of map tables will be full because 18127 # they will have been populated with defaults 18128 ($type == $ENUM) 18129 18130 : # A match table should match everything if its method 18131 # shows it should 18132 ($table->matches_all 18133 18134 # The complement of an empty binary table will match 18135 # everything 18136 || $is_complement_of_empty_binary 18137 ) 18138 ; 18139 18140 my $count = $table->count; 18141 if ($expected_full) { 18142 if ($count != $MAX_WORKING_CODEPOINTS) { 18143 Carp::my_carp("$table matches only " 18144 . clarify_number($count) 18145 . " Unicode code points but should match " 18146 . clarify_number($MAX_WORKING_CODEPOINTS) 18147 . " (off by " 18148 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count)) 18149 . "). Proceeding anyway."); 18150 } 18151 18152 # Here is expected to be full. If it is because it is the 18153 # complement of an (empty) binary table that is to be 18154 # suppressed, then suppress this one as well. 18155 if ($is_complement_of_empty_binary) { 18156 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; 18157 my $opposing = $property->table($opposing_name); 18158 my $opposing_status = $opposing->status; 18159 if ($opposing_status) { 18160 $table->set_status($opposing_status, 18161 $opposing->status_info); 18162 } 18163 } 18164 } 18165 elsif ($count == $MAX_UNICODE_CODEPOINTS 18166 && $name ne "Any" 18167 && ($table == $property || $table->leader == $table) 18168 && $table->property->status ne $NORMAL) 18169 { 18170 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); 18171 } 18172 18173 if ($table->fate >= $SUPPRESSED) { 18174 if (! $is_property) { 18175 my @children = $table->children; 18176 foreach my $child (@children) { 18177 if ($child->fate < $SUPPRESSED) { 18178 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); 18179 } 18180 } 18181 } 18182 next TABLE; 18183 18184 } 18185 18186 if (! $is_property) { 18187 18188 make_ucd_table_pod_entries($table) if $table->property == $perl; 18189 18190 # Several things need to be done just once for each related 18191 # group of match tables. Do them on the parent. 18192 if ($table->parent == $table) { 18193 18194 # Add an entry in the pod file for the table; it also does 18195 # the children. 18196 make_re_pod_entries($table) if defined $pod_directory; 18197 18198 # See if the table matches identical code points with 18199 # something that has already been processed and is ready 18200 # for output. In that case, no need to have two files 18201 # with the same code points in them. We use the table's 18202 # hash() method to store these in buckets, so that it is 18203 # quite likely that if two tables are in the same bucket 18204 # they will be identical, so don't have to compare tables 18205 # frequently. The tables have to have the same status to 18206 # share a file, so add this to the bucket hash. (The 18207 # reason for this latter is that UCD.pm associates a 18208 # status with a file.) We don't check tables that are 18209 # inverses of others, as it would lead to some coding 18210 # complications, and checking all the regular ones should 18211 # find everything. 18212 if ($table->complement == 0) { 18213 my $hash = $table->hash . ';' . $table->status; 18214 18215 # Look at each table that is in the same bucket as 18216 # this one would be. 18217 foreach my $comparison 18218 (@{$match_tables_to_write{$hash}}) 18219 { 18220 # If the table doesn't point back to this one, we 18221 # see if it matches identically 18222 if ( $comparison->leader != $table 18223 && $table->matches_identically_to($comparison)) 18224 { 18225 $table->set_equivalent_to($comparison, 18226 Related => 0); 18227 next TABLE; 18228 } 18229 } 18230 18231 # Here, not equivalent, add this table to the bucket. 18232 push @{$match_tables_to_write{$hash}}, $table; 18233 } 18234 } 18235 } 18236 else { 18237 18238 # Here is the property itself. 18239 # Don't write out or make references to the $perl property 18240 next if $table == $perl; 18241 18242 make_ucd_table_pod_entries($table); 18243 18244 # There is a mapping stored of the various synonyms to the 18245 # standardized name of the property for Unicode::UCD. 18246 # Also, the pod file contains entries of the form: 18247 # \p{alias: *} \p{full: *} 18248 # rather than show every possible combination of things. 18249 18250 my @property_aliases = $property->aliases; 18251 18252 my $full_property_name = $property->full_name; 18253 my $property_name = $property->name; 18254 my $standard_property_name = standardize($property_name); 18255 my $standard_property_full_name 18256 = standardize($full_property_name); 18257 18258 # We also create for Unicode::UCD a list of aliases for 18259 # the property. The list starts with the property name; 18260 # then its full name. 18261 my @property_list; 18262 my @standard_list; 18263 if ( $property->fate <= $MAP_PROXIED) { 18264 @property_list = ($property_name, $full_property_name); 18265 @standard_list = ($standard_property_name, 18266 $standard_property_full_name); 18267 } 18268 18269 # For each synonym ... 18270 for my $i (0 .. @property_aliases - 1) { 18271 my $alias = $property_aliases[$i]; 18272 my $alias_name = $alias->name; 18273 my $alias_standard = standardize($alias_name); 18274 18275 18276 # Add other aliases to the list of property aliases 18277 if ($property->fate <= $MAP_PROXIED 18278 && ! grep { $alias_standard eq $_ } @standard_list) 18279 { 18280 push @property_list, $alias_name; 18281 push @standard_list, $alias_standard; 18282 } 18283 18284 # For Unicode::UCD, set the mapping of the alias to the 18285 # property 18286 if ($type == $STRING) { 18287 if ($property->fate <= $MAP_PROXIED) { 18288 $string_property_loose_to_name{$alias_standard} 18289 = $standard_property_name; 18290 } 18291 } 18292 else { 18293 my $hash_ref = ($alias_standard =~ /^_/) 18294 ? \%strict_property_name_of 18295 : \%loose_property_name_of; 18296 if (exists $hash_ref->{$alias_standard}) { 18297 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained"); 18298 } 18299 else { 18300 $hash_ref->{$alias_standard} 18301 = $standard_property_name; 18302 } 18303 18304 # Now for the re pod entry for this alias. Skip if not 18305 # outputting a pod; skip the first one, which is the 18306 # full name so won't have an entry like: '\p{full: *} 18307 # \p{full: *}', and skip if don't want an entry for 18308 # this one. 18309 next if $i == 0 18310 || ! defined $pod_directory 18311 || ! $alias->make_re_pod_entry; 18312 18313 my $rhs = "\\p{$full_property_name: *}"; 18314 if ($property != $perl && $table->perl_extension) { 18315 $rhs .= ' (Perl extension)'; 18316 } 18317 push @match_properties, 18318 format_pod_line($indent_info_column, 18319 '\p{' . $alias->name . ': *}', 18320 $rhs, 18321 $alias->status); 18322 } 18323 } 18324 18325 # The list of all possible names is attached to each alias, so 18326 # lookup is easy 18327 if (@property_list) { 18328 push @{$prop_aliases{$standard_list[0]}}, @property_list; 18329 } 18330 18331 if ($property->fate <= $MAP_PROXIED) { 18332 18333 # Similarly, we create for Unicode::UCD a list of 18334 # property-value aliases. 18335 18336 # Look at each table in the property... 18337 foreach my $table ($property->tables) { 18338 my @values_list; 18339 my $table_full_name = $table->full_name; 18340 my $standard_table_full_name 18341 = standardize($table_full_name); 18342 my $table_name = $table->name; 18343 my $standard_table_name = standardize($table_name); 18344 18345 # The list starts with the table name and its full 18346 # name. 18347 push @values_list, $table_name, $table_full_name; 18348 18349 # We add to the table each unique alias that isn't 18350 # discouraged from use. 18351 foreach my $alias ($table->aliases) { 18352 next if $alias->status 18353 && $alias->status eq $DISCOURAGED; 18354 my $name = $alias->name; 18355 my $standard = standardize($name); 18356 next if $standard eq $standard_table_name; 18357 next if $standard eq $standard_table_full_name; 18358 push @values_list, $name; 18359 } 18360 18361 # Here @values_list is a list of all the aliases for 18362 # the table. That is, all the property-values given 18363 # by this table. By agreement with Unicode::UCD, 18364 # if the name and full name are identical, and there 18365 # are no other names, drop the duplicate entry to save 18366 # memory. 18367 if (@values_list == 2 18368 && $values_list[0] eq $values_list[1]) 18369 { 18370 pop @values_list 18371 } 18372 18373 # To save memory, unlike the similar list for property 18374 # aliases above, only the standard forms have the list. 18375 # This forces an extra step of converting from input 18376 # name to standard name, but the savings are 18377 # considerable. (There is only marginal savings if we 18378 # did this with the property aliases.) 18379 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list; 18380 } 18381 } 18382 18383 # Don't write out a mapping file if not desired. 18384 next if ! $property->to_output_map; 18385 } 18386 18387 # Here, we know we want to write out the table, but don't do it 18388 # yet because there may be other tables that come along and will 18389 # want to share the file, and the file's comments will change to 18390 # mention them. So save for later. 18391 push @writables, $table; 18392 18393 } # End of looping through the property and all its tables. 18394 } # End of looping through all properties. 18395 18396 # Now have all the tables that will have files written for them. Do it. 18397 foreach my $table (@writables) { 18398 my @directory; 18399 my $filename; 18400 my $property = $table->property; 18401 my $is_property = ($table == $property); 18402 18403 # For very short tables, instead of writing them out to actual files, 18404 # we in-line their inversion list definitions into UCD.pm. The 18405 # definition replaces the file name, and the special pseudo-directory 18406 # '#' is used to signal this. This significantly cuts down the number 18407 # of files written at little extra cost to the hashes in UCD.pm. 18408 # And it means, no run-time files to read to get the definitions. 18409 if (! $is_property 18410 && ! $annotate # For annotation, we want to explicitly show 18411 # everything, so keep in files 18412 && $table->ranges <= 3) 18413 { 18414 my @ranges = $table->ranges; 18415 my $count = @ranges; 18416 if ($count == 0) { # 0th index reserved for 0-length lists 18417 $filename = 0; 18418 } 18419 elsif ($table->leader != $table) { 18420 18421 # Here, is a table that is equivalent to another; code 18422 # in register_file_for_name() causes its leader's definition 18423 # to be used 18424 18425 next; 18426 } 18427 else { # No equivalent table so far. 18428 18429 # Build up its definition range-by-range. 18430 my $definition = ""; 18431 while (defined (my $range = shift @ranges)) { 18432 my $end = $range->end; 18433 if ($end < $MAX_WORKING_CODEPOINT) { 18434 $count++; 18435 $end = "\n" . ($end + 1); 18436 } 18437 else { # Extends to infinity, hence no 'end' 18438 $end = ""; 18439 } 18440 $definition .= "\n" . $range->start . $end; 18441 } 18442 $definition = "V$count" . $definition; 18443 $filename = @inline_definitions; 18444 push @inline_definitions, $definition; 18445 } 18446 @directory = "#"; 18447 register_file_for_name($table, \@directory, $filename); 18448 next; 18449 } 18450 18451 if (! $is_property) { 18452 # Match tables for the property go in lib/$subdirectory, which is 18453 # the property's name. Don't use the standard file name for this, 18454 # as may get an unfamiliar alias 18455 @directory = ($matches_directory, ($property->match_subdir) 18456 ? $property->match_subdir 18457 : $property->external_name); 18458 } 18459 else { 18460 18461 @directory = $table->directory; 18462 $filename = $table->file; 18463 } 18464 18465 # Use specified filename if available, or default to property's 18466 # shortest name. We need an 8.3 safe filename (which means "an 8 18467 # safe" filename, since after the dot is only 'pl', which is < 3) 18468 # The 2nd parameter is if the filename shouldn't be changed, and 18469 # it shouldn't iff there is a hard-coded name for this table. 18470 $filename = construct_filename( 18471 $filename || $table->external_name, 18472 ! $filename, # mutable if no filename 18473 \@directory); 18474 18475 register_file_for_name($table, \@directory, $filename); 18476 18477 # Only need to write one file when shared by more than one 18478 # property 18479 next if ! $is_property 18480 && ($table->leader != $table || $table->complement != 0); 18481 18482 # Construct a nice comment to add to the file 18483 $table->set_final_comment; 18484 18485 $table->write; 18486 } 18487 18488 18489 # Write out the pod file 18490 make_pod; 18491 18492 # And Name.pm, UCD.pl 18493 make_Name_pm; 18494 make_UCD; 18495 18496 make_property_test_script() if $make_test_script; 18497 make_normalization_test_script() if $make_norm_test_script; 18498 return; 18499} 18500 18501my @white_space_separators = ( # This used only for making the test script. 18502 "", 18503 ' ', 18504 "\t", 18505 ' ' 18506 ); 18507 18508sub generate_separator($lhs) { 18509 # This used only for making the test script. It generates the colon or 18510 # equal separator between the property and property value, with random 18511 # white space surrounding the separator 18512 18513 return "" if $lhs eq ""; # No separator if there's only one (the r) side 18514 18515 # Choose space before and after randomly 18516 my $spaces_before =$white_space_separators[rand(@white_space_separators)]; 18517 my $spaces_after = $white_space_separators[rand(@white_space_separators)]; 18518 18519 # And return the whole complex, half the time using a colon, half the 18520 # equals 18521 return $spaces_before 18522 . (rand() < 0.5) ? '=' : ':' 18523 . $spaces_after; 18524} 18525 18526sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18527 # This used only for making the test script. It generates test cases that 18528 # are expected to compile successfully in perl. Note that the LHS and 18529 # RHS are assumed to already be as randomized as the caller wants. 18530 18531 # $lhs # The property: what's to the left of the colon 18532 # or equals separator 18533 # $rhs # The property value; what's to the right 18534 # $valid_code # A code point that's known to be in the 18535 # table given by LHS=RHS; undef if table is 18536 # empty 18537 # $invalid_code # A code point known to not be in the table; 18538 # undef if the table is all code points 18539 # $warning 18540 18541 # Get the colon or equal 18542 my $separator = generate_separator($lhs); 18543 18544 # The whole 'property=value' 18545 my $name = "$lhs$separator$rhs"; 18546 18547 my @output; 18548 # Create a complete set of tests, with complements. 18549 if (defined $valid_code) { 18550 push @output, <<"EOC" 18551Expect(1, $valid_code, '\\p{$name}', $warning); 18552Expect(0, $valid_code, '\\p{^$name}', $warning); 18553Expect(0, $valid_code, '\\P{$name}', $warning); 18554Expect(1, $valid_code, '\\P{^$name}', $warning); 18555EOC 18556 } 18557 if (defined $invalid_code) { 18558 push @output, <<"EOC" 18559Expect(0, $invalid_code, '\\p{$name}', $warning); 18560Expect(1, $invalid_code, '\\p{^$name}', $warning); 18561Expect(1, $invalid_code, '\\P{$name}', $warning); 18562Expect(0, $invalid_code, '\\P{^$name}', $warning); 18563EOC 18564 } 18565 return @output; 18566} 18567 18568sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18569 # This used only for making the test script. It generates wildcardl 18570 # matching test cases that are expected to compile successfully in perl. 18571 18572 # $lhs # The property: what's to the left of the 18573 # or equals separator 18574 # $rhs # The property value; what's to the right 18575 # $valid_code # A code point that's known to be in the 18576 # table given by LHS=RHS; undef if table is 18577 # empty 18578 # $invalid_code # A code point known to not be in the table; 18579 # undef if the table is all code points 18580 # $warning 18581 18582 return if $lhs eq ""; 18583 return if $lhs =~ / ^ Is_ /x; # These are not currently supported 18584 18585 # Generate a standardized pattern, with colon being the delimitter 18586 my $wildcard = "$lhs=:\\A$rhs\\z:"; 18587 18588 my @output; 18589 push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);" 18590 if defined $valid_code; 18591 push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);" 18592 if defined $invalid_code; 18593 return @output; 18594} 18595 18596sub generate_error($lhs, $rhs, $already_in_error=0) { 18597 # This used only for making the test script. It generates test cases that 18598 # are expected to not only not match, but to be syntax or similar errors 18599 18600 # $lhs # The property: what's to the left of the 18601 # colon or equals separator 18602 # $rhs # The property value; what's to the right 18603 # $already_in_error # Boolean; if true it's known that the 18604 # unmodified LHS and RHS will cause an error. 18605 # This routine should not force another one 18606 # Get the colon or equal 18607 my $separator = generate_separator($lhs); 18608 18609 # Since this is an error only, don't bother to randomly decide whether to 18610 # put the error on the left or right side; and assume that the RHS is 18611 # loosely matched, again for convenience rather than rigor. 18612 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; 18613 18614 my $property = $lhs . $separator . $rhs; 18615 18616 return <<"EOC"; 18617Error('\\p{$property}'); 18618Error('\\P{$property}'); 18619EOC 18620} 18621 18622# These are used only for making the test script 18623# XXX Maybe should also have a bad strict seps, which includes underscore. 18624 18625my @good_loose_seps = ( 18626 " ", 18627 "-", 18628 "\t", 18629 "", 18630 "_", 18631 ); 18632my @bad_loose_seps = ( 18633 "/a/", 18634 ':=', 18635 ); 18636 18637sub randomize_stricter_name($name) { 18638 # This used only for making the test script. Take the input name and 18639 # return a randomized, but valid version of it under the stricter matching 18640 # rules. 18641 18642 # If the name looks like a number (integer, floating, or rational), do 18643 # some extra work 18644 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { 18645 my $sign = $1; 18646 my $number = $2; 18647 my $separator = $3; 18648 18649 # If there isn't a sign, part of the time add a plus 18650 # Note: Not testing having any denominator having a minus sign 18651 if (! $sign) { 18652 $sign = '+' if rand() <= .3; 18653 } 18654 18655 # And add 0 or more leading zeros. 18656 $name = $sign . ('0' x int rand(10)) . $number; 18657 18658 if (defined $separator) { 18659 my $extra_zeros = '0' x int rand(10); 18660 18661 if ($separator eq '.') { 18662 18663 # Similarly, add 0 or more trailing zeros after a decimal 18664 # point 18665 $name .= $extra_zeros; 18666 } 18667 else { 18668 18669 # Or, leading zeros before the denominator 18670 $name =~ s,/,/$extra_zeros,; 18671 } 18672 } 18673 } 18674 18675 # For legibility of the test, only change the case of whole sections at a 18676 # time. To do this, first split into sections. The split returns the 18677 # delimiters 18678 my @sections; 18679 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { 18680 trace $section if main::DEBUG && $to_trace; 18681 18682 if (length $section > 1 && $section !~ /\D/) { 18683 18684 # If the section is a sequence of digits, about half the time 18685 # randomly add underscores between some of them. 18686 if (rand() > .5) { 18687 18688 # Figure out how many underscores to add. max is 1 less than 18689 # the number of digits. (But add 1 at the end to make sure 18690 # result isn't 0, and compensate earlier by subtracting 2 18691 # instead of 1) 18692 my $num_underscores = int rand(length($section) - 2) + 1; 18693 18694 # And add them evenly throughout, for convenience, not rigor 18695 use integer; 18696 my $spacing = (length($section) - 1)/ $num_underscores; 18697 my $temp = $section; 18698 $section = ""; 18699 for my $i (1 .. $num_underscores) { 18700 $section .= substr($temp, 0, $spacing, "") . '_'; 18701 } 18702 $section .= $temp; 18703 } 18704 push @sections, $section; 18705 } 18706 else { 18707 18708 # Here not a sequence of digits. Change the case of the section 18709 # randomly 18710 my $switch = int rand(4); 18711 if ($switch == 0) { 18712 push @sections, uc $section; 18713 } 18714 elsif ($switch == 1) { 18715 push @sections, lc $section; 18716 } 18717 elsif ($switch == 2) { 18718 push @sections, ucfirst $section; 18719 } 18720 else { 18721 push @sections, $section; 18722 } 18723 } 18724 } 18725 trace "returning", join "", @sections if main::DEBUG && $to_trace; 18726 return join "", @sections; 18727} 18728 18729sub randomize_loose_name($name, $want_error=0) { 18730 # This used only for making the test script 18731 18732 $name = randomize_stricter_name($name); 18733 18734 my @parts; 18735 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18736 18737 # Preserve trailing ones for the sake of not stripping the underscore from 18738 # 'L_' 18739 for my $part (split /[-\s_]+ (?= . )/, $name) { 18740 if (@parts) { 18741 if ($want_error and rand() < 0.3) { 18742 push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; 18743 $want_error = 0; 18744 } 18745 else { 18746 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18747 } 18748 } 18749 push @parts, $part; 18750 } 18751 my $new = join("", @parts); 18752 trace "$name => $new" if main::DEBUG && $to_trace; 18753 18754 if ($want_error) { 18755 if (rand() >= 0.5) { 18756 $new .= $bad_loose_seps[rand(@bad_loose_seps)]; 18757 } 18758 else { 18759 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; 18760 } 18761 } 18762 return $new; 18763} 18764 18765# Used to make sure don't generate duplicate test cases. 18766my %test_generated; 18767 18768sub make_property_test_script() { 18769 # This used only for making the test script 18770 # this written directly -- it's huge. 18771 18772 print "Making test script\n" if $verbosity >= $PROGRESS; 18773 18774 # This uses randomness to test different possibilities without testing all 18775 # possibilities. To ensure repeatability, set the seed to 0. But if 18776 # tests are added, it will perturb all later ones in the .t file 18777 srand 0; 18778 18779 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name 18780 18781 # Create a list of what the %f representation is for each rational number. 18782 # This will be used below. 18783 my @valid_base_floats = '0.0'; 18784 foreach my $e_representation (keys %nv_floating_to_rational) { 18785 push @valid_base_floats, 18786 eval $nv_floating_to_rational{$e_representation}; 18787 } 18788 18789 # It doesn't matter whether the elements of this array contain single lines 18790 # or multiple lines. main::write doesn't count the lines. 18791 my @output; 18792 18793 push @output, <<'EOF_CODE'; 18794Error('\p{Script=InGreek}'); # Bug #69018 18795Test_GCB("1100 $nobreak 1161"); # Bug #70940 18796Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 18797Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 18798Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726 18799Error('\p{InKana}'); # 'Kana' is not a block so InKana shouldn't compile 18800Expect(1, 0xB6, '\p{In=V1_1}', ""); # Didn't use to work 18801Expect(1, 0x3A2,'\p{In=NA}', ""); # Didn't use to work 18802 18803# Make sure this gets tested; it was not part of the official test suite at 18804# the time this was added. Note that this is as it would appear in the 18805# official suite, and gets modified to check for the perl tailoring by 18806# Test_WB() 18807Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable"); 18808Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable"); 18809Expect(1, ord(" "), '\p{gc=:(?aa)s:}', ""); # /aa is valid 18810Expect(1, ord(" "), '\p{gc=:(?-s)s:}', ""); # /-s is valid 18811EOF_CODE 18812 18813 # Sort these so get results in same order on different runs of this 18814 # program 18815 foreach my $property (sort { $a->has_dependency <=> $b->has_dependency 18816 or 18817 lc $a->name cmp lc $b->name 18818 } property_ref('*')) 18819 { 18820 # Non-binary properties should not match \p{}; Test all for that. 18821 if ($property->type != $BINARY && $property->type != $FORCED_BINARY) { 18822 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } 18823 $property->aliases; 18824 foreach my $property_alias ($property->aliases) { 18825 my $name = standardize($property_alias->name); 18826 18827 # But some names are ambiguous, meaning a binary property with 18828 # the same name when used in \p{}, and a different 18829 # (non-binary) property in other contexts. 18830 next if grep { $name eq $_ } keys %ambiguous_names; 18831 18832 push @output, <<"EOF_CODE"; 18833Error('\\p{$name}'); 18834Error('\\P{$name}'); 18835EOF_CODE 18836 } 18837 } 18838 foreach my $table (sort { $a->has_dependency <=> $b->has_dependency 18839 or 18840 lc $a->name cmp lc $b->name 18841 } $property->tables) 18842 { 18843 18844 # Find code points that match, and don't match this table. 18845 my $valid = $table->get_valid_code_point; 18846 my $invalid = $table->get_invalid_code_point; 18847 my $warning = ($table->status eq $DEPRECATED) 18848 ? "'deprecated'" 18849 : '""'; 18850 18851 # Test each possible combination of the property's aliases with 18852 # the table's. If this gets to be too many, could do what is done 18853 # in the set_final_comment() for Tables 18854 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases; 18855 next unless @table_aliases; 18856 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases; 18857 next unless @property_aliases; 18858 18859 # Every property can be optionally be prefixed by 'Is_', so test 18860 # that those work, by creating such a new alias for each 18861 # pre-existing one. 18862 push @property_aliases, map { Alias->new("Is_" . $_->name, 18863 $_->loose_match, 18864 $_->make_re_pod_entry, 18865 $_->ok_as_filename, 18866 $_->status, 18867 $_->ucd, 18868 ) 18869 } @property_aliases; 18870 my $max = max(scalar @table_aliases, scalar @property_aliases); 18871 for my $j (0 .. $max - 1) { 18872 18873 # The current alias for property is the next one on the list, 18874 # or if beyond the end, start over. Similarly for table 18875 my $property_name 18876 = $property_aliases[$j % @property_aliases]->name; 18877 18878 $property_name = "" if $table->property == $perl; 18879 my $table_alias = $table_aliases[$j % @table_aliases]; 18880 my $table_name = $table_alias->name; 18881 my $loose_match = $table_alias->loose_match; 18882 18883 # If the table doesn't have a file, any test for it is 18884 # already guaranteed to be in error 18885 my $already_error = ! $table->file_path; 18886 18887 # A table that begins with these could actually be a 18888 # user-defined property, so won't be compile time errors, as 18889 # the definitions of those can be deferred until runtime 18890 next if $already_error && $table_name =~ / ^ I[ns] /x; 18891 18892 # Generate error cases for this alias. 18893 push @output, generate_error($property_name, 18894 $table_name, 18895 $already_error); 18896 18897 # If the table is guaranteed to always generate an error, 18898 # quit now without generating success cases. 18899 next if $already_error; 18900 18901 # Now for the success cases. First, wildcard matching, as it 18902 # shouldn't have any randomization. 18903 if ($table_alias->status eq $NORMAL) { 18904 push @output, generate_wildcard_tests($property_name, 18905 $table_name, 18906 $valid, 18907 $invalid, 18908 $warning, 18909 ); 18910 } 18911 my $random; 18912 if ($loose_match) { 18913 18914 # For loose matching, create an extra test case for the 18915 # standard name. 18916 my $standard = standardize($table_name); 18917 18918 # $test_name should be a unique combination for each test 18919 # case; used just to avoid duplicate tests 18920 my $test_name = "$property_name=$standard"; 18921 18922 # Don't output duplicate test cases. 18923 if (! exists $test_generated{$test_name}) { 18924 $test_generated{$test_name} = 1; 18925 push @output, generate_tests($property_name, 18926 $standard, 18927 $valid, 18928 $invalid, 18929 $warning, 18930 ); 18931 if ($table_alias->status eq $NORMAL) { 18932 push @output, generate_wildcard_tests( 18933 $property_name, 18934 $standard, 18935 $valid, 18936 $invalid, 18937 $warning, 18938 ); 18939 } 18940 } 18941 $random = randomize_loose_name($table_name) 18942 } 18943 else { # Stricter match 18944 $random = randomize_stricter_name($table_name); 18945 } 18946 18947 # Now for the main test case for this alias. 18948 my $test_name = "$property_name=$random"; 18949 if (! exists $test_generated{$test_name}) { 18950 $test_generated{$test_name} = 1; 18951 push @output, generate_tests($property_name, 18952 $random, 18953 $valid, 18954 $invalid, 18955 $warning, 18956 ); 18957 18958 if ($property->name eq 'nv') { 18959 if ($table_name !~ qr{/}) { 18960 push @output, generate_tests($property_name, 18961 sprintf("%.15e", $table_name), 18962 $valid, 18963 $invalid, 18964 $warning, 18965 ); 18966 } 18967 else { 18968 # If the name is a rational number, add tests for a 18969 # non-reduced form, and for a floating point equivalent. 18970 18971 # 60 is a number divisible by a bunch of things 18972 my ($numerator, $denominator) = $table_name 18973 =~ m! (.+) / (.+) !x; 18974 $numerator *= 60; 18975 $denominator *= 60; 18976 push @output, generate_tests($property_name, 18977 "$numerator/$denominator", 18978 $valid, 18979 $invalid, 18980 $warning, 18981 ); 18982 18983 # Calculate the float, and the %e representation 18984 my $float = eval $table_name; 18985 my $e_representation = sprintf("%.*e", 18986 $E_FLOAT_PRECISION, $float); 18987 # Parse that 18988 my ($non_zeros, $zeros, $exponent_sign, $exponent) 18989 = $e_representation 18990 =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x; 18991 my $min_e_precision; 18992 my $min_f_precision; 18993 18994 if ($exponent_sign eq '+' && $exponent != 0) { 18995 Carp::my_carp_bug("Not yet equipped to handle" 18996 . " positive exponents"); 18997 return; 18998 } 18999 else { 19000 # We're trying to find the minimum precision that 19001 # is needed to indicate this particular rational 19002 # for the given $E_FLOAT_PRECISION. For %e, any 19003 # trailing zeros, like 1.500e-02 aren't needed, so 19004 # the correct value is how many non-trailing zeros 19005 # there are after the decimal point. 19006 $min_e_precision = length $non_zeros; 19007 19008 # For %f, like .01500, we want at least 19009 # $E_FLOAT_PRECISION digits, but any trailing 19010 # zeros aren't needed, so we can subtract the 19011 # length of those. But we also need to include 19012 # the zeros after the decimal point, but before 19013 # the first significant digit. 19014 $min_f_precision = $E_FLOAT_PRECISION 19015 + $exponent 19016 - length $zeros; 19017 } 19018 19019 # Make tests for each possible precision from 1 to 19020 # just past the worst case. 19021 my $upper_limit = ($min_e_precision > $min_f_precision) 19022 ? $min_e_precision 19023 : $min_f_precision; 19024 19025 for my $i (1 .. $upper_limit + 1) { 19026 for my $format ("e", "f") { 19027 my $this_table 19028 = sprintf("%.*$format", $i, $float); 19029 19030 # If we don't have enough precision digits, 19031 # make a fail test; otherwise a pass test. 19032 my $pass = ($format eq "e") 19033 ? $i >= $min_e_precision 19034 : $i >= $min_f_precision; 19035 if ($pass) { 19036 push @output, generate_tests($property_name, 19037 $this_table, 19038 $valid, 19039 $invalid, 19040 $warning, 19041 ); 19042 } 19043 elsif ( $format eq "e" 19044 19045 # Here we would fail, but in the %f 19046 # case, the representation at this 19047 # precision could actually be a 19048 # valid one for some other rational 19049 || ! grep { $this_table 19050 =~ / ^ $_ 0* $ /x } 19051 @valid_base_floats) 19052 { 19053 push @output, 19054 generate_error($property_name, 19055 $this_table, 19056 1 # 1 => already an 19057 # error 19058 ); 19059 } 19060 } 19061 } 19062 } 19063 } 19064 } 19065 } 19066 $table->DESTROY(); 19067 } 19068 $property->DESTROY(); 19069 } 19070 19071 # Make any test of the boundary (break) properties TODO if the code 19072 # doesn't match the version being compiled 19073 my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version) 19074 ? "\nsub TODO_FAILING_BREAKS { 1 }\n" 19075 : "\nsub TODO_FAILING_BREAKS { 0 }\n"; 19076 19077 @output= map { 19078 map s/^/ /mgr, 19079 map "$_;\n", 19080 split /;\n/, $_ 19081 } @output; 19082 19083 # Cause there to be 'if' statements to only execute a portion of this 19084 # long-running test each time, so that we can have a bunch of .t's running 19085 # in parallel 19086 my $chunks = 10 # Number of test files 19087 - 1 # For GCB & SB 19088 - 1 # For WB 19089 - 4; # LB split into this many files 19090 my @output_chunked; 19091 my $chunk_count=0; 19092 my $chunk_size= int(@output / $chunks) + 1; 19093 while (@output) { 19094 $chunk_count++; 19095 my @chunk= splice @output, 0, $chunk_size; 19096 push @output_chunked, 19097 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19098 @chunk, 19099 "}\n"; 19100 } 19101 19102 $chunk_count++; 19103 push @output_chunked, 19104 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19105 (map {" Test_GCB('$_');\n"} @backslash_X_tests), 19106 (map {" Test_SB('$_');\n"} @SB_tests), 19107 "}\n"; 19108 19109 19110 $chunk_size= int(@LB_tests / 4) + 1; 19111 @LB_tests = map {" Test_LB('$_');\n"} @LB_tests; 19112 while (@LB_tests) { 19113 $chunk_count++; 19114 my @chunk= splice @LB_tests, 0, $chunk_size; 19115 push @output_chunked, 19116 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19117 @chunk, 19118 "}\n"; 19119 } 19120 19121 $chunk_count++; 19122 push @output_chunked, 19123 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19124 (map {" Test_WB('$_');\n"} @WB_tests), 19125 "}\n"; 19126 19127 &write($t_path, 19128 0, # Not utf8; 19129 [$HEADER, 19130 $TODO_FAILING_BREAKS, 19131 <DATA>, 19132 @output_chunked, 19133 "Finished();\n", 19134 ]); 19135 19136 return; 19137} 19138 19139sub make_normalization_test_script() { 19140 print "Making normalization test script\n" if $verbosity >= $PROGRESS; 19141 19142 my $n_path = 'TestNorm.pl'; 19143 19144 unshift @normalization_tests, <<'END'; 19145use utf8; 19146use Test::More; 19147 19148sub ord_string { # Convert packed ords to printable string 19149 use charnames (); 19150 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } 19151 unpack "U*", shift) . "'"; 19152 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; 19153} 19154 19155sub Test_N { 19156 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; 19157 my $display_source = ord_string($source); 19158 my $display_nfc = ord_string($nfc); 19159 my $display_nfd = ord_string($nfd); 19160 my $display_nfkc = ord_string($nfkc); 19161 my $display_nfkd = ord_string($nfkd); 19162 19163 use Unicode::Normalize; 19164 # NFC 19165 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd) 19166 # nfkc == toNFC(nfkc) == toNFC(nfkd) 19167 # 19168 # NFD 19169 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd) 19170 # nfkd == toNFD(nfkc) == toNFD(nfkd) 19171 # 19172 # NFKC 19173 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) == 19174 # toNFKC(nfkc) == toNFKC(nfkd) 19175 # 19176 # NFKD 19177 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) == 19178 # toNFKD(nfkc) == toNFKD(nfkd) 19179 19180 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc"); 19181 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc"); 19182 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc"); 19183 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc"); 19184 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc"); 19185 19186 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd"); 19187 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd"); 19188 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd"); 19189 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd"); 19190 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd"); 19191 19192 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc"); 19193 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc"); 19194 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc"); 19195 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc"); 19196 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc"); 19197 19198 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd"); 19199 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd"); 19200 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd"); 19201 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd"); 19202 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd"); 19203} 19204END 19205 19206 &write($n_path, 19207 1, # Is utf8; 19208 [ 19209 @normalization_tests, 19210 'done_testing();' 19211 ]); 19212 return; 19213} 19214 19215# Skip reasons, so will be exact same text and hence the files with each 19216# reason will get grouped together in perluniprops. 19217my $Documentation = "Documentation"; 19218my $Indic_Skip 19219 = "Provisional; for the analysis and processing of Indic scripts"; 19220my $Validation = "Validation Tests"; 19221my $Validation_Documentation = "Documentation of validation Tests"; 19222my $Unused_Skip = "Currently unused by Perl"; 19223 19224# This is a list of the input files and how to handle them. The files are 19225# processed in their order in this list. Some reordering is possible if 19226# desired, but the PropertyAliases and PropValueAliases files should be first, 19227# and the extracted before the others (as data in an extracted file can be 19228# over-ridden by the non-extracted. Some other files depend on data derived 19229# from an earlier file, like UnicodeData requires data from Jamo, and the case 19230# changing and folding requires data from Unicode. Mostly, it is safest to 19231# order by first version releases in (except the Jamo). 19232# 19233# The version strings allow the program to know whether to expect a file or 19234# not, but if a file exists in the directory, it will be processed, even if it 19235# is in a version earlier than expected, so you can copy files from a later 19236# release into an earlier release's directory. 19237my @input_file_objects = ( 19238 Input_file->new('PropertyAliases.txt', v3.2, 19239 Handler => \&process_PropertyAliases, 19240 Early => [ \&substitute_PropertyAliases ], 19241 Required_Even_in_Debug_Skip => 1, 19242 ), 19243 Input_file->new(undef, v0, # No file associated with this 19244 Progress_Message => 'Finishing property setup', 19245 Handler => \&finish_property_setup, 19246 ), 19247 Input_file->new('PropValueAliases.txt', v3.2, 19248 Handler => \&process_PropValueAliases, 19249 Early => [ \&substitute_PropValueAliases ], 19250 Has_Missings_Defaults => $NOT_IGNORED, 19251 Required_Even_in_Debug_Skip => 1, 19252 ), 19253 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, 19254 Property => 'General_Category', 19255 ), 19256 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, 19257 Property => 'Canonical_Combining_Class', 19258 Has_Missings_Defaults => $NOT_IGNORED, 19259 ), 19260 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, 19261 Property => 'Numeric_Type', 19262 Has_Missings_Defaults => $NOT_IGNORED, 19263 ), 19264 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, 19265 Property => 'East_Asian_Width', 19266 Has_Missings_Defaults => $NOT_IGNORED, 19267 ), 19268 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, 19269 Property => 'Line_Break', 19270 Has_Missings_Defaults => $NOT_IGNORED, 19271 ), 19272 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, 19273 Property => 'Bidi_Class', 19274 Has_Missings_Defaults => $NOT_IGNORED, 19275 ), 19276 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, 19277 Property => 'Decomposition_Type', 19278 Has_Missings_Defaults => $NOT_IGNORED, 19279 ), 19280 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), 19281 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, 19282 Property => 'Numeric_Value', 19283 Each_Line_Handler => \&filter_numeric_value_line, 19284 Has_Missings_Defaults => $NOT_IGNORED, 19285 ), 19286 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, 19287 Property => 'Joining_Group', 19288 Has_Missings_Defaults => $NOT_IGNORED, 19289 ), 19290 19291 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, 19292 Property => 'Joining_Type', 19293 Has_Missings_Defaults => $NOT_IGNORED, 19294 ), 19295 Input_file->new("${EXTRACTED}DName.txt", v10.0.0, 19296 Skip => 'This file adds no new information not already' 19297 . ' present in other files', 19298 # And it's unnecessary programmer work to handle this new 19299 # format. Previous Derived files actually had bug fixes 19300 # in them that were useful, but that should not be the 19301 # case here. 19302 ), 19303 Input_file->new('Jamo.txt', v2.0.0, 19304 Property => 'Jamo_Short_Name', 19305 Each_Line_Handler => \&filter_jamo_line, 19306 ), 19307 Input_file->new('UnicodeData.txt', v1.1.5, 19308 Pre_Handler => \&setup_UnicodeData, 19309 19310 # We clean up this file for some early versions. 19311 Each_Line_Handler => [ (($v_version lt v2.0.0 ) 19312 ? \&filter_v1_ucd 19313 : ($v_version eq v2.1.5) 19314 ? \&filter_v2_1_5_ucd 19315 19316 # And for 5.14 Perls with 6.0, 19317 # have to also make changes 19318 : ($v_version ge v6.0.0 19319 && $^V lt v5.17.0) 19320 ? \&filter_v6_ucd 19321 : undef), 19322 19323 # Early versions did not have the 19324 # proper Unicode_1 names for the 19325 # controls 19326 (($v_version lt v3.0.0) 19327 ? \&filter_early_U1_names 19328 : undef), 19329 19330 # Early versions did not correctly 19331 # use the later method for giving 19332 # decimal digit values 19333 (($v_version le v3.2.0) 19334 ? \&filter_bad_Nd_ucd 19335 : undef), 19336 19337 # And the main filter 19338 \&filter_UnicodeData_line, 19339 ], 19340 EOF_Handler => \&EOF_UnicodeData, 19341 ), 19342 Input_file->new('CJKXREF.TXT', v1.1.5, 19343 Withdrawn => v2.0.0, 19344 Skip => 'Gives the mapping of CJK code points ' 19345 . 'between Unicode and various other standards', 19346 ), 19347 Input_file->new('ArabicShaping.txt', v2.0.0, 19348 Each_Line_Handler => 19349 ($v_version lt 4.1.0) 19350 ? \&filter_old_style_arabic_shaping 19351 : undef, 19352 # The first field after the range is a "schematic name" 19353 # not used by Perl 19354 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ], 19355 Has_Missings_Defaults => $NOT_IGNORED, 19356 ), 19357 Input_file->new('Blocks.txt', v2.0.0, 19358 Property => 'Block', 19359 Has_Missings_Defaults => $NOT_IGNORED, 19360 Each_Line_Handler => \&filter_blocks_lines 19361 ), 19362 Input_file->new('Index.txt', v2.0.0, 19363 Skip => 'Alphabetical index of Unicode characters', 19364 ), 19365 Input_file->new('NamesList.txt', v2.0.0, 19366 Skip => 'Annotated list of characters', 19367 ), 19368 Input_file->new('PropList.txt', v2.0.0, 19369 Each_Line_Handler => (($v_version lt v3.1.0) 19370 ? \&filter_old_style_proplist 19371 : undef), 19372 ), 19373 Input_file->new('Props.txt', v2.0.0, 19374 Withdrawn => v3.0.0, 19375 Skip => 'A subset of F<PropList.txt> (which is used instead)', 19376 ), 19377 Input_file->new('ReadMe.txt', v2.0.0, 19378 Skip => $Documentation, 19379 ), 19380 Input_file->new('Unihan.txt', v2.0.0, 19381 Withdrawn => v5.2.0, 19382 Construction_Time_Handler => \&construct_unihan, 19383 Pre_Handler => \&setup_unihan, 19384 Optional => [ "", 19385 'Unicode_Radical_Stroke' 19386 ], 19387 Each_Line_Handler => \&filter_unihan_line, 19388 ), 19389 Input_file->new('SpecialCasing.txt', v2.1.8, 19390 Each_Line_Handler => ($v_version eq 2.1.8) 19391 ? \&filter_2_1_8_special_casing_line 19392 : \&filter_special_casing_line, 19393 Pre_Handler => \&setup_special_casing, 19394 Has_Missings_Defaults => $IGNORED, 19395 ), 19396 Input_file->new( 19397 'LineBreak.txt', v3.0.0, 19398 Has_Missings_Defaults => $NOT_IGNORED, 19399 Property => 'Line_Break', 19400 # Early versions had problematic syntax 19401 Each_Line_Handler => ($v_version ge v3.1.0) 19402 ? undef 19403 : ($v_version lt v3.0.0) 19404 ? \&filter_substitute_lb 19405 : \&filter_early_ea_lb, 19406 # Must use long names for property values see comments at 19407 # sub filter_substitute_lb 19408 Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic', 19409 'Alphabetic', # default to this because XX -> 19410 # AL 19411 19412 # Don't use _Perl_LB as a synonym for 19413 # Line_Break in later perls, as it is tailored 19414 # and isn't the same as Line_Break 19415 'ONLY_EARLY' ], 19416 ), 19417 Input_file->new('EastAsianWidth.txt', v3.0.0, 19418 Property => 'East_Asian_Width', 19419 Has_Missings_Defaults => $NOT_IGNORED, 19420 # Early versions had problematic syntax 19421 Each_Line_Handler => (($v_version lt v3.1.0) 19422 ? \&filter_early_ea_lb 19423 : undef), 19424 ), 19425 Input_file->new('CompositionExclusions.txt', v3.0.0, 19426 Property => 'Composition_Exclusion', 19427 ), 19428 Input_file->new('UnicodeData.html', v3.0.0, 19429 Withdrawn => v4.0.1, 19430 Skip => $Documentation, 19431 ), 19432 Input_file->new('BidiMirroring.txt', v3.0.1, 19433 Property => 'Bidi_Mirroring_Glyph', 19434 Has_Missings_Defaults => ($v_version lt v6.2.0) 19435 ? $NO_DEFAULTS 19436 # Is <none> which doesn't mean 19437 # anything to us, we will use the 19438 # null string 19439 : $IGNORED, 19440 ), 19441 Input_file->new('NamesList.html', v3.0.0, 19442 Skip => 'Describes the format and contents of ' 19443 . 'F<NamesList.txt>', 19444 ), 19445 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0, 19446 Withdrawn => v5.1, 19447 Skip => $Documentation, 19448 ), 19449 Input_file->new('CaseFolding.txt', v3.0.1, 19450 Pre_Handler => \&setup_case_folding, 19451 Each_Line_Handler => 19452 [ ($v_version lt v3.1.0) 19453 ? \&filter_old_style_case_folding 19454 : undef, 19455 \&filter_case_folding_line 19456 ], 19457 Has_Missings_Defaults => $IGNORED, 19458 ), 19459 Input_file->new("NormTest.txt", v3.0.1, 19460 Handler => \&process_NormalizationsTest, 19461 Skip => ($make_norm_test_script) ? 0 : $Validation, 19462 ), 19463 Input_file->new('DCoreProperties.txt', v3.1.0, 19464 # 5.2 changed this file 19465 Has_Missings_Defaults => (($v_version ge v5.2.0) 19466 ? $NOT_IGNORED 19467 : $NO_DEFAULTS), 19468 ), 19469 Input_file->new('DProperties.html', v3.1.0, 19470 Withdrawn => v3.2.0, 19471 Skip => $Documentation, 19472 ), 19473 Input_file->new('PropList.html', v3.1.0, 19474 Withdrawn => v5.1, 19475 Skip => $Documentation, 19476 ), 19477 Input_file->new('Scripts.txt', v3.1.0, 19478 Property => 'Script', 19479 Each_Line_Handler => (($v_version le v4.0.0) 19480 ? \&filter_all_caps_script_names 19481 : undef), 19482 Has_Missings_Defaults => $NOT_IGNORED, 19483 ), 19484 Input_file->new('DNormalizationProps.txt', v3.1.0, 19485 Has_Missings_Defaults => $NOT_IGNORED, 19486 Each_Line_Handler => (($v_version lt v4.0.1) 19487 ? \&filter_old_style_normalization_lines 19488 : undef), 19489 ), 19490 Input_file->new('DerivedProperties.html', v3.1.1, 19491 Withdrawn => v5.1, 19492 Skip => $Documentation, 19493 ), 19494 Input_file->new('DAge.txt', v3.2.0, 19495 Has_Missings_Defaults => $NOT_IGNORED, 19496 Property => 'Age' 19497 ), 19498 Input_file->new('HangulSyllableType.txt', v4.0, 19499 Has_Missings_Defaults => $NOT_IGNORED, 19500 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ], 19501 Property => 'Hangul_Syllable_Type' 19502 ), 19503 Input_file->new('NormalizationCorrections.txt', v3.2.0, 19504 # This documents the cumulative fixes to erroneous 19505 # normalizations in earlier Unicode versions. Its main 19506 # purpose is so that someone running on an earlier 19507 # version can use this file to override what got 19508 # published in that earlier release. It would be easy 19509 # for mktables to handle this file. But all the 19510 # corrections in it should already be in the other files 19511 # for the release it is. To get it to actually mean 19512 # something useful, someone would have to be using an 19513 # earlier Unicode release, and copy it into the directory 19514 # for that release and recompile. So far there has been 19515 # no demand to do that, so this hasn't been implemented. 19516 Skip => 'Documentation of corrections already ' 19517 . 'incorporated into the Unicode data base', 19518 ), 19519 Input_file->new('StandardizedVariants.html', v3.2.0, 19520 Skip => 'Obsoleted as of Unicode 9.0, but previously ' 19521 . 'provided a visual display of the standard ' 19522 . 'variant sequences derived from ' 19523 . 'F<StandardizedVariants.txt>.', 19524 # I don't know why the html came earlier than the 19525 # .txt, but both are skipped anyway, so it doesn't 19526 # matter. 19527 ), 19528 Input_file->new('StandardizedVariants.txt', v4.0.0, 19529 Skip => 'Certain glyph variations for character display ' 19530 . 'are standardized. This lists the non-Unihan ' 19531 . 'ones; the Unihan ones are also not used by ' 19532 . 'Perl, and are in a separate Unicode data base ' 19533 . 'L<http://www.unicode.org/ivd>', 19534 ), 19535 Input_file->new('UCD.html', v4.0.0, 19536 Withdrawn => v5.2, 19537 Skip => $Documentation, 19538 ), 19539 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, 19540 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ], 19541 Property => 'Word_Break', 19542 Has_Missings_Defaults => $NOT_IGNORED, 19543 ), 19544 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1, 19545 Early => [ \&generate_GCB, '_Perl_GCB' ], 19546 Property => 'Grapheme_Cluster_Break', 19547 Has_Missings_Defaults => $NOT_IGNORED, 19548 ), 19549 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, 19550 Handler => \&process_GCB_test, 19551 retain_trailing_comments => 1, 19552 ), 19553 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0, 19554 Skip => $Validation_Documentation, 19555 ), 19556 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, 19557 Handler => \&process_SB_test, 19558 retain_trailing_comments => 1, 19559 ), 19560 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0, 19561 Skip => $Validation_Documentation, 19562 ), 19563 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, 19564 Handler => \&process_WB_test, 19565 retain_trailing_comments => 1, 19566 ), 19567 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0, 19568 Skip => $Validation_Documentation, 19569 ), 19570 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, 19571 Property => 'Sentence_Break', 19572 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ], 19573 Has_Missings_Defaults => $NOT_IGNORED, 19574 ), 19575 Input_file->new('NamedSequences.txt', v4.1.0, 19576 Handler => \&process_NamedSequences 19577 ), 19578 Input_file->new('Unihan.html', v4.1.0, 19579 Withdrawn => v5.2, 19580 Skip => $Documentation, 19581 ), 19582 Input_file->new('NameAliases.txt', v5.0, 19583 Property => 'Name_Alias', 19584 Each_Line_Handler => ($v_version le v6.0.0) 19585 ? \&filter_early_version_name_alias_line 19586 : \&filter_later_version_name_alias_line, 19587 ), 19588 # NameAliases.txt came along in v5.0. The above constructor handles 19589 # this. But until 6.1, it was lacking some information needed by core 19590 # perl. The constructor below handles that. It is either a kludge or 19591 # clever, depending on your point of view. The 'Withdrawn' parameter 19592 # indicates not to use it at all starting in 6.1 (so the above 19593 # constructor applies), and the 'v6.1' parameter indicates to use the 19594 # Early parameter before 6.1. Therefore 'Early" is always used, 19595 # yielding the internal-only property '_Perl_Name_Alias', which it 19596 # gets from a NameAliases.txt from 6.1 or later stored in 19597 # N_Asubst.txt. In combination with the above constructor, 19598 # 'Name_Alias' is publicly accessible starting with v5.0, and the 19599 # better 6.1 version is accessible to perl core in all releases. 19600 Input_file->new("NameAliases.txt", v6.1, 19601 Withdrawn => v6.1, 19602 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ], 19603 Property => 'Name_Alias', 19604 EOF_Handler => \&fixup_early_perl_name_alias, 19605 Each_Line_Handler => 19606 \&filter_later_version_name_alias_line, 19607 ), 19608 Input_file->new('NamedSqProv.txt', v5.0.0, 19609 Skip => 'Named sequences proposed for inclusion in a ' 19610 . 'later version of the Unicode Standard; if you ' 19611 . 'need them now, you can append this file to ' 19612 . 'F<NamedSequences.txt> and recompile perl', 19613 ), 19614 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0, 19615 Handler => \&process_LB_test, 19616 retain_trailing_comments => 1, 19617 ), 19618 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0, 19619 Skip => $Validation_Documentation, 19620 ), 19621 Input_file->new("BidiTest.txt", v5.2.0, 19622 Skip => $Validation, 19623 ), 19624 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, 19625 Optional => "", 19626 Each_Line_Handler => \&filter_unihan_line, 19627 ), 19628 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, 19629 Optional => "", 19630 Each_Line_Handler => \&filter_unihan_line, 19631 ), 19632 Input_file->new('UnihanIRGSources.txt', v5.2.0, 19633 Optional => [ "", 19634 'kCompatibilityVariant', 19635 'kIICore', 19636 'kIRG_GSource', 19637 'kIRG_HSource', 19638 'kIRG_JSource', 19639 'kIRG_KPSource', 19640 'kIRG_MSource', 19641 'kIRG_KSource', 19642 'kIRG_SSource', 19643 'kIRG_TSource', 19644 'kIRG_USource', 19645 'kIRG_UKSource', 19646 'kIRG_VSource', 19647 ], 19648 Pre_Handler => \&setup_unihan, 19649 Each_Line_Handler => \&filter_unihan_line, 19650 ), 19651 Input_file->new('UnihanNumericValues.txt', v5.2.0, 19652 Optional => [ "", 19653 'kAccountingNumeric', 19654 'kOtherNumeric', 19655 'kPrimaryNumeric', 19656 ], 19657 Each_Line_Handler => \&filter_unihan_line, 19658 ), 19659 Input_file->new('UnihanOtherMappings.txt', v5.2.0, 19660 Optional => "", 19661 Each_Line_Handler => \&filter_unihan_line, 19662 ), 19663 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, 19664 Optional => [ "", 19665 'Unicode_Radical_Stroke' 19666 ], 19667 Each_Line_Handler => \&filter_unihan_line, 19668 ), 19669 Input_file->new('UnihanReadings.txt', v5.2.0, 19670 Optional => "", 19671 Each_Line_Handler => \&filter_unihan_line, 19672 ), 19673 Input_file->new('UnihanVariants.txt', v5.2.0, 19674 Optional => "", 19675 Each_Line_Handler => \&filter_unihan_line, 19676 ), 19677 Input_file->new('CJKRadicals.txt', v5.2.0, 19678 Skip => 'Maps the kRSUnicode property values to ' 19679 . 'corresponding code points', 19680 ), 19681 Input_file->new('EmojiSources.txt', v6.0.0, 19682 Skip => 'Maps certain Unicode code points to their ' 19683 . 'legacy Japanese cell-phone values', 19684 ), 19685 # This file is actually not usable as-is until 6.1.0, because the property 19686 # is provisional, so its name is missing from PropertyAliases.txt until 19687 # that release, so that further work would have to be done to get it to 19688 # work properly 19689 Input_file->new('ScriptExtensions.txt', v6.0.0, 19690 Property => 'Script_Extensions', 19691 Early => [ sub {} ], # Doesn't do anything but ensures 19692 # that this isn't skipped for early 19693 # versions 19694 Pre_Handler => \&setup_script_extensions, 19695 Each_Line_Handler => \&filter_script_extensions_line, 19696 Has_Missings_Defaults => (($v_version le v6.0.0) 19697 ? $NO_DEFAULTS 19698 : $IGNORED), 19699 ), 19700 # These two Indic files are actually not usable as-is until 6.1.0, 19701 # because they are provisional, so their property values are missing from 19702 # PropValueAliases.txt until that release, so that further work would have 19703 # to be done to get them to work properly. 19704 Input_file->new('IndicMatraCategory.txt', v6.0.0, 19705 Withdrawn => v8.0.0, 19706 Property => 'Indic_Matra_Category', 19707 Has_Missings_Defaults => $NOT_IGNORED, 19708 Skip => $Indic_Skip, 19709 ), 19710 Input_file->new('IndicSyllabicCategory.txt', v6.0.0, 19711 Property => 'Indic_Syllabic_Category', 19712 Has_Missings_Defaults => $NOT_IGNORED, 19713 Skip => (($v_version lt v8.0.0) 19714 ? $Indic_Skip 19715 : 0), 19716 ), 19717 Input_file->new('USourceData.txt', v6.2.0, 19718 Skip => 'Documentation of status and cross reference of ' 19719 . 'proposals for encoding by Unicode of Unihan ' 19720 . 'characters', 19721 ), 19722 Input_file->new('USourceGlyphs.pdf', v6.2.0, 19723 Skip => 'Pictures of the characters in F<USourceData.txt>', 19724 ), 19725 Input_file->new('BidiBrackets.txt', v6.3.0, 19726 Properties => [ 'Bidi_Paired_Bracket', 19727 'Bidi_Paired_Bracket_Type' 19728 ], 19729 Has_Missings_Defaults => $NO_DEFAULTS, 19730 ), 19731 Input_file->new("BidiCharacterTest.txt", v6.3.0, 19732 Skip => $Validation, 19733 ), 19734 Input_file->new('IndicPositionalCategory.txt', v8.0.0, 19735 Property => 'Indic_Positional_Category', 19736 Has_Missings_Defaults => $NOT_IGNORED, 19737 ), 19738 Input_file->new('TangutSources.txt', v9.0.0, 19739 Skip => 'Specifies source mappings for Tangut ideographs' 19740 . ' and components. This data file also includes' 19741 . ' informative radical-stroke values that are used' 19742 . ' internally by Unicode', 19743 ), 19744 Input_file->new('VerticalOrientation.txt', v10.0.0, 19745 Property => 'Vertical_Orientation', 19746 Has_Missings_Defaults => $NOT_IGNORED, 19747 ), 19748 Input_file->new('NushuSources.txt', v10.0.0, 19749 Skip => 'Specifies source material for Nushu characters', 19750 ), 19751 Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0, 19752 Property => 'Equivalent_Unified_Ideograph', 19753 Has_Missings_Defaults => $NOT_IGNORED, 19754 ), 19755 Input_file->new('EmojiData.txt', v11.0.0, 19756 # Is in UAX #51 and not the UCD, so must be updated 19757 # separately, and the first line edited to indicate the 19758 # UCD release we're pretending it to be in. The UTC says 19759 # this is a transitional state, and in fact was moved to 19760 # the UCD in 13.0 19761 Withdrawn => v13.0.0, 19762 Pre_Handler => \&setup_emojidata, 19763 Has_Missings_Defaults => $NOT_IGNORED, 19764 Each_Line_Handler => \&filter_emojidata_line, 19765 UCD => 0, 19766 ), 19767 Input_file->new("$EMOJI/emoji.txt", v13.0.0, 19768 Has_Missings_Defaults => $NOT_IGNORED, 19769 UCD => 0, 19770 ), 19771 Input_file->new("$EMOJI/ReadMe.txt", v13.0.0, 19772 Skip => $Documentation, 19773 UCD => 0, 19774 ), 19775 Input_file->new('IdStatus.txt', v13.0.0, 19776 Pre_Handler => \&setup_IdStatus, 19777 Property => 'Identifier_Status', 19778 UCD => 0, 19779 ), 19780 Input_file->new('IdType.txt', v13.0.0, 19781 Pre_Handler => \&setup_IdType, 19782 Each_Line_Handler => \&filter_IdType_line, 19783 Property => 'Identifier_Type', 19784 UCD => 0, 19785 ), 19786 Input_file->new('confusables.txt', v15.0.0, 19787 Skip => $Unused_Skip, 19788 UCD => 0, 19789 ), 19790 Input_file->new('confusablesSummary.txt', v15.0.0, 19791 Skip => $Unused_Skip, 19792 UCD => 0, 19793 ), 19794 Input_file->new('intentional.txt', v15.0.0, 19795 Skip => $Unused_Skip, 19796 UCD => 0, 19797 ), 19798); 19799 19800# End of all the preliminaries. 19801# Do it... 19802 19803if (@missing_early_files) { 19804 print simple_fold(join_lines(<<END 19805 19806The compilation cannot be completed because one or more required input files, 19807listed below, are missing. This is because you are compiling Unicode version 19808$unicode_version, which predates the existence of these file(s). To fully 19809function, perl needs the data that these files would have contained if they 19810had been in this release. To work around this, create copies of later 19811versions of the missing files in the directory containing '$0'. (Perl will 19812make the necessary adjustments to the data to compensate for it not being the 19813same version as is being compiled.) The files are available from unicode.org, 19814via either ftp or http. If using http, they will be under 19815www.unicode.org/versions/. Below are listed the source file name of each 19816missing file, the Unicode version to copy it from, and the name to store it 19817as. (Note that the listed source file name may not be exactly the one that 19818Unicode calls it. If you don't find it, you can look it up in 'README.perl' 19819to get the correct name.) 19820END 19821 )); 19822 print simple_fold(join_lines("\n$_")) for @missing_early_files; 19823 exit 2; 19824} 19825 19826if ($compare_versions) { 19827 Carp::my_carp(<<END 19828Warning. \$compare_versions is set. Output is not suitable for production 19829END 19830 ); 19831} 19832 19833# Put into %potential_files a list of all the files in the directory structure 19834# that could be inputs to this program 19835File::Find::find({ 19836 wanted=>sub { 19837 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the 19838 # name's case 19839 my $full = lc(File::Spec->rel2abs($_)); 19840 $potential_files{$full} = 1; 19841 return; 19842 } 19843}, File::Spec->curdir()); 19844 19845my @mktables_list_output_files; 19846my $old_start_time = 0; 19847my $old_options = ""; 19848 19849if (! -e $file_list) { 19850 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; 19851 $write_unchanged_files = 1; 19852} elsif ($write_unchanged_files) { 19853 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; 19854} 19855else { 19856 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; 19857 my $file_handle; 19858 if (! open $file_handle, "<", $file_list) { 19859 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); 19860 $glob_list = 1; 19861 } 19862 else { 19863 my @input; 19864 19865 # Read and parse mktables.lst, placing the results from the first part 19866 # into @input, and the second part into @mktables_list_output_files 19867 for my $list ( \@input, \@mktables_list_output_files ) { 19868 while (<$file_handle>) { 19869 s/^ \s+ | \s+ $//xg; 19870 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) { 19871 $old_start_time = $1; 19872 next; 19873 } 19874 if (/^ \s* \# \s* From\ options\ (.+) /x) { 19875 $old_options = $1; 19876 next; 19877 } 19878 next if /^ \s* (?: \# .* )? $/x; 19879 last if /^ =+ $/x; 19880 my ( $file ) = split /\t/; 19881 push @$list, $file; 19882 } 19883 @$list = uniques(@$list); 19884 next; 19885 } 19886 19887 # Look through all the input files 19888 foreach my $input (@input) { 19889 next if $input eq 'version'; # Already have checked this. 19890 19891 # Ignore if doesn't exist. The checking about whether we care or 19892 # not is done via the Input_file object. 19893 next if ! file_exists($input); 19894 19895 # The paths are stored with relative names, and with '/' as the 19896 # delimiter; convert to absolute on this machine 19897 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); 19898 $potential_files{lc $full} = 1; 19899 } 19900 } 19901 19902 close $file_handle; 19903} 19904 19905if ($glob_list) { 19906 19907 # Here wants to process all .txt files in the directory structure. 19908 # Convert them to full path names. They are stored in the platform's 19909 # relative style 19910 my @known_files; 19911 foreach my $object (@input_file_objects) { 19912 my $file = $object->file; 19913 next unless defined $file; 19914 push @known_files, File::Spec->rel2abs($file); 19915 } 19916 19917 my @unknown_input_files; 19918 foreach my $file (keys %potential_files) { # The keys are stored in lc 19919 next if grep { $file eq lc($_) } @known_files; 19920 19921 # Here, the file is unknown to us. Get relative path name 19922 $file = File::Spec->abs2rel($file); 19923 push @unknown_input_files, $file; 19924 19925 # What will happen is we create a data structure for it, and add it to 19926 # the list of input files to process. First get the subdirectories 19927 # into an array 19928 my (undef, $directories, undef) = File::Spec->splitpath($file); 19929 $directories =~ s;/$;;; # Can have extraneous trailing '/' 19930 my @directories = File::Spec->splitdir($directories); 19931 19932 # If the file isn't extracted (meaning none of the directories is the 19933 # extracted one), just add it to the end of the list of inputs. 19934 if (! grep { $EXTRACTED_DIR eq $_ } @directories) { 19935 push @input_file_objects, Input_file->new($file, v0); 19936 } 19937 else { 19938 19939 # Here, the file is extracted. It needs to go ahead of most other 19940 # processing. Search for the first input file that isn't a 19941 # special required property (that is, find one whose first_release 19942 # is non-0), and isn't extracted. Also, the Age property file is 19943 # processed before the extracted ones, just in case 19944 # $compare_versions is set. 19945 for (my $i = 0; $i < @input_file_objects; $i++) { 19946 if ($input_file_objects[$i]->first_released ne v0 19947 && lc($input_file_objects[$i]->file) ne 'dage.txt' 19948 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) 19949 { 19950 splice @input_file_objects, $i, 0, 19951 Input_file->new($file, v0); 19952 last; 19953 } 19954 } 19955 19956 } 19957 } 19958 if (@unknown_input_files) { 19959 print STDERR simple_fold(join_lines(<<END 19960 19961The following files are unknown as to how to handle. Assuming they are 19962typical property files. You'll know by later error messages if it worked or 19963not: 19964END 19965 ) . " " . join(", ", @unknown_input_files) . "\n\n"); 19966 } 19967} # End of looking through directory structure for more .txt files. 19968 19969# Create the list of input files from the objects we have defined, plus 19970# version 19971my @input_files = qw(version Makefile); 19972foreach my $object (@input_file_objects) { 19973 my $file = $object->file; 19974 next if ! defined $file; # Not all objects have files 19975 next if defined $object->skip;; 19976 push @input_files, $file; 19977} 19978 19979if ( $verbosity >= $VERBOSE ) { 19980 print "Expecting ".scalar( @input_files )." input files. ", 19981 "Checking ".scalar( @mktables_list_output_files )." output files.\n"; 19982} 19983 19984# We set $most_recent to be the most recently changed input file, including 19985# this program itself (done much earlier in this file) 19986foreach my $in (@input_files) { 19987 next unless -e $in; # Keep going even if missing a file 19988 my $mod_time = (stat $in)[9]; 19989 $most_recent = $mod_time if $mod_time > $most_recent; 19990 19991 # See that the input files have distinct names, to warn someone if they 19992 # are adding a new one 19993 if ($make_list) { 19994 my ($volume, $directories, $file ) = File::Spec->splitpath($in); 19995 $directories =~ s;/$;;; # Can have extraneous trailing '/' 19996 my @directories = File::Spec->splitdir($directories); 19997 construct_filename($file, 'mutable', \@directories); 19998 } 19999} 20000 20001# We use 'Makefile' just to see if it has changed since the last time we 20002# rebuilt. Now discard it. 20003@input_files = grep { $_ ne 'Makefile' } @input_files; 20004 20005my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild 20006 || ! scalar @mktables_list_output_files # or if no outputs known 20007 || $old_start_time < $most_recent # or out-of-date 20008 || $old_options ne $command_line_arguments; # or with different 20009 # options 20010 20011# Now we check to see if any output files are older than youngest, if 20012# they are, we need to continue on, otherwise we can presumably bail. 20013if (! $rebuild) { 20014 foreach my $out (@mktables_list_output_files) { 20015 if ( ! file_exists($out)) { 20016 print "'$out' is missing.\n" if $verbosity >= $VERBOSE; 20017 $rebuild = 1; 20018 last; 20019 } 20020 #local $to_trace = 1 if main::DEBUG; 20021 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; 20022 if ( (stat $out)[9] <= $most_recent ) { 20023 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; 20024 print "'$out' is too old.\n" if $verbosity >= $VERBOSE; 20025 $rebuild = 1; 20026 last; 20027 } 20028 } 20029} 20030if (! $rebuild) { 20031 print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; 20032 exit(0); 20033} 20034print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE; 20035 20036# Ready to do the major processing. First create the perl pseudo-property. 20037$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); 20038 20039# Process each input file 20040foreach my $file (@input_file_objects) { 20041 $file->run; 20042} 20043 20044# Finish the table generation. 20045 20046print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; 20047finish_Unicode(); 20048 20049# For the very specialized case of comparing two Unicode versions... 20050if (DEBUG && $compare_versions) { 20051 handle_compare_versions(); 20052} 20053 20054print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; 20055compile_perl(); 20056 20057print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; 20058add_perl_synonyms(); 20059 20060print "Writing tables\n" if $verbosity >= $PROGRESS; 20061write_all_tables(); 20062 20063# Write mktables.lst 20064if ( $file_list and $make_list ) { 20065 20066 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; 20067 foreach my $file (@input_files, @files_actually_output) { 20068 my (undef, $directories, $basefile) = File::Spec->splitpath($file); 20069 my @directories = grep length, File::Spec->splitdir($directories); 20070 $file = join '/', @directories, $basefile; 20071 } 20072 20073 my $ofh; 20074 if (! open $ofh,">",$file_list) { 20075 Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); 20076 return 20077 } 20078 else { 20079 my $localtime = localtime $start_time; 20080 print $ofh <<"END"; 20081# 20082# $file_list -- File list for $0. 20083# 20084# Autogenerated starting on $start_time ($localtime) 20085# From options $command_line_arguments 20086# 20087# - First section is input files 20088# ($0 itself is not listed but is automatically considered an input) 20089# - Section separator is /^=+\$/ 20090# - Second section is a list of output files. 20091# - Lines matching /^\\s*#/ are treated as comments 20092# which along with blank lines are ignored. 20093# 20094 20095# Input files: 20096 20097END 20098 print $ofh "$_\n" for sort(@input_files); 20099 print $ofh "\n=================================\n# Output files:\n\n"; 20100 print $ofh "$_\n" for sort @files_actually_output; 20101 print $ofh "\n# ",scalar(@input_files)," input files\n", 20102 "# ",scalar(@files_actually_output)+1," output files\n\n", 20103 "# End list\n"; 20104 close $ofh 20105 or Carp::my_carp("Failed to close $ofh: $!"); 20106 20107 print "Filelist has ",scalar(@input_files)," input files and ", 20108 scalar(@files_actually_output)+1," output files\n" 20109 if $verbosity >= $VERBOSE; 20110 } 20111} 20112 20113# Output these warnings unless -q explicitly specified. 20114if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { 20115 if (@unhandled_properties) { 20116 print "\nProperties and tables that unexpectedly have no code points\n"; 20117 foreach my $property (sort @unhandled_properties) { 20118 print $property, "\n"; 20119 } 20120 } 20121 20122 if (%potential_files) { 20123 print "\nInput files that are not considered:\n"; 20124 foreach my $file (sort keys %potential_files) { 20125 print File::Spec->abs2rel($file), "\n"; 20126 } 20127 } 20128 print "\nAll done\n" if $verbosity >= $VERBOSE; 20129} 20130 20131if ($version_of_mk_invlist_bounds lt $v_version) { 20132 Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need" 20133 . " to be checked and possibly updated to Unicode" 20134 . " $string_version. Failing tests will be marked TODO"); 20135} 20136 20137exit(0); 20138 20139# TRAILING CODE IS USED BY make_property_test_script() 20140__DATA__ 20141 20142use strict; 20143use warnings; 20144 20145use feature 'signatures'; 20146 20147no warnings 'experimental::uniprop_wildcards'; 20148 20149# Test qr/\X/ and the \p{} regular expression constructs. This file is 20150# constructed by mktables from the tables it generates, so if mktables is 20151# buggy, this won't necessarily catch those bugs. Tests are generated for all 20152# feasible properties; a few aren't currently feasible; see 20153# is_code_point_usable() in mktables for details. 20154 20155# Standard test packages are not used because this manipulates SIG_WARN. It 20156# exits 0 if every non-skipped test succeeded; -1 if any failed. 20157 20158my $Tests = 0; 20159my $Fails = 0; 20160 20161# loc_tools.pl requires this function to be defined 20162sub ok($pass, @msg) { 20163 print "not " unless $pass; 20164 print "ok "; 20165 print ++$Tests; 20166 print " - ", join "", @msg if @msg; 20167 print "\n"; 20168} 20169 20170sub Expect($expected, $ord, $regex, $warning_type='') { 20171 my $line = (caller)[2]; 20172 20173 # Convert the code point to hex form 20174 my $string = sprintf "\"\\x{%04X}\"", $ord; 20175 20176 my @tests = ""; 20177 20178 # The first time through, use all warnings. If the input should generate 20179 # a warning, add another time through with them turned off 20180 push @tests, "no warnings '$warning_type';" if $warning_type; 20181 20182 foreach my $no_warnings (@tests) { 20183 20184 # Store any warning messages instead of outputting them 20185 local $SIG{__WARN__} = $SIG{__WARN__}; 20186 my $warning_message; 20187 $SIG{__WARN__} = sub { $warning_message = $_[0] }; 20188 20189 $Tests++; 20190 20191 # A string eval is needed because of the 'no warnings'. 20192 # Assumes no parentheses in the regular expression 20193 my $result = eval "$no_warnings 20194 my \$RegObj = qr($regex); 20195 $string =~ \$RegObj ? 1 : 0"; 20196 if (not defined $result) { 20197 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; 20198 $Fails++; 20199 } 20200 elsif ($result ^ $expected) { 20201 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; 20202 $Fails++; 20203 } 20204 elsif ($warning_message) { 20205 if (! $warning_type || ($warning_type && $no_warnings)) { 20206 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; 20207 $Fails++; 20208 } 20209 else { 20210 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; 20211 } 20212 } 20213 elsif ($warning_type && ! $no_warnings) { 20214 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; 20215 $Fails++; 20216 } 20217 else { 20218 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; 20219 } 20220 } 20221 return; 20222} 20223 20224sub Error($regex) { 20225 $Tests++; 20226 if (eval { 'x' =~ qr/$regex/; 1 }) { 20227 $Fails++; 20228 my $line = (caller)[2]; 20229 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; 20230 } 20231 else { 20232 my $line = (caller)[2]; 20233 print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; 20234 } 20235 return; 20236} 20237 20238# Break test files (e.g. GCBTest.txt) character that break allowed here 20239my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7)); 20240utf8::upgrade($breakable_utf8); 20241 20242# Break test files (e.g. GCBTest.txt) character that indicates can't break 20243# here 20244my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7)); 20245utf8::upgrade($nobreak_utf8); 20246 20247my $are_ctype_locales_available; 20248my $utf8_locale; 20249chdir 't' if -d 't'; 20250eval { require "./loc_tools.pl" }; 20251if (defined &locales_enabled) { 20252 $are_ctype_locales_available = locales_enabled('LC_CTYPE'); 20253 if ($are_ctype_locales_available) { 20254 $utf8_locale = &find_utf8_ctype_locale; 20255 } 20256} 20257 20258# Eval'd so can run on versions earlier than the property is available in 20259my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/'; 20260if (! defined $WB_Extend_or_Format_re) { 20261 $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/'; 20262} 20263 20264sub _test_break($template, $break_type) { 20265 # Test various break property matches. The 2nd parameter gives the 20266 # property name. The input is a line from auxiliary/*Test.txt for the 20267 # given property. Each such line is a sequence of Unicode (not native) 20268 # code points given by their hex numbers, separated by the two characters 20269 # defined just before this subroutine that indicate that either there can 20270 # or cannot be a break between the adjacent code points. All these are 20271 # tested. 20272 # 20273 # For the gcb property extra tests are made. if there isn't a break, that 20274 # means the sequence forms an extended grapheme cluster, which means that 20275 # \X should match the whole thing. If there is a break, \X should stop 20276 # there. This is all converted by this routine into a match: $string =~ 20277 # /(\X)/, Each \X should match the next cluster; and that is what is 20278 # checked. 20279 20280 my $line = (caller 1)[2]; # Line number 20281 my $comment = ""; 20282 20283 if ($template =~ / ( .*? ) \s* \# (.*) /x) { 20284 $template = $1; 20285 $comment = $2; 20286 20287 # Replace leading spaces with a single one. 20288 $comment =~ s/ ^ \s* / # /x; 20289 } 20290 20291 # The line contains characters above the ASCII range, but in Latin1. It 20292 # may or may not be in utf8, and if it is, it may or may not know it. So, 20293 # convert these characters to 8 bits. If knows is in utf8, simply 20294 # downgrade. 20295 if (utf8::is_utf8($template)) { 20296 utf8::downgrade($template); 20297 } else { 20298 20299 # Otherwise, if it is in utf8, but doesn't know it, the next lines 20300 # convert the two problematic characters to their 8-bit equivalents. 20301 # If it isn't in utf8, they don't harm anything. 20302 use bytes; 20303 $template =~ s/$nobreak_utf8/$nobreak/g; 20304 $template =~ s/$breakable_utf8/$breakable/g; 20305 } 20306 20307 # Perl customizes wb. So change the official tests accordingly 20308 if ($break_type eq 'wb' && $WB_Extend_or_Format_re) { 20309 20310 # Split into elements that alternate between code point and 20311 # break/no-break 20312 my @line = split / +/, $template; 20313 20314 # Look at each code point and its following one 20315 for (my $i = 1; $i < @line - 1 - 1; $i+=2) { 20316 20317 # The customization only involves changing some breaks to 20318 # non-breaks. 20319 next if $line[$i+1] =~ /$nobreak/; 20320 20321 my $lhs = chr utf8::unicode_to_native(hex $line[$i]); 20322 my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]); 20323 20324 # And it only affects adjacent space characters. 20325 next if $lhs !~ /\s/u; 20326 20327 # But, we want to make sure to test spaces followed by a Extend 20328 # or Format. 20329 next if $rhs !~ /\s|$WB_Extend_or_Format_re/; 20330 20331 # To test the customization, add some white-space before this to 20332 # create a span. The $lhs white space may or may not be bound to 20333 # that span, and also with the $rhs. If the $rhs is a binding 20334 # character, the $lhs is bound to it and not to the span, unless 20335 # $lhs is vertical space. In all other cases, the $lhs is bound 20336 # to the span. If the $rhs is white space, it is bound to the 20337 # $lhs 20338 my $bound; 20339 my $span; 20340 if ($rhs =~ /$WB_Extend_or_Format_re/) { 20341 if ($lhs =~ /\v/) { 20342 $bound = $breakable; 20343 $span = $nobreak; 20344 } 20345 else { 20346 $bound = $nobreak; 20347 $span = $breakable; 20348 } 20349 } 20350 else { 20351 $span = $nobreak; 20352 $bound = $nobreak; 20353 } 20354 20355 splice @line, $i, 0, ( '0020', $nobreak, '0020', $span); 20356 $i += 4; 20357 $line[$i+1] = $bound; 20358 } 20359 $template = join " ", @line; 20360 } 20361 20362 # The input is just the break/no-break symbols and sequences of Unicode 20363 # code points as hex digits separated by spaces for legibility. e.g.: 20364 # �� 0020 �� 0308 �� 0020 �� 20365 # Convert to native \x format 20366 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex; 20367 $template =~ s/ \s* //gx; # Probably the line above removed all spaces; 20368 # but be sure 20369 20370 # Make a copy of the input with the symbols replaced by \b{} and \B{} as 20371 # appropriate 20372 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx; 20373 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx; 20374 20375 my $display_string = $template =~ s/[$breakable$nobreak]//gr; 20376 my $string = eval "\"$display_string\""; 20377 20378 # The remaining massaging of the input is for the \X tests. Get rid of 20379 # the leading and trailing breakables 20380 $template =~ s/^ \s* $breakable \s* //x; 20381 $template =~ s/ \s* $breakable \s* $ //x; 20382 20383 # Delete no-breaks 20384 $template =~ s/ \s* $nobreak \s* //xg; 20385 20386 # Split the input into segments that are breakable between them. 20387 my @should_display = split /\s*$breakable\s*/, $template; 20388 my @should_match = map { eval "\"$_\"" } @should_display; 20389 20390 # If a string can be represented in both non-ut8 and utf8, test both cases 20391 my $display_upgrade = ""; 20392 UPGRADE: 20393 for my $to_upgrade (0 .. 1) { 20394 20395 if ($to_upgrade) { 20396 20397 # If already in utf8, would just be a repeat 20398 next UPGRADE if utf8::is_utf8($string); 20399 20400 utf8::upgrade($string); 20401 $display_upgrade = " (utf8-upgraded)"; 20402 } 20403 20404 my @modifiers = qw(a aa d u i); 20405 if ($are_ctype_locales_available) { 20406 push @modifiers, "l$utf8_locale" if defined $utf8_locale; 20407 20408 # The /l modifier has C after it to indicate the locale to try 20409 push @modifiers, "lC"; 20410 } 20411 20412 # Test for each of the regex modifiers. 20413 for my $modifier (@modifiers) { 20414 my $display_locale = ""; 20415 20416 # For /l, set the locale to what it says to. 20417 if ($modifier =~ / ^ l (.*) /x) { 20418 my $locale = $1; 20419 $display_locale = "(locale = $locale)"; 20420 POSIX::setlocale(POSIX::LC_CTYPE(), $locale); 20421 $modifier = 'l'; 20422 } 20423 20424 no warnings qw(locale regexp surrogate); 20425 my $pattern = "(?$modifier:$break_pattern)"; 20426 20427 # Actually do the test 20428 my $matched_text; 20429 my $matched = $string =~ qr/$pattern/; 20430 if ($matched) { 20431 $matched_text = "matched"; 20432 } 20433 else { 20434 $matched_text = "failed to match"; 20435 print "not "; 20436 20437 if (TODO_FAILING_BREAKS) { 20438 $comment = " # $comment" unless $comment =~ / ^ \s* \# /x; 20439 $comment =~ s/#/# TODO/; 20440 } 20441 } 20442 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n"; 20443 20444 # Only print the comment on the first use of this line 20445 $comment = ""; 20446 20447 # Repeat with the first \B{} in the pattern. This makes sure the 20448 # code in regexec.c:find_byclass() for \B gets executed 20449 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) { 20450 my $B_pattern = "$1$2"; 20451 $matched = $string =~ qr/$B_pattern/; 20452 print "not " unless $matched; 20453 $matched_text = ($matched) ? "matched" : "failed to match"; 20454 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale"; 20455 print " # TODO" if TODO_FAILING_BREAKS && ! $matched; 20456 print "\n"; 20457 } 20458 } 20459 20460 next if $break_type ne 'gcb'; 20461 20462 # Finally, do the \X match. 20463 my @matches = $string =~ /(\X)/g; 20464 20465 # Look through each matched cluster to verify that it matches what we 20466 # expect. 20467 my $min = (@matches < @should_match) ? @matches : @should_match; 20468 for my $i (0 .. $min - 1) { 20469 $Tests++; 20470 if ($matches[$i] eq $should_match[$i]) { 20471 print "ok $Tests - "; 20472 if ($i == 0) { 20473 print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; 20474 } else { 20475 print "And \\X #", $i + 1, 20476 } 20477 print " correctly matched $should_display[$i]; line $line\n"; 20478 } else { 20479 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ } 20480 split "", $matches[$i]); 20481 print "not ok $Tests -"; 20482 print " # TODO" if TODO_FAILING_BREAKS; 20483 print " In \"$display_string\" =~ /(\\X)/g, \\X #", 20484 $i + 1, 20485 " should have matched $should_display[$i]", 20486 " but instead matched $matches[$i]", 20487 ". Abandoning rest of line $line\n"; 20488 next UPGRADE; 20489 } 20490 } 20491 20492 # And the number of matches should equal the number of expected matches. 20493 $Tests++; 20494 if (@matches == @should_match) { 20495 print "ok $Tests - Nothing was left over; line $line\n"; 20496 } else { 20497 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line"; 20498 print " # TODO" if TODO_FAILING_BREAKS; 20499 print "\n"; 20500 } 20501 } 20502 20503 return; 20504} 20505 20506sub Test_GCB($t) { 20507 _test_break($t, 'gcb'); 20508} 20509 20510sub Test_LB($t) { 20511 _test_break($t, 'lb'); 20512} 20513 20514sub Test_SB($t) { 20515 _test_break($t, 'sb'); 20516} 20517 20518sub Test_WB($t) { 20519 _test_break($t, 'wb'); 20520} 20521 20522sub Finished() { 20523 print "1..$Tests\n"; 20524 exit($Fails ? -1 : 0); 20525} 20526 20527