1#!/usr/bin/perl -w
2
3# !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
7# There was an attempt when this was first rewritten to make it 5.8
8# compatible, but that has now been abandoned, and newer constructs are used
9# as convenient.
10
11# NOTE: this script can run quite slowly in older/slower systems.
12# It can also consume a lot of memory (128 MB or more), you may need
13# to raise your process resource limits (e.g. in bash, "ulimit -a"
14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
15
16my $start_time;
17BEGIN { # Get the time the script started running; do it at compilation to
18        # get it as close as possible
19    $start_time= time;
20}
21
22require 5.010_001;
23use strict;
24use warnings;
25use builtin qw(refaddr);
26use Carp;
27use Config;
28use File::Find;
29use File::Path;
30use File::Spec;
31use Text::Tabs;
32use re "/aa";
33
34use feature 'state';
35use feature 'signatures';
36no warnings qw( experimental::builtin );
37
38sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
39$| = 1 if DEBUG;
40my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
41
42sub NON_ASCII_PLATFORM { ord("A") != 65 }
43
44# When a new version of Unicode is published, unfortunately the algorithms for
45# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
46# manually.  The changes may or may not be backward compatible with older
47# releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
48# changes, then come back here and set the variable below to what version the
49# code is expecting.  If a newer version of Unicode is being compiled than
50# expected, a warning will be generated.  If an older version is being
51# compiled, any bounds tests that fail in the generated test file (-maketest
52# option) will be marked as TODO.
53my $version_of_mk_invlist_bounds = v15.0.0;
54
55##########################################################################
56#
57# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
58# from the Unicode database files (lib/unicore/.../*.txt),  It also generates
59# a pod file and .t files, depending on option parameters.
60#
61# The structure of this file is:
62#   First these introductory comments; then
63#   code needed for everywhere, such as debugging stuff; then
64#   code to handle input parameters; then
65#   data structures likely to be of external interest (some of which depend on
66#       the input parameters, so follows them; then
67#   more data structures and subroutine and package (class) definitions; then
68#   the small actual loop to process the input files and finish up; then
69#   a __DATA__ section, for the .t tests
70#
71# This program works on all releases of Unicode so far.  The outputs have been
72# scrutinized most intently for release 5.1.  The others have been checked for
73# somewhat more than just sanity.  It can handle all non-provisional Unicode
74# character properties in those releases.
75#
76# This program is mostly about Unicode character (or code point) properties.
77# A property describes some attribute or quality of a code point, like if it
78# is lowercase or not, its name, what version of Unicode it was first defined
79# in, or what its uppercase equivalent is.  Unicode deals with these disparate
80# possibilities by making all properties into mappings from each code point
81# into some corresponding value.  In the case of it being lowercase or not,
82# the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
83# property maps each Unicode code point to a single value, called a "property
84# value".  (Some more recently defined properties, map a code point to a set
85# of values.)
86#
87# When using a property in a regular expression, what is desired isn't the
88# mapping of the code point to its property's value, but the reverse (or the
89# mathematical "inverse relation"): starting with the property value, "Does a
90# code point map to it?"  These are written in a "compound" form:
91# \p{property=value}, e.g., \p{category=punctuation}.  This program generates
92# files containing the lists of code points that map to each such regular
93# expression property value, one file per list
94#
95# There is also a single form shortcut that Perl adds for many of the commonly
96# used properties.  This happens for all binary properties, plus script,
97# general_category, and block properties.
98#
99# Thus the outputs of this program are files.  There are map files, mostly in
100# the 'To' directory; and there are list files for use in regular expression
101# matching, all in subdirectories of the 'lib' directory, with each
102# subdirectory being named for the property that the lists in it are for.
103# Bookkeeping, test, and documentation files are also generated.
104
105my $matches_directory = 'lib';   # Where match (\p{}) files go.
106my $map_directory = 'To';        # Where map files go.
107
108# DATA STRUCTURES
109#
110# The major data structures of this program are Property, of course, but also
111# Table.  There are two kinds of tables, very similar to each other.
112# "Match_Table" is the data structure giving the list of code points that have
113# a particular property value, mentioned above.  There is also a "Map_Table"
114# data structure which gives the property's mapping from code point to value.
115# There are two structures because the match tables need to be combined in
116# various ways, such as constructing unions, intersections, complements, etc.,
117# and the map ones don't.  And there would be problems, perhaps subtle, if
118# a map table were inadvertently operated on in some of those ways.
119# The use of separate classes with operations defined on one but not the other
120# prevents accidentally confusing the two.
121#
122# At the heart of each table's data structure is a "Range_List", which is just
123# an ordered list of "Ranges", plus ancillary information, and methods to
124# operate on them.  A Range is a compact way to store property information.
125# Each range has a starting code point, an ending code point, and a value that
126# is meant to apply to all the code points between the two end points,
127# inclusive.  For a map table, this value is the property value for those
128# code points.  Two such ranges could be written like this:
129#   0x41 .. 0x5A, 'Upper',
130#   0x61 .. 0x7A, 'Lower'
131#
132# Each range also has a type used as a convenience to classify the values.
133# Most ranges in this program will be Type 0, or normal, but there are some
134# ranges that have a non-zero type.  These are used only in map tables, and
135# are for mappings that don't fit into the normal scheme of things.  Mappings
136# that require a hash entry to communicate with utf8.c are one example;
137# another example is mappings for charnames.pm to use which indicate a name
138# that is algorithmically determinable from its code point (and the reverse).
139# These are used to significantly compact these tables, instead of listing
140# each one of the tens of thousands individually.
141#
142# In a match table, the value of a range is irrelevant (and hence the type as
143# well, which will always be 0), and arbitrarily set to the empty string.
144# Using the example above, there would be two match tables for those two
145# entries, one named Upper would contain the 0x41..0x5A range, and the other
146# named Lower would contain 0x61..0x7A.
147#
148# Actually, there are two types of range lists, "Range_Map" is the one
149# associated with map tables, and "Range_List" with match tables.
150# Again, this is so that methods can be defined on one and not the others so
151# as to prevent operating on them in incorrect ways.
152#
153# Eventually, most tables are written out to files to be read by Unicode::UCD.
154# All tables could in theory be written, but some are suppressed because there
155# is no current practical use for them.  It is easy to change which get
156# written by changing various lists that are near the top of the actual code
157# in this file.  The table data structures contain enough ancillary
158# information to allow them to be treated as separate entities for writing,
159# such as the path to each one's file.  There is a heading in each map table
160# that gives the format of its entries, and what the map is for all the code
161# points missing from it.  (This allows tables to be more compact.)
162#
163# The Property data structure contains one or more tables.  All properties
164# contain a map table (except the $perl property which is a
165# pseudo-property containing only match tables), and any properties that
166# are usable in regular expression matches also contain various matching
167# tables, one for each value the property can have.  A binary property can
168# have two values, True and False (or Y and N, which are preferred by Unicode
169# terminology).  Thus each of these properties will have a map table that
170# takes every code point and maps it to Y or N (but having ranges cuts the
171# number of entries in that table way down), and two match tables, one
172# which has a list of all the code points that map to Y, and one for all the
173# code points that map to N.  (For each binary property, a third table is also
174# generated for the pseudo Perl property.  It contains the identical code
175# points as the Y table, but can be written in regular expressions, not in the
176# compound form, but in a "single" form like \p{IsUppercase}.)  Many
177# properties are binary, but some properties have several possible values,
178# some have many, and properties like Name have a different value for every
179# named code point.  Those will not, unless the controlling lists are changed,
180# have their match tables written out.  But all the ones which can be used in
181# regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
182# a property would have either its map table or its match tables written but
183# not both.  Again, what gets written is controlled by lists which can easily
184# be changed.  Starting in 5.14, advantage was taken of this, and all the map
185# tables needed to reconstruct the Unicode db are now written out, while
186# suppressing the Unicode .txt files that contain the data.  Our tables are
187# much more compact than the .txt files, so a significant space savings was
188# achieved.  Also, tables are not written out that are trivially derivable
189# from tables that do get written.  So, there typically is no file containing
190# the code points not matched by a binary property (the table for \P{} versus
191# lowercase \p{}), since you just need to invert the True table to get the
192# False table.
193
194# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
195# how many match tables there are and the content of the maps.  This 'Type' is
196# different than a range 'Type', so don't get confused by the two concepts
197# having the same name.
198#
199# For information about the Unicode properties, see Unicode's UAX44 document:
200
201my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
202
203# As stated earlier, this program will work on any release of Unicode so far.
204# Most obvious problems in earlier data have NOT been corrected except when
205# necessary to make Perl or this program work reasonably, and to keep out
206# potential security issues.  For example, no folding information was given in
207# early releases, so this program substitutes lower case instead, just so that
208# a regular expression with the /i option will do something that actually
209# gives the right results in many cases.  There are also a couple other
210# corrections for version 1.1.5, commented at the point they are made.  As an
211# example of corrections that weren't made (but could be) is this statement
212# from DerivedAge.txt: "The supplementary private use code points and the
213# non-character code points were assigned in version 2.0, but not specifically
214# listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
215# it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
216# further down in these introductory comments.
217#
218# This program works on all non-provisional properties as of the current
219# Unicode release, though the files for some are suppressed for various
220# reasons.  You can change which are output by changing lists in this program.
221#
222# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
223# loose matchings rules (from Unicode TR18):
224#
225#    The recommended names for UCD properties and property values are in
226#    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
227#    [PropValue]. There are both abbreviated names and longer, more
228#    descriptive names. It is strongly recommended that both names be
229#    recognized, and that loose matching of property names be used,
230#    whereby the case distinctions, whitespace, hyphens, and underbar
231#    are ignored.
232#
233# The program still allows Fuzzy to override its determination of if loose
234# matching should be used, but it isn't currently used, as it is no longer
235# needed; the calculations it makes are good enough.
236#
237# SUMMARY OF HOW IT WORKS:
238#
239#   Process arguments
240#
241#   A list is constructed containing each input file that is to be processed
242#
243#   Each file on the list is processed in a loop, using the associated handler
244#   code for each:
245#        The PropertyAliases.txt and PropValueAliases.txt files are processed
246#            first.  These files name the properties and property values.
247#            Objects are created of all the property and property value names
248#            that the rest of the input should expect, including all synonyms.
249#        The other input files give mappings from properties to property
250#           values.  That is, they list code points and say what the mapping
251#           is under the given property.  Some files give the mappings for
252#           just one property; and some for many.  This program goes through
253#           each file and populates the properties and their map tables from
254#           them.  Some properties are listed in more than one file, and
255#           Unicode has set up a precedence as to which has priority if there
256#           is a conflict.  Thus the order of processing matters, and this
257#           program handles the conflict possibility by processing the
258#           overriding input files last, so that if necessary they replace
259#           earlier values.
260#        After this is all done, the program creates the property mappings not
261#            furnished by Unicode, but derivable from what it does give.
262#        The tables of code points that match each property value in each
263#            property that is accessible by regular expressions are created.
264#        The Perl-defined properties are created and populated.  Many of these
265#            require data determined from the earlier steps
266#        Any Perl-defined synonyms are created, and name clashes between Perl
267#            and Unicode are reconciled and warned about.
268#        All the properties are written to files
269#        Any other files are written, and final warnings issued.
270#
271# For clarity, a number of operators have been overloaded to work on tables:
272#   ~ means invert (take all characters not in the set).  The more
273#       conventional '!' is not used because of the possibility of confusing
274#       it with the actual boolean operation.
275#   + means union
276#   - means subtraction
277#   & means intersection
278# The precedence of these is the order listed.  Parentheses should be
279# copiously used.  These are not a general scheme.  The operations aren't
280# defined for a number of things, deliberately, to avoid getting into trouble.
281# Operations are done on references and affect the underlying structures, so
282# that the copy constructors for them have been overloaded to not return a new
283# clone, but the input object itself.
284#
285# The bool operator is deliberately not overloaded to avoid confusion with
286# "should it mean if the object merely exists, or also is non-empty?".
287#
288# WHY CERTAIN DESIGN DECISIONS WERE MADE
289#
290# This program needs to be able to run under miniperl.  Therefore, it uses a
291# minimum of other modules, and hence implements some things itself that could
292# be gotten from CPAN
293#
294# This program uses inputs published by the Unicode Consortium.  These can
295# change incompatibly between releases without the Perl maintainers realizing
296# it.  Therefore this program is now designed to try to flag these.  It looks
297# at the directories where the inputs are, and flags any unrecognized files.
298# It keeps track of all the properties in the files it handles, and flags any
299# that it doesn't know how to handle.  It also flags any input lines that
300# don't match the expected syntax, among other checks.
301#
302# It is also designed so if a new input file matches one of the known
303# templates, one hopefully just needs to add it to a list to have it
304# processed.
305#
306# As mentioned earlier, some properties are given in more than one file.  In
307# particular, the files in the extracted directory are supposedly just
308# reformattings of the others.  But they contain information not easily
309# derivable from the other files, including results for Unihan (which isn't
310# usually available to this program) and for unassigned code points.  They
311# also have historically had errors or been incomplete.  In an attempt to
312# create the best possible data, this program thus processes them first to
313# glean information missing from the other files; then processes those other
314# files to override any errors in the extracted ones.  Much of the design was
315# driven by this need to store things and then possibly override them.
316#
317# It tries to keep fatal errors to a minimum, to generate something usable for
318# testing purposes.  It always looks for files that could be inputs, and will
319# warn about any that it doesn't know how to handle (the -q option suppresses
320# the warning).
321#
322# Why is there more than one type of range?
323#   This simplified things.  There are some very specialized code points that
324#   have to be handled specially for output, such as Hangul syllable names.
325#   By creating a range type (done late in the development process), it
326#   allowed this to be stored with the range, and overridden by other input.
327#   Originally these were stored in another data structure, and it became a
328#   mess trying to decide if a second file that was for the same property was
329#   overriding the earlier one or not.
330#
331# Why are there two kinds of tables, match and map?
332#   (And there is a base class shared by the two as well.)  As stated above,
333#   they actually are for different things.  Development proceeded much more
334#   smoothly when I (khw) realized the distinction.  Map tables are used to
335#   give the property value for every code point (actually every code point
336#   that doesn't map to a default value).  Match tables are used for regular
337#   expression matches, and are essentially the inverse mapping.  Separating
338#   the two allows more specialized methods, and error checks so that one
339#   can't just take the intersection of two map tables, for example, as that
340#   is nonsensical.
341#
342# What about 'fate' and 'status'.  The concept of a table's fate was created
343#   late when it became clear that something more was needed.  The difference
344#   between this and 'status' is unclean, and could be improved if someone
345#   wanted to spend the effort.
346#
347# DEBUGGING
348#
349# This program is written so it will run under miniperl.  Occasionally changes
350# will cause an error where the backtrace doesn't work well under miniperl.
351# To diagnose the problem, you can instead run it under regular perl, if you
352# have one compiled.
353#
354# There is a good trace facility.  To enable it, first sub DEBUG must be set
355# to return true.  Then a line like
356#
357# local $to_trace = 1 if main::DEBUG;
358#
359# can be added to enable tracing in its lexical scope (plus dynamic) or until
360# you insert another line:
361#
362# local $to_trace = 0 if main::DEBUG;
363#
364# To actually trace, use a line like "trace $a, @b, %c, ...;
365#
366# Some of the more complex subroutines already have trace statements in them.
367# Permanent trace statements should be like:
368#
369# trace ... if main::DEBUG && $to_trace;
370#
371# main::stack_trace() will display what its name implies
372#
373# If there is just one or a few files that you're debugging, you can easily
374# cause most everything else to be skipped.  Change the line
375#
376# my $debug_skip = 0;
377#
378# to 1, and every file whose object is in @input_file_objects and doesn't have
379# a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
380# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
381#
382# To compare the output tables, it may be useful to specify the -annotate
383# flag.  (As of this writing, this can't be done on a clean workspace, due to
384# requirements in Text::Tabs used in this option; so first run mktables
385# without this option.)  This option adds comment lines to each table, one for
386# each non-algorithmically named character giving, currently its code point,
387# name, and graphic representation if printable (and you have a font that
388# knows about it).  This makes it easier to see what the particular code
389# points are in each output table.  Non-named code points are annotated with a
390# description of their status, and contiguous ones with the same description
391# will be output as a range rather than individually.  Algorithmically named
392# characters are also output as ranges, except when there are just a few
393# contiguous ones.
394#
395# FUTURE ISSUES
396#
397# The program would break if Unicode were to change its names so that
398# interior white space, underscores, or dashes differences were significant
399# within property and property value names.
400#
401# It might be easier to use the xml versions of the UCD if this program ever
402# would need heavy revision, and the ability to handle old versions was not
403# required.
404#
405# There is the potential for name collisions, in that Perl has chosen names
406# that Unicode could decide it also likes.  There have been such collisions in
407# the past, with mostly Perl deciding to adopt the Unicode definition of the
408# name.  However in the 5.2 Unicode beta testing, there were a number of such
409# collisions, which were withdrawn before the final release, because of Perl's
410# and other's protests.  These all involved new properties which began with
411# 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
412# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
413# Unicode document, so they are unlikely to be used by Unicode for another
414# purpose.  However, they might try something beginning with 'In', or use any
415# of the other Perl-defined properties.  This program will warn you of name
416# collisions, and refuse to generate tables with them, but manual intervention
417# will be required in this event.  One scheme that could be implemented, if
418# necessary, would be to have this program generate another file, or add a
419# field to mktables.lst that gives the date of first definition of a property.
420# Each new release of Unicode would use that file as a basis for the next
421# iteration.  And the Perl synonym addition code could sort based on the age
422# of the property, so older properties get priority, and newer ones that clash
423# would be refused; hence existing code would not be impacted, and some other
424# synonym would have to be used for the new property.  This is ugly, and
425# manual intervention would certainly be easier to do in the short run; lets
426# hope it never comes to this.
427#
428# A NOTE ON UNIHAN
429#
430# This program can generate tables from the Unihan database.  But that DB
431# isn't normally available, so it is marked as optional.  Prior to version
432# 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
433# was split into 8 different files, all beginning with the letters 'Unihan'.
434# If you plunk those files down into the directory mktables ($0) is in, this
435# program will read them and automatically create tables for the properties
436# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
437# plus any you add to the @cjk_properties array and the @cjk_property_values
438# array, being sure to add necessary '# @missings' lines to the latter.  For
439# Unicode versions earlier than 5.2, most of the Unihan properties are not
440# listed at all in PropertyAliases nor PropValueAliases.  This program assumes
441# for these early releases that you want the properties that are specified in
442# the 5.2 release.
443#
444# You may need to adjust the entries to suit your purposes.  setup_unihan(),
445# and filter_unihan_line() are the functions where this is done.  This program
446# already does some adjusting to make the lines look more like the rest of the
447# Unicode DB;  You can see what that is in filter_unihan_line()
448#
449# There is a bug in the 3.2 data file in which some values for the
450# kPrimaryNumeric property have commas and an unexpected comment.  A filter
451# could be added to correct these; or for a particular installation, the
452# Unihan.txt file could be edited to fix them.
453#
454# HOW TO ADD A FILE TO BE PROCESSED
455#
456# A new file from Unicode needs to have an object constructed for it in
457# @input_file_objects, probably at the end or at the end of the extracted
458# ones.  The program should warn you if its name will clash with others on
459# restrictive file systems, like DOS.  If so, figure out a better name, and
460# add lines to the README.perl file giving that.  If the file is a character
461# property, it should be in the format that Unicode has implicitly
462# standardized for such files for the more recently introduced ones.
463# If so, the Input_file constructor for @input_file_objects can just be the
464# file name and release it first appeared in.  If not, then it should be
465# possible to construct an each_line_handler() to massage the line into the
466# standardized form.
467#
468# For non-character properties, more code will be needed.  You can look at
469# the existing entries for clues.
470#
471# UNICODE VERSIONS NOTES
472#
473# The Unicode UCD has had a number of errors in it over the versions.  And
474# these remain, by policy, in the standard for that version.  Therefore it is
475# risky to correct them, because code may be expecting the error.  So this
476# program doesn't generally make changes, unless the error breaks the Perl
477# core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
478# for U+1105, which causes real problems for the algorithms for Jamo
479# calculations, so it is changed here.
480#
481# But it isn't so clear cut as to what to do about concepts that are
482# introduced in a later release; should they extend back to earlier releases
483# where the concept just didn't exist?  It was easier to do this than to not,
484# so that's what was done.  For example, the default value for code points not
485# in the files for various properties was probably undefined until changed by
486# some version.  No_Block for blocks is such an example.  This program will
487# assign No_Block even in Unicode versions that didn't have it.  This has the
488# benefit that code being written doesn't have to special case earlier
489# versions; and the detriment that it doesn't match the Standard precisely for
490# the affected versions.
491#
492# Here are some observations about some of the issues in early versions:
493#
494# Prior to version 3.0, there were 3 character decompositions.  These are not
495# handled by Unicode::Normalize, nor will it compile when presented a version
496# that has them.  However, you can trivially get it to compile by simply
497# ignoring those decompositions, by changing the croak to a carp.  At the time
498# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
499# dist/Unicode-Normalize/mkheader) reads
500#
501#   croak("Weird Canonical Decomposition of U+$h");
502#
503# Simply comment it out.  It will compile, but will not know about any three
504# character decompositions.
505
506# The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
507# that the reason is that the CJK block starting at 4E00 was removed from
508# PropList, and was not put back in until 3.1.0.  The Perl extension (the
509# single property name \p{alpha}) has the correct values.  But the compound
510# form is simply not generated until 3.1, as it can be argued that prior to
511# this release, this was not an official property.  The comments for
512# filter_old_style_proplist() give more details.
513#
514# Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
515# always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
516# reason is that 3.2 introduced U+205F=medium math space, which was not
517# classed as white space, but Perl figured out that it should have been. 4.0
518# reclassified it correctly.
519#
520# Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
521# this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
522# became 202, and ATBL was left with no code points, as all the ones that
523# mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
524# name for the class, it would not have been affected, but if it used the
525# mnemonic, it would have been.
526#
527# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
528# points which eventually came to have this script property value, instead
529# mapped to "Unknown".  But in the next release all these code points were
530# moved to \p{sc=common} instead.
531
532# The tests furnished  by Unicode for testing WordBreak and SentenceBreak
533# generate errors in 5.0 and earlier.
534#
535# The default for missing code points for BidiClass is complicated.  Starting
536# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
537# tries to do the best it can for earlier releases.  It is done in
538# process_PropertyAliases()
539#
540# In version 2.1.2, the entry in UnicodeData.txt:
541#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
542# should instead be
543#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
544# Without this change, there are casing problems for this character.
545#
546# Search for $string_compare_versions to see how to compare changes to
547# properties between Unicode versions
548#
549##############################################################################
550
551my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
552                        # and errors
553my $MAX_LINE_WIDTH = 78;
554
555# Debugging aid to skip most files so as to not be distracted by them when
556# concentrating on the ones being debugged.  Add
557# non_skip => 1,
558# to the constructor for those files you want processed when you set this.
559# Files with a first version number of 0 are special: they are always
560# processed regardless of the state of this flag.  Generally, Jamo.txt and
561# UnicodeData.txt must not be skipped if you want this program to not die
562# before normal completion.
563my $debug_skip = 0;
564
565
566# Normally these are suppressed.
567my $write_Unicode_deprecated_tables = 0;
568
569# Set to 1 to enable tracing.
570our $to_trace = 0;
571
572{ # Closure for trace: debugging aid
573    my $print_caller = 1;        # ? Include calling subroutine name
574    my $main_with_colon = 'main::';
575    my $main_colon_length = length($main_with_colon);
576
577    sub trace {
578        return unless $to_trace;        # Do nothing if global flag not set
579
580        my @input = @_;
581
582        local $DB::trace = 0;
583        $DB::trace = 0;          # Quiet 'used only once' message
584
585        my $line_number;
586
587        # Loop looking up the stack to get the first non-trace caller
588        my $caller_line;
589        my $caller_name;
590        my $i = 0;
591        do {
592            $line_number = $caller_line;
593            (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
594            $caller = $main_with_colon unless defined $caller;
595
596            $caller_name = $caller;
597
598            # get rid of pkg
599            $caller_name =~ s/.*:://;
600            if (substr($caller_name, 0, $main_colon_length)
601                eq $main_with_colon)
602            {
603                $caller_name = substr($caller_name, $main_colon_length);
604            }
605
606        } until ($caller_name ne 'trace');
607
608        # If the stack was empty, we were called from the top level
609        $caller_name = 'main' if ($caller_name eq ""
610                                    || $caller_name eq 'trace');
611
612        my $output = "";
613        #print STDERR __LINE__, ": ", join ", ", @input, "\n";
614        foreach my $string (@input) {
615            if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
616                $output .= simple_dumper($string);
617            }
618            else {
619                $string = "$string" if ref $string;
620                $string = $UNDEF unless defined $string;
621                chomp $string;
622                $string = '""' if $string eq "";
623                $output .= " " if $output ne ""
624                                && $string ne ""
625                                && substr($output, -1, 1) ne " "
626                                && substr($string, 0, 1) ne " ";
627                $output .= $string;
628            }
629        }
630
631        print STDERR sprintf "%4d: ", $line_number if defined $line_number;
632        print STDERR "$caller_name: " if $print_caller;
633        print STDERR $output, "\n";
634        return;
635    }
636}
637
638sub stack_trace() {
639    local $to_trace = 1 if main::DEBUG;
640    my $line = (caller(0))[2];
641    my $i = 1;
642
643    # Accumulate the stack trace
644    while (1) {
645        my ($pkg, $file, $caller_line, $caller) = caller $i++;
646
647        last unless defined $caller;
648
649        trace "called from $caller() at line $line";
650        $line = $caller_line;
651    }
652}
653
654# This is for a rarely used development feature that allows you to compare two
655# versions of the Unicode standard without having to deal with changes caused
656# by the code points introduced in the later version.  You probably also want
657# to use the -annotate option when using this.  Run this program on a unicore
658# containing the starting release you want to compare.  Save that output
659# structure.  Then, switching to a unicore with the ending release, change the
660# "" in the $string_compare_versions definition just below to a string
661# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
662# to the starting release.  This program will then compile, but throw away all
663# code points introduced after the starting release.  Finally use a diff tool
664# to compare the two directory structures.  They include only the code points
665# common to both releases, and you can see the changes caused just by the
666# underlying release semantic changes.  For versions earlier than 3.2, you
667# must copy a version of DAge.txt into the directory.
668my $string_compare_versions = DEBUG && "";
669my $compare_versions = DEBUG
670                       && $string_compare_versions
671                       && pack "C*", split /\./, $string_compare_versions;
672
673sub uniques {
674    # Returns non-duplicated input values.  From "Perl Best Practices:
675    # Encapsulated Cleverness".  p. 455 in first edition.
676
677    my %seen;
678    # Arguably this breaks encapsulation, if the goal is to permit multiple
679    # distinct objects to stringify to the same value, and be interchangeable.
680    # However, for this program, no two objects stringify identically, and all
681    # lists passed to this function are either objects or strings. So this
682    # doesn't affect correctness, but it does give a couple of percent speedup.
683    no overloading;
684    return grep { ! $seen{$_}++ } @_;
685}
686
687$0 = File::Spec->canonpath($0);
688
689my $make_test_script = 0;      # ? Should we output a test script
690my $make_norm_test_script = 0; # ? Should we output a normalization test script
691my $write_unchanged_files = 0; # ? Should we update the output files even if
692                               #    we don't think they have changed
693my $use_directory = "";        # ? Should we chdir somewhere.
694my $pod_directory;             # input directory to store the pod file.
695my $pod_file = 'perluniprops';
696my $t_path;                     # Path to the .t test file
697my $file_list = 'mktables.lst'; # File to store input and output file names.
698                               # This is used to speed up the build, by not
699                               # executing the main body of the program if
700                               # nothing on the list has changed since the
701                               # previous build
702my $make_list = 1;             # ? Should we write $file_list.  Set to always
703                               # make a list so that when the release manager
704                               # is preparing a release, they won't have to do
705                               # special things
706my $glob_list = 0;             # ? Should we try to include unknown .txt files
707                               # in the input.
708my $output_range_counts = $debugging_build;   # ? Should we include the number
709                                              # of code points in ranges in
710                                              # the output
711my $annotate = 0;              # ? Should character names be in the output
712
713# Verbosity levels; 0 is quiet
714my $NORMAL_VERBOSITY = 1;
715my $PROGRESS = 2;
716my $VERBOSE = 3;
717
718my $verbosity = $NORMAL_VERBOSITY;
719
720# Stored in mktables.lst so that if this program is called with different
721# options, will regenerate even if the files otherwise look like they're
722# up-to-date.
723my $command_line_arguments = join " ", @ARGV;
724
725# Process arguments
726while (@ARGV) {
727    my $arg = shift @ARGV;
728    if ($arg eq '-v') {
729        $verbosity = $VERBOSE;
730    }
731    elsif ($arg eq '-p') {
732        $verbosity = $PROGRESS;
733        $| = 1;     # Flush buffers as we go.
734    }
735    elsif ($arg eq '-q') {
736        $verbosity = 0;
737    }
738    elsif ($arg eq '-w') {
739        # update the files even if they haven't changed
740        $write_unchanged_files = 1;
741    }
742    elsif ($arg eq '-check') {
743        my $this = shift @ARGV;
744        my $ok = shift @ARGV;
745        if ($this ne $ok) {
746            print "Skipping as check params are not the same.\n";
747            exit(0);
748        }
749    }
750    elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
751        -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
752    }
753    elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
754    {
755        $make_test_script = 1;
756    }
757    elsif ($arg eq '-makenormtest')
758    {
759        $make_norm_test_script = 1;
760    }
761    elsif ($arg eq '-makelist') {
762        $make_list = 1;
763    }
764    elsif ($arg eq '-C' && defined ($use_directory = shift)) {
765        -d $use_directory or croak "Unknown directory '$use_directory'";
766    }
767    elsif ($arg eq '-L') {
768
769        # Existence not tested until have chdir'd
770        $file_list = shift;
771    }
772    elsif ($arg eq '-globlist') {
773        $glob_list = 1;
774    }
775    elsif ($arg eq '-c') {
776        $output_range_counts = ! $output_range_counts
777    }
778    elsif ($arg eq '-annotate') {
779        $annotate = 1;
780        $debugging_build = 1;
781        $output_range_counts = 1;
782    }
783    else {
784        my $with_c = 'with';
785        $with_c .= 'out' if $output_range_counts;   # Complements the state
786        croak <<END;
787usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
788          [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
789          [-check A B ]
790  -c          : Output comments $with_c number of code points in ranges
791  -q          : Quiet Mode: Only output serious warnings.
792  -p          : Set verbosity level to normal plus show progress.
793  -v          : Set Verbosity level high:  Show progress and non-serious
794                warnings
795  -w          : Write files regardless
796  -C dir      : Change to this directory before proceeding. All relative paths
797                except those specified by the -P and -T options will be done
798                with respect to this directory.
799  -P dir      : Output $pod_file file to directory 'dir'.
800  -T path     : Create a test script as 'path'; overrides -maketest
801  -L filelist : Use alternate 'filelist' instead of standard one
802  -globlist   : Take as input all non-Test *.txt files in current and sub
803                directories
804  -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
805                overrides -T
806  -makelist   : Rewrite the file list $file_list based on current setup
807  -annotate   : Output an annotation for each character in the table files;
808                useful for debugging mktables, looking at diffs; but is slow
809                and memory intensive
810  -check A B  : Executes $0 only if A and B are the same
811END
812    }
813}
814
815# Stores the most-recently changed file.  If none have changed, can skip the
816# build
817my $most_recent = (stat $0)[9];   # Do this before the chdir!
818
819# Change directories now, because need to read 'version' early.
820if ($use_directory) {
821    if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
822        $pod_directory = File::Spec->rel2abs($pod_directory);
823    }
824    if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
825        $t_path = File::Spec->rel2abs($t_path);
826    }
827    chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
828    if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
829        $pod_directory = File::Spec->abs2rel($pod_directory);
830    }
831    if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
832        $t_path = File::Spec->abs2rel($t_path);
833    }
834}
835
836# Get Unicode version into regular and v-string.  This is done now because
837# various tables below get populated based on it.  These tables are populated
838# here to be near the top of the file, and so easily seeable by those needing
839# to modify things.
840open my $VERSION, "<", "version"
841                    or croak "$0: can't open required file 'version': $!\n";
842my $string_version = <$VERSION>;
843close $VERSION;
844chomp $string_version;
845my $v_version = pack "C*", split /\./, $string_version;        # v string
846
847my $unicode_version = ($compare_versions)
848                      ? (  "$string_compare_versions (using "
849                         . "$string_version rules)")
850                      : $string_version;
851
852# The following are the complete names of properties with property values that
853# are known to not match any code points in some versions of Unicode, but that
854# may change in the future so they should be matchable, hence an empty file is
855# generated for them.
856my @tables_that_may_be_empty;
857push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
858                                                    if $v_version lt v6.3.0;
859push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
860push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
861push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
862                                                    if $v_version ge v4.1.0;
863push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
864                                                    if $v_version ge v6.0.0;
865push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
866                                                    if $v_version ge v6.1.0;
867push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
868                                                    if $v_version ge v6.2.0;
869
870# The lists below are hashes, so the key is the item in the list, and the
871# value is the reason why it is in the list.  This makes generation of
872# documentation easier.
873
874my %why_suppressed;  # No file generated for these.
875
876# Files aren't generated for empty extraneous properties.  This is arguable.
877# Extraneous properties generally come about because a property is no longer
878# used in a newer version of Unicode.  If we generated a file without code
879# points, programs that used to work on that property will still execute
880# without errors.  It just won't ever match (or will always match, with \P{}).
881# This means that the logic is now likely wrong.  I (khw) think its better to
882# find this out by getting an error message.  Just move them to the table
883# above to change this behavior
884my %why_suppress_if_empty_warn_if_not = (
885
886   # It is the only property that has ever officially been removed from the
887   # Standard.  The database never contained any code points for it.
888   'Special_Case_Condition' => 'Obsolete',
889
890   # Apparently never official, but there were code points in some versions of
891   # old-style PropList.txt
892   'Non_Break' => 'Obsolete',
893);
894
895# These would normally go in the warn table just above, but they were changed
896# a long time before this program was written, so warnings about them are
897# moot.
898if ($v_version gt v3.2.0) {
899    push @tables_that_may_be_empty,
900                                'Canonical_Combining_Class=Attached_Below_Left'
901}
902
903# Obsoleted
904if ($v_version ge v11.0.0) {
905    push @tables_that_may_be_empty, qw(
906                                       Grapheme_Cluster_Break=E_Base
907                                       Grapheme_Cluster_Break=E_Base_GAZ
908                                       Grapheme_Cluster_Break=E_Modifier
909                                       Grapheme_Cluster_Break=Glue_After_Zwj
910                                       Word_Break=E_Base
911                                       Word_Break=E_Base_GAZ
912                                       Word_Break=E_Modifier
913                                       Word_Break=Glue_After_Zwj);
914}
915
916# Enum values for to_output_map() method in the Map_Table package. (0 is don't
917# output)
918my $EXTERNAL_MAP = 1;
919my $INTERNAL_MAP = 2;
920my $OUTPUT_ADJUSTED = 3;
921
922# To override computed values for writing the map tables for these properties.
923# The default for enum map tables is to write them out, so that the Unicode
924# .txt files can be removed, but all the data to compute any property value
925# for any code point is available in a more compact form.
926my %global_to_output_map = (
927    # Needed by UCD.pm, but don't want to publicize that it exists, so won't
928    # get stuck supporting it if things change.  Since it is a STRING
929    # property, it normally would be listed in the pod, but INTERNAL_MAP
930    # suppresses that.
931    Unicode_1_Name => $INTERNAL_MAP,
932
933    Present_In => 0,                # Suppress, as easily computed from Age
934    Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
935                                            # retained, but needed for
936                                            # non-ASCII
937
938    # Suppress, as mapping can be found instead from the
939    # Perl_Decomposition_Mapping file
940    Decomposition_Type => 0,
941);
942
943# There are several types of obsolete properties defined by Unicode.  These
944# must be hand-edited for every new Unicode release.
945my %why_deprecated;  # Generates a deprecated warning message if used.
946my %why_stabilized;  # Documentation only
947my %why_obsolete;    # Documentation only
948
949{   # Closure
950    my $simple = 'Perl uses the more complete version';
951    my $unihan = 'Unihan properties are by default not enabled in the Perl core.';
952
953    my $other_properties = 'other properties';
954    my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
955    my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
956
957    %why_deprecated = (
958        'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
959        'Jamo_Short_Name' => $contributory,
960        'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
961        'Other_Alphabetic' => $contributory,
962        'Other_Default_Ignorable_Code_Point' => $contributory,
963        'Other_Grapheme_Extend' => $contributory,
964        'Other_ID_Continue' => $contributory,
965        'Other_ID_Start' => $contributory,
966        'Other_Lowercase' => $contributory,
967        'Other_Math' => $contributory,
968        'Other_Uppercase' => $contributory,
969        'Expands_On_NFC' => $why_no_expand,
970        'Expands_On_NFD' => $why_no_expand,
971        'Expands_On_NFKC' => $why_no_expand,
972        'Expands_On_NFKD' => $why_no_expand,
973    );
974
975    %why_suppressed = (
976        # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
977        # contains the same information, but without the algorithmically
978        # determinable Hangul syllables'.  This file is not published, so it's
979        # existence is not noted in the comment.
980        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
981
982        # Don't suppress ISO_Comment, as otherwise special handling is needed
983        # to differentiate between it and gc=c, which can be written as 'isc',
984        # which is the same characters as ISO_Comment's short name.
985
986        'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
987
988        'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
989        'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
990        'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
991        'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
992
993        FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
994    );
995
996    foreach my $property (
997
998            # The following are suppressed because they were made contributory
999            # or deprecated by Unicode before Perl ever thought about
1000            # supporting them.
1001            'Jamo_Short_Name',
1002            'Grapheme_Link',
1003            'Expands_On_NFC',
1004            'Expands_On_NFD',
1005            'Expands_On_NFKC',
1006            'Expands_On_NFKD',
1007
1008            # The following are suppressed because they have been marked
1009            # as deprecated for a sufficient amount of time
1010            'Other_Alphabetic',
1011            'Other_Default_Ignorable_Code_Point',
1012            'Other_Grapheme_Extend',
1013            'Other_ID_Continue',
1014            'Other_ID_Start',
1015            'Other_Lowercase',
1016            'Other_Math',
1017            'Other_Uppercase',
1018    ) {
1019        $why_suppressed{$property} = $why_deprecated{$property};
1020    }
1021
1022    # Customize the message for all the 'Other_' properties
1023    foreach my $property (keys %why_deprecated) {
1024        next if (my $main_property = $property) !~ s/^Other_//;
1025        $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1026    }
1027}
1028
1029if ($write_Unicode_deprecated_tables) {
1030    foreach my $property (keys %why_suppressed) {
1031        delete $why_suppressed{$property} if $property =~
1032                                                    / ^ Other | Grapheme /x;
1033    }
1034}
1035
1036if ($v_version ge 4.0.0) {
1037    $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1038    if ($v_version ge 6.0.0) {
1039        $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1040    }
1041}
1042if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1043    $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1044    if ($v_version ge 6.0.0) {
1045        $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1046    }
1047}
1048
1049# Probably obsolete forever
1050if ($v_version ge v4.1.0) {
1051    $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1052}
1053if ($v_version ge v6.0.0) {
1054    $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1055    $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1056}
1057
1058# This program can create files for enumerated-like properties, such as
1059# 'Numeric_Type'.  This file would be the same format as for a string
1060# property, with a mapping from code point to its value, so you could look up,
1061# for example, the script a code point is in.  But no one so far wants this
1062# mapping, or they have found another way to get it since this is a new
1063# feature.  So no file is generated except if it is in this list.
1064my @output_mapped_properties = split "\n", <<END;
1065END
1066
1067# If you want more Unihan properties than the default, you need to add them to
1068# these arrays.  Depending on the property type, @missing lines might have to
1069# be added to the second array.  A sample entry would be (including the '#'):
1070# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1071my @cjk_properties = split "\n", <<'END';
1072END
1073my @cjk_property_values = split "\n", <<'END';
1074END
1075
1076# The input files don't list every code point.  Those not listed are to be
1077# defaulted to some value.  Below are hard-coded what those values are for
1078# non-binary properties as of 5.1.  Starting in 5.0, there are
1079# machine-parsable comment lines in the files that give the defaults; so this
1080# list shouldn't have to be extended.  The claim is that all missing entries
1081# for binary properties will default to 'N'.  Unicode tried to change that in
1082# 5.2, but the beta period produced enough protest that they backed off.
1083#
1084# The defaults for the fields that appear in UnicodeData.txt in this hash must
1085# be in the form that it expects.  The others may be synonyms.
1086my $CODE_POINT = '<code point>';
1087my %default_mapping = (
1088    Age => "Unassigned",
1089    # Bidi_Class => Complicated; set in code
1090    Bidi_Mirroring_Glyph => "",
1091    Block => 'No_Block',
1092    Canonical_Combining_Class => 0,
1093    Case_Folding => $CODE_POINT,
1094    Decomposition_Mapping => $CODE_POINT,
1095    Decomposition_Type => 'None',
1096    East_Asian_Width => "Neutral",
1097    FC_NFKC_Closure => $CODE_POINT,
1098    General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1099    Grapheme_Cluster_Break => 'Other',
1100    Hangul_Syllable_Type => 'NA',
1101    ISO_Comment => "",
1102    Jamo_Short_Name => "",
1103    Joining_Group => "No_Joining_Group",
1104    # Joining_Type => Complicated; set in code
1105    kIICore => 'N',   #                       Is converted to binary
1106    #Line_Break => Complicated; set in code
1107    Lowercase_Mapping => $CODE_POINT,
1108    Name => "",
1109    Name_Alias => "",
1110    NFC_QC => 'Yes',
1111    NFD_QC => 'Yes',
1112    NFKC_QC => 'Yes',
1113    NFKD_QC => 'Yes',
1114    Numeric_Type => 'None',
1115    Numeric_Value => 'NaN',
1116    Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1117    Sentence_Break => 'Other',
1118    Simple_Case_Folding => $CODE_POINT,
1119    Simple_Lowercase_Mapping => $CODE_POINT,
1120    Simple_Titlecase_Mapping => $CODE_POINT,
1121    Simple_Uppercase_Mapping => $CODE_POINT,
1122    Titlecase_Mapping => $CODE_POINT,
1123    Unicode_1_Name => "",
1124    Unicode_Radical_Stroke => "",
1125    Uppercase_Mapping => $CODE_POINT,
1126    Word_Break => 'Other',
1127);
1128
1129### End of externally interesting definitions, except for @input_file_objects
1130
1131my $HEADER=<<"EOF";
1132# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1133# This file is machine-generated by $0 from the Unicode
1134# database, Version $unicode_version.  Any changes made here will be lost!
1135EOF
1136
1137my $INTERNAL_ONLY_HEADER = <<"EOF";
1138
1139# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1140# This file is for internal use by core Perl only.  The format and even the
1141# name or existence of this file are subject to change without notice.  Don't
1142# use it directly.  Use Unicode::UCD to access the Unicode character data
1143# base.
1144EOF
1145
1146my $DEVELOPMENT_ONLY=<<"EOF";
1147# !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1148# This file contains information artificially constrained to code points
1149# present in Unicode release $string_compare_versions.
1150# IT CANNOT BE RELIED ON.  It is for use during development only and should
1151# not be used for production.
1152
1153EOF
1154
1155my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1156                                   ? "10FFFF"
1157                                   : "FFFF";
1158my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1159my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1160
1161# We work with above-Unicode code points, up to IV_MAX, but we may want to use
1162# sentinels above that number.  Therefore for internal use, we use a much
1163# smaller number, translating it to IV_MAX only for output.  The exact number
1164# is immaterial (all above-Unicode code points are treated exactly the same),
1165# but the algorithm requires it to be at least
1166# 2 * $MAX_UNICODE_CODEPOINTS + 1
1167my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1168my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1169my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1170
1171my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1172
1173# Matches legal code point.  4-6 hex numbers, If there are 6, the first
1174# two must be 10; if there are 5, the first must not be a 0.  Written this way
1175# to decrease backtracking.  The first regex allows the code point to be at
1176# the end of a word, but to work properly, the word shouldn't end with a valid
1177# hex character.  The second one won't match a code point at the end of a
1178# word, and doesn't have the run-on issue
1179my $run_on_code_point_re =
1180            qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1181my $code_point_re = qr/\b$run_on_code_point_re/;
1182
1183# This matches the beginning of the line in the Unicode DB files that give the
1184# defaults for code points not listed (i.e., missing) in the file.  The code
1185# depends on this ending with a semi-colon, so it can assume it is a valid
1186# field when the line is split() by semi-colons
1187my $missing_defaults_prefix = qr/ ^ \# \s+ \@missing: \s+
1188                                           ($code_point_re)
1189                                           \.\.
1190                                           ($code_point_re)
1191                                       \s* ;
1192                                /x;
1193
1194# Property types.  Unicode has more types, but these are sufficient for our
1195# purposes.
1196my $UNKNOWN = -1;   # initialized to illegal value
1197my $NON_STRING = 1; # Either binary or enum
1198my $BINARY = 2;
1199my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1200                       # tables, additional true and false tables are
1201                       # generated so that false is anything matching the
1202                       # default value, and true is everything else.
1203my $ENUM = 4;       # Include catalog
1204my $STRING = 5;     # Anything else: string or misc
1205
1206# Some input files have lines that give default values for code points not
1207# contained in the file.  Sometimes these should be ignored.
1208my $NO_DEFAULTS = 0;        # Must evaluate to false
1209my $NOT_IGNORED = 1;
1210my $IGNORED = 2;
1211
1212# Range types.  Each range has a type.  Most ranges are type 0, for normal,
1213# and will appear in the main body of the tables in the output files, but
1214# there are other types of ranges as well, listed below, that are specially
1215# handled.   There are pseudo-types as well that will never be stored as a
1216# type, but will affect the calculation of the type.
1217
1218# 0 is for normal, non-specials
1219my $MULTI_CP = 1;           # Sequence of more than code point
1220my $HANGUL_SYLLABLE = 2;
1221my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1222my $NULL = 4;               # The map is to the null string; utf8.c can't
1223                            # handle these, nor is there an accepted syntax
1224                            # for them in \p{} constructs
1225my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1226                             # otherwise be $MULTI_CP type are instead type 0
1227
1228# process_generic_property_file() can accept certain overrides in its input.
1229# Each of these must begin AND end with $CMD_DELIM.
1230my $CMD_DELIM = "\a";
1231my $REPLACE_CMD = 'replace';    # Override the Replace
1232my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1233
1234my $NO = 0;
1235my $YES = 1;
1236
1237# Values for the Replace argument to add_range.
1238# $NO                      # Don't replace; add only the code points not
1239                           # already present.
1240my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1241                           # the comments at the subroutine definition.
1242my $UNCONDITIONALLY = 2;   # Replace without conditions.
1243my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1244                           # already there
1245my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1246                           # already there
1247my $CROAK = 6;             # Die with an error if is already there
1248
1249# Flags to give property statuses.  The phrases are to remind maintainers that
1250# if the flag is changed, the indefinite article referring to it in the
1251# documentation may need to be as well.
1252my $NORMAL = "";
1253my $DEPRECATED = 'D';
1254my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1255my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1256my $DISCOURAGED = 'X';
1257my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1258my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1259my $STRICTER = 'T';
1260my $a_bold_stricter = "a 'B<$STRICTER>'";
1261my $A_bold_stricter = "A 'B<$STRICTER>'";
1262my $STABILIZED = 'S';
1263my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1264my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1265my $OBSOLETE = 'O';
1266my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1267my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1268
1269# Aliases can also have an extra status:
1270my $INTERNAL_ALIAS = 'P';
1271
1272my %status_past_participles = (
1273    $DISCOURAGED => 'discouraged',
1274    $STABILIZED => 'stabilized',
1275    $OBSOLETE => 'obsolete',
1276    $DEPRECATED => 'deprecated',
1277    $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1278);
1279
1280# Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1281# externally documented.
1282my $ORDINARY = 0;       # The normal fate.
1283my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1284                        # but there is a file written that can be used to
1285                        # reconstruct this table
1286my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1287                        # for Perl's internal use only
1288my $SUPPRESSED = 3;     # The file for this table is not written out, and as a
1289                        # result, we don't bother to do many computations on
1290                        # it.
1291my $PLACEHOLDER = 4;    # Like $SUPPRESSED, but we go through all the
1292                        # computations anyway, as the values are needed for
1293                        # things to work.  This happens when we have Perl
1294                        # extensions that depend on Unicode tables that
1295                        # wouldn't normally be in a given Unicode version.
1296
1297# The format of the values of the tables:
1298my $EMPTY_FORMAT = "";
1299my $BINARY_FORMAT = 'b';
1300my $DECIMAL_FORMAT = 'd';
1301my $FLOAT_FORMAT = 'f';
1302my $INTEGER_FORMAT = 'i';
1303my $HEX_FORMAT = 'x';
1304my $RATIONAL_FORMAT = 'r';
1305my $STRING_FORMAT = 's';
1306my $ADJUST_FORMAT = 'a';
1307my $HEX_ADJUST_FORMAT = 'ax';
1308my $DECOMP_STRING_FORMAT = 'c';
1309my $STRING_WHITE_SPACE_LIST = 'sw';
1310
1311my %map_table_formats = (
1312    $BINARY_FORMAT => 'binary',
1313    $DECIMAL_FORMAT => 'single decimal digit',
1314    $FLOAT_FORMAT => 'floating point number',
1315    $INTEGER_FORMAT => 'integer',
1316    $HEX_FORMAT => 'non-negative hex whole number; a code point',
1317    $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1318    $STRING_FORMAT => 'string',
1319    $ADJUST_FORMAT => 'some entries need adjustment',
1320    $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1321    $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1322    $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1323);
1324
1325# Unicode didn't put such derived files in a separate directory at first.
1326my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1327my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1328my $AUXILIARY = 'auxiliary';
1329my $EMOJI = 'emoji';
1330
1331# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm
1332my %loose_to_file_of;       # loosely maps table names to their respective
1333                            # files
1334my %stricter_to_file_of;    # same; but for stricter mapping.
1335my %loose_property_to_file_of; # Maps a loose property name to its map file
1336my %strict_property_to_file_of; # Same, but strict
1337my @inline_definitions = "V0"; # Each element gives a definition of a unique
1338                            # inversion list.  When a definition is inlined,
1339                            # its value in the hash it's in (one of the two
1340                            # defined just above) will include an index into
1341                            # this array.  The 0th element is initialized to
1342                            # the definition for a zero length inversion list
1343my %file_to_swash_name;     # Maps the file name to its corresponding key name
1344                            # in the hash %Unicode::UCD::SwashInfo
1345my %nv_floating_to_rational; # maps numeric values floating point numbers to
1346                             # their rational equivalent
1347my %loose_property_name_of; # Loosely maps (non_string) property names to
1348                            # standard form
1349my %strict_property_name_of; # Strictly maps (non_string) property names to
1350                            # standard form
1351my %string_property_loose_to_name; # Same, for string properties.
1352my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1353                            # the property name in standard loose form, and
1354                            # 'value' is the default value for that property,
1355                            # also in standard loose form.
1356my %loose_to_standard_value; # loosely maps table names to the canonical
1357                            # alias for them
1358my %ambiguous_names;        # keys are alias names (in standard form) that
1359                            # have more than one possible meaning.
1360my %combination_property;   # keys are alias names (in standard form) that
1361                            # have both a map table, and a binary one that
1362                            # yields true for all non-null maps.
1363my %prop_aliases;           # Keys are standard property name; values are each
1364                            # one's aliases
1365my %prop_value_aliases;     # Keys of top level are standard property name;
1366                            # values are keys to another hash,  Each one is
1367                            # one of the property's values, in standard form.
1368                            # The values are that prop-val's aliases.
1369my %skipped_files;          # List of files that we skip
1370my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1371
1372# Most properties are immune to caseless matching, otherwise you would get
1373# nonsensical results, as properties are a function of a code point, not
1374# everything that is caselessly equivalent to that code point.  For example,
1375# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1376# be true because 's' and 'S' are equivalent caselessly.  However,
1377# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1378# extend that concept to those very few properties that are like this.  Each
1379# such property will match the full range caselessly.  They are hard-coded in
1380# the program; it's not worth trying to make it general as it's extremely
1381# unlikely that they will ever change.
1382my %caseless_equivalent_to;
1383
1384# This is the range of characters that were in Release 1 of Unicode, and
1385# removed in Release 2 (replaced with the current Hangul syllables starting at
1386# U+AC00).  The range was reused starting in Release 3 for other purposes.
1387my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1388my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1389
1390# These constants names and values were taken from the Unicode standard,
1391# version 5.1, section 3.12.  They are used in conjunction with Hangul
1392# syllables.  The '_string' versions are so generated tables can retain the
1393# hex format, which is the more familiar value
1394my $SBase_string = "0xAC00";
1395my $SBase = CORE::hex $SBase_string;
1396my $LBase_string = "0x1100";
1397my $LBase = CORE::hex $LBase_string;
1398my $VBase_string = "0x1161";
1399my $VBase = CORE::hex $VBase_string;
1400my $TBase_string = "0x11A7";
1401my $TBase = CORE::hex $TBase_string;
1402my $SCount = 11172;
1403my $LCount = 19;
1404my $VCount = 21;
1405my $TCount = 28;
1406my $NCount = $VCount * $TCount;
1407
1408# For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1409# with the above published constants.
1410my %Jamo;
1411my %Jamo_L;     # Leading consonants
1412my %Jamo_V;     # Vowels
1413my %Jamo_T;     # Trailing consonants
1414
1415# For code points whose name contains its ordinal as a '-ABCD' suffix.
1416# The key is the base name of the code point, and the value is an
1417# array giving all the ranges that use this base name.  Each range
1418# is actually a hash giving the 'low' and 'high' values of it.
1419my %names_ending_in_code_point;
1420my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1421                                        # removed from the names
1422# Inverse mapping.  The list of ranges that have these kinds of
1423# names.  Each element contains the low, high, and base names in an
1424# anonymous hash.
1425my @code_points_ending_in_code_point;
1426
1427# To hold Unicode's normalization test suite
1428my @normalization_tests;
1429
1430# Boolean: does this Unicode version have the hangul syllables, and are we
1431# writing out a table for them?
1432my $has_hangul_syllables = 0;
1433
1434# Does this Unicode version have code points whose names end in their
1435# respective code points, and are we writing out a table for them?  0 for no;
1436# otherwise points to first property that a table is needed for them, so that
1437# if multiple tables are needed, we don't create duplicates
1438my $needing_code_points_ending_in_code_point = 0;
1439
1440my @backslash_X_tests;     # List of tests read in for testing \X
1441my @LB_tests;              # List of tests read in for testing \b{lb}
1442my @SB_tests;              # List of tests read in for testing \b{sb}
1443my @WB_tests;              # List of tests read in for testing \b{wb}
1444my @unhandled_properties;  # Will contain a list of properties found in
1445                           # the input that we didn't process.
1446my @match_properties;      # Properties that have match tables, to be
1447                           # listed in the pod
1448my @map_properties;        # Properties that get map files written
1449my @named_sequences;       # NamedSequences.txt contents.
1450my %potential_files;       # Generated list of all .txt files in the directory
1451                           # structure so we can warn if something is being
1452                           # ignored.
1453my @missing_early_files;   # Generated list of absent files that we need to
1454                           # proceed in compiling this early Unicode version
1455my @files_actually_output; # List of files we generated.
1456my @more_Names;            # Some code point names are compound; this is used
1457                           # to store the extra components of them.
1458my $E_FLOAT_PRECISION = 3; # The minimum number of digits after the decimal
1459                           # point of a normalized floating point number
1460                           # needed to match before we consider it equivalent
1461                           # to a candidate rational
1462
1463# These store references to certain commonly used property objects
1464my $age;
1465my $ccc;
1466my $gc;
1467my $perl;
1468my $block;
1469my $perl_charname;
1470my $print;
1471my $All;
1472my $Assigned;   # All assigned characters in this Unicode release
1473my $DI;         # Default_Ignorable_Code_Point property
1474my $NChar;      # Noncharacter_Code_Point property
1475my $script;
1476my $scx;        # Script_Extensions property
1477my $idt;        # Identifier_Type property
1478
1479# Are there conflicting names because of beginning with 'In_', or 'Is_'
1480my $has_In_conflicts = 0;
1481my $has_Is_conflicts = 0;
1482
1483sub internal_file_to_platform ($file=undef) {
1484    # Convert our file paths which have '/' separators to those of the
1485    # platform.
1486
1487    return undef unless defined $file;
1488
1489    return File::Spec->join(split '/', $file);
1490}
1491
1492sub file_exists ($file=undef) {   # platform independent '-e'.  This program internally
1493                        # uses slash as a path separator.
1494    return 0 unless defined $file;
1495    return -e internal_file_to_platform($file);
1496}
1497
1498sub objaddr($addr) {
1499    # Returns the address of the blessed input object.
1500    # It doesn't check for blessedness because that would do a string eval
1501    # every call, and the program is structured so that this is never called
1502    # for a non-blessed object.
1503
1504    return pack 'J', refaddr $addr;
1505}
1506
1507# These are used only if $annotate is true.
1508# The entire range of Unicode characters is examined to populate these
1509# after all the input has been processed.  But most can be skipped, as they
1510# have the same descriptive phrases, such as being unassigned
1511my @viacode;            # Contains the 1 million character names
1512my @age;                # And their ages ("" if none)
1513my @printable;          # boolean: And are those characters printable?
1514my @annotate_char_type; # Contains a type of those characters, specifically
1515                        # for the purposes of annotation.
1516my $annotate_ranges;    # A map of ranges of code points that have the same
1517                        # name for the purposes of annotation.  They map to the
1518                        # upper edge of the range, so that the end point can
1519                        # be immediately found.  This is used to skip ahead to
1520                        # the end of a range, and avoid processing each
1521                        # individual code point in it.
1522my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1523                                   # characters, but excluding those which are
1524                                   # also noncharacter code points
1525
1526# The annotation types are an extension of the regular range types, though
1527# some of the latter are folded into one.  Make the new types negative to
1528# avoid conflicting with the regular types
1529my $SURROGATE_TYPE = -1;
1530my $UNASSIGNED_TYPE = -2;
1531my $PRIVATE_USE_TYPE = -3;
1532my $NONCHARACTER_TYPE = -4;
1533my $CONTROL_TYPE = -5;
1534my $ABOVE_UNICODE_TYPE = -6;
1535my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1536
1537sub populate_char_info ($i) {
1538    # Used only with the $annotate option.  Populates the arrays with the
1539    # input code point's info that are needed for outputting more detailed
1540    # comments.  If calling context wants a return, it is the end point of
1541    # any contiguous range of characters that share essentially the same info
1542
1543    $viacode[$i] = $perl_charname->value_of($i) || "";
1544    $age[$i] = (defined $age)
1545               ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1546                  ? $age->value_of($i)
1547                  : "")
1548               : "";
1549
1550    # A character is generally printable if Unicode says it is,
1551    # but below we make sure that most Unicode general category 'C' types
1552    # aren't.
1553    $printable[$i] = $print->contains($i);
1554
1555    # But the characters in this range were removed in v2.0 and replaced by
1556    # different ones later.  Modern fonts will be for the replacement
1557    # characters, so suppress printing them.
1558    if (($v_version lt v2.0
1559         || ($compare_versions && $compare_versions lt v2.0))
1560        && (   $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1561            && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1562    {
1563        $printable[$i] = 0;
1564    }
1565
1566    $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1567
1568    # Only these two regular types are treated specially for annotations
1569    # purposes
1570    $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1571                                && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1572
1573    # Give a generic name to all code points that don't have a real name.
1574    # We output ranges, if applicable, for these.  Also calculate the end
1575    # point of the range.
1576    my $end;
1577    if (! $viacode[$i]) {
1578        if ($i > $MAX_UNICODE_CODEPOINT) {
1579            $viacode[$i] = 'Above-Unicode';
1580            $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1581            $printable[$i] = 0;
1582            $end = $MAX_WORKING_CODEPOINT;
1583        }
1584        elsif ($gc-> table('Private_use')->contains($i)) {
1585            $viacode[$i] = 'Private Use';
1586            $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1587            $printable[$i] = 0;
1588            $end = $gc->table('Private_Use')->containing_range($i)->end;
1589        }
1590        elsif ($NChar->contains($i)) {
1591            $viacode[$i] = 'Noncharacter';
1592            $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1593            $printable[$i] = 0;
1594            $end = $NChar->containing_range($i)->end;
1595        }
1596        elsif ($gc-> table('Control')->contains($i)) {
1597            my $name_ref = property_ref('Name_Alias');
1598            $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1599            $viacode[$i] = (defined $name_ref)
1600                           ? $name_ref->value_of($i)
1601                           : 'Control';
1602            $annotate_char_type[$i] = $CONTROL_TYPE;
1603            $printable[$i] = 0;
1604        }
1605        elsif ($gc-> table('Unassigned')->contains($i)) {
1606            $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1607            $printable[$i] = 0;
1608            $viacode[$i] = 'Unassigned';
1609
1610            if (defined $block) { # No blocks in earliest releases
1611                $viacode[$i] .= ', block=' . $block-> value_of($i);
1612                $end = $gc-> table('Unassigned')->containing_range($i)->end;
1613
1614                # Because we name the unassigned by the blocks they are in, it
1615                # can't go past the end of that block, and it also can't go
1616                # past the unassigned range it is in.  The special table makes
1617                # sure that the non-characters, which are unassigned, are
1618                # separated out.
1619                $end = min($block->containing_range($i)->end,
1620                           $unassigned_sans_noncharacters->
1621                                                    containing_range($i)->end);
1622            }
1623            else {
1624                $end = $i + 1;
1625                while ($unassigned_sans_noncharacters->contains($end)) {
1626                    $end++;
1627                }
1628                $end--;
1629            }
1630        }
1631        elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1632            $viacode[$i] = 'Surrogate';
1633            $annotate_char_type[$i] = $SURROGATE_TYPE;
1634            $printable[$i] = 0;
1635            $end = $gc->table('Surrogate')->containing_range($i)->end;
1636        }
1637        else {
1638            Carp::my_carp_bug("Can't figure out how to annotate "
1639                              . sprintf("U+%04X", $i)
1640                              . ".  Proceeding anyway.");
1641            $viacode[$i] = 'UNKNOWN';
1642            $annotate_char_type[$i] = $UNKNOWN_TYPE;
1643            $printable[$i] = 0;
1644        }
1645    }
1646
1647    # Here, has a name, but if it's one in which the code point number is
1648    # appended to the name, do that.
1649    elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1650        $viacode[$i] .= sprintf("-%04X", $i);
1651
1652        my $limit = $perl_charname->containing_range($i)->end;
1653        if (defined $age) {
1654            # Do all these as groups of the same age, instead of individually,
1655            # because their names are so meaningless, and there are typically
1656            # large quantities of them.
1657            $end = $i + 1;
1658            while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1659                $end++;
1660            }
1661            $end--;
1662        }
1663        else {
1664            $end = $limit;
1665        }
1666    }
1667
1668    # And here, has a name, but if it's a hangul syllable one, replace it with
1669    # the correct name from the Unicode algorithm
1670    elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1671        use integer;
1672        my $SIndex = $i - $SBase;
1673        my $L = $LBase + $SIndex / $NCount;
1674        my $V = $VBase + ($SIndex % $NCount) / $TCount;
1675        my $T = $TBase + $SIndex % $TCount;
1676        $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1677        $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1678        $end = $perl_charname->containing_range($i)->end;
1679    }
1680
1681    return if ! defined wantarray;
1682    return $i if ! defined $end;    # If not a range, return the input
1683
1684    # Save this whole range so can find the end point quickly
1685    $annotate_ranges->add_map($i, $end, $end);
1686
1687    return $end;
1688}
1689
1690sub max($a, $b) {
1691    return $a >= $b ? $a : $b;
1692}
1693
1694sub min($a, $b) {
1695    return $a <= $b ? $a : $b;
1696}
1697
1698sub clarify_number ($number) {
1699    # This returns the input number with underscores inserted every 3 digits
1700    # in large (5 digits or more) numbers.  Input must be entirely digits, not
1701    # checked.
1702
1703    my $pos = length($number) - 3;
1704    return $number if $pos <= 1;
1705    while ($pos > 0) {
1706        substr($number, $pos, 0) = '_';
1707        $pos -= 3;
1708    }
1709    return $number;
1710}
1711
1712sub clarify_code_point_count ($number) {
1713    # This is like clarify_number(), but the input is assumed to be a count of
1714    # code points, rather than a generic number.
1715
1716    my $append = "";
1717
1718    if ($number > $MAX_UNICODE_CODEPOINTS) {
1719        $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1720        return "All above-Unicode code points" if $number == 0;
1721        $append = " + all above-Unicode code points";
1722    }
1723    return clarify_number($number) . $append;
1724}
1725
1726package Carp;
1727
1728# These routines give a uniform treatment of messages in this program.  They
1729# are placed in the Carp package to cause the stack trace to not include them,
1730# although an alternative would be to use another package and set @CARP_NOT
1731# for it.
1732
1733our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1734
1735# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1736# and overload trying to load Scalar:Util under miniperl.  See
1737# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1738undef $overload::VERSION;
1739
1740sub my_carp($message="", $nofold=0) {
1741
1742    if ($message) {
1743        $message = main::join_lines($message);
1744        $message =~ s/^$0: *//;     # Remove initial program name
1745        $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1746        $message = "\n$0: $message;";
1747
1748        # Fold the message with program name, semi-colon end punctuation
1749        # (which looks good with the message that carp appends to it), and a
1750        # hanging indent for continuation lines.
1751        $message = main::simple_fold($message, "", 4) unless $nofold;
1752        $message =~ s/\n$//;        # Remove the trailing nl so what carp
1753                                    # appends is to the same line
1754    }
1755
1756    return $message if defined wantarray;   # If a caller just wants the msg
1757
1758    carp $message;
1759    return;
1760}
1761
1762sub my_carp_bug($message="") {
1763    # This is called when it is clear that the problem is caused by a bug in
1764    # this program.
1765    $message =~ s/^$0: *//;
1766    $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1767    carp $message;
1768    return;
1769}
1770
1771sub carp_too_few_args($args_ref, $count) {
1772    my_carp_bug("Need at least $count arguments to "
1773        . (caller 1)[3]
1774        . ".  Instead got: '"
1775        . join ', ', @$args_ref
1776        . "'.  No action taken.");
1777    return;
1778}
1779
1780sub carp_extra_args($args_ref) {
1781    unless (ref $args_ref) {
1782        my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1783        return;
1784    }
1785    my ($package, $file, $line) = caller;
1786    my $subroutine = (caller 1)[3];
1787
1788    my $list;
1789    if (ref $args_ref eq 'HASH') {
1790        foreach my $key (keys %$args_ref) {
1791            $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1792        }
1793        $list = join ', ', each %{$args_ref};
1794    }
1795    elsif (ref $args_ref eq 'ARRAY') {
1796        foreach my $arg (@$args_ref) {
1797            $arg = $UNDEF unless defined $arg;
1798        }
1799        $list = join ', ', @$args_ref;
1800    }
1801    else {
1802        my_carp_bug("Can't cope with ref "
1803                . ref($args_ref)
1804                . " . argument to 'carp_extra_args'.  Not checking arguments.");
1805        return;
1806    }
1807
1808    my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1809    return;
1810}
1811
1812package main;
1813
1814{ # Closure
1815
1816    # This program uses the inside-out method for objects, as recommended in
1817    # "Perl Best Practices".  (This is the best solution still, since this has
1818    # to run under miniperl.)  This closure aids in generating those.  There
1819    # are two routines.  setup_package() is called once per package to set
1820    # things up, and then set_access() is called for each hash representing a
1821    # field in the object.  These routines arrange for the object to be
1822    # properly destroyed when no longer used, and for standard accessor
1823    # functions to be generated.  If you need more complex accessors, just
1824    # write your own and leave those accesses out of the call to set_access().
1825    # More details below.
1826
1827    my %constructor_fields; # fields that are to be used in constructors; see
1828                            # below
1829
1830    # The values of this hash will be the package names as keys to other
1831    # hashes containing the name of each field in the package as keys, and
1832    # references to their respective hashes as values.
1833    my %package_fields;
1834
1835    sub setup_package {
1836        # Sets up the package, creating standard DESTROY and dump methods
1837        # (unless already defined).  The dump method is used in debugging by
1838        # simple_dumper().
1839        # The optional parameters are:
1840        #   a)  a reference to a hash, that gets populated by later
1841        #       set_access() calls with one of the accesses being
1842        #       'constructor'.  The caller can then refer to this, but it is
1843        #       not otherwise used by these two routines.
1844        #   b)  a reference to a callback routine to call during destruction
1845        #       of the object, before any fields are actually destroyed
1846
1847        my %args = @_;
1848        my $constructor_ref = delete $args{'Constructor_Fields'};
1849        my $destroy_callback = delete $args{'Destroy_Callback'};
1850        Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1851
1852        my %fields;
1853        my $package = (caller)[0];
1854
1855        $package_fields{$package} = \%fields;
1856        $constructor_fields{$package} = $constructor_ref;
1857
1858        unless ($package->can('DESTROY')) {
1859            my $destroy_name = "${package}::DESTROY";
1860            no strict "refs";
1861
1862            # Use typeglob to give the anonymous subroutine the name we want
1863            *$destroy_name = sub ($self) {
1864                my $addr = pack 'J', refaddr $self;
1865
1866                $self->$destroy_callback if $destroy_callback;
1867                foreach my $field (keys %{$package_fields{$package}}) {
1868                    #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1869                    delete $package_fields{$package}{$field}{$addr};
1870                }
1871                return;
1872            }
1873        }
1874
1875        unless ($package->can('dump')) {
1876            my $dump_name = "${package}::dump";
1877            no strict "refs";
1878            *$dump_name = sub ($self, @_args) {
1879                return dump_inside_out($self, $package_fields{$package}, @_args);
1880            }
1881        }
1882        return;
1883    }
1884
1885    sub set_access($name, $field, @accessors) {
1886        # Arrange for the input field to be garbage collected when no longer
1887        # needed.  Also, creates standard accessor functions for the field
1888        # based on the optional parameters-- none if none of these parameters:
1889        #   'addable'    creates an 'add_NAME()' accessor function.
1890        #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1891        #                function.
1892        #   'settable'   creates a 'set_NAME()' accessor function.
1893        #   'constructor' doesn't create an accessor function, but adds the
1894        #                field to the hash that was previously passed to
1895        #                setup_package();
1896        # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1897        # 'add' etc. all mean 'addable'.
1898        # The read accessor function will work on both array and scalar
1899        # values.  If another accessor in the parameter list is 'a', the read
1900        # access assumes an array.  You can also force it to be array access
1901        # by specifying 'readable_array' instead of 'readable'
1902        #
1903        # A sort-of 'protected' access can be set-up by preceding the addable,
1904        # readable or settable with some initial portion of 'protected_' (but,
1905        # the underscore is required), like 'p_a', 'pro_set', etc.  The
1906        # "protection" is only by convention.  All that happens is that the
1907        # accessor functions' names begin with an underscore.  So instead of
1908        # calling set_foo, the call is _set_foo.  (Real protection could be
1909        # accomplished by having a new subroutine, end_package, called at the
1910        # end of each package, and then storing the __LINE__ ranges and
1911        # checking them on every accessor.  But that is way overkill.)
1912
1913        # We create anonymous subroutines as the accessors and then use
1914        # typeglobs to assign them to the proper package and name
1915
1916        # $name 	Name of the field
1917        # $field 	Reference to the inside-out hash containing the
1918		# 			field
1919
1920        my $package = (caller)[0];
1921
1922        if (! exists $package_fields{$package}) {
1923            croak "$0: Must call 'setup_package' before 'set_access'";
1924        }
1925
1926        # Stash the field so DESTROY can get it.
1927        $package_fields{$package}{$name} = $field;
1928
1929        # Remaining arguments are the accessors.  For each...
1930        foreach my $access (@accessors) {
1931            my $access = lc $access;
1932
1933            my $protected = "";
1934
1935            # Match the input as far as it goes.
1936            if ($access =~ /^(p[^_]*)_/) {
1937                $protected = $1;
1938                if (substr('protected_', 0, length $protected)
1939                    eq $protected)
1940                {
1941
1942                    # Add 1 for the underscore not included in $protected
1943                    $access = substr($access, length($protected) + 1);
1944                    $protected = '_';
1945                }
1946                else {
1947                    $protected = "";
1948                }
1949            }
1950
1951            if (substr('addable', 0, length $access) eq $access) {
1952                my $subname = "${package}::${protected}add_$name";
1953                no strict "refs";
1954
1955                # add_ accessor.  Don't add if already there, which we
1956                # determine using 'eq' for scalars and '==' otherwise.
1957                *$subname = sub ($self, $value) {
1958                    use strict "refs";
1959                    my $addr = pack 'J', refaddr $self;
1960                    if (ref $value) {
1961                        return if grep { $value == $_ } @{$field->{$addr}};
1962                    }
1963                    else {
1964                        return if grep { $value eq $_ } @{$field->{$addr}};
1965                    }
1966                    push @{$field->{$addr}}, $value;
1967                    return;
1968                }
1969            }
1970            elsif (substr('constructor', 0, length $access) eq $access) {
1971                if ($protected) {
1972                    Carp::my_carp_bug("Can't set-up 'protected' constructors")
1973                }
1974                else {
1975                    $constructor_fields{$package}{$name} = $field;
1976                }
1977            }
1978            elsif (substr('readable_array', 0, length $access) eq $access) {
1979
1980                # Here has read access.  If one of the other parameters for
1981                # access is array, or this one specifies array (by being more
1982                # than just 'readable_'), then create a subroutine that
1983                # assumes the data is an array.  Otherwise just a scalar
1984                my $subname = "${package}::${protected}$name";
1985                if (grep { /^a/i } (@accessors)
1986                    or length($access) > length('readable_'))
1987                {
1988                    no strict "refs";
1989                    *$subname = sub ($_addr) {
1990                        use strict "refs";
1991                        my $addr = pack 'J', refaddr $_addr;
1992                        if (ref $field->{$addr} ne 'ARRAY') {
1993                            my $type = ref $field->{$addr};
1994                            $type = 'scalar' unless $type;
1995                            Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1996                            return;
1997                        }
1998                        return scalar @{$field->{$addr}} unless wantarray;
1999
2000                        # Make a copy; had problems with caller modifying the
2001                        # original otherwise
2002                        my @return = @{$field->{$addr}};
2003                        return @return;
2004                    }
2005                }
2006                else {
2007
2008                    # Here not an array value, a simpler function.
2009                    no strict "refs";
2010                    *$subname = sub ($addr) {
2011                        use strict "refs";
2012                        return $field->{pack 'J', refaddr $addr};
2013                    }
2014                }
2015            }
2016            elsif (substr('settable', 0, length $access) eq $access) {
2017                my $subname = "${package}::${protected}set_$name";
2018                no strict "refs";
2019                *$subname = sub ($self, $value) {
2020                    use strict "refs";
2021                    # $self is $_[0]; $value is $_[1]
2022                    $field->{pack 'J', refaddr $self} = $value;
2023                    return;
2024                }
2025            }
2026            else {
2027                Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2028            }
2029        }
2030        return;
2031    }
2032}
2033
2034package Input_file;
2035
2036# All input files use this object, which stores various attributes about them,
2037# and provides for convenient, uniform handling.  The run method wraps the
2038# processing.  It handles all the bookkeeping of opening, reading, and closing
2039# the file, returning only significant input lines.
2040#
2041# Each object gets a handler which processes the body of the file, and is
2042# called by run().  All character property files must use the generic,
2043# default handler, which has code scrubbed to handle things you might not
2044# expect, including automatic EBCDIC handling.  For files that don't deal with
2045# mapping code points to a property value, such as test files,
2046# PropertyAliases, PropValueAliases, and named sequences, you can override the
2047# handler to be a custom one.  Such a handler should basically be a
2048# while(next_line()) {...} loop.
2049#
2050# You can also set up handlers to
2051#   0) call during object construction time, after everything else is done
2052#   1) call before the first line is read, for pre processing
2053#   2) call to adjust each line of the input before the main handler gets
2054#      them.  This can be automatically generated, if appropriately simple
2055#      enough, by specifying a Properties parameter in the constructor.
2056#   3) call upon EOF before the main handler exits its loop
2057#   4) call at the end, for post processing
2058#
2059# $_ is used to store the input line, and is to be filtered by the
2060# each_line_handler()s.  So, if the format of the line is not in the desired
2061# format for the main handler, these are used to do that adjusting.  They can
2062# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2063# so the $_ output of one is used as the input to the next.  The EOF handler
2064# is also stackable, but none of the others are, but could easily be changed
2065# to be so.
2066#
2067# Some properties are used by the Perl core but aren't defined until later
2068# Unicode releases.  The perl interpreter would have problems working when
2069# compiled with an earlier Unicode version that doesn't have them, so we need
2070# to define them somehow for those releases.  The 'Early' constructor
2071# parameter can be used to automatically handle this.  It is essentially
2072# ignored if the Unicode version being compiled has a data file for this
2073# property.  Either code to execute or a file to read can be specified.
2074# Details are at the %early definition.
2075#
2076# Most of the handlers can call insert_lines() or insert_adjusted_lines()
2077# which insert the parameters as lines to be processed before the next input
2078# file line is read.  This allows the EOF handler(s) to flush buffers, for
2079# example.  The difference between the two routines is that the lines inserted
2080# by insert_lines() are subjected to the each_line_handler()s.  (So if you
2081# called it from such a handler, you would get infinite recursion without some
2082# mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
2083# directly to the main handler without any adjustments.  If the
2084# post-processing handler calls any of these, there will be no effect.  Some
2085# error checking for these conditions could be added, but it hasn't been done.
2086#
2087# carp_bad_line() should be called to warn of bad input lines, which clears $_
2088# to prevent further processing of the line.  This routine will output the
2089# message as a warning once, and then keep a count of the lines that have the
2090# same message, and output that count at the end of the file's processing.
2091# This keeps the number of messages down to a manageable amount.
2092#
2093# get_missings() should be called to retrieve any @missing input lines.
2094# Messages will be raised if this isn't done if the options aren't to ignore
2095# missings.
2096
2097sub trace { return main::trace(@_); }
2098
2099{ # Closure
2100    # Keep track of fields that are to be put into the constructor.
2101    my %constructor_fields;
2102
2103    main::setup_package(Constructor_Fields => \%constructor_fields);
2104
2105    my %file; # Input file name, required
2106    main::set_access('file', \%file, qw{ c r });
2107
2108    my %first_released; # Unicode version file was first released in, required
2109    main::set_access('first_released', \%first_released, qw{ c r });
2110
2111    my %handler;    # Subroutine to process the input file, defaults to
2112                    # 'process_generic_property_file'
2113    main::set_access('handler', \%handler, qw{ c });
2114
2115    my %property;
2116    # name of property this file is for.  defaults to none, meaning not
2117    # applicable, or is otherwise determinable, for example, from each line.
2118    main::set_access('property', \%property, qw{ c r });
2119
2120    my %optional;
2121    # This is either an unsigned number, or a list of property names.  In the
2122    # former case, if it is non-zero, it means the file is optional, so if the
2123    # file is absent, no warning about that is output.  In the latter case, it
2124    # is a list of properties that the file (exclusively) defines.  If the
2125    # file is present, tables for those properties will be produced; if
2126    # absent, none will, even if they are listed elsewhere (namely
2127    # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2128    # and no warnings will be raised about them not being available.  (And no
2129    # warning about the file itself will be raised.)
2130    main::set_access('optional', \%optional, qw{ c readable_array } );
2131
2132    my %non_skip;
2133    # This is used for debugging, to skip processing of all but a few input
2134    # files.  Add 'non_skip => 1' to the constructor for those files you want
2135    # processed when you set the $debug_skip global.
2136    main::set_access('non_skip', \%non_skip, 'c');
2137
2138    my %skip;
2139    # This is used to skip processing of this input file (semi-) permanently.
2140    # The value should be the reason the file is being skipped.  It is used
2141    # for files that we aren't planning to process anytime soon, but want to
2142    # allow to be in the directory and be checked for their names not
2143    # conflicting with any other files on a DOS 8.3 name filesystem, but to
2144    # not otherwise be processed, and to not raise a warning about not being
2145    # handled.  In the constructor call, any value that evaluates to a numeric
2146    # 0 or undef means don't skip.  Any other value is a string giving the
2147    # reason it is being skipped, and this will appear in generated pod.
2148    # However, an empty string reason will suppress the pod entry.
2149    # Internally, calls that evaluate to numeric 0 are changed into undef to
2150    # distinguish them from an empty string call.
2151    main::set_access('skip', \%skip, 'c', 'r');
2152
2153    my %each_line_handler;
2154    # list of subroutines to look at and filter each non-comment line in the
2155    # file.  defaults to none.  The subroutines are called in order, each is
2156    # to adjust $_ for the next one, and the final one adjusts it for
2157    # 'handler'
2158    main::set_access('each_line_handler', \%each_line_handler, 'c');
2159
2160    my %retain_trailing_comments;
2161    # This is used to not discard the comments that end data lines.  This
2162    # would be used only for files with non-typical syntax, and most code here
2163    # assumes that comments have been stripped, so special handlers would have
2164    # to be written.  It is assumed that the code will use these in
2165    # single-quoted contexts, and so any "'" marks in the comment will be
2166    # prefixed by a backslash.
2167    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2168
2169    my %properties; # Optional ordered list of the properties that occur in each
2170    # meaningful line of the input file.  If present, an appropriate
2171    # each_line_handler() is automatically generated and pushed onto the stack
2172    # of such handlers.  This is useful when a file contains multiple
2173    # properties per line, but no other special considerations are necessary.
2174    # The special value "<ignored>" means to discard the corresponding input
2175    # field.
2176    # Any @missing lines in the file should also match this syntax; no such
2177    # files exist as of 6.3.  But if it happens in a future release, the code
2178    # could be expanded to properly parse them.
2179    main::set_access('properties', \%properties, qw{ c r });
2180
2181    my %has_missings_defaults;
2182    # ? Are there lines in the file giving default values for code points
2183    # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2184    # the norm, but IGNORED means it has such lines, but the handler doesn't
2185    # use them.  Having these three states allows us to catch changes to the
2186    # UCD that this program should track.  XXX This could be expanded to
2187    # specify the syntax for such lines, like %properties above.
2188    main::set_access('has_missings_defaults',
2189                                        \%has_missings_defaults, qw{ c r });
2190
2191    my %construction_time_handler;
2192    # Subroutine to call at the end of the new method.  If undef, no such
2193    # handler is called.
2194    main::set_access('construction_time_handler',
2195                                        \%construction_time_handler, qw{ c });
2196
2197    my %pre_handler;
2198    # Subroutine to call before doing anything else in the file.  If undef, no
2199    # such handler is called.
2200    main::set_access('pre_handler', \%pre_handler, qw{ c });
2201
2202    my %eof_handler;
2203    # Subroutines to call upon getting an EOF on the input file, but before
2204    # that is returned to the main handler.  This is to allow buffers to be
2205    # flushed.  The handler is expected to call insert_lines() or
2206    # insert_adjusted() with the buffered material
2207    main::set_access('eof_handler', \%eof_handler, qw{ c });
2208
2209    my %post_handler;
2210    # Subroutine to call after all the lines of the file are read in and
2211    # processed.  If undef, no such handler is called.  Note that this cannot
2212    # add lines to be processed; instead use eof_handler
2213    main::set_access('post_handler', \%post_handler, qw{ c });
2214
2215    my %progress_message;
2216    # Message to print to display progress in lieu of the standard one
2217    main::set_access('progress_message', \%progress_message, qw{ c });
2218
2219    my %handle;
2220    # cache open file handle, internal.  Is undef if file hasn't been
2221    # processed at all, empty if has;
2222    main::set_access('handle', \%handle);
2223
2224    my %added_lines;
2225    # cache of lines added virtually to the file, internal
2226    main::set_access('added_lines', \%added_lines);
2227
2228    my %remapped_lines;
2229    # cache of lines added virtually to the file, internal
2230    main::set_access('remapped_lines', \%remapped_lines);
2231
2232    my %errors;
2233    # cache of errors found, internal
2234    main::set_access('errors', \%errors);
2235
2236    my %missings;
2237    # storage of '@missing' defaults lines
2238    main::set_access('missings', \%missings);
2239
2240    my %early;
2241    # Used for properties that must be defined (for Perl's purposes) on
2242    # versions of Unicode earlier than Unicode itself defines them.  The
2243    # parameter is an array (it would be better to be a hash, but not worth
2244    # bothering about due to its rare use).
2245    #
2246    # The first element is either a code reference to call when in a release
2247    # earlier than the Unicode file is available in, or it is an alternate
2248    # file to use instead of the non-existent one.  This file must have been
2249    # plunked down in the same directory as mktables.  Should you be compiling
2250    # on a release that needs such a file, mktables will abort the
2251    # compilation, and tell you where to get the necessary file(s), and what
2252    # name(s) to use to store them as.
2253    # In the case of specifying an alternate file, the array must contain two
2254    # further elements:
2255    #
2256    # [1] is the name of the property that will be generated by this file.
2257    # The class automatically takes the input file and excludes any code
2258    # points in it that were not assigned in the Unicode version being
2259    # compiled.  It then uses this result to define the property in the given
2260    # version.  Since the property doesn't actually exist in the Unicode
2261    # version being compiled, this should be a name accessible only by core
2262    # perl.  If it is the same name as the regular property, the constructor
2263    # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2264    # get output, and so will be unusable by non-core code.  Otherwise it gets
2265    # marked as $INTERNAL_ONLY.
2266    #
2267    # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2268    # the Hangul syllables in that release (which were ripped out in version
2269    # 2) for the given property .  (Hence it is ignored except when compiling
2270    # version 1.  You only get one value that applies to all of them, which
2271    # may not be the actual reality, but probably nobody cares anyway for
2272    # these obsolete characters.)
2273    #
2274    # [3] if present is the default value for the property to assign for code
2275    # points not given in the input.  If not present, the default from the
2276    # normal property is used
2277    #
2278    # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2279    # it means to not add the name in [1] as an alias to the property name
2280    # used for these.  Normally, when compiling Unicode versions that don't
2281    # invoke the early handling, the name is added as a synonym.
2282    #
2283    # Not all files can be handled in the above way, and so the code ref
2284    # alternative is available.  It can do whatever it needs to.  The other
2285    # array elements are optional in this case, and the code is free to use or
2286    # ignore them if they are present.
2287    #
2288    # Internally, the constructor unshifts a 0 or 1 onto this array to
2289    # indicate if an early alternative is actually being used or not.  This
2290    # makes for easier testing later on.
2291    main::set_access('early', \%early, 'c');
2292
2293    my %only_early;
2294    main::set_access('only_early', \%only_early, 'c');
2295
2296    my %required_even_in_debug_skip;
2297    # debug_skip is used to speed up compilation during debugging by skipping
2298    # processing files that are not needed for the task at hand.  However,
2299    # some files pretty much can never be skipped, and this is used to specify
2300    # that this is one of them.  In order to skip this file, the call to the
2301    # constructor must be edited to comment out this parameter.
2302    main::set_access('required_even_in_debug_skip',
2303                     \%required_even_in_debug_skip, 'c');
2304
2305    my %withdrawn;
2306    # Some files get removed from the Unicode DB.  This is a version object
2307    # giving the first release without this file.
2308    main::set_access('withdrawn', \%withdrawn, 'c');
2309
2310    my %ucd;
2311    # Some files are not actually part of the Unicode Character Database.
2312    # These typically have a different way of indicating their version
2313    main::set_access('ucd', \%ucd, 'c');
2314
2315    my %in_this_release;
2316    # Calculated value from %first_released and %withdrawn.  Are we compiling
2317    # a Unicode release which includes this file?
2318    main::set_access('in_this_release', \%in_this_release);
2319
2320    sub _next_line;
2321    sub _next_line_with_remapped_range;
2322
2323    sub new {
2324        my $class = shift;
2325
2326        my $self = bless \do{ my $anonymous_scalar }, $class;
2327        my $addr = pack 'J', refaddr $self;
2328
2329        # Set defaults
2330        $handler{$addr} = \&main::process_generic_property_file;
2331        $retain_trailing_comments{$addr} = 0;
2332        $non_skip{$addr} = 0;
2333        $skip{$addr} = undef;
2334        $has_missings_defaults{$addr} = $NO_DEFAULTS;
2335        $handle{$addr} = undef;
2336        $added_lines{$addr} = [ ];
2337        $remapped_lines{$addr} = [ ];
2338        $each_line_handler{$addr} = [ ];
2339        $eof_handler{$addr} = [ ];
2340        $errors{$addr} = { };
2341        $missings{$addr} = [ ];
2342        $early{$addr} = [ ];
2343        $optional{$addr} = [ ];
2344        $ucd{$addr} = 1;
2345
2346        # Two positional parameters.
2347        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2348        $file{$addr} = main::internal_file_to_platform(shift);
2349        $first_released{$addr} = shift;
2350
2351        # The rest of the arguments are key => value pairs
2352        # %constructor_fields has been set up earlier to list all possible
2353        # ones.  Either set or push, depending on how the default has been set
2354        # up just above.
2355        my %args = @_;
2356        foreach my $key (keys %args) {
2357            my $argument = $args{$key};
2358
2359            # Note that the fields are the lower case of the constructor keys
2360            my $hash = $constructor_fields{lc $key};
2361            if (! defined $hash) {
2362                Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2363                next;
2364            }
2365            if (ref $hash->{$addr} eq 'ARRAY') {
2366                if (ref $argument eq 'ARRAY') {
2367                    foreach my $argument (@{$argument}) {
2368                        next if ! defined $argument;
2369                        push @{$hash->{$addr}}, $argument;
2370                    }
2371                }
2372                else {
2373                    push @{$hash->{$addr}}, $argument if defined $argument;
2374                }
2375            }
2376            else {
2377                $hash->{$addr} = $argument;
2378            }
2379            delete $args{$key};
2380        };
2381
2382        $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2383
2384        # Convert 0 (meaning don't skip) to undef
2385        undef $skip{$addr} unless $skip{$addr};
2386
2387        # Handle the case where this file is optional
2388        my $pod_message_for_non_existent_optional = "";
2389        if ($optional{$addr}->@*) {
2390
2391            # First element is the pod message
2392            $pod_message_for_non_existent_optional
2393                                                = shift $optional{$addr}->@*;
2394            # Convert a 0 'Optional' argument to an empty list to make later
2395            # code more concise.
2396            if (   $optional{$addr}->@*
2397                && $optional{$addr}->@* == 1
2398                && $optional{$addr}[0] ne ""
2399                && $optional{$addr}[0] !~ /\D/
2400                && $optional{$addr}[0] == 0)
2401            {
2402                $optional{$addr} = [ ];
2403            }
2404            else {  # But if the only element doesn't evaluate to 0, make sure
2405                    # that this file is indeed considered optional below.
2406                unshift $optional{$addr}->@*, 1;
2407            }
2408        }
2409
2410        my $progress;
2411        my $function_instead_of_file = 0;
2412
2413        if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2414            $only_early{$addr} = 1;
2415            pop $early{$addr}->@*;
2416        }
2417
2418        # If we are compiling a Unicode release earlier than the file became
2419        # available, the constructor may have supplied a substitute
2420        if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2421
2422            # Yes, we have a substitute, that we will use; mark it so
2423            unshift $early{$addr}->@*, 1;
2424
2425            # See the definition of %early for what the array elements mean.
2426            # Note that we have just unshifted onto the array, so the numbers
2427            # below are +1 of those in the %early description.
2428            # If we have a property this defines, create a table and default
2429            # map for it now (at essentially compile time), so that it will be
2430            # available for the whole of run time.  (We will want to add this
2431            # name as an alias when we are using the official property name;
2432            # but this must be deferred until run(), because at construction
2433            # time the official names have yet to be defined.)
2434            if ($early{$addr}[2]) {
2435                my $fate = ($property{$addr}
2436                            && $property{$addr} eq $early{$addr}[2])
2437                          ? $PLACEHOLDER
2438                          : $INTERNAL_ONLY;
2439                my $prop_object = Property->new($early{$addr}[2],
2440                                                Fate => $fate,
2441                                                Perl_Extension => 1,
2442                                                );
2443
2444                # If not specified by the constructor, use the default mapping
2445                # for the regular property for this substitute one.
2446                if ($early{$addr}[4]) {
2447                    $prop_object->set_default_map($early{$addr}[4]);
2448                }
2449                elsif (    defined $property{$addr}
2450                       &&  defined $default_mapping{$property{$addr}})
2451                {
2452                    $prop_object
2453                        ->set_default_map($default_mapping{$property{$addr}});
2454                }
2455            }
2456
2457            if (ref $early{$addr}[1] eq 'CODE') {
2458                $function_instead_of_file = 1;
2459
2460                # If the first element of the array is a code ref, the others
2461                # are optional.
2462                $handler{$addr} = $early{$addr}[1];
2463                $property{$addr} = $early{$addr}[2]
2464                                                if defined $early{$addr}[2];
2465                $progress = "substitute $file{$addr}";
2466
2467                undef $file{$addr};
2468            }
2469            else {  # Specifying a substitute file
2470
2471                if (! main::file_exists($early{$addr}[1])) {
2472
2473                    # If we don't see the substitute file, generate an error
2474                    # message giving the needed things, and add it to the list
2475                    # of such to output before actual processing happens
2476                    # (hence the user finds out all of them in one run).
2477                    # Instead of creating a general method for NameAliases,
2478                    # hard-code it here, as there is unlikely to ever be a
2479                    # second one which needs special handling.
2480                    my $string_version = ($file{$addr} eq "NameAliases.txt")
2481                                    ? 'at least 6.1 (the later, the better)'
2482                                    : sprintf "%vd", $first_released{$addr};
2483                    push @missing_early_files, <<END;
2484'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2485END
2486                    ;
2487                    return;
2488                }
2489                $progress = $early{$addr}[1];
2490                $progress .= ", substituting for $file{$addr}" if $file{$addr};
2491                $file{$addr} = $early{$addr}[1];
2492                $property{$addr} = $early{$addr}[2];
2493
2494                # Ignore code points not in the version being compiled
2495                push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2496
2497                if (   $v_version lt v2.0        # Hanguls in this release ...
2498                    && defined $early{$addr}[3]) # ... need special treatment
2499                {
2500                    push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2501                }
2502            }
2503
2504            # And this substitute is valid for all releases.
2505            $first_released{$addr} = v0;
2506        }
2507        else {  # Normal behavior
2508            $progress = $file{$addr};
2509            unshift $early{$addr}->@*, 0; # No substitute
2510        }
2511
2512        my $file = $file{$addr};
2513        $progress_message{$addr} = "Processing $progress"
2514                                            unless $progress_message{$addr};
2515
2516        # A file should be there if it is within the window of versions for
2517        # which Unicode supplies it
2518        if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2519            $in_this_release{$addr} = 0;
2520            $skip{$addr} = "";
2521        }
2522        else {
2523            $in_this_release{$addr} = $first_released{$addr} le $v_version;
2524
2525            # Check that the file for this object (possibly using a substitute
2526            # for early releases) exists or we have a function alternative
2527            if (   ! $function_instead_of_file
2528                && ! main::file_exists($file))
2529            {
2530                # Here there is nothing available for this release.  This is
2531                # fine if we aren't expecting anything in this release.
2532                if (! $in_this_release{$addr}) {
2533                    $skip{$addr} = "";  # Don't remark since we expected
2534                                        # nothing and got nothing
2535                }
2536                elsif ($optional{$addr}->@*) {
2537
2538                    # Here the file is optional in this release; Use the
2539                    # passed in text to document this case in the pod.
2540                    $skip{$addr} = $pod_message_for_non_existent_optional;
2541                }
2542                elsif (   $in_this_release{$addr}
2543                       && ! defined $skip{$addr}
2544                       && defined $file)
2545                { # Doesn't exist but should.
2546                    $skip{$addr} = "'$file' not found.  Possibly Big problems";
2547                    Carp::my_carp($skip{$addr});
2548                }
2549            }
2550            elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2551            {
2552
2553                # The file exists; if not skipped for another reason, and we are
2554                # skipping most everything during debugging builds, use that as
2555                # the skip reason.
2556                $skip{$addr} = '$debug_skip is on'
2557            }
2558        }
2559
2560        if (   ! $debug_skip
2561            && $non_skip{$addr}
2562            && ! $required_even_in_debug_skip{$addr}
2563            && $verbosity)
2564        {
2565            print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2566        }
2567
2568        # Here, we have figured out if we will be skipping this file or not.
2569        # If so, we add any single property it defines to any passed in
2570        # optional property list.  These will be dealt with at run time.
2571        if (defined $skip{$addr}) {
2572            if ($property{$addr}) {
2573                push $optional{$addr}->@*, $property{$addr};
2574            }
2575        } # Otherwise, are going to process the file.
2576        elsif ($property{$addr}) {
2577
2578            # If the file has a property defined in the constructor for it, it
2579            # means that the property is not listed in the file's entries.  So
2580            # add a handler (to the list of line handlers) to insert the
2581            # property name into the lines, to provide a uniform interface to
2582            # the final processing subroutine.
2583            push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2584        }
2585        elsif ($properties{$addr}) {
2586
2587            # Similarly, there may be more than one property represented on
2588            # each line, with no clue but the constructor input what those
2589            # might be.  Add a handler for each line in the input so that it
2590            # creates a separate input line for each property in those input
2591            # lines, thus making them suitable to handle generically.
2592
2593            push @{$each_line_handler{$addr}},
2594                 sub {
2595                    my $file = shift;
2596                    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2597                    my @fields = split /\s*;\s*/, $_, -1;
2598
2599                    if (@fields - 1 > @{$properties{$addr}}) {
2600                        $file->carp_bad_line('Extra fields');
2601                        $_ = "";
2602                        return;
2603                    }
2604                    my $range = shift @fields;  # 0th element is always the
2605                                                # range
2606
2607                    # The next fields in the input line correspond
2608                    # respectively to the stored properties.
2609                    for my $i (0 ..  @{$properties{$addr}} - 1) {
2610                        my $property_name = $properties{$addr}[$i];
2611                        next if $property_name eq '<ignored>';
2612                        $file->insert_adjusted_lines(
2613                              "$range; $property_name; $fields[$i]");
2614                    }
2615                    $_ = "";
2616
2617                    return;
2618                };
2619        }
2620
2621        {   # On non-ascii platforms, we use a special pre-handler
2622            no strict;
2623            no warnings 'once';
2624            *next_line = (main::NON_ASCII_PLATFORM)
2625                         ? *_next_line_with_remapped_range
2626                         : *_next_line;
2627        }
2628
2629        &{$construction_time_handler{$addr}}($self)
2630                                        if $construction_time_handler{$addr};
2631
2632        return $self;
2633    }
2634
2635
2636    use overload
2637        fallback => 0,
2638        qw("") => "_operator_stringify",
2639        "." => \&main::_operator_dot,
2640        ".=" => \&main::_operator_dot_equal,
2641    ;
2642
2643    sub _operator_stringify($self, $other="", $reversed=0) {
2644        return __PACKAGE__ . " object for " . $self->file;
2645    }
2646
2647    sub run($self) {
2648        # Process the input object $self.  This opens and closes the file and
2649        # calls all the handlers for it.  Currently,  this can only be called
2650        # once per file, as it destroy's the EOF handlers
2651
2652        # flag to make sure extracted files are processed early
2653        state $seen_non_extracted = 0;
2654
2655        my $addr = pack 'J', refaddr $self;
2656
2657        my $file = $file{$addr};
2658
2659        if (! $file) {
2660            $handle{$addr} = 'pretend_is_open';
2661        }
2662        else {
2663            if ($seen_non_extracted) {
2664                if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2665                                            # case of the file's name
2666                {
2667                    Carp::my_carp_bug(main::join_lines(<<END
2668$file should be processed just after the 'Prop...Alias' files, and before
2669anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2670have subtle problems
2671END
2672                    ));
2673                }
2674            }
2675            elsif ($EXTRACTED_DIR
2676
2677                    # We only do this check for generic property files
2678                    && $handler{$addr} == \&main::process_generic_property_file
2679
2680                    && $file !~ /$EXTRACTED/i)
2681            {
2682                # We don't set this (by the 'if' above) if we have no
2683                # extracted directory, so if running on an early version,
2684                # this test won't work.  Not worth worrying about.
2685                $seen_non_extracted = 1;
2686            }
2687
2688            # Mark the file as having being processed, and warn if it
2689            # isn't a file we are expecting.  As we process the files,
2690            # they are deleted from the hash, so any that remain at the
2691            # end of the program are files that we didn't process.
2692            my $fkey = File::Spec->rel2abs($file);
2693            my $exists = delete $potential_files{lc($fkey)};
2694
2695            Carp::my_carp("Was not expecting '$file'.")
2696                                    if $exists && ! $in_this_release{$addr};
2697
2698            # If there is special handling for compiling Unicode releases
2699            # earlier than the first one in which Unicode defines this
2700            # property ...
2701            if ($early{$addr}->@* > 1) {
2702
2703                # Mark as processed any substitute file that would be used in
2704                # such a release
2705                $fkey = File::Spec->rel2abs($early{$addr}[1]);
2706                delete $potential_files{lc($fkey)};
2707
2708                # As commented in the constructor code, when using the
2709                # official property, we still have to allow the publicly
2710                # inaccessible early name so that the core code which uses it
2711                # will work regardless.
2712                if (   ! $only_early{$addr}
2713                    && ! $early{$addr}[0]
2714                    && $early{$addr}->@* > 2)
2715                {
2716                    my $early_property_name = $early{$addr}[2];
2717                    if ($property{$addr} ne $early_property_name) {
2718                        main::property_ref($property{$addr})
2719                                            ->add_alias($early_property_name);
2720                    }
2721                }
2722            }
2723
2724            # We may be skipping this file ...
2725            if (defined $skip{$addr}) {
2726
2727                # If the file isn't supposed to be in this release, there is
2728                # nothing to do
2729                if ($in_this_release{$addr}) {
2730
2731                    # But otherwise, we may print a message
2732                    if ($debug_skip) {
2733                        print STDERR "Skipping input file '$file'",
2734                                     " because '$skip{$addr}'\n";
2735                    }
2736
2737                    # And add it to the list of skipped files, which is later
2738                    # used to make the pod
2739                    $skipped_files{$file} = $skip{$addr};
2740
2741                    # The 'optional' list contains properties that are also to
2742                    # be skipped along with the file.  (There may also be
2743                    # digits which are just placeholders to make sure it isn't
2744                    # an empty list
2745                    foreach my $property ($optional{$addr}->@*) {
2746                        next unless $property =~ /\D/;
2747                        my $prop_object = main::property_ref($property);
2748                        next unless defined $prop_object;
2749                        $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2750                    }
2751                }
2752
2753                return;
2754            }
2755
2756            # Here, we are going to process the file.  Open it, converting the
2757            # slashes used in this program into the proper form for the OS
2758            my $file_handle;
2759            if (not open $file_handle, "<", $file) {
2760                Carp::my_carp("Can't open $file.  Skipping: $!");
2761                return;
2762            }
2763            $handle{$addr} = $file_handle; # Cache the open file handle
2764
2765            # If possible, make sure that the file is the correct version.
2766            # (This data isn't available on early Unicode releases or in
2767            # UnicodeData.txt.)  We don't do this check if we are using a
2768            # substitute file instead of the official one (though the code
2769            # could be extended to do so).
2770            if ($in_this_release{$addr}
2771                && ! $early{$addr}[0]
2772                && lc($file) ne 'unicodedata.txt')
2773            {
2774                my $this_version;
2775
2776                if ($file !~ /^Unihan/i) {
2777
2778                    # The non-Unihan files started getting version numbers in
2779                    # 3.2, but some files in 4.0 are unchanged from 3.2, and
2780                    # marked as 3.2.  4.0.1 is the first version where there
2781                    # are no files marked as being from less than 4.0, though
2782                    # some are marked as 4.0.  In versions after that, the
2783                    # numbers are correct.
2784                    if ($v_version ge v4.0.1) {
2785                        $_ = <$file_handle>;    # The version number is in the
2786                                                # very first line if it is a
2787                                                # UCD file; otherwise, it
2788                                                # might be
2789                        goto valid_version if $_ =~ / - $string_version \. /x;
2790                        chomp;
2791                        if ($ucd{$addr}) {
2792                            $_ =~ s/^#\s*//;
2793
2794                            # 4.0.1 had some valid files that weren't updated.
2795                            goto valid_version
2796                                    if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/;
2797                            $this_version = $_;
2798                            goto wrong_version;
2799                        }
2800                        else {
2801                            my $BOM = "\x{FEFF}";
2802                            utf8::encode($BOM);
2803                            my $BOM_re = qr/ ^ (?:$BOM)? /x;
2804
2805                            do {
2806                                chomp;
2807
2808                                # BOM; seems to be on many lines in some
2809                                # files!!
2810                                $_ =~ s/$BOM_re//;
2811
2812                                if (/./) {
2813
2814                                    # Only look for the version if in the
2815                                    # first comment block.
2816                                    goto no_version unless $_ =~ /^#/;
2817
2818                                    if ($_ =~ /Version:? (\S*)/) {
2819                                        $this_version = $1;
2820                                        goto valid_version
2821                                          if  $this_version eq $string_version;
2822                                        goto valid_version
2823                                            if  "$this_version.0"
2824                                                            eq $string_version;
2825                                    }
2826                                }
2827                            } while (<$file_handle>);
2828
2829                            goto no_version;
2830                        }
2831                    }
2832                }
2833                elsif ($v_version ge v6.0.0) { # Unihan
2834
2835                    # Unihan files didn't get accurate version numbers until
2836                    # 6.0.  The version is somewhere in the first comment
2837                    # block
2838                    while (<$file_handle>) {
2839                        goto no_version if $_ !~ /^#/;
2840                        chomp;
2841                        $_ =~ s/^#\s*//;
2842                        next if $_ !~ / version: /x;
2843                        goto valid_version if $_ =~ /$string_version/;
2844                        goto wrong_version;
2845                    }
2846                    goto no_version;
2847                }
2848                else {  # Old Unihan; have to assume is valid
2849                    goto valid_version;
2850                }
2851
2852              wrong_version:
2853                die Carp::my_carp("File '$file' is version "
2854                                . "'$this_version'.  It should be "
2855                                . "version $string_version");
2856              no_version:
2857                Carp::my_carp_bug("Could not find the expected "
2858                                . "version info in file '$file'");
2859            }
2860        }
2861
2862      valid_version:
2863        print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2864
2865        # Call any special handler for before the file.
2866        &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2867
2868        # Then the main handler
2869        &{$handler{$addr}}($self);
2870
2871        # Then any special post-file handler.
2872        &{$post_handler{$addr}}($self) if $post_handler{$addr};
2873
2874        # If any errors have been accumulated, output the counts (as the first
2875        # error message in each class was output when it was encountered).
2876        if ($errors{$addr}) {
2877            my $total = 0;
2878            my $types = 0;
2879            foreach my $error (keys %{$errors{$addr}}) {
2880                $total += $errors{$addr}->{$error};
2881                delete $errors{$addr}->{$error};
2882                $types++;
2883            }
2884            if ($total > 1) {
2885                my $message
2886                        = "A total of $total lines had errors in $file.  ";
2887
2888                $message .= ($types == 1)
2889                            ? '(Only the first one was displayed.)'
2890                            : '(Only the first of each type was displayed.)';
2891                Carp::my_carp($message);
2892            }
2893        }
2894
2895        if (@{$missings{$addr}}) {
2896            Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2897        }
2898
2899        # If a real file handle, close it.
2900        close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2901                                                        ref $handle{$addr};
2902        $handle{$addr} = "";   # Uses empty to indicate that has already seen
2903                               # the file, as opposed to undef
2904        return;
2905    }
2906
2907    sub _next_line($self) {
2908        # Sets $_ to be the next logical input line, if any.  Returns non-zero
2909        # if such a line exists.  'logical' means that any lines that have
2910        # been added via insert_lines() will be returned in $_ before the file
2911        # is read again.
2912
2913        my $addr = pack 'J', refaddr $self;
2914
2915        # Here the file is open (or if the handle is not a ref, is an open
2916        # 'virtual' file).  Get the next line; any inserted lines get priority
2917        # over the file itself.
2918        my $adjusted;
2919
2920        LINE:
2921        while (1) { # Loop until find non-comment, non-empty line
2922            #local $to_trace = 1 if main::DEBUG;
2923            my $inserted_ref = shift @{$added_lines{$addr}};
2924            if (defined $inserted_ref) {
2925                ($adjusted, $_) = @{$inserted_ref};
2926                trace $adjusted, $_ if main::DEBUG && $to_trace;
2927                return 1 if $adjusted;
2928            }
2929            else {
2930                last if ! ref $handle{$addr}; # Don't read unless is real file
2931                last if ! defined ($_ = readline $handle{$addr});
2932            }
2933            chomp;
2934            trace $_ if main::DEBUG && $to_trace;
2935
2936            # See if this line is the comment line that defines what property
2937            # value that code points that are not listed in the file should
2938            # have.  The format or existence of these lines is not guaranteed
2939            # by Unicode since they are comments, but the documentation says
2940            # that this was added for machine-readability, so probably won't
2941            # change.  This works starting in Unicode Version 5.0.  They look
2942            # like:
2943            #
2944            # @missing: 0000..10FFFF; Not_Reordered
2945            # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2946            # @missing: 0000..10FFFF; ; NaN
2947            #
2948            # Save the line for a later get_missings() call.
2949            if (/$missing_defaults_prefix/) {
2950                if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2951                    $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2952                }
2953                elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2954                    my $start = $1;     # The pattern saves the beginning and
2955                    my $end = $2;       # end points of the range the default
2956                                        # is for
2957                    my @defaults = split /\s* ; \s*/x, $_;
2958
2959                    # The first field is the @missing, which ends in a
2960                    # semi-colon, so can safely shift.
2961                    shift @defaults;
2962
2963                    # Some of these lines may have empty field placeholders
2964                    # which get in the way.  An example is:
2965                    # @missing: 0000..10FFFF; ; NaN
2966                    # Remove them.  Process starting from the top so the
2967                    # splice doesn't affect things still to be looked at.
2968                    for (my $i = @defaults - 1; $i >= 0; $i--) {
2969                        next if $defaults[$i] ne "";
2970                        splice @defaults, $i, 1;
2971                    }
2972
2973                    # What's left should be just the property (maybe) and the
2974                    # default.  Having only one element means it doesn't have
2975                    # the property.
2976                    my $default;
2977                    my $property;
2978                    if (@defaults >= 1) {
2979                        if (@defaults == 1) {
2980                            $default = $defaults[0];
2981                        }
2982                        else {
2983                            $property = $defaults[0];
2984                            $default = $defaults[1];
2985                        }
2986                    }
2987
2988                    if (@defaults < 1
2989                        || @defaults > 2
2990                        || ($default =~ /^</
2991                            && $default !~ /^<code *point>$/i
2992                            && $default !~ /^<none>$/i
2993                            && $default !~ /^<script>$/i))
2994                    {
2995                        $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2996                    }
2997                    else {
2998
2999                        # If the property is missing from the line, it should
3000                        # be the one for the whole file
3001                        $property = $property{$addr} if ! defined $property;
3002
3003                        # Change <none> to the null string, which is what it
3004                        # really means.  If the default is the code point
3005                        # itself, set it to <code point>, which is what
3006                        # Unicode uses (but sometimes they've forgotten the
3007                        # space)
3008                        if ($default =~ /^<none>$/i) {
3009                            $default = "";
3010                        }
3011                        elsif ($default =~ /^<code *point>$/i) {
3012                            $default = $CODE_POINT;
3013                        }
3014                        elsif ($default =~ /^<script>$/i) {
3015
3016                            # Special case this one.  Currently is from
3017                            # ScriptExtensions.txt, and means for all unlisted
3018                            # code points, use their Script property values.
3019                            # For the code points not listed in that file, the
3020                            # default value is 'Unknown'.
3021                            $default = "Unknown";
3022                        }
3023
3024                        # Store them as a sub-hash as part of an array, with
3025                        # both components.
3026                        push @{$missings{$addr}}, { start    => hex $start,
3027                                                    end      => hex $end,
3028                                                    default  => $default,
3029                                                    property => $property
3030                                                  };
3031                    }
3032                }
3033
3034                # There is nothing for the caller to process on this comment
3035                # line.
3036                next;
3037            }
3038
3039            # Unless to keep, remove comments.  If to keep, ignore
3040            # comment-only lines
3041            if ($retain_trailing_comments{$addr}) {
3042                next if / ^ \s* \# /x;
3043
3044                # But escape any single quotes (done in both the comment and
3045                # non-comment portion; this could be a bug someday, but not
3046                # likely)
3047                s/'/\\'/g;
3048            }
3049            else {
3050                s/#.*//;
3051            }
3052
3053            # Remove trailing space, and skip this line if the result is empty
3054            s/\s+$//;
3055            next if /^$/;
3056
3057            # Call any handlers for this line, and skip further processing of
3058            # the line if the handler sets the line to null.
3059            foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3060                &{$sub_ref}($self);
3061                next LINE if /^$/;
3062            }
3063
3064            # Here the line is ok.  return success.
3065            return 1;
3066        } # End of looping through lines.
3067
3068        # If there are EOF handlers, call each (only once) and if it generates
3069        # more lines to process go back in the loop to handle them.
3070        while ($eof_handler{$addr}->@*) {
3071            &{$eof_handler{$addr}[0]}($self);
3072            shift $eof_handler{$addr}->@*;   # Currently only get one shot at it.
3073            goto LINE if $added_lines{$addr};
3074        }
3075
3076        # Return failure -- no more lines.
3077        return 0;
3078
3079    }
3080
3081    sub _next_line_with_remapped_range($self) {
3082        # like _next_line(), but for use on non-ASCII platforms.  It sets $_
3083        # to be the next logical input line, if any.  Returns non-zero if such
3084        # a line exists.  'logical' means that any lines that have been added
3085        # via insert_lines() will be returned in $_ before the file is read
3086        # again.
3087        #
3088        # The difference from _next_line() is that this remaps the Unicode
3089        # code points in the input to those of the native platform.  Each
3090        # input line contains a single code point, or a single contiguous
3091        # range of them  This routine splits each range into its individual
3092        # code points and caches them.  It returns the cached values,
3093        # translated into their native equivalents, one at a time, for each
3094        # call, before reading the next line.  Since native values can only be
3095        # a single byte wide, no translation is needed for code points above
3096        # 0xFF, and ranges that are entirely above that number are not split.
3097        # If an input line contains the range 254-1000, it would be split into
3098        # three elements: 254, 255, and 256-1000.  (The downstream table
3099        # insertion code will sort and coalesce the individual code points
3100        # into appropriate ranges.)
3101
3102        my $addr = pack 'J', refaddr $self;
3103
3104        while (1) {
3105
3106            # Look in cache before reading the next line.  Return any cached
3107            # value, translated
3108            my $inserted = shift @{$remapped_lines{$addr}};
3109            if (defined $inserted) {
3110                trace $inserted if main::DEBUG && $to_trace;
3111                $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3112                trace $_ if main::DEBUG && $to_trace;
3113                return 1;
3114            }
3115
3116            # Get the next line.
3117            return 0 unless _next_line($self);
3118
3119            # If there is a special handler for it, return the line,
3120            # untranslated.  This should happen only for files that are
3121            # special, not being code-point related, such as property names.
3122            return 1 if $handler{$addr}
3123                                    != \&main::process_generic_property_file;
3124
3125            my ($range, $property_name, $map, @remainder)
3126                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3127
3128            if (@remainder
3129                || ! defined $property_name
3130                || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3131            {
3132                Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
3133            }
3134
3135            my $low = hex $1;
3136            my $high = (defined $2) ? hex $2 : $low;
3137
3138            # If the input maps the range to another code point, remap the
3139            # target if it is between 0 and 255.
3140            my $tail;
3141            if (defined $map) {
3142                $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3143                $tail = "$property_name; $map";
3144                $_ = "$range; $tail";
3145            }
3146            else {
3147                $tail = $property_name;
3148            }
3149
3150            # If entire range is above 255, just return it, unchanged (except
3151            # any mapped-to code point, already changed above)
3152            return 1 if $low > 255;
3153
3154            # Cache an entry for every code point < 255.  For those in the
3155            # range above 255, return a dummy entry for just that portion of
3156            # the range.  Note that this will be out-of-order, but that is not
3157            # a problem.
3158            foreach my $code_point ($low .. $high) {
3159                if ($code_point > 255) {
3160                    $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3161                    return 1;
3162                }
3163                push @{$remapped_lines{$addr}}, "$code_point; $tail";
3164            }
3165        } # End of looping through lines.
3166
3167        # NOTREACHED
3168    }
3169
3170#   Not currently used, not fully tested.
3171#    sub peek {
3172#        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3173#        # record.  Not callable from an each_line_handler(), nor does it call
3174#        # an each_line_handler() on the line.
3175#
3176#        my $self = shift;
3177#        my $addr = pack 'J', refaddr $self;
3178#
3179#        foreach my $inserted_ref (@{$added_lines{$addr}}) {
3180#            my ($adjusted, $line) = @{$inserted_ref};
3181#            next if $adjusted;
3182#
3183#            # Remove comments and trailing space, and return a non-empty
3184#            # resulting line
3185#            $line =~ s/#.*//;
3186#            $line =~ s/\s+$//;
3187#            return $line if $line ne "";
3188#        }
3189#
3190#        return if ! ref $handle{$addr}; # Don't read unless is real file
3191#        while (1) { # Loop until find non-comment, non-empty line
3192#            local $to_trace = 1 if main::DEBUG;
3193#            trace $_ if main::DEBUG && $to_trace;
3194#            return if ! defined (my $line = readline $handle{$addr});
3195#            chomp $line;
3196#            push @{$added_lines{$addr}}, [ 0, $line ];
3197#
3198#            $line =~ s/#.*//;
3199#            $line =~ s/\s+$//;
3200#            return $line if $line ne "";
3201#        }
3202#
3203#        return;
3204#    }
3205
3206
3207    sub insert_lines($self, @lines) {
3208        # Lines can be inserted so that it looks like they were in the input
3209        # file at the place it was when this routine is called.  See also
3210        # insert_adjusted_lines().  Lines inserted via this routine go through
3211        # any each_line_handler()
3212
3213        # Each inserted line is an array, with the first element being 0 to
3214        # indicate that this line hasn't been adjusted, and needs to be
3215        # processed.
3216        push @{$added_lines{pack 'J', refaddr $self}}, map { [ 0, $_ ] } @lines;
3217        return;
3218    }
3219
3220    sub insert_adjusted_lines($self, @lines) {
3221        # Lines can be inserted so that it looks like they were in the input
3222        # file at the place it was when this routine is called.  See also
3223        # insert_lines().  Lines inserted via this routine are already fully
3224        # adjusted, ready to be processed; each_line_handler()s handlers will
3225        # not be called.  This means this is not a completely general
3226        # facility, as only the last each_line_handler on the stack should
3227        # call this.  It could be made more general, by passing to each of the
3228        # line_handlers their position on the stack, which they would pass on
3229        # to this routine, and that would replace the boolean first element in
3230        # the anonymous array pushed here, so that the next_line routine could
3231        # use that to call only those handlers whose index is after it on the
3232        # stack.  But this is overkill for what is needed now.
3233
3234        trace $self if main::DEBUG && $to_trace;
3235
3236        # Each inserted line is an array, with the first element being 1 to
3237        # indicate that this line has been adjusted
3238        push @{$added_lines{pack 'J', refaddr $self}}, map { [ 1, $_ ] } @lines;
3239        return;
3240    }
3241
3242    sub get_missings($self) {
3243        # Returns the stored up @missings lines' values, and clears the list.
3244        # The values are in a hash, consisting of 'default' and 'property'.
3245        # However, since these lines can be stacked up, the return is an array
3246        # of all these hashes.
3247
3248        my $addr = pack 'J', refaddr $self;
3249
3250        # If not accepting a list return, just return the first one.
3251        return shift @{$missings{$addr}} unless wantarray;
3252
3253        my @return = @{$missings{$addr}};
3254        undef @{$missings{$addr}};
3255        return @return;
3256    }
3257
3258    sub _exclude_unassigned($self) {
3259
3260        # Takes the range in $_ and excludes code points that aren't assigned
3261        # in this release
3262
3263        state $skip_inserted_count = 0;
3264
3265        # Ignore recursive calls.
3266        if ($skip_inserted_count) {
3267            $skip_inserted_count--;
3268            return;
3269        }
3270
3271        # Find what code points are assigned in this release
3272        main::calculate_Assigned() if ! defined $Assigned;
3273
3274        my ($range, @remainder)
3275            = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3276
3277        # Examine the range.
3278        if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3279        {
3280            my $low = hex $1;
3281            my $high = (defined $2) ? hex $2 : $low;
3282
3283            # Split the range into subranges of just those code points in it
3284            # that are assigned.
3285            my @ranges = (Range_List->new(Initialize
3286                              => Range->new($low, $high)) & $Assigned)->ranges;
3287
3288            # Do nothing if nothing in the original range is assigned in this
3289            # release; handle normally if everything is in this release.
3290            if (! @ranges) {
3291                $_ = "";
3292            }
3293            elsif (@ranges != 1) {
3294
3295                # Here, some code points in the original range aren't in this
3296                # release; @ranges gives the ones that are.  Create fake input
3297                # lines for each of the ranges, and set things up so that when
3298                # this routine is called on that fake input, it will do
3299                # nothing.
3300                $skip_inserted_count = @ranges;
3301                my $remainder = join ";", @remainder;
3302                for my $range (@ranges) {
3303                    $self->insert_lines(sprintf("%04X..%04X;%s",
3304                                    $range->start, $range->end, $remainder));
3305                }
3306                $_ = "";    # The original range is now defunct.
3307            }
3308        }
3309
3310        return;
3311    }
3312
3313    sub _fixup_obsolete_hanguls($self) {
3314
3315        # This is called only when compiling Unicode version 1.  All Unicode
3316        # data for subsequent releases assumes that the code points that were
3317        # Hangul syllables in this release only are something else, so if
3318        # using such data, we have to override it
3319
3320        my $addr = pack 'J', refaddr $self;
3321
3322        my $object = main::property_ref($property{$addr});
3323        $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3324                         $FINAL_REMOVED_HANGUL_SYLLABLE,
3325                         $early{$addr}[3],  # Passed-in value for these
3326                         Replace => $UNCONDITIONALLY);
3327    }
3328
3329    sub _insert_property_into_line($self) {
3330        # Add a property field to $_, if this file requires it.
3331
3332        my $property = $property{pack 'J', refaddr $self};
3333        $_ =~ s/(;|$)/; $property$1/;
3334        return;
3335    }
3336
3337    sub carp_bad_line($self, $message="") {
3338        # Output consistent error messages, using either a generic one, or the
3339        # one given by the optional parameter.  To avoid gazillions of the
3340        # same message in case the syntax of a  file is way off, this routine
3341        # only outputs the first instance of each message, incrementing a
3342        # count so the totals can be output at the end of the file.
3343
3344        my $addr = pack 'J', refaddr $self;
3345
3346        $message = 'Unexpected line' unless $message;
3347
3348        # No trailing punctuation so as to fit with our addenda.
3349        $message =~ s/[.:;,]$//;
3350
3351        # If haven't seen this exact message before, output it now.  Otherwise
3352        # increment the count of how many times it has occurred
3353        unless ($errors{$addr}->{$message}) {
3354            Carp::my_carp("$message in '$_' in "
3355                            . $file{$addr}
3356                            . " at line $..  Skipping this line;");
3357            $errors{$addr}->{$message} = 1;
3358        }
3359        else {
3360            $errors{$addr}->{$message}++;
3361        }
3362
3363        # Clear the line to prevent any further (meaningful) processing of it.
3364        $_ = "";
3365
3366        return;
3367    }
3368} # End closure
3369
3370package Multi_Default;
3371
3372sub trace { return main::trace(@_); }
3373
3374# Certain properties in early versions of Unicode had more than one possible
3375# default for code points missing from the files.  In these cases, one
3376# default applies to everything left over after all the others are applied,
3377# and for each of the others, there is a description of which class of code
3378# points applies to it.  This object helps implement this by storing the
3379# defaults, and for all but that final default, an eval string that generates
3380# the class that it applies to.  That class must be a Range_List, or contains
3381# a Range_List that the overloaded operators recognize as to be operated on.
3382# A string is used because this is called early when we know symbolically what
3383# needs to be done, but typically before any data is gathered.  Thus the
3384# evaluation gets delayed until we have at hand all the needed information.
3385
3386{   # Closure
3387
3388    main::setup_package();
3389
3390    my %class_defaults;
3391    # The defaults structure for the classes
3392    main::set_access('class_defaults', \%class_defaults, 'readable_array');
3393
3394    my %other_default;
3395    # The default that applies to everything left over.
3396    main::set_access('other_default', \%other_default, 'r');
3397
3398    my %iterator;
3399
3400    sub new {
3401        # The constructor is called with default => eval pairs, terminated by
3402        # the left-over default. e.g.
3403        # Multi_Default->new(
3404        #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3405        #               -  0x200D',
3406        #        'R' => 'some other expression that evaluates to code points',
3407        #        .
3408        #        .
3409        #        .
3410        #        'U'));
3411        # It is best to leave the final value be the one that matches the
3412        # above-Unicode code points.
3413
3414        my $class = shift;
3415
3416        my $self = bless \do{my $anonymous_scalar}, $class;
3417        my $addr = pack 'J', refaddr $self;
3418        $iterator{$addr} = 0;
3419
3420        return $self unless @_;
3421
3422        while (@_ > 1) {
3423            $self->append_default(shift, shift);
3424        }
3425
3426        $self->set_final_default(shift);
3427
3428        return $self;
3429    }
3430
3431    sub append_default($self, $new_default, $eval) {
3432        my $addr = pack 'J', refaddr $self;
3433
3434        # Pushes a default setting to the current list
3435        push $class_defaults{$addr}->@*, [ $new_default, $eval ];
3436    }
3437
3438    sub set_final_default($self, $new_default) {
3439        my $addr = pack 'J', refaddr $self;
3440        $other_default{$addr} = $new_default;
3441    }
3442
3443    sub get_next_defaults($self) {
3444        # Iterates and returns the next class of defaults.
3445
3446        my $addr = pack 'J', refaddr $self;
3447        if ($iterator{$addr}++ < $class_defaults{$addr}->@*) {
3448            return $class_defaults{$addr}->[$iterator{$addr}-1]->@*;
3449        }
3450
3451        $iterator{$addr} = 0;
3452        return undef;
3453    }
3454}
3455
3456package Alias;
3457
3458# An alias is one of the names that a table goes by.  This class defines them
3459# including some attributes.  Everything is currently setup in the
3460# constructor.
3461
3462
3463{   # Closure
3464
3465    main::setup_package();
3466
3467    my %name;
3468    main::set_access('name', \%name, 'r');
3469
3470    my %loose_match;
3471    # Should this name match loosely or not.
3472    main::set_access('loose_match', \%loose_match, 'r');
3473
3474    my %make_re_pod_entry;
3475    # Some aliases should not get their own entries in the re section of the
3476    # pod, because they are covered by a wild-card, and some we want to
3477    # discourage use of.  Binary
3478    main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3479
3480    my %ucd;
3481    # Is this documented to be accessible via Unicode::UCD
3482    main::set_access('ucd', \%ucd, 'r', 's');
3483
3484    my %status;
3485    # Aliases have a status, like deprecated, or even suppressed (which means
3486    # they don't appear in documentation).  Enum
3487    main::set_access('status', \%status, 'r');
3488
3489    my %ok_as_filename;
3490    # Similarly, some aliases should not be considered as usable ones for
3491    # external use, such as file names, or we don't want documentation to
3492    # recommend them.  Boolean
3493    main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3494
3495    sub new {
3496        my $class = shift;
3497
3498        my $self = bless \do { my $anonymous_scalar }, $class;
3499        my $addr = pack 'J', refaddr $self;
3500
3501        $name{$addr} = shift;
3502        $loose_match{$addr} = shift;
3503        $make_re_pod_entry{$addr} = shift;
3504        $ok_as_filename{$addr} = shift;
3505        $status{$addr} = shift;
3506        $ucd{$addr} = shift;
3507
3508        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3509
3510        # Null names are never ok externally
3511        $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3512
3513        return $self;
3514    }
3515}
3516
3517package Range;
3518
3519# A range is the basic unit for storing code points, and is described in the
3520# comments at the beginning of the program.  Each range has a starting code
3521# point; an ending code point (not less than the starting one); a value
3522# that applies to every code point in between the two end-points, inclusive;
3523# and an enum type that applies to the value.  The type is for the user's
3524# convenience, and has no meaning here, except that a non-zero type is
3525# considered to not obey the normal Unicode rules for having standard forms.
3526#
3527# The same structure is used for both map and match tables, even though in the
3528# latter, the value (and hence type) is irrelevant and could be used as a
3529# comment.  In map tables, the value is what all the code points in the range
3530# map to.  Type 0 values have the standardized version of the value stored as
3531# well, so as to not have to recalculate it a lot.
3532
3533sub trace { return main::trace(@_); }
3534
3535{   # Closure
3536
3537    main::setup_package();
3538
3539    my %start;
3540    main::set_access('start', \%start, 'r', 's');
3541
3542    my %end;
3543    main::set_access('end', \%end, 'r', 's');
3544
3545    my %value;
3546    main::set_access('value', \%value, 'r', 's');
3547
3548    my %type;
3549    main::set_access('type', \%type, 'r');
3550
3551    my %standard_form;
3552    # The value in internal standard form.  Defined only if the type is 0.
3553    main::set_access('standard_form', \%standard_form);
3554
3555    # Note that if these fields change, the dump() method should as well
3556
3557    sub new($class, $_addr, $_end, @_args) {
3558        my $self = bless \do { my $anonymous_scalar }, $class;
3559        my $addr = pack 'J', refaddr $self;
3560
3561        $start{$addr} = $_addr;
3562        $end{$addr}   = $_end;
3563
3564        my %args = @_args;
3565
3566        my $value = delete $args{'Value'};  # Can be 0
3567        $value = "" unless defined $value;
3568        $value{$addr} = $value;
3569
3570        $type{$addr} = delete $args{'Type'} || 0;
3571
3572        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3573
3574        return $self;
3575    }
3576
3577    use overload
3578        fallback => 0,
3579        qw("") => "_operator_stringify",
3580        "." => \&main::_operator_dot,
3581        ".=" => \&main::_operator_dot_equal,
3582    ;
3583
3584    sub _operator_stringify($self, $other="", $reversed=0) {
3585        my $addr = pack 'J', refaddr $self;
3586
3587        # Output it like '0041..0065 (value)'
3588        my $return = sprintf("%04X", $start{$addr})
3589                        .  '..'
3590                        . sprintf("%04X", $end{$addr});
3591        my $value = $value{$addr};
3592        my $type = $type{$addr};
3593        $return .= ' (';
3594        $return .= "$value";
3595        $return .= ", Type=$type" if $type != 0;
3596        $return .= ')';
3597
3598        return $return;
3599    }
3600
3601    sub standard_form($self) {
3602        # Calculate the standard form only if needed, and cache the result.
3603        # The standard form is the value itself if the type is special.
3604        # This represents a considerable CPU and memory saving - at the time
3605        # of writing there are 368676 non-special objects, but the standard
3606        # form is only requested for 22047 of them - ie about 6%.
3607
3608        my $addr = pack 'J', refaddr $self;
3609
3610        return $standard_form{$addr} if defined $standard_form{$addr};
3611
3612        my $value = $value{$addr};
3613        return $value if $type{$addr};
3614        return $standard_form{$addr} = main::standardize($value);
3615    }
3616
3617    sub dump($self, $indent) {
3618        # Human, not machine readable.  For machine readable, comment out this
3619        # entire routine and let the standard one take effect.
3620        my $addr = pack 'J', refaddr $self;
3621
3622        my $return = $indent
3623                    . sprintf("%04X", $start{$addr})
3624                    . '..'
3625                    . sprintf("%04X", $end{$addr})
3626                    . " '$value{$addr}';";
3627        if (! defined $standard_form{$addr}) {
3628            $return .= "(type=$type{$addr})";
3629        }
3630        elsif ($standard_form{$addr} ne $value{$addr}) {
3631            $return .= "(standard '$standard_form{$addr}')";
3632        }
3633        return $return;
3634    }
3635} # End closure
3636
3637package _Range_List_Base;
3638
3639# Base class for range lists.  A range list is simply an ordered list of
3640# ranges, so that the ranges with the lowest starting numbers are first in it.
3641#
3642# When a new range is added that is adjacent to an existing range that has the
3643# same value and type, it merges with it to form a larger range.
3644#
3645# Ranges generally do not overlap, except that there can be multiple entries
3646# of single code point ranges.  This is because of NameAliases.txt.
3647#
3648# In this program, there is a standard value such that if two different
3649# values, have the same standard value, they are considered equivalent.  This
3650# value was chosen so that it gives correct results on Unicode data
3651
3652# There are a number of methods to manipulate range lists, and some operators
3653# are overloaded to handle them.
3654
3655sub trace { return main::trace(@_); }
3656
3657{ # Closure
3658
3659    our $addr;
3660
3661    # Max is initialized to a negative value that isn't adjacent to 0, for
3662    # simpler tests
3663    my $max_init = -2;
3664
3665    main::setup_package();
3666
3667    my %ranges;
3668    # The list of ranges
3669    main::set_access('ranges', \%ranges, 'readable_array');
3670
3671    my %max;
3672    # The highest code point in the list.  This was originally a method, but
3673    # actual measurements said it was used a lot.
3674    main::set_access('max', \%max, 'r');
3675
3676    my %each_range_iterator;
3677    # Iterator position for each_range()
3678    main::set_access('each_range_iterator', \%each_range_iterator);
3679
3680    my %owner_name_of;
3681    # Name of parent this is attached to, if any.  Solely for better error
3682    # messages.
3683    main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3684
3685    my %_search_ranges_cache;
3686    # A cache of the previous result from _search_ranges(), for better
3687    # performance
3688    main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3689
3690    sub new {
3691        my $class = shift;
3692        my %args = @_;
3693
3694        # Optional initialization data for the range list.  NOTE: For large
3695        # ranges, it is better to use Range object rather than
3696        #   [ low .. high ]
3697        # as it iterates through each one individually in the latter case.
3698        my $initialize = delete $args{'Initialize'};
3699
3700        my $self;
3701
3702        # Use _union() to initialize.  _union() returns an object of this
3703        # class, which means that it will call this constructor recursively.
3704        # But it won't have this $initialize parameter so that it won't
3705        # infinitely loop on this.
3706        return _union($class, $initialize, %args) if defined $initialize;
3707
3708        $self = bless \do { my $anonymous_scalar }, $class;
3709        my $addr = pack 'J', refaddr $self;
3710
3711        # Optional parent object, only for debug info.
3712        $owner_name_of{$addr} = delete $args{'Owner'};
3713        $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3714
3715        # Stringify, in case it is an object.
3716        $owner_name_of{$addr} = "$owner_name_of{$addr}";
3717
3718        # This is used only for error messages, and so a colon is added
3719        $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3720
3721        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3722
3723        $max{$addr} = $max_init;
3724
3725        $_search_ranges_cache{$addr} = 0;
3726        $ranges{$addr} = [];
3727
3728        return $self;
3729    }
3730
3731    use overload
3732        fallback => 0,
3733        qw("") => "_operator_stringify",
3734        "." => \&main::_operator_dot,
3735        ".=" => \&main::_operator_dot_equal,
3736    ;
3737
3738    sub _operator_stringify($self, $other="", $reversed=0) {
3739        my $addr = pack 'J', refaddr $self;
3740
3741        return "Range_List attached to '$owner_name_of{$addr}'"
3742                                                if $owner_name_of{$addr};
3743        return "anonymous Range_List " . \$self;
3744    }
3745
3746    sub _union {
3747        # Returns the union of the input code points.  It can be called as
3748        # either a constructor or a method.  If called as a method, the result
3749        # will be a new() instance of the calling object, containing the union
3750        # of that object with the other parameter's code points;  if called as
3751        # a constructor, the first parameter gives the class that the new object
3752        # should be, and the second parameter gives the code points to go into
3753        # it.
3754        # In either case, there are two parameters looked at by this routine;
3755        # any additional parameters are passed to the new() constructor.
3756        #
3757        # The code points can come in the form of some object that contains
3758        # ranges, and has a conventionally named method to access them; or
3759        # they can be an array of individual code points (as integers); or
3760        # just a single code point.
3761        #
3762        # If they are ranges, this routine doesn't make any effort to preserve
3763        # the range values and types of one input over the other.  Therefore
3764        # this base class should not allow _union to be called from other than
3765        # initialization code, so as to prevent two tables from being added
3766        # together where the range values matter.  The general form of this
3767        # routine therefore belongs in a derived class, but it was moved here
3768        # to avoid duplication of code.  The failure to overload this in this
3769        # class keeps it safe.
3770        #
3771        # It does make the effort during initialization to accept tables with
3772        # multiple values for the same code point, and to preserve the order
3773        # of these.  If there is only one input range or range set, it doesn't
3774        # sort (as it should already be sorted to the desired order), and will
3775        # accept multiple values per code point.  Otherwise it will merge
3776        # multiple values into a single one.
3777
3778        my $self;
3779        my @args;   # Arguments to pass to the constructor
3780
3781        my $class = shift;
3782
3783        # If a method call, will start the union with the object itself, and
3784        # the class of the new object will be the same as self.
3785        if (ref $class) {
3786            $self = $class;
3787            $class = ref $self;
3788            push @args, $self;
3789        }
3790
3791        # Add the other required parameter.
3792        push @args, shift;
3793        # Rest of parameters are passed on to the constructor
3794
3795        # Accumulate all records from both lists.
3796        my @records;
3797        my $input_count = 0;
3798        for my $arg (@args) {
3799            #local $to_trace = 0 if main::DEBUG;
3800            trace "argument = $arg" if main::DEBUG && $to_trace;
3801            if (! defined $arg) {
3802                my $message = "";
3803                if (defined $self) {
3804                    $message .= $owner_name_of{pack 'J', refaddr $self};
3805                }
3806                Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3807                return;
3808            }
3809
3810            $arg = [ $arg ] if ! ref $arg;
3811            my $type = ref $arg;
3812            if ($type eq 'ARRAY') {
3813                foreach my $element (@$arg) {
3814                    push @records, Range->new($element, $element);
3815                    $input_count++;
3816                }
3817            }
3818            elsif ($arg->isa('Range')) {
3819                push @records, $arg;
3820                $input_count++;
3821            }
3822            elsif ($arg->can('ranges')) {
3823                push @records, $arg->ranges;
3824                $input_count++;
3825            }
3826            else {
3827                my $message = "";
3828                if (defined $self) {
3829                    $message .= $owner_name_of{pack 'J', refaddr $self};
3830                }
3831                Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3832                return;
3833            }
3834        }
3835
3836        # Sort with the range containing the lowest ordinal first, but if
3837        # two ranges start at the same code point, sort with the bigger range
3838        # of the two first, because it takes fewer cycles.
3839        if ($input_count > 1) {
3840            @records = sort { ($a->start <=> $b->start)
3841                                      or
3842                                    # if b is shorter than a, b->end will be
3843                                    # less than a->end, and we want to select
3844                                    # a, so want to return -1
3845                                    ($b->end <=> $a->end)
3846                                   } @records;
3847        }
3848
3849        my $new = $class->new(@_);
3850
3851        # Fold in records so long as they add new information.
3852        for my $set (@records) {
3853            my $start = $set->start;
3854            my $end   = $set->end;
3855            my $value = $set->value;
3856            my $type  = $set->type;
3857            if ($start > $new->max) {
3858                $new->_add_delete('+', $start, $end, $value, Type => $type);
3859            }
3860            elsif ($end > $new->max) {
3861                $new->_add_delete('+', $new->max +1, $end, $value,
3862                                                                Type => $type);
3863            }
3864            elsif ($input_count == 1) {
3865                # Here, overlaps existing range, but is from a single input,
3866                # so preserve the multiple values from that input.
3867                $new->_add_delete('+', $start, $end, $value, Type => $type,
3868                                                Replace => $MULTIPLE_AFTER);
3869            }
3870        }
3871
3872        return $new;
3873    }
3874
3875    sub range_count($self) {        # Return the number of ranges in the range list
3876        return scalar @{$ranges{pack 'J', refaddr $self}};
3877    }
3878
3879    sub min($self) {
3880        # Returns the minimum code point currently in the range list, or if
3881        # the range list is empty, 2 beyond the max possible.  This is a
3882        # method because used so rarely, that not worth saving between calls,
3883        # and having to worry about changing it as ranges are added and
3884        # deleted.
3885
3886        my $addr = pack 'J', refaddr $self;
3887
3888        # If the range list is empty, return a large value that isn't adjacent
3889        # to any that could be in the range list, for simpler tests
3890        return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3891        return $ranges{$addr}->[0]->start;
3892    }
3893
3894    sub contains($self, $codepoint) {
3895        # Boolean: Is argument in the range list?  If so returns $i such that:
3896        #   range[$i]->end < $codepoint <= range[$i+1]->end
3897        # which is one beyond what you want; this is so that the 0th range
3898        # doesn't return false
3899
3900        my $i = $self->_search_ranges($codepoint);
3901        return 0 unless defined $i;
3902
3903        # The search returns $i, such that
3904        #   range[$i-1]->end < $codepoint <= range[$i]->end
3905        # So is in the table if and only iff it is at least the start position
3906        # of range $i.
3907        return 0 if $ranges{pack 'J', refaddr $self}->[$i]->start > $codepoint;
3908        return $i + 1;
3909    }
3910
3911    sub containing_range($self, $codepoint) {
3912        # Returns the range object that contains the code point, undef if none
3913        my $i = $self->contains($codepoint);
3914        return unless $i;
3915
3916        # contains() returns 1 beyond where we should look
3917        return $ranges{pack 'J', refaddr $self}->[$i-1];
3918    }
3919
3920    sub value_of($self, $codepoint) {
3921        # Returns the value associated with the code point, undef if none
3922        my $range = $self->containing_range($codepoint);
3923        return unless defined $range;
3924
3925        return $range->value;
3926    }
3927
3928    sub type_of($self, $codepoint) {
3929        # Returns the type of the range containing the code point, undef if
3930        # the code point is not in the table
3931        my $range = $self->containing_range($codepoint);
3932        return unless defined $range;
3933
3934        return $range->type;
3935    }
3936
3937    sub _search_ranges($self, $code_point) {
3938        # Find the range in the list which contains a code point, or where it
3939        # should go if were to add it.  That is, it returns $i, such that:
3940        #   range[$i-1]->end < $codepoint <= range[$i]->end
3941        # Returns undef if no such $i is possible (e.g. at end of table), or
3942        # if there is an error.
3943        my $addr = pack 'J', refaddr $self;
3944
3945        return if $code_point > $max{$addr};
3946        my $r = $ranges{$addr};                # The current list of ranges
3947        my $range_list_size = scalar @$r;
3948        my $i;
3949
3950        use integer;        # want integer division
3951
3952        # Use the cached result as the starting guess for this one, because,
3953        # an experiment on 5.1 showed that 90% of the time the cache was the
3954        # same as the result on the next call (and 7% it was one less).
3955        $i = $_search_ranges_cache{$addr};
3956        $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3957                                            # from an intervening deletion
3958        #local $to_trace = 1 if main::DEBUG;
3959        trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3960        return $i if $code_point <= $r->[$i]->end
3961                     && ($i == 0 || $r->[$i-1]->end < $code_point);
3962
3963        # Here the cache doesn't yield the correct $i.  Try adding 1.
3964        if ($i < $range_list_size - 1
3965            && $r->[$i]->end < $code_point &&
3966            $code_point <= $r->[$i+1]->end)
3967        {
3968            $i++;
3969            trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3970            $_search_ranges_cache{$addr} = $i;
3971            return $i;
3972        }
3973
3974        # Here, adding 1 also didn't work.  We do a binary search to
3975        # find the correct position, starting with current $i
3976        my $lower = 0;
3977        my $upper = $range_list_size - 1;
3978        while (1) {
3979            trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3980
3981            if ($code_point <= $r->[$i]->end) {
3982
3983                # Here we have met the upper constraint.  We can quit if we
3984                # also meet the lower one.
3985                last if $i == 0 || $r->[$i-1]->end < $code_point;
3986
3987                $upper = $i;        # Still too high.
3988
3989            }
3990            else {
3991
3992                # Here, $r[$i]->end < $code_point, so look higher up.
3993                $lower = $i;
3994            }
3995
3996            # Split search domain in half to try again.
3997            my $temp = ($upper + $lower) / 2;
3998
3999            # No point in continuing unless $i changes for next time
4000            # in the loop.
4001            if ($temp == $i) {
4002
4003                # We can't reach the highest element because of the averaging.
4004                # So if one below the upper edge, force it there and try one
4005                # more time.
4006                if ($i == $range_list_size - 2) {
4007
4008                    trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4009                    $i = $range_list_size - 1;
4010
4011                    # Change $lower as well so if fails next time through,
4012                    # taking the average will yield the same $i, and we will
4013                    # quit with the error message just below.
4014                    $lower = $i;
4015                    next;
4016                }
4017                Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
4018                return;
4019            }
4020            $i = $temp;
4021        } # End of while loop
4022
4023        if (main::DEBUG && $to_trace) {
4024            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4025            trace "i=  [ $i ]", $r->[$i];
4026            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4027        }
4028
4029        # Here we have found the offset.  Cache it as a starting point for the
4030        # next call.
4031        $_search_ranges_cache{$addr} = $i;
4032        return $i;
4033    }
4034
4035    sub _add_delete {
4036        # Add, replace or delete ranges to or from a list.  The $type
4037        # parameter gives which:
4038        #   '+' => insert or replace a range, returning a list of any changed
4039        #          ranges.
4040        #   '-' => delete a range, returning a list of any deleted ranges.
4041        #
4042        # The next three parameters give respectively the start, end, and
4043        # value associated with the range.  'value' should be null unless the
4044        # operation is '+';
4045        #
4046        # The range list is kept sorted so that the range with the lowest
4047        # starting position is first in the list, and generally, adjacent
4048        # ranges with the same values are merged into a single larger one (see
4049        # exceptions below).
4050        #
4051        # There are more parameters; all are key => value pairs:
4052        #   Type    gives the type of the value.  It is only valid for '+'.
4053        #           All ranges have types; if this parameter is omitted, 0 is
4054        #           assumed.  Ranges with type 0 are assumed to obey the
4055        #           Unicode rules for casing, etc; ranges with other types are
4056        #           not.  Otherwise, the type is arbitrary, for the caller's
4057        #           convenience, and looked at only by this routine to keep
4058        #           adjacent ranges of different types from being merged into
4059        #           a single larger range, and when Replace =>
4060        #           $IF_NOT_EQUIVALENT is specified (see just below).
4061        #   Replace  determines what to do if the range list already contains
4062        #            ranges which coincide with all or portions of the input
4063        #            range.  It is only valid for '+':
4064        #       => $NO            means that the new value is not to replace
4065        #                         any existing ones, but any empty gaps of the
4066        #                         range list coinciding with the input range
4067        #                         will be filled in with the new value.
4068        #       => $UNCONDITIONALLY  means to replace the existing values with
4069        #                         this one unconditionally.  However, if the
4070        #                         new and old values are identical, the
4071        #                         replacement is skipped to save cycles
4072        #       => $IF_NOT_EQUIVALENT means to replace the existing values
4073        #          (the default)  with this one if they are not equivalent.
4074        #                         Ranges are equivalent if their types are the
4075        #                         same, and they are the same string; or if
4076        #                         both are type 0 ranges, if their Unicode
4077        #                         standard forms are identical.  In this last
4078        #                         case, the routine chooses the more "modern"
4079        #                         one to use.  This is because some of the
4080        #                         older files are formatted with values that
4081        #                         are, for example, ALL CAPs, whereas the
4082        #                         derived files have a more modern style,
4083        #                         which looks better.  By looking for this
4084        #                         style when the pre-existing and replacement
4085        #                         standard forms are the same, we can move to
4086        #                         the modern style
4087        #       => $MULTIPLE_BEFORE means that if this range duplicates an
4088        #                         existing one, but has a different value,
4089        #                         don't replace the existing one, but insert
4090        #                         this one so that the same range can occur
4091        #                         multiple times.  They are stored LIFO, so
4092        #                         that the final one inserted is the first one
4093        #                         returned in an ordered search of the table.
4094        #                         If this is an exact duplicate, including the
4095        #                         value, the original will be moved to be
4096        #                         first, before any other duplicate ranges
4097        #                         with different values.
4098        #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4099        #                         FIFO, so that this one is inserted after all
4100        #                         others that currently exist.  If this is an
4101        #                         exact duplicate, including value, of an
4102        #                         existing range, this one is discarded
4103        #                         (leaving the existing one in its original,
4104        #                         higher priority position
4105        #       => $CROAK         Die with an error if is already there
4106        #       => anything else  is the same as => $IF_NOT_EQUIVALENT
4107        #
4108        # "same value" means identical for non-type-0 ranges, and it means
4109        # having the same standard forms for type-0 ranges.
4110
4111        return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4112
4113        my $self = shift;
4114        my $operation = shift;   # '+' for add/replace; '-' for delete;
4115        my $start = shift;
4116        my $end   = shift;
4117        my $value = shift;
4118
4119        my %args = @_;
4120
4121        $value = "" if not defined $value;        # warning: $value can be "0"
4122
4123        my $replace = delete $args{'Replace'};
4124        $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4125
4126        my $type = delete $args{'Type'};
4127        $type = 0 unless defined $type;
4128
4129        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4130
4131        my $addr = pack 'J', refaddr $self;
4132
4133        if ($operation ne '+' && $operation ne '-') {
4134            Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
4135            return;
4136        }
4137        unless (defined $start && defined $end) {
4138            Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
4139            return;
4140        }
4141        unless ($end >= $start) {
4142            Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
4143            return;
4144        }
4145        #local $to_trace = 1 if main::DEBUG;
4146
4147        if ($operation eq '-') {
4148            if ($replace != $IF_NOT_EQUIVALENT) {
4149                Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
4150                $replace = $IF_NOT_EQUIVALENT;
4151            }
4152            if ($type) {
4153                Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
4154                $type = 0;
4155            }
4156            if ($value ne "") {
4157                Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
4158                $value = "";
4159            }
4160        }
4161
4162        my $r = $ranges{$addr};               # The current list of ranges
4163        my $range_list_size = scalar @$r;     # And its size
4164        my $max = $max{$addr};                # The current high code point in
4165                                              # the list of ranges
4166
4167        # Do a special case requiring fewer machine cycles when the new range
4168        # starts after the current highest point.  The Unicode input data is
4169        # structured so this is common.
4170        if ($start > $max) {
4171
4172            trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4173            return if $operation eq '-'; # Deleting a non-existing range is a
4174                                         # no-op
4175
4176            # If the new range doesn't logically extend the current final one
4177            # in the range list, create a new range at the end of the range
4178            # list.  (max cleverly is initialized to a negative number not
4179            # adjacent to 0 if the range list is empty, so even adding a range
4180            # to an empty range list starting at 0 will have this 'if'
4181            # succeed.)
4182            if ($start > $max + 1        # non-adjacent means can't extend.
4183                || @{$r}[-1]->value ne $value # values differ, can't extend.
4184                || @{$r}[-1]->type != $type # types differ, can't extend.
4185            ) {
4186                push @$r, Range->new($start, $end,
4187                                     Value => $value,
4188                                     Type => $type);
4189            }
4190            else {
4191
4192                # Here, the new range starts just after the current highest in
4193                # the range list, and they have the same type and value.
4194                # Extend the existing range to incorporate the new one.
4195                @{$r}[-1]->set_end($end);
4196            }
4197
4198            # This becomes the new maximum.
4199            $max{$addr} = $end;
4200
4201            return;
4202        }
4203        #local $to_trace = 0 if main::DEBUG;
4204
4205        trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4206
4207        # Here, the input range isn't after the whole rest of the range list.
4208        # Most likely 'splice' will be needed.  The rest of the routine finds
4209        # the needed splice parameters, and if necessary, does the splice.
4210        # First, find the offset parameter needed by the splice function for
4211        # the input range.  Note that the input range may span multiple
4212        # existing ones, but we'll worry about that later.  For now, just find
4213        # the beginning.  If the input range is to be inserted starting in a
4214        # position not currently in the range list, it must (obviously) come
4215        # just after the range below it, and just before the range above it.
4216        # Slightly less obviously, it will occupy the position currently
4217        # occupied by the range that is to come after it.  More formally, we
4218        # are looking for the position, $i, in the array of ranges, such that:
4219        #
4220        # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4221        #
4222        # (The ordered relationships within existing ranges are also shown in
4223        # the equation above).  However, if the start of the input range is
4224        # within an existing range, the splice offset should point to that
4225        # existing range's position in the list; that is $i satisfies a
4226        # somewhat different equation, namely:
4227        #
4228        #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4229        #
4230        # More briefly, $start can come before or after r[$i]->start, and at
4231        # this point, we don't know which it will be.  However, these
4232        # two equations share these constraints:
4233        #
4234        #   r[$i-1]->end < $start <= r[$i]->end
4235        #
4236        # And that is good enough to find $i.
4237
4238        my $i = $self->_search_ranges($start);
4239        if (! defined $i) {
4240            Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
4241            return;
4242        }
4243
4244        # The search function returns $i such that:
4245        #
4246        # r[$i-1]->end < $start <= r[$i]->end
4247        #
4248        # That means that $i points to the first range in the range list
4249        # that could possibly be affected by this operation.  We still don't
4250        # know if the start of the input range is within r[$i], or if it
4251        # points to empty space between r[$i-1] and r[$i].
4252        trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4253
4254        # Special case the insertion of data that is not to replace any
4255        # existing data.
4256        if ($replace == $NO) {  # If $NO, has to be operation '+'
4257            #local $to_trace = 1 if main::DEBUG;
4258            trace "Doesn't replace" if main::DEBUG && $to_trace;
4259
4260            # Here, the new range is to take effect only on those code points
4261            # that aren't already in an existing range.  This can be done by
4262            # looking through the existing range list and finding the gaps in
4263            # the ranges that this new range affects, and then calling this
4264            # function recursively on each of those gaps, leaving untouched
4265            # anything already in the list.  Gather up a list of the changed
4266            # gaps first so that changes to the internal state as new ranges
4267            # are added won't be a problem.
4268            my @gap_list;
4269
4270            # First, if the starting point of the input range is outside an
4271            # existing one, there is a gap from there to the beginning of the
4272            # existing range -- add a span to fill the part that this new
4273            # range occupies
4274            if ($start < $r->[$i]->start) {
4275                push @gap_list, Range->new($start,
4276                                           main::min($end,
4277                                                     $r->[$i]->start - 1),
4278                                           Type => $type);
4279                trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4280            }
4281
4282            # Then look through the range list for other gaps until we reach
4283            # the highest range affected by the input one.
4284            my $j;
4285            for ($j = $i+1; $j < $range_list_size; $j++) {
4286                trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4287                last if $end < $r->[$j]->start;
4288
4289                # If there is a gap between when this range starts and the
4290                # previous one ends, add a span to fill it.  Note that just
4291                # because there are two ranges doesn't mean there is a
4292                # non-zero gap between them.  It could be that they have
4293                # different values or types
4294                if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4295                    push @gap_list,
4296                        Range->new($r->[$j-1]->end + 1,
4297                                   $r->[$j]->start - 1,
4298                                   Type => $type);
4299                    trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4300                }
4301            }
4302
4303            # Here, we have either found an existing range in the range list,
4304            # beyond the area affected by the input one, or we fell off the
4305            # end of the loop because the input range affects the whole rest
4306            # of the range list.  In either case, $j is 1 higher than the
4307            # highest affected range.  If $j == $i, it means that there are no
4308            # affected ranges, that the entire insertion is in the gap between
4309            # r[$i-1], and r[$i], which we already have taken care of before
4310            # the loop.
4311            # On the other hand, if there are affected ranges, it might be
4312            # that there is a gap that needs filling after the final such
4313            # range to the end of the input range
4314            if ($r->[$j-1]->end < $end) {
4315                    push @gap_list, Range->new(main::max($start,
4316                                                         $r->[$j-1]->end + 1),
4317                                               $end,
4318                                               Type => $type);
4319                    trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4320            }
4321
4322            # Call recursively to fill in all the gaps.
4323            foreach my $gap (@gap_list) {
4324                $self->_add_delete($operation,
4325                                   $gap->start,
4326                                   $gap->end,
4327                                   $value,
4328                                   Type => $type);
4329            }
4330
4331            return;
4332        }
4333
4334        # Here, we have taken care of the case where $replace is $NO.
4335        # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4336        # If inserting a multiple record, this is where it goes, before the
4337        # first (if any) existing one if inserting LIFO.  (If this is to go
4338        # afterwards, FIFO, we below move the pointer to there.)  These imply
4339        # an insertion, and no change to any existing ranges.  Note that $i
4340        # can be -1 if this new range doesn't actually duplicate any existing,
4341        # and comes at the beginning of the list.
4342        if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4343
4344            if ($start != $end) {
4345                Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
4346                return;
4347            }
4348
4349            # If the new code point is within a current range ...
4350            if ($end >= $r->[$i]->start) {
4351
4352                # Don't add an exact duplicate, as it isn't really a multiple
4353                my $existing_value = $r->[$i]->value;
4354                my $existing_type = $r->[$i]->type;
4355                return if $value eq $existing_value && $type eq $existing_type;
4356
4357                # If the multiple value is part of an existing range, we want
4358                # to split up that range, so that only the single code point
4359                # is affected.  To do this, we first call ourselves
4360                # recursively to delete that code point from the table, having
4361                # preserved its current data above.  Then we call ourselves
4362                # recursively again to add the new multiple, which we know by
4363                # the test just above is different than the current code
4364                # point's value, so it will become a range containing a single
4365                # code point: just itself.  Finally, we add back in the
4366                # pre-existing code point, which will again be a single code
4367                # point range.  Because 'i' likely will have changed as a
4368                # result of these operations, we can't just continue on, but
4369                # do this operation recursively as well.  If we are inserting
4370                # LIFO, the pre-existing code point needs to go after the new
4371                # one, so use MULTIPLE_AFTER; and vice versa.
4372                if ($r->[$i]->start != $r->[$i]->end) {
4373                    $self->_add_delete('-', $start, $end, "");
4374                    $self->_add_delete('+', $start, $end, $value, Type => $type);
4375                    return $self->_add_delete('+',
4376                            $start, $end,
4377                            $existing_value,
4378                            Type => $existing_type,
4379                            Replace => ($replace == $MULTIPLE_BEFORE)
4380                                       ? $MULTIPLE_AFTER
4381                                       : $MULTIPLE_BEFORE);
4382                }
4383            }
4384
4385            # If to place this new record after, move to beyond all existing
4386            # ones; but don't add this one if identical to any of them, as it
4387            # isn't really a multiple.  This leaves the original order, so
4388            # that the current request is ignored.  The reasoning is that the
4389            # previous request that wanted this record to have high priority
4390            # should have precedence.
4391            if ($replace == $MULTIPLE_AFTER) {
4392                while ($i < @$r && $r->[$i]->start == $start) {
4393                    return if $value eq $r->[$i]->value
4394                              && $type eq $r->[$i]->type;
4395                    $i++;
4396                }
4397            }
4398            else {
4399                # If instead we are to place this new record before any
4400                # existing ones, remove any identical ones that come after it.
4401                # This changes the existing order so that the new one is
4402                # first, as is being requested.
4403                for (my $j = $i + 1;
4404                     $j < @$r && $r->[$j]->start == $start;
4405                     $j++)
4406                {
4407                    if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4408                        splice @$r, $j, 1;
4409                        last;   # There should only be one instance, so no
4410                                # need to keep looking
4411                    }
4412                }
4413            }
4414
4415            trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4416            my @return = splice @$r,
4417                                $i,
4418                                0,
4419                                Range->new($start,
4420                                           $end,
4421                                           Value => $value,
4422                                           Type => $type);
4423            if (main::DEBUG && $to_trace) {
4424                trace "After splice:";
4425                trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4426                trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4427                trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4428                trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4429                trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4430                trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4431            }
4432            return @return;
4433        }
4434
4435        # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4436        # leaves delete, insert, and replace either unconditionally or if not
4437        # equivalent.  $i still points to the first potential affected range.
4438        # Now find the highest range affected, which will determine the length
4439        # parameter to splice.  (The input range can span multiple existing
4440        # ones.)  If this isn't a deletion, while we are looking through the
4441        # range list, see also if this is a replacement rather than a clean
4442        # insertion; that is if it will change the values of at least one
4443        # existing range.  Start off assuming it is an insert, until find it
4444        # isn't.
4445        my $clean_insert = $operation eq '+';
4446        my $j;        # This will point to the highest affected range
4447
4448        # For non-zero types, the standard form is the value itself;
4449        my $standard_form = ($type) ? $value : main::standardize($value);
4450
4451        for ($j = $i; $j < $range_list_size; $j++) {
4452            trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4453
4454            # If find a range that it doesn't overlap into, we can stop
4455            # searching
4456            last if $end < $r->[$j]->start;
4457
4458            # Here, overlaps the range at $j.  If the values don't match,
4459            # and so far we think this is a clean insertion, it becomes a
4460            # non-clean insertion, i.e., a 'change' or 'replace' instead.
4461            if ($clean_insert) {
4462                if ($r->[$j]->standard_form ne $standard_form) {
4463                    $clean_insert = 0;
4464                    if ($replace == $CROAK) {
4465                        main::croak("The range to add "
4466                        . sprintf("%04X", $start)
4467                        . '-'
4468                        . sprintf("%04X", $end)
4469                        . " with value '$value' overlaps an existing range $r->[$j]");
4470                    }
4471                }
4472                else {
4473
4474                    # Here, the two values are essentially the same.  If the
4475                    # two are actually identical, replacing wouldn't change
4476                    # anything so skip it.
4477                    my $pre_existing = $r->[$j]->value;
4478                    if ($pre_existing ne $value) {
4479
4480                        # Here the new and old standardized values are the
4481                        # same, but the non-standardized values aren't.  If
4482                        # replacing unconditionally, then replace
4483                        if( $replace == $UNCONDITIONALLY) {
4484                            $clean_insert = 0;
4485                        }
4486                        else {
4487
4488                            # Here, are replacing conditionally.  Decide to
4489                            # replace or not based on which appears to look
4490                            # the "nicest".  If one is mixed case and the
4491                            # other isn't, choose the mixed case one.
4492                            my $new_mixed = $value =~ /[A-Z]/
4493                                            && $value =~ /[a-z]/;
4494                            my $old_mixed = $pre_existing =~ /[A-Z]/
4495                                            && $pre_existing =~ /[a-z]/;
4496
4497                            if ($old_mixed != $new_mixed) {
4498                                $clean_insert = 0 if $new_mixed;
4499                                if (main::DEBUG && $to_trace) {
4500                                    if ($clean_insert) {
4501                                        trace "Retaining $pre_existing over $value";
4502                                    }
4503                                    else {
4504                                        trace "Replacing $pre_existing with $value";
4505                                    }
4506                                }
4507                            }
4508                            else {
4509
4510                                # Here casing wasn't different between the two.
4511                                # If one has hyphens or underscores and the
4512                                # other doesn't, choose the one with the
4513                                # punctuation.
4514                                my $new_punct = $value =~ /[-_]/;
4515                                my $old_punct = $pre_existing =~ /[-_]/;
4516
4517                                if ($old_punct != $new_punct) {
4518                                    $clean_insert = 0 if $new_punct;
4519                                    if (main::DEBUG && $to_trace) {
4520                                        if ($clean_insert) {
4521                                            trace "Retaining $pre_existing over $value";
4522                                        }
4523                                        else {
4524                                            trace "Replacing $pre_existing with $value";
4525                                        }
4526                                    }
4527                                }   # else existing one is just as "good";
4528                                    # retain it to save cycles.
4529                            }
4530                        }
4531                    }
4532                }
4533            }
4534        } # End of loop looking for highest affected range.
4535
4536        # Here, $j points to one beyond the highest range that this insertion
4537        # affects (hence to beyond the range list if that range is the final
4538        # one in the range list).
4539
4540        # The splice length is all the affected ranges.  Get it before
4541        # subtracting, for efficiency, so we don't have to later add 1.
4542        my $length = $j - $i;
4543
4544        $j--;        # $j now points to the highest affected range.
4545        trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4546
4547        # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4548        # $j points to the highest affected range.  But it can be < $i or even
4549        # -1.  These happen only if the insertion is entirely in the gap
4550        # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4551        # above exited first time through with $end < $r->[$i]->start.  (And
4552        # then we subtracted one from j)  This implies also that $start <
4553        # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4554        # $start, so the entire input range is in the gap.
4555        if ($j < $i) {
4556
4557            # Here the entire input range is in the gap before $i.
4558
4559            if (main::DEBUG && $to_trace) {
4560                if ($i) {
4561                    trace "Entire range is between $r->[$i-1] and $r->[$i]";
4562                }
4563                else {
4564                    trace "Entire range is before $r->[$i]";
4565                }
4566            }
4567            return if $operation ne '+'; # Deletion of a non-existent range is
4568                                         # a no-op
4569        }
4570        else {
4571
4572            # Here part of the input range is not in the gap before $i.  Thus,
4573            # there is at least one affected one, and $j points to the highest
4574            # such one.
4575
4576            # At this point, here is the situation:
4577            # This is not an insertion of a multiple, nor of tentative ($NO)
4578            # data.
4579            #   $i  points to the first element in the current range list that
4580            #            may be affected by this operation.  In fact, we know
4581            #            that the range at $i is affected because we are in
4582            #            the else branch of this 'if'
4583            #   $j  points to the highest affected range.
4584            # In other words,
4585            #   r[$i-1]->end < $start <= r[$i]->end
4586            # And:
4587            #   r[$i-1]->end < $start <= $end < r[$j+1]->start
4588            #
4589            # Also:
4590            #   $clean_insert is a boolean which is set true if and only if
4591            #        this is a "clean insertion", i.e., not a change nor a
4592            #        deletion (multiple was handled above).
4593
4594            # We now have enough information to decide if this call is a no-op
4595            # or not.  It is a no-op if this is an insertion of already
4596            # existing data.  To be so, it must be contained entirely in one
4597            # range.
4598
4599            if (main::DEBUG && $to_trace && $clean_insert
4600                                         && $start >= $r->[$i]->start
4601                                         && $end   <= $r->[$i]->end)
4602            {
4603                    trace "no-op";
4604            }
4605            return if $clean_insert
4606                      && $start >= $r->[$i]->start
4607                      && $end   <= $r->[$i]->end;
4608        }
4609
4610        # Here, we know that some action will have to be taken.  We have
4611        # calculated the offset and length (though adjustments may be needed)
4612        # for the splice.  Now start constructing the replacement list.
4613        my @replacement;
4614        my $splice_start = $i;
4615
4616        my $extends_below;
4617        my $extends_above;
4618
4619        # See if should extend any adjacent ranges.
4620        if ($operation eq '-') { # Don't extend deletions
4621            $extends_below = $extends_above = 0;
4622        }
4623        else {  # Here, should extend any adjacent ranges.  See if there are
4624                # any.
4625            $extends_below = ($i > 0
4626                            # can't extend unless adjacent
4627                            && $r->[$i-1]->end == $start -1
4628                            # can't extend unless are same standard value
4629                            && $r->[$i-1]->standard_form eq $standard_form
4630                            # can't extend unless share type
4631                            && $r->[$i-1]->type == $type);
4632            $extends_above = ($j+1 < $range_list_size
4633                            && $r->[$j+1]->start == $end +1
4634                            && $r->[$j+1]->standard_form eq $standard_form
4635                            && $r->[$j+1]->type == $type);
4636        }
4637        if ($extends_below && $extends_above) { # Adds to both
4638            $splice_start--;     # start replace at element below
4639            $length += 2;        # will replace on both sides
4640            trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4641
4642            # The result will fill in any gap, replacing both sides, and
4643            # create one large range.
4644            @replacement = Range->new($r->[$i-1]->start,
4645                                      $r->[$j+1]->end,
4646                                      Value => $value,
4647                                      Type => $type);
4648        }
4649        else {
4650
4651            # Here we know that the result won't just be the conglomeration of
4652            # a new range with both its adjacent neighbors.  But it could
4653            # extend one of them.
4654
4655            if ($extends_below) {
4656
4657                # Here the new element adds to the one below, but not to the
4658                # one above.  If inserting, and only to that one range,  can
4659                # just change its ending to include the new one.
4660                if ($length == 0 && $clean_insert) {
4661                    $r->[$i-1]->set_end($end);
4662                    trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4663                    return;
4664                }
4665                else {
4666                    trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4667                    $splice_start--;        # start replace at element below
4668                    $length++;              # will replace the element below
4669                    $start = $r->[$i-1]->start;
4670                }
4671            }
4672            elsif ($extends_above) {
4673
4674                # Here the new element adds to the one above, but not below.
4675                # Mirror the code above
4676                if ($length == 0 && $clean_insert) {
4677                    $r->[$j+1]->set_start($start);
4678                    trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4679                    return;
4680                }
4681                else {
4682                    trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4683                    $length++;        # will replace the element above
4684                    $end = $r->[$j+1]->end;
4685                }
4686            }
4687
4688            trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4689
4690            # Finally, here we know there will have to be a splice.
4691            # If the change or delete affects only the highest portion of the
4692            # first affected range, the range will have to be split.  The
4693            # splice will remove the whole range, but will replace it by a new
4694            # range containing just the unaffected part.  So, in this case,
4695            # add to the replacement list just this unaffected portion.
4696            if (! $extends_below
4697                && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4698            {
4699                push @replacement,
4700                    Range->new($r->[$i]->start,
4701                               $start - 1,
4702                               Value => $r->[$i]->value,
4703                               Type => $r->[$i]->type);
4704            }
4705
4706            # In the case of an insert or change, but not a delete, we have to
4707            # put in the new stuff;  this comes next.
4708            if ($operation eq '+') {
4709                push @replacement, Range->new($start,
4710                                              $end,
4711                                              Value => $value,
4712                                              Type => $type);
4713            }
4714
4715            trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4716            #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4717
4718            # And finally, if we're changing or deleting only a portion of the
4719            # highest affected range, it must be split, as the lowest one was.
4720            if (! $extends_above
4721                && $j >= 0  # Remember that j can be -1 if before first
4722                            # current element
4723                && $end >= $r->[$j]->start
4724                && $end < $r->[$j]->end)
4725            {
4726                push @replacement,
4727                    Range->new($end + 1,
4728                               $r->[$j]->end,
4729                               Value => $r->[$j]->value,
4730                               Type => $r->[$j]->type);
4731            }
4732        }
4733
4734        # And do the splice, as calculated above
4735        if (main::DEBUG && $to_trace) {
4736            trace "replacing $length element(s) at $i with ";
4737            foreach my $replacement (@replacement) {
4738                trace "    $replacement";
4739            }
4740            trace "Before splice:";
4741            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4742            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4743            trace "i  =[", $i, "]", $r->[$i];
4744            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4745            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4746        }
4747
4748        my @return = splice @$r, $splice_start, $length, @replacement;
4749
4750        if (main::DEBUG && $to_trace) {
4751            trace "After splice:";
4752            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4753            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4754            trace "i  =[", $i, "]", $r->[$i];
4755            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4756            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4757            trace "removed ", @return if @return;
4758        }
4759
4760        # An actual deletion could have changed the maximum in the list.
4761        # There was no deletion if the splice didn't return something, but
4762        # otherwise recalculate it.  This is done too rarely to worry about
4763        # performance.
4764        if ($operation eq '-' && @return) {
4765            if (@$r) {
4766                $max{$addr} = $r->[-1]->end;
4767            }
4768            else {  # Now empty
4769                $max{$addr} = $max_init;
4770            }
4771        }
4772        return @return;
4773    }
4774
4775    sub reset_each_range($self) {  # reset the iterator for each_range();
4776        undef $each_range_iterator{pack 'J', refaddr $self};
4777        return;
4778    }
4779
4780    sub each_range($self) {
4781        # Iterate over each range in a range list.  Results are undefined if
4782        # the range list is changed during the iteration.
4783        my $addr = pack 'J', refaddr $self;
4784
4785        return if $self->is_empty;
4786
4787        $each_range_iterator{$addr} = -1
4788                                if ! defined $each_range_iterator{$addr};
4789        $each_range_iterator{$addr}++;
4790        return $ranges{$addr}->[$each_range_iterator{$addr}]
4791                        if $each_range_iterator{$addr} < @{$ranges{$addr}};
4792        undef $each_range_iterator{$addr};
4793        return;
4794    }
4795
4796    sub count($self) {        # Returns count of code points in range list
4797        my $addr = pack 'J', refaddr $self;
4798
4799        my $count = 0;
4800        foreach my $range (@{$ranges{$addr}}) {
4801            $count += $range->end - $range->start + 1;
4802        }
4803        return $count;
4804    }
4805
4806    sub delete_range($self, $start, $end) {    # Delete a range
4807        return $self->_add_delete('-', $start, $end, "");
4808    }
4809
4810    sub is_empty($self) { # Returns boolean as to if a range list is empty
4811        return scalar @{$ranges{pack 'J', refaddr $self}} == 0;
4812    }
4813
4814    sub hash($self) {
4815        # Quickly returns a scalar suitable for separating tables into
4816        # buckets, i.e. it is a hash function of the contents of a table, so
4817        # there are relatively few conflicts.
4818        my $addr = pack 'J', refaddr $self;
4819
4820        # These are quickly computable.  Return looks like 'min..max;count'
4821        return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4822    }
4823} # End closure for _Range_List_Base
4824
4825package Range_List;
4826use parent '-norequire', '_Range_List_Base';
4827
4828# A Range_List is a range list for match tables; i.e. the range values are
4829# not significant.  Thus a number of operations can be safely added to it,
4830# such as inversion, intersection.  Note that union is also an unsafe
4831# operation when range values are cared about, and that method is in the base
4832# class, not here.  But things are set up so that that method is callable only
4833# during initialization.  Only in this derived class, is there an operation
4834# that combines two tables.  A Range_Map can thus be used to initialize a
4835# Range_List, and its mappings will be in the list, but are not significant to
4836# this class.
4837
4838sub trace { return main::trace(@_); }
4839
4840{ # Closure
4841
4842    use overload
4843        fallback => 0,
4844        '+' => sub { my $self = shift;
4845                    my $other = shift;
4846
4847                    return $self->_union($other)
4848                },
4849        '+=' => sub { my $self = shift;
4850                    my $other = shift;
4851                    my $reversed = shift;
4852
4853                    if ($reversed) {
4854                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4855                        . ref($other)
4856                        . ' += '
4857                        . ref($self)
4858                        . "'.  undef returned.");
4859                        return;
4860                    }
4861
4862                    return $self->_union($other)
4863                },
4864        '&' => sub { my $self = shift;
4865                    my $other = shift;
4866
4867                    return $self->_intersect($other, 0);
4868                },
4869        '&=' => sub { my $self = shift;
4870                    my $other = shift;
4871                    my $reversed = shift;
4872
4873                    if ($reversed) {
4874                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4875                        . ref($other)
4876                        . ' &= '
4877                        . ref($self)
4878                        . "'.  undef returned.");
4879                        return;
4880                    }
4881
4882                    return $self->_intersect($other, 0);
4883                },
4884        '~' => "_invert",
4885        '-' => "_subtract",
4886    ;
4887
4888    sub _invert($self, @) {
4889        # Returns a new Range_List that gives all code points not in $self.
4890        my $new = Range_List->new;
4891
4892        # Go through each range in the table, finding the gaps between them
4893        my $max = -1;   # Set so no gap before range beginning at 0
4894        for my $range ($self->ranges) {
4895            my $start = $range->start;
4896            my $end   = $range->end;
4897
4898            # If there is a gap before this range, the inverse will contain
4899            # that gap.
4900            if ($start > $max + 1) {
4901                $new->add_range($max + 1, $start - 1);
4902            }
4903            $max = $end;
4904        }
4905
4906        # And finally, add the gap from the end of the table to the max
4907        # possible code point
4908        if ($max < $MAX_WORKING_CODEPOINT) {
4909            $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4910        }
4911        return $new;
4912    }
4913
4914    sub _subtract($self, $other, $reversed=0) {
4915        # Returns a new Range_List with the argument deleted from it.  The
4916        # argument can be a single code point, a range, or something that has
4917        # a range, with the _range_list() method on it returning them
4918
4919        if ($reversed) {
4920            Carp::my_carp_bug("Bad news.  Can't cope with '"
4921            . ref($other)
4922            . ' - '
4923            . ref($self)
4924            . "'.  undef returned.");
4925            return;
4926        }
4927
4928        my $new = Range_List->new(Initialize => $self);
4929
4930        if (! ref $other) { # Single code point
4931            $new->delete_range($other, $other);
4932        }
4933        elsif ($other->isa('Range')) {
4934            $new->delete_range($other->start, $other->end);
4935        }
4936        elsif ($other->can('_range_list')) {
4937            foreach my $range ($other->_range_list->ranges) {
4938                $new->delete_range($range->start, $range->end);
4939            }
4940        }
4941        else {
4942            Carp::my_carp_bug("Can't cope with a "
4943                        . ref($other)
4944                        . " argument to '-'.  Subtraction ignored."
4945                        );
4946            return $self;
4947        }
4948
4949        return $new;
4950    }
4951
4952    sub _intersect($a_object, $b_object, $check_if_overlapping=0) {
4953        # Returns either a boolean giving whether the two inputs' range lists
4954        # intersect (overlap), or a new Range_List containing the intersection
4955        # of the two lists.  The optional final parameter being true indicates
4956        # to do the check instead of the intersection.
4957
4958        if (! defined $b_object) {
4959            my $message = "";
4960            $message .= $a_object->_owner_name_of if defined $a_object;
4961            Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4962            return;
4963        }
4964
4965        # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4966        # Thus the intersection could be much more simply be written:
4967        #   return ~(~$a_object + ~$b_object);
4968        # But, this is slower, and when taking the inverse of a large
4969        # range_size_1 table, back when such tables were always stored that
4970        # way, it became prohibitively slow, hence the code was changed to the
4971        # below
4972
4973        if ($b_object->isa('Range')) {
4974            $b_object = Range_List->new(Initialize => $b_object,
4975                                        Owner => $a_object->_owner_name_of);
4976        }
4977        $b_object = $b_object->_range_list if $b_object->can('_range_list');
4978
4979        my @a_ranges = $a_object->ranges;
4980        my @b_ranges = $b_object->ranges;
4981
4982        #local $to_trace = 1 if main::DEBUG;
4983        trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4984
4985        # Start with the first range in each list
4986        my $a_i = 0;
4987        my $range_a = $a_ranges[$a_i];
4988        my $b_i = 0;
4989        my $range_b = $b_ranges[$b_i];
4990
4991        my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4992                                                if ! $check_if_overlapping;
4993
4994        # If either list is empty, there is no intersection and no overlap
4995        if (! defined $range_a || ! defined $range_b) {
4996            return $check_if_overlapping ? 0 : $new;
4997        }
4998        trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4999
5000        # Otherwise, must calculate the intersection/overlap.  Start with the
5001        # very first code point in each list
5002        my $a = $range_a->start;
5003        my $b = $range_b->start;
5004
5005        # Loop through all the ranges of each list; in each iteration, $a and
5006        # $b are the current code points in their respective lists
5007        while (1) {
5008
5009            # If $a and $b are the same code point, ...
5010            if ($a == $b) {
5011
5012                # it means the lists overlap.  If just checking for overlap
5013                # know the answer now,
5014                return 1 if $check_if_overlapping;
5015
5016                # The intersection includes this code point plus anything else
5017                # common to both current ranges.
5018                my $start = $a;
5019                my $end = main::min($range_a->end, $range_b->end);
5020                if (! $check_if_overlapping) {
5021                    trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5022                    $new->add_range($start, $end);
5023                }
5024
5025                # Skip ahead to the end of the current intersect
5026                $a = $b = $end;
5027
5028                # If the current intersect ends at the end of either range (as
5029                # it must for at least one of them), the next possible one
5030                # will be the beginning code point in it's list's next range.
5031                if ($a == $range_a->end) {
5032                    $range_a = $a_ranges[++$a_i];
5033                    last unless defined $range_a;
5034                    $a = $range_a->start;
5035                }
5036                if ($b == $range_b->end) {
5037                    $range_b = $b_ranges[++$b_i];
5038                    last unless defined $range_b;
5039                    $b = $range_b->start;
5040                }
5041
5042                trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5043            }
5044            elsif ($a < $b) {
5045
5046                # Not equal, but if the range containing $a encompasses $b,
5047                # change $a to be the middle of the range where it does equal
5048                # $b, so the next iteration will get the intersection
5049                if ($range_a->end >= $b) {
5050                    $a = $b;
5051                }
5052                else {
5053
5054                    # Here, the current range containing $a is entirely below
5055                    # $b.  Go try to find a range that could contain $b.
5056                    $a_i = $a_object->_search_ranges($b);
5057
5058                    # If no range found, quit.
5059                    last unless defined $a_i;
5060
5061                    # The search returns $a_i, such that
5062                    #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5063                    # Set $a to the beginning of this new range, and repeat.
5064                    $range_a = $a_ranges[$a_i];
5065                    $a = $range_a->start;
5066                }
5067            }
5068            else { # Here, $b < $a.
5069
5070                # Mirror image code to the leg just above
5071                if ($range_b->end >= $a) {
5072                    $b = $a;
5073                }
5074                else {
5075                    $b_i = $b_object->_search_ranges($a);
5076                    last unless defined $b_i;
5077                    $range_b = $b_ranges[$b_i];
5078                    $b = $range_b->start;
5079                }
5080            }
5081        } # End of looping through ranges.
5082
5083        # Intersection fully computed, or now know that there is no overlap
5084        return $check_if_overlapping ? 0 : $new;
5085    }
5086
5087    sub overlaps($self, $other) {
5088        # Returns boolean giving whether the two arguments overlap somewhere
5089        return $self->_intersect($other, 1);
5090    }
5091
5092    sub add_range($self, $start, $end) {
5093        # Add a range to the list.
5094        return $self->_add_delete('+', $start, $end, "");
5095    }
5096
5097    sub matches_identically_to($self, $other) {
5098        # Return a boolean as to whether or not two Range_Lists match identical
5099        # sets of code points.
5100        # These are ordered in increasing real time to figure out (at least
5101        # until a patch changes that and doesn't change this)
5102        return 0 if $self->max != $other->max;
5103        return 0 if $self->min != $other->min;
5104        return 0 if $self->range_count != $other->range_count;
5105        return 0 if $self->count != $other->count;
5106
5107        # Here they could be identical because all the tests above passed.
5108        # The loop below is somewhat simpler since we know they have the same
5109        # number of elements.  Compare range by range, until reach the end or
5110        # find something that differs.
5111        my @a_ranges = $self->ranges;
5112        my @b_ranges = $other->ranges;
5113        for my $i (0 .. @a_ranges - 1) {
5114            my $a = $a_ranges[$i];
5115            my $b = $b_ranges[$i];
5116            trace "self $a; other $b" if main::DEBUG && $to_trace;
5117            return 0 if ! defined $b
5118                        || $a->start != $b->start
5119                        || $a->end != $b->end;
5120        }
5121        return 1;
5122    }
5123
5124    sub is_code_point_usable($code, $try_hard) {
5125        # This used only for making the test script.  See if the input
5126        # proposed trial code point is one that Perl will handle.  If second
5127        # parameter is 0, it won't select some code points for various
5128        # reasons, noted below.
5129        return 0 if $code < 0;                # Never use a negative
5130
5131        # shun null.  I'm (khw) not sure why this was done, but NULL would be
5132        # the character very frequently used.
5133        return $try_hard if $code == 0x0000;
5134
5135        # shun non-character code points.
5136        return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5137        return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5138
5139        return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
5140        return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5141
5142        return 1;
5143    }
5144
5145    sub get_valid_code_point($self) {
5146        # Return a code point that's part of the range list.  Returns nothing
5147        # if the table is empty or we can't find a suitable code point.  This
5148        # used only for making the test script.
5149
5150        # On first pass, don't choose less desirable code points; if no good
5151        # one is found, repeat, allowing a less desirable one to be selected.
5152        for my $try_hard (0, 1) {
5153
5154            # Look through all the ranges for a usable code point.
5155            for my $set (reverse $self->ranges) {
5156
5157                # Try the edge cases first, starting with the end point of the
5158                # range.
5159                my $end = $set->end;
5160                return $end if is_code_point_usable($end, $try_hard);
5161                $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5162
5163                # End point didn't, work.  Start at the beginning and try
5164                # every one until find one that does work.
5165                for my $trial ($set->start .. $end - 1) {
5166                    return $trial if is_code_point_usable($trial, $try_hard);
5167                }
5168            }
5169        }
5170        return ();  # If none found, give up.
5171    }
5172
5173    sub get_invalid_code_point($self) {
5174        # Return a code point that's not part of the table.  Returns nothing
5175        # if the table covers all code points or a suitable code point can't
5176        # be found.  This used only for making the test script.
5177
5178        # Just find a valid code point of the inverse, if any.
5179        return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5180    }
5181} # end closure for Range_List
5182
5183package Range_Map;
5184use parent '-norequire', '_Range_List_Base';
5185
5186# A Range_Map is a range list in which the range values (called maps) are
5187# significant, and hence shouldn't be manipulated by our other code, which
5188# could be ambiguous or lose things.  For example, in taking the union of two
5189# lists, which share code points, but which have differing values, which one
5190# has precedence in the union?
5191# It turns out that these operations aren't really necessary for map tables,
5192# and so this class was created to make sure they aren't accidentally
5193# applied to them.
5194
5195{ # Closure
5196
5197    sub add_map($self, @add) {
5198        # Add a range containing a mapping value to the list
5199        return $self->_add_delete('+', @add);
5200    }
5201
5202    sub replace_map($self, @list) {
5203        # Replace a range
5204        return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY);
5205    }
5206
5207    sub add_duplicate {
5208        # Adds entry to a range list which can duplicate an existing entry
5209
5210        my $self = shift;
5211        my $code_point = shift;
5212        my $value = shift;
5213        my %args = @_;
5214        my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5215        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5216
5217        return $self->add_map($code_point, $code_point,
5218                                $value, Replace => $replace);
5219    }
5220} # End of closure for package Range_Map
5221
5222package _Base_Table;
5223
5224# A table is the basic data structure that gets written out into a file for
5225# use by the Perl core.  This is the abstract base class implementing the
5226# common elements from the derived ones.  A list of the methods to be
5227# furnished by an implementing class is just after the constructor.
5228
5229sub standardize { return main::standardize($_[0]); }
5230sub trace { return main::trace(@_); }
5231
5232{ # Closure
5233
5234    main::setup_package();
5235
5236    my %range_list;
5237    # Object containing the ranges of the table.
5238    main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5239
5240    my %full_name;
5241    # The full table name.
5242    main::set_access('full_name', \%full_name, 'r');
5243
5244    my %name;
5245    # The table name, almost always shorter
5246    main::set_access('name', \%name, 'r');
5247
5248    my %short_name;
5249    # The shortest of all the aliases for this table, with underscores removed
5250    main::set_access('short_name', \%short_name);
5251
5252    my %nominal_short_name_length;
5253    # The length of short_name before removing underscores
5254    main::set_access('nominal_short_name_length',
5255                    \%nominal_short_name_length);
5256
5257    my %complete_name;
5258    # The complete name, including property.
5259    main::set_access('complete_name', \%complete_name, 'r');
5260
5261    my %property;
5262    # Parent property this table is attached to.
5263    main::set_access('property', \%property, 'r');
5264
5265    my %aliases;
5266    # Ordered list of alias objects of the table's name.  The first ones in
5267    # the list are output first in comments
5268    main::set_access('aliases', \%aliases, 'readable_array');
5269
5270    my %comment;
5271    # A comment associated with the table for human readers of the files
5272    main::set_access('comment', \%comment, 's');
5273
5274    my %description;
5275    # A comment giving a short description of the table's meaning for human
5276    # readers of the files.
5277    main::set_access('description', \%description, 'readable_array');
5278
5279    my %note;
5280    # A comment giving a short note about the table for human readers of the
5281    # files.
5282    main::set_access('note', \%note, 'readable_array');
5283
5284    my %fate;
5285    # Enum; there are a number of possibilities for what happens to this
5286    # table: it could be normal, or suppressed, or not for external use.  See
5287    # values at definition for $SUPPRESSED.
5288    main::set_access('fate', \%fate, 'r');
5289
5290    my %find_table_from_alias;
5291    # The parent property passes this pointer to a hash which this class adds
5292    # all its aliases to, so that the parent can quickly take an alias and
5293    # find this table.
5294    main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5295
5296    my %locked;
5297    # After this table is made equivalent to another one; we shouldn't go
5298    # changing the contents because that could mean it's no longer equivalent
5299    main::set_access('locked', \%locked, 'r');
5300
5301    my %file_path;
5302    # This gives the final path to the file containing the table.  Each
5303    # directory in the path is an element in the array
5304    main::set_access('file_path', \%file_path, 'readable_array');
5305
5306    my %status;
5307    # What is the table's status, normal, $OBSOLETE, etc.  Enum
5308    main::set_access('status', \%status, 'r');
5309
5310    my %status_info;
5311    # A comment about its being obsolete, or whatever non normal status it has
5312    main::set_access('status_info', \%status_info, 'r');
5313
5314    my %caseless_equivalent;
5315    # The table this is equivalent to under /i matching, if any.
5316    main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5317
5318    my %range_size_1;
5319    # Is the table to be output with each range only a single code point?
5320    # This is done to avoid breaking existing code that may have come to rely
5321    # on this behavior in previous versions of this program.)
5322    main::set_access('range_size_1', \%range_size_1, 'r', 's');
5323
5324    my %perl_extension;
5325    # A boolean set iff this table is a Perl extension to the Unicode
5326    # standard.
5327    main::set_access('perl_extension', \%perl_extension, 'r');
5328
5329    my %output_range_counts;
5330    # A boolean set iff this table is to have comments written in the
5331    # output file that contain the number of code points in the range.
5332    # The constructor can override the global flag of the same name.
5333    main::set_access('output_range_counts', \%output_range_counts, 'r');
5334
5335    my %write_as_invlist;
5336    # A boolean set iff the output file for this table is to be in the form of
5337    # an inversion list/map.
5338    main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5339
5340    my %format;
5341    # The format of the entries of the table.  This is calculated from the
5342    # data in the table (or passed in the constructor).  This is an enum e.g.,
5343    # $STRING_FORMAT.  It is marked protected as it should not be generally
5344    # used to override calculations.
5345    main::set_access('format', \%format, 'r', 'p_s');
5346
5347    my %has_dependency;
5348    # A boolean that gives whether some other table in this property is
5349    # defined as the complement of this table.  This is a crude, but currently
5350    # sufficient, mechanism to make this table not get destroyed before what
5351    # is dependent on it is.  Other dependencies could be added, so the name
5352    # was chosen to reflect a more general situation than actually is
5353    # currently the case.
5354    main::set_access('has_dependency', \%has_dependency, 'r', 's');
5355
5356    sub new {
5357        # All arguments are key => value pairs, which you can see below, most
5358        # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5359        # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5360        # documented in the Alias package
5361
5362        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5363
5364        my $class = shift;
5365
5366        my $self = bless \do { my $anonymous_scalar }, $class;
5367        my $addr = pack 'J', refaddr $self;
5368
5369        my %args = @_;
5370
5371        $name{$addr} = delete $args{'Name'};
5372        $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5373        $full_name{$addr} = delete $args{'Full_Name'};
5374        my $complete_name = $complete_name{$addr}
5375                          = delete $args{'Complete_Name'};
5376        $format{$addr} = delete $args{'Format'};
5377        $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5378        $property{$addr} = delete $args{'_Property'};
5379        $range_list{$addr} = delete $args{'_Range_List'};
5380        $status{$addr} = delete $args{'Status'} || $NORMAL;
5381        $status_info{$addr} = delete $args{'_Status_Info'} || "";
5382        $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5383        $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5384        $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5385        $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5386        my $ucd = delete $args{'UCD'};
5387
5388        my $description = delete $args{'Description'};
5389        my $ok_as_filename = delete $args{'OK_as_Filename'};
5390        my $loose_match = delete $args{'Fuzzy'};
5391        my $note = delete $args{'Note'};
5392        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5393        my $perl_extension = delete $args{'Perl_Extension'};
5394        my $suppression_reason = delete $args{'Suppression_Reason'};
5395
5396        # Shouldn't have any left over
5397        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5398
5399        # Can't use || above because conceivably the name could be 0, and
5400        # can't use // operator in case this program gets used in Perl 5.8
5401        $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5402        $output_range_counts{$addr} = $output_range_counts if
5403                                        ! defined $output_range_counts{$addr};
5404
5405        $aliases{$addr} = [ ];
5406        $comment{$addr} = [ ];
5407        $description{$addr} = [ ];
5408        $note{$addr} = [ ];
5409        $file_path{$addr} = [ ];
5410        $locked{$addr} = "";
5411        $has_dependency{$addr} = 0;
5412
5413        push @{$description{$addr}}, $description if $description;
5414        push @{$note{$addr}}, $note if $note;
5415
5416        if ($fate{$addr} == $PLACEHOLDER) {
5417
5418            # A placeholder table doesn't get documented, is a perl extension,
5419            # and quite likely will be empty
5420            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5421            $perl_extension = 1 if ! defined $perl_extension;
5422            $ucd = 0 if ! defined $ucd;
5423            push @tables_that_may_be_empty, $complete_name{$addr};
5424            $self->add_comment(<<END);
5425This is a placeholder because it is not in Version $string_version of Unicode,
5426but is needed by the Perl core to work gracefully.  Because it is not in this
5427version of Unicode, it will not be listed in $pod_file.pod
5428END
5429        }
5430        elsif (exists $why_suppressed{$complete_name}
5431                # Don't suppress if overridden
5432                && ! grep { $_ eq $complete_name{$addr} }
5433                                                    @output_mapped_properties)
5434        {
5435            $fate{$addr} = $SUPPRESSED;
5436        }
5437        elsif ($fate{$addr} == $SUPPRESSED) {
5438            Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5439            # Though currently unused
5440        }
5441        elsif ($suppression_reason) {
5442            Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5443        }
5444
5445        # If hasn't set its status already, see if it is on one of the
5446        # lists of properties or tables that have particular statuses; if
5447        # not, is normal.  The lists are prioritized so the most serious
5448        # ones are checked first
5449        if (! $status{$addr}) {
5450            if (exists $why_deprecated{$complete_name}) {
5451                $status{$addr} = $DEPRECATED;
5452            }
5453            elsif (exists $why_stabilized{$complete_name}) {
5454                $status{$addr} = $STABILIZED;
5455            }
5456            elsif (exists $why_obsolete{$complete_name}) {
5457                $status{$addr} = $OBSOLETE;
5458            }
5459
5460            # Existence above doesn't necessarily mean there is a message
5461            # associated with it.  Use the most serious message.
5462            if ($status{$addr}) {
5463                if ($why_deprecated{$complete_name}) {
5464                    $status_info{$addr}
5465                                = $why_deprecated{$complete_name};
5466                }
5467                elsif ($why_stabilized{$complete_name}) {
5468                    $status_info{$addr}
5469                                = $why_stabilized{$complete_name};
5470                }
5471                elsif ($why_obsolete{$complete_name}) {
5472                    $status_info{$addr}
5473                                = $why_obsolete{$complete_name};
5474                }
5475            }
5476        }
5477
5478        $perl_extension{$addr} = $perl_extension || 0;
5479
5480        # Don't list a property by default that is internal only
5481        if ($fate{$addr} > $MAP_PROXIED) {
5482            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5483            $ucd = 0 if ! defined $ucd;
5484        }
5485        else {
5486            $ucd = 1 if ! defined $ucd;
5487        }
5488
5489        # By convention what typically gets printed only or first is what's
5490        # first in the list, so put the full name there for good output
5491        # clarity.  Other routines rely on the full name being first on the
5492        # list
5493        $self->add_alias($full_name{$addr},
5494                            OK_as_Filename => $ok_as_filename,
5495                            Fuzzy => $loose_match,
5496                            Re_Pod_Entry => $make_re_pod_entry,
5497                            Status => $status{$addr},
5498                            UCD => $ucd,
5499                            );
5500
5501        # Then comes the other name, if meaningfully different.
5502        if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5503            $self->add_alias($name{$addr},
5504                            OK_as_Filename => $ok_as_filename,
5505                            Fuzzy => $loose_match,
5506                            Re_Pod_Entry => $make_re_pod_entry,
5507                            Status => $status{$addr},
5508                            UCD => $ucd,
5509                            );
5510        }
5511
5512        return $self;
5513    }
5514
5515    # Here are the methods that are required to be defined by any derived
5516    # class
5517    for my $sub (qw(
5518                    handle_special_range
5519                    append_to_body
5520                    pre_body
5521                ))
5522                # write() knows how to write out normal ranges, but it calls
5523                # handle_special_range() when it encounters a non-normal one.
5524                # append_to_body() is called by it after it has handled all
5525                # ranges to add anything after the main portion of the table.
5526                # And finally, pre_body() is called after all this to build up
5527                # anything that should appear before the main portion of the
5528                # table.  Doing it this way allows things in the middle to
5529                # affect what should appear before the main portion of the
5530                # table.
5531    {
5532        no strict "refs";
5533        *$sub = sub {
5534            Carp::my_carp_bug( __LINE__
5535                              . ": Must create method '$sub()' for "
5536                              . ref shift);
5537            return;
5538        }
5539    }
5540
5541    use overload
5542        fallback => 0,
5543        "." => \&main::_operator_dot,
5544        ".=" => \&main::_operator_dot_equal,
5545        '!=' => \&main::_operator_not_equal,
5546        '==' => \&main::_operator_equal,
5547    ;
5548
5549    sub ranges {
5550        # Returns the array of ranges associated with this table.
5551
5552        return $range_list{pack 'J', refaddr shift}->ranges;
5553    }
5554
5555    sub add_alias {
5556        # Add a synonym for this table.
5557
5558        return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5559
5560        my $self = shift;
5561        my $name = shift;       # The name to add.
5562        my $pointer = shift;    # What the alias hash should point to.  For
5563                                # map tables, this is the parent property;
5564                                # for match tables, it is the table itself.
5565
5566        my %args = @_;
5567        my $loose_match = delete $args{'Fuzzy'};
5568
5569        my $ok_as_filename = delete $args{'OK_as_Filename'};
5570        $ok_as_filename = 1 unless defined $ok_as_filename;
5571
5572        # An internal name does not get documented, unless overridden by the
5573        # input; same for making tests for it.
5574        my $status = delete $args{'Status'} || (($name =~ /^_/)
5575                                                ? $INTERNAL_ALIAS
5576                                                : $NORMAL);
5577        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5578                                            // (($status ne $INTERNAL_ALIAS)
5579                                               ? (($name =~ /^_/) ? $NO : $YES)
5580                                               : $NO);
5581        my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5582
5583        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5584
5585        # Capitalize the first letter of the alias unless it is one of the CJK
5586        # ones which specifically begins with a lower 'k'.  Do this because
5587        # Unicode has varied whether they capitalize first letters or not, and
5588        # have later changed their minds and capitalized them, but not the
5589        # other way around.  So do it always and avoid changes from release to
5590        # release
5591        $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5592
5593        my $addr = pack 'J', refaddr $self;
5594
5595        # Figure out if should be loosely matched if not already specified.
5596        if (! defined $loose_match) {
5597
5598            # Is a loose_match if isn't null, and doesn't begin with an
5599            # underscore and isn't just a number
5600            if ($name ne ""
5601                && substr($name, 0, 1) ne '_'
5602                && $name !~ qr{^[0-9_.+-/]+$})
5603            {
5604                $loose_match = 1;
5605            }
5606            else {
5607                $loose_match = 0;
5608            }
5609        }
5610
5611        # If this alias has already been defined, do nothing.
5612        return if defined $find_table_from_alias{$addr}->{$name};
5613
5614        # That includes if it is standardly equivalent to an existing alias,
5615        # in which case, add this name to the list, so won't have to search
5616        # for it again.
5617        my $standard_name = main::standardize($name);
5618        if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5619            $find_table_from_alias{$addr}->{$name}
5620                        = $find_table_from_alias{$addr}->{$standard_name};
5621            return;
5622        }
5623
5624        # Set the index hash for this alias for future quick reference.
5625        $find_table_from_alias{$addr}->{$name} = $pointer;
5626        $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5627        local $to_trace = 0 if main::DEBUG;
5628        trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5629        trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5630
5631
5632        # Put the new alias at the end of the list of aliases unless the final
5633        # element begins with an underscore (meaning it is for internal perl
5634        # use) or is all numeric, in which case, put the new one before that
5635        # one.  This floats any all-numeric or underscore-beginning aliases to
5636        # the end.  This is done so that they are listed last in output lists,
5637        # to encourage the user to use a better name (either more descriptive
5638        # or not an internal-only one) instead.  This ordering is relied on
5639        # implicitly elsewhere in this program, like in short_name()
5640        my $list = $aliases{$addr};
5641        my $insert_position = (@$list == 0
5642                                || (substr($list->[-1]->name, 0, 1) ne '_'
5643                                    && $list->[-1]->name =~ /\D/))
5644                            ? @$list
5645                            : @$list - 1;
5646        splice @$list,
5647                $insert_position,
5648                0,
5649                Alias->new($name, $loose_match, $make_re_pod_entry,
5650                           $ok_as_filename, $status, $ucd);
5651
5652        # This name may be shorter than any existing ones, so clear the cache
5653        # of the shortest, so will have to be recalculated.
5654        undef $short_name{pack 'J', refaddr $self};
5655        return;
5656    }
5657
5658    sub short_name($self, $nominal_length_ptr=undef) {
5659        # Returns a name suitable for use as the base part of a file name.
5660        # That is, shorter wins.  It can return undef if there is no suitable
5661        # name.  The name has all non-essential underscores removed.
5662
5663        # The optional second parameter is a reference to a scalar in which
5664        # this routine will store the length the returned name had before the
5665        # underscores were removed, or undef if the return is undef.
5666
5667        # The shortest name can change if new aliases are added.  So using
5668        # this should be deferred until after all these are added.  The code
5669        # that does that should clear this one's cache.
5670        # Any name with alphabetics is preferred over an all numeric one, even
5671        # if longer.
5672
5673        my $addr = pack 'J', refaddr $self;
5674
5675        # For efficiency, don't recalculate, but this means that adding new
5676        # aliases could change what the shortest is, so the code that does
5677        # that needs to undef this.
5678        if (defined $short_name{$addr}) {
5679            if ($nominal_length_ptr) {
5680                $$nominal_length_ptr = $nominal_short_name_length{$addr};
5681            }
5682            return $short_name{$addr};
5683        }
5684
5685        # Look at each alias
5686        my $is_last_resort = 0;
5687        my $deprecated_or_discouraged
5688                                = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5689        foreach my $alias ($self->aliases()) {
5690
5691            # Don't use an alias that isn't ok to use for an external name.
5692            next if ! $alias->ok_as_filename;
5693
5694            my $name = main::Standardize($alias->name);
5695            trace $self, $name if main::DEBUG && $to_trace;
5696
5697            # Take the first one, or any non-deprecated non-discouraged one
5698            # over one that is, or a shorter one that isn't numeric.  This
5699            # relies on numeric aliases always being last in the array
5700            # returned by aliases().  Any alpha one will have precedence.
5701            if (   ! defined $short_name{$addr}
5702                || (   $is_last_resort
5703                    && $alias->status !~ $deprecated_or_discouraged)
5704                || ($name =~ /\D/
5705                    && length($name) < length($short_name{$addr})))
5706            {
5707                # Remove interior underscores.
5708                ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5709
5710                $nominal_short_name_length{$addr} = length $name;
5711                $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5712            }
5713        }
5714
5715        # If the short name isn't a nice one, perhaps an equivalent table has
5716        # a better one.
5717        if (   $self->can('children')
5718            && (   ! defined $short_name{$addr}
5719                || $short_name{$addr} eq ""
5720                || $short_name{$addr} eq "_"))
5721        {
5722            my $return;
5723            foreach my $follower ($self->children) {    # All equivalents
5724                my $follower_name = $follower->short_name;
5725                next unless defined $follower_name;
5726
5727                # Anything (except undefined) is better than underscore or
5728                # empty
5729                if (! defined $return || $return eq "_") {
5730                    $return = $follower_name;
5731                    next;
5732                }
5733
5734                # If the new follower name isn't "_" and is shorter than the
5735                # current best one, prefer the new one.
5736                next if $follower_name eq "_";
5737                next if length $follower_name > length $return;
5738                $return = $follower_name;
5739            }
5740            $short_name{$addr} = $return if defined $return;
5741        }
5742
5743        # If no suitable external name return undef
5744        if (! defined $short_name{$addr}) {
5745            $$nominal_length_ptr = undef if $nominal_length_ptr;
5746            return;
5747        }
5748
5749        # Don't allow a null short name.
5750        if ($short_name{$addr} eq "") {
5751            $short_name{$addr} = '_';
5752            $nominal_short_name_length{$addr} = 1;
5753        }
5754
5755        trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5756
5757        if ($nominal_length_ptr) {
5758            $$nominal_length_ptr = $nominal_short_name_length{$addr};
5759        }
5760        return $short_name{$addr};
5761    }
5762
5763    sub external_name($self) {
5764        # Returns the external name that this table should be known by.  This
5765        # is usually the short_name, but not if the short_name is undefined,
5766        # in which case the external_name is arbitrarily set to the
5767        # underscore.
5768
5769        my $short = $self->short_name;
5770        return $short if defined $short;
5771
5772        return '_';
5773    }
5774
5775    sub add_description($self, $description) { # Adds the parameter as a short description.
5776        push @{$description{pack 'J', refaddr $self}}, $description;
5777
5778        return;
5779    }
5780
5781    sub add_note($self, $note) { # Adds the parameter as a short note.
5782        push @{$note{pack 'J', refaddr $self}}, $note;
5783
5784        return;
5785    }
5786
5787    sub add_comment($self, $comment) { # Adds the parameter as a comment.
5788
5789        return unless $debugging_build;
5790
5791        chomp $comment;
5792
5793        push @{$comment{pack 'J', refaddr $self}}, $comment;
5794
5795        return;
5796    }
5797
5798    sub comment($self) {
5799        # Return the current comment for this table.  If called in list
5800        # context, returns the array of comments.  In scalar, returns a string
5801        # of each element joined together with a period ending each.
5802
5803        my $addr = pack 'J', refaddr $self;
5804        my @list = @{$comment{$addr}};
5805        return @list if wantarray;
5806        my $return = "";
5807        foreach my $sentence (@list) {
5808            $return .= '.  ' if $return;
5809            $return .= $sentence;
5810            $return =~ s/\.$//;
5811        }
5812        $return .= '.' if $return;
5813        return $return;
5814    }
5815
5816    sub initialize($self, $initialization) {
5817        # Initialize the table with the argument which is any valid
5818        # initialization for range lists.
5819
5820        my $addr = pack 'J', refaddr $self;
5821
5822        # Replace the current range list with a new one of the same exact
5823        # type.
5824        my $class = ref $range_list{$addr};
5825        $range_list{$addr} = $class->new(Owner => $self,
5826                                        Initialize => $initialization);
5827        return;
5828
5829    }
5830
5831    sub header($self) {
5832        # The header that is output for the table in the file it is written
5833        # in.
5834        my $return = "";
5835        $return .= $DEVELOPMENT_ONLY if $compare_versions;
5836        $return .= $HEADER;
5837        return $return;
5838    }
5839
5840    sub merge_single_annotation_line ($output, $annotation, $annotation_column) {
5841
5842        # This appends an annotation comment, $annotation, to $output,
5843        # starting in or after column $annotation_column, removing any
5844        # pre-existing comment from $output.
5845
5846        $annotation =~ s/^ \s* \# \  //x;
5847        $output =~ s/ \s* ( \# \N* )? \n //x;
5848        $output = Text::Tabs::expand($output);
5849
5850        my $spaces = $annotation_column - length $output;
5851        $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
5852
5853        $output = sprintf "%s%*s# %s",
5854                            $output,
5855                            $spaces,
5856                            " ",
5857                            $annotation;
5858        return Text::Tabs::unexpand $output;
5859    }
5860
5861    sub write($self, $use_adjustments=0, $suppress_value=0) {
5862        # Write a representation of the table to its file.  It calls several
5863        # functions furnished by sub-classes of this abstract base class to
5864        # handle non-normal ranges, to add stuff before the table, and at its
5865        # end.  If the table is to be written so that adjustments are
5866        # required, this does that conversion.
5867
5868
5869        # $use_adjustments ? output in adjusted format or not
5870        # $suppress_value Optional, if the value associated with
5871        # a range equals this one, don't write
5872        # the range
5873
5874        my $addr = pack 'J', refaddr $self;
5875        my $write_as_invlist = $write_as_invlist{$addr};
5876
5877        # Start with the header
5878        my @HEADER = $self->header;
5879
5880        # Then the comments
5881        push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5882                                                        if $comment{$addr};
5883
5884        # Things discovered processing the main body of the document may
5885        # affect what gets output before it, therefore pre_body() isn't called
5886        # until after all other processing of the table is done.
5887
5888        # The main body looks like a 'here' document.  If there are comments,
5889        # get rid of them when processing it.
5890        my @OUT;
5891        if ($annotate || $output_range_counts) {
5892            # Use the line below in Perls that don't have /r
5893            #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5894            push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5895        } else {
5896            push @OUT, "return <<'END';\n";
5897        }
5898
5899        if ($range_list{$addr}->is_empty) {
5900
5901            # This is a kludge for empty tables to silence a warning in
5902            # utf8.c, which can't really deal with empty tables, but it can
5903            # deal with a table that matches nothing, as the inverse of 'All'
5904            # does.
5905            push @OUT, "!Unicode::UCD::All\n";
5906        }
5907        elsif ($self->name eq 'N'
5908
5909               # To save disk space and table cache space, avoid putting out
5910               # binary N tables, but instead create a file which just inverts
5911               # the Y table.  Since the file will still exist and occupy a
5912               # certain number of blocks, might as well output the whole
5913               # thing if it all will fit in one block.   The number of
5914               # ranges below is an approximate number for that.
5915               && ($self->property->type == $BINARY
5916                   || $self->property->type == $FORCED_BINARY)
5917               # && $self->property->tables == 2  Can't do this because the
5918               #        non-binary properties, like NFDQC aren't specifiable
5919               #        by the notation
5920               && $range_list{$addr}->ranges > 15
5921               && ! $annotate)  # Under --annotate, want to see everything
5922        {
5923            push @OUT, "!Unicode::UCD::" . $self->property->name . "\n";
5924        }
5925        else {
5926            my $range_size_1 = $range_size_1{$addr};
5927
5928            # To make it more readable, use a minimum indentation
5929            my $comment_indent;
5930
5931            # These are used only in $annotate option
5932            my $format;         # e.g. $HEX_ADJUST_FORMAT
5933            my $include_name;   # ? Include the character's name in the
5934                                # annotation?
5935            my $include_cp;     # ? Include its code point
5936
5937            if (! $annotate) {
5938                $comment_indent = ($self->isa('Map_Table'))
5939                                  ? 24
5940                                  : ($write_as_invlist)
5941                                    ? 8
5942                                    : 16;
5943            }
5944            else {
5945                $format = $self->format;
5946
5947                # The name of the character is output only for tables that
5948                # don't already include the name in the output.
5949                my $property = $self->property;
5950                $include_name =
5951                    !  ($property == $perl_charname
5952                        || $property == main::property_ref('Unicode_1_Name')
5953                        || $property == main::property_ref('Name')
5954                        || $property == main::property_ref('Name_Alias')
5955                       );
5956
5957                # Don't include the code point in the annotation where all
5958                # lines are a single code point, so it can be easily found in
5959                # the first column
5960                $include_cp = ! $range_size_1;
5961
5962                if (! $self->isa('Map_Table')) {
5963                    $comment_indent = ($write_as_invlist) ? 8 : 16;
5964                }
5965                else {
5966                    $comment_indent = 16;
5967
5968                    # There are just a few short ranges in this table, so no
5969                    # need to include the code point in the annotation.
5970                    $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
5971
5972                    # We're trying to get this to look good, as the whole
5973                    # point is to make human-readable tables.  It is easier to
5974                    # read if almost all the annotation comments begin in the
5975                    # same column.  Map tables have varying width maps, so can
5976                    # create a jagged comment appearance.  This code does a
5977                    # preliminary pass through these tables looking for the
5978                    # maximum width map in each, and causing the comments to
5979                    # begin just to the right of that.  However, if the
5980                    # comments begin too far to the right of most lines, it's
5981                    # hard to line them up horizontally with their real data.
5982                    # Therefore we ignore the longest outliers
5983                    my $ignore_longest_X_percent = 2;  # Discard longest X%
5984
5985                    # Each key in this hash is a width of at least one of the
5986                    # maps in the table.  Its value is how many lines have
5987                    # that width.
5988                    my %widths;
5989
5990                    # We won't space things further left than one tab stop
5991                    # after the rest of the line; initializing it to that
5992                    # number saves some work.
5993                    my $max_map_width = 8;
5994
5995                    # Fill in the %widths hash
5996                    my $total = 0;
5997                    for my $set ($range_list{$addr}->ranges) {
5998                        my $value = $set->value;
5999
6000                        # These range types don't appear in the main table
6001                        next if $set->type == 0
6002                                && defined $suppress_value
6003                                && $value eq $suppress_value;
6004                        next if $set->type == $MULTI_CP
6005                                || $set->type == $NULL;
6006
6007                        # Include 2 spaces before the beginning of the
6008                        # comment
6009                        my $this_width = length($value) + 2;
6010
6011                        # Ranges of the remaining non-zero types usually
6012                        # occupy just one line (maybe occasionally two, but
6013                        # this doesn't have to be dead accurate).  This is
6014                        # because these ranges are like "unassigned code
6015                        # points"
6016                        my $count = ($set->type != 0)
6017                                    ? 1
6018                                    : $set->end - $set->start + 1;
6019                        $widths{$this_width} += $count;
6020                        $total += $count;
6021                        $max_map_width = $this_width
6022                                            if $max_map_width < $this_width;
6023                    }
6024
6025                    # If the widest map gives us less than two tab stops
6026                    # worth, just take it as-is.
6027                    if ($max_map_width > 16) {
6028
6029                        # Otherwise go through %widths until we have included
6030                        # the desired percentage of lines in the whole table.
6031                        my $running_total = 0;
6032                        foreach my $width (sort { $a <=> $b } keys %widths)
6033                        {
6034                            $running_total += $widths{$width};
6035                            use integer;
6036                            if ($running_total * 100 / $total
6037                                            >= 100 - $ignore_longest_X_percent)
6038                            {
6039                                $max_map_width = $width;
6040                                last;
6041                            }
6042                        }
6043                    }
6044                    $comment_indent += $max_map_width;
6045                }
6046            }
6047
6048            # Values for previous time through the loop.  Initialize to
6049            # something that won't be adjacent to the first iteration;
6050            # only $previous_end matters for that.
6051            my $previous_start;
6052            my $previous_end = -2;
6053            my $previous_value;
6054
6055            # Values for next time through the portion of the loop that splits
6056            # the range.  0 in $next_start means there is no remaining portion
6057            # to deal with.
6058            my $next_start = 0;
6059            my $next_end;
6060            my $next_value;
6061            my $offset = 0;
6062            my $invlist_count = 0;
6063
6064            my $output_value_in_hex = $self->isa('Map_Table')
6065                                && ($self->format eq $HEX_ADJUST_FORMAT
6066                                    || $self->to_output_map == $EXTERNAL_MAP);
6067            # Use leading zeroes just for files whose format should not be
6068            # changed from what it has been.  Otherwise, they just take up
6069            # space and time to process.
6070            my $hex_format = ($self->isa('Map_Table')
6071                              && $self->to_output_map == $EXTERNAL_MAP)
6072                             ? "%04X"
6073                             : "%X";
6074
6075            # The values for some of these tables are stored in mktables as
6076            # hex strings.  Normally, these are just output as strings without
6077            # change, but when we are doing adjustments, we have to operate on
6078            # these numerically, so we convert those to decimal to do that,
6079            # and back to hex for output
6080            my $convert_map_to_from_hex = 0;
6081            my $output_map_in_hex = 0;
6082            if ($self->isa('Map_Table')) {
6083                $convert_map_to_from_hex
6084                   = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6085                      || ($annotate && $self->format eq $HEX_FORMAT);
6086                $output_map_in_hex = $convert_map_to_from_hex
6087                                 || $self->format eq $HEX_FORMAT;
6088            }
6089
6090            # To store any annotations about the characters.
6091            my @annotation;
6092
6093            # Output each range as part of the here document.
6094            RANGE:
6095            for my $set ($range_list{$addr}->ranges) {
6096                if ($set->type != 0) {
6097                    $self->handle_special_range($set);
6098                    next RANGE;
6099                }
6100                my $start = $set->start;
6101                my $end   = $set->end;
6102                my $value  = $set->value;
6103
6104                # Don't output ranges whose value is the one to suppress
6105                next RANGE if defined $suppress_value
6106                              && $value eq $suppress_value;
6107
6108                $value = CORE::hex $value if $convert_map_to_from_hex;
6109
6110
6111                {   # This bare block encloses the scope where we may need to
6112                    # 'redo' to.  Consider a table that is to be written out
6113                    # using single item ranges.  This is given in the
6114                    # $range_size_1 boolean.  To accomplish this, we split the
6115                    # range each time through the loop into two portions, the
6116                    # first item, and the rest.  We handle that first item
6117                    # this time in the loop, and 'redo' to repeat the process
6118                    # for the rest of the range.
6119                    #
6120                    # We may also have to do it, with other special handling,
6121                    # if the table has adjustments.  Consider the table that
6122                    # contains the lowercasing maps.  mktables stores the
6123                    # ASCII range ones as 26 ranges:
6124                    #       ord('A') => ord('a'), .. ord('Z') => ord('z')
6125                    # For compactness, the table that gets written has this as
6126                    # just one range
6127                    #       ( ord('A') .. ord('Z') ) => ord('a')
6128                    # and the software that reads the tables is smart enough
6129                    # to "connect the dots".  This change is accomplished in
6130                    # this loop by looking to see if the current iteration
6131                    # fits the paradigm of the previous iteration, and if so,
6132                    # we merge them by replacing the final output item with
6133                    # the merged data.  Repeated 25 times, this gets A-Z.  But
6134                    # we also have to make sure we don't screw up cases where
6135                    # we have internally stored
6136                    #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
6137                    # This single internal range has to be output as 3 ranges,
6138                    # which is done by splitting, like we do for $range_size_1
6139                    # tables.  (There are very few of such ranges that need to
6140                    # be split, so the gain of doing the combining of other
6141                    # ranges far outweighs the splitting of these.)  The
6142                    # values to use for the redo at the end of this block are
6143                    # set up just below in the scalars whose names begin with
6144                    # '$next_'.
6145
6146                    if (($use_adjustments || $range_size_1) && $end != $start)
6147                    {
6148                        $next_start = $start + 1;
6149                        $next_end = $end;
6150                        $next_value = $value;
6151                        $end = $start;
6152                    }
6153
6154                    if ($use_adjustments && ! $range_size_1) {
6155
6156                        # If this range is adjacent to the previous one, and
6157                        # the values in each are integers that are also
6158                        # adjacent (differ by 1), then this range really
6159                        # extends the previous one that is already in element
6160                        # $OUT[-1].  So we pop that element, and pretend that
6161                        # the range starts with whatever it started with.
6162                        # $offset is incremented by 1 each time so that it
6163                        # gives the current offset from the first element in
6164                        # the accumulating range, and we keep in $value the
6165                        # value of that first element.
6166                        if ($start == $previous_end + 1
6167                            && $value =~ /^ -? \d+ $/xa
6168                            && $previous_value =~ /^ -? \d+ $/xa
6169                            && ($value == ($previous_value + ++$offset)))
6170                        {
6171                            pop @OUT;
6172                            $start = $previous_start;
6173                            $value = $previous_value;
6174                        }
6175                        else {
6176                            $offset = 0;
6177                            if (@annotation == 1) {
6178                                $OUT[-1] = merge_single_annotation_line(
6179                                    $OUT[-1], $annotation[0], $comment_indent);
6180                            }
6181                            else {
6182                                push @OUT, @annotation;
6183                            }
6184                        }
6185                        undef @annotation;
6186
6187                        # Save the current values for the next time through
6188                        # the loop.
6189                        $previous_start = $start;
6190                        $previous_end = $end;
6191                        $previous_value = $value;
6192                    }
6193
6194                    if ($write_as_invlist) {
6195                        if (   $previous_end > 0
6196                            && $output_range_counts{$addr})
6197                        {
6198                            my $complement_count = $start - $previous_end - 1;
6199                            if ($complement_count > 1) {
6200                                $OUT[-1] = merge_single_annotation_line(
6201                                    $OUT[-1],
6202                                       "#"
6203                                     . (" " x 17)
6204                                     . "["
6205                                     .  main::clarify_code_point_count(
6206                                                            $complement_count)
6207                                      . "] in complement\n",
6208                                    $comment_indent);
6209                            }
6210                        }
6211
6212                        # Inversion list format has a single number per line,
6213                        # the starting code point of a range that matches the
6214                        # property
6215                        push @OUT, $start, "\n";
6216                        $invlist_count++;
6217
6218                        # Add a comment with the size of the range, if
6219                        # requested.
6220                        if ($output_range_counts{$addr}) {
6221                            $OUT[-1] = merge_single_annotation_line(
6222                                    $OUT[-1],
6223                                    "# ["
6224                                      . main::clarify_code_point_count($end - $start + 1)
6225                                      . "]\n",
6226                                    $comment_indent);
6227                        }
6228                    }
6229                    elsif ($start != $end) { # If there is a range
6230                        if ($end == $MAX_WORKING_CODEPOINT) {
6231                            push @OUT, sprintf "$hex_format\t$hex_format",
6232                                                $start,
6233                                                $MAX_PLATFORM_CODEPOINT;
6234                        }
6235                        else {
6236                            push @OUT, sprintf "$hex_format\t$hex_format",
6237                                                $start,       $end;
6238                        }
6239                        if (length $value) {
6240                            if ($convert_map_to_from_hex) {
6241                                $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6242                            }
6243                            else {
6244                                $OUT[-1] .= "\t$value\n";
6245                            }
6246                        }
6247
6248                        # Add a comment with the size of the range, if
6249                        # requested.
6250                        if ($output_range_counts{$addr}) {
6251                            $OUT[-1] = merge_single_annotation_line(
6252                                    $OUT[-1],
6253                                    "# ["
6254                                      . main::clarify_code_point_count($end - $start + 1)
6255                                      . "]\n",
6256                                    $comment_indent);
6257                        }
6258                    }
6259                    else { # Here to output a single code point per line.
6260
6261                        # Use any passed in subroutine to output.
6262                        if (ref $range_size_1 eq 'CODE') {
6263                            for my $i ($start .. $end) {
6264                                push @OUT, &{$range_size_1}($i, $value);
6265                            }
6266                        }
6267                        else {
6268
6269                            # Here, caller is ok with default output.
6270                            for (my $i = $start; $i <= $end; $i++) {
6271                                if ($convert_map_to_from_hex) {
6272                                    push @OUT,
6273                                        sprintf "$hex_format\t\t$hex_format\n",
6274                                                 $i,            $value;
6275                                }
6276                                else {
6277                                    push @OUT, sprintf $hex_format, $i;
6278                                    $OUT[-1] .= "\t\t$value" if $value ne "";
6279                                    $OUT[-1] .= "\n";
6280                                }
6281                            }
6282                        }
6283                    }
6284
6285                    if ($annotate) {
6286                        for (my $i = $start; $i <= $end; $i++) {
6287                            my $annotation = "";
6288
6289                            # Get character information if don't have it already
6290                            main::populate_char_info($i)
6291                                                     if ! defined $viacode[$i];
6292                            my $type = $annotate_char_type[$i];
6293
6294                            # Figure out if should output the next code points
6295                            # as part of a range or not.  If this is not in an
6296                            # annotation range, then won't output as a range,
6297                            # so returns $i.  Otherwise use the end of the
6298                            # annotation range, but no further than the
6299                            # maximum possible end point of the loop.
6300                            my $range_end =
6301                                        $range_size_1
6302                                        ? $start
6303                                        : main::min(
6304                                          $annotate_ranges->value_of($i) || $i,
6305                                          $end);
6306
6307                            # Use a range if it is a range, and either is one
6308                            # of the special annotation ranges, or the range
6309                            # is at most 3 long.  This last case causes the
6310                            # algorithmically named code points to be output
6311                            # individually in spans of at most 3, as they are
6312                            # the ones whose $type is > 0.
6313                            if ($range_end != $i
6314                                && ( $type < 0 || $range_end - $i > 2))
6315                            {
6316                                # Here is to output a range.  We don't allow a
6317                                # caller-specified output format--just use the
6318                                # standard one.
6319                                my $range_name = $viacode[$i];
6320
6321                                # For the code points which end in their hex
6322                                # value, we eliminate that from the output
6323                                # annotation, and capitalize only the first
6324                                # letter of each word.
6325                                if ($type == $CP_IN_NAME) {
6326                                    my $hex = sprintf $hex_format, $i;
6327                                    $range_name =~ s/-$hex$//;
6328                                    my @words = split " ", $range_name;
6329                                    for my $word (@words) {
6330                                        $word =
6331                                          ucfirst(lc($word)) if $word ne 'CJK';
6332                                    }
6333                                    $range_name = join " ", @words;
6334                                }
6335                                elsif ($type == $HANGUL_SYLLABLE) {
6336                                    $range_name = "Hangul Syllable";
6337                                }
6338
6339                                # If the annotation would just repeat what's
6340                                # already being output as the range, skip it.
6341                                # (When an inversion list is being written, it
6342                                # isn't a repeat, as that always is in
6343                                # decimal)
6344                                if (   $write_as_invlist
6345                                    || $i != $start
6346                                    || $range_end < $end)
6347                                {
6348                                    if ($range_end < $MAX_WORKING_CODEPOINT)
6349                                    {
6350                                        $annotation = sprintf "%04X..%04X",
6351                                                              $i,   $range_end;
6352                                    }
6353                                    else {
6354                                        $annotation = sprintf "%04X..INFINITY",
6355                                                               $i;
6356                                    }
6357                                }
6358                                else { # Indent if not displaying code points
6359                                    $annotation = " " x 4;
6360                                }
6361
6362                                if ($range_name) {
6363                                    $annotation .= " $age[$i]" if $age[$i];
6364                                    $annotation .= " $range_name";
6365                                }
6366
6367                                # Include the number of code points in the
6368                                # range
6369                                my $count =
6370                                    main::clarify_code_point_count($range_end - $i + 1);
6371                                $annotation .= " [$count]\n";
6372
6373                                # Skip to the end of the range
6374                                $i = $range_end;
6375                            }
6376                            else { # Not in a range.
6377                                my $comment = "";
6378
6379                                # When outputting the names of each character,
6380                                # use the character itself if printable
6381                                $comment .= "'" . main::display_chr($i) . "' "
6382                                                            if $printable[$i];
6383
6384                                my $output_value = $value;
6385
6386                                # Determine the annotation
6387                                if ($format eq $DECOMP_STRING_FORMAT) {
6388
6389                                    # This is very specialized, with the type
6390                                    # of decomposition beginning the line
6391                                    # enclosed in <...>, and the code points
6392                                    # that the code point decomposes to
6393                                    # separated by blanks.  Create two
6394                                    # strings, one of the printable
6395                                    # characters, and one of their official
6396                                    # names.
6397                                    (my $map = $output_value)
6398                                                    =~ s/ \ * < .*? > \ +//x;
6399                                    my $tostr = "";
6400                                    my $to_name = "";
6401                                    my $to_chr = "";
6402                                    foreach my $to (split " ", $map) {
6403                                        $to = CORE::hex $to;
6404                                        $to_name .= " + " if $to_name;
6405                                        $to_chr .= main::display_chr($to);
6406                                        main::populate_char_info($to)
6407                                                    if ! defined $viacode[$to];
6408                                        $to_name .=  $viacode[$to];
6409                                    }
6410
6411                                    $comment .=
6412                                    "=> '$to_chr'; $viacode[$i] => $to_name";
6413                                }
6414                                else {
6415                                    $output_value += $i - $start
6416                                                   if $use_adjustments
6417                                                      # Don't try to adjust a
6418                                                      # non-integer
6419                                                   && $output_value !~ /[-\D]/;
6420
6421                                    if ($output_map_in_hex) {
6422                                        main::populate_char_info($output_value)
6423                                          if ! defined $viacode[$output_value];
6424                                        $comment .= " => '"
6425                                        . main::display_chr($output_value)
6426                                        . "'; " if $printable[$output_value];
6427                                    }
6428                                    if ($include_name && $viacode[$i]) {
6429                                        $comment .= " " if $comment;
6430                                        $comment .= $viacode[$i];
6431                                    }
6432                                    if ($output_map_in_hex) {
6433                                        $comment .=
6434                                                " => $viacode[$output_value]"
6435                                                    if $viacode[$output_value];
6436                                        $output_value = sprintf($hex_format,
6437                                                                $output_value);
6438                                    }
6439                                }
6440
6441                                if ($include_cp) {
6442                                    $annotation = sprintf "%04X %s", $i, $age[$i];
6443                                    if ($use_adjustments) {
6444                                        $annotation .= " => $output_value";
6445                                    }
6446                                }
6447
6448                                if ($comment ne "") {
6449                                    $annotation .= " " if $annotation ne "";
6450                                    $annotation .= $comment;
6451                                }
6452                                $annotation .= "\n" if $annotation ne "";
6453                            }
6454
6455                            if ($annotation ne "") {
6456                                push @annotation, (" " x $comment_indent)
6457                                                  .  "# $annotation";
6458                            }
6459                        }
6460
6461                        # If not adjusting, we don't have to go through the
6462                        # loop again to know that the annotation comes next
6463                        # in the output.
6464                        if (! $use_adjustments) {
6465                            if (@annotation == 1) {
6466                                $OUT[-1] = merge_single_annotation_line(
6467                                    $OUT[-1], $annotation[0], $comment_indent);
6468                            }
6469                            else {
6470                                push @OUT, map { Text::Tabs::unexpand $_ }
6471                                               @annotation;
6472                            }
6473                            undef @annotation;
6474                        }
6475                    }
6476
6477                    # Add the beginning of the range that doesn't match the
6478                    # property, except if the just added match range extends
6479                    # to infinity.  We do this after any annotations for the
6480                    # match range.
6481                    if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6482                        push @OUT, $end + 1, "\n";
6483                        $invlist_count++;
6484                    }
6485
6486                    # If we split the range, set up so the next time through
6487                    # we get the remainder, and redo.
6488                    if ($next_start) {
6489                        $start = $next_start;
6490                        $end = $next_end;
6491                        $value = $next_value;
6492                        $next_start = 0;
6493                        redo;
6494                    }
6495                } # End of redo block
6496            } # End of loop through all the table's ranges
6497
6498            push @OUT, @annotation; # Add orphaned annotation, if any
6499
6500            splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6501        }
6502
6503        # Add anything that goes after the main body, but within the here
6504        # document,
6505        my $append_to_body = $self->append_to_body;
6506        push @OUT, $append_to_body if $append_to_body;
6507
6508        # And finish the here document.
6509        push @OUT, "END\n";
6510
6511        # Done with the main portion of the body.  Can now figure out what
6512        # should appear before it in the file.
6513        my $pre_body = $self->pre_body;
6514        push @HEADER, $pre_body, "\n" if $pre_body;
6515
6516        # All these files should have a .pl suffix added to them.
6517        my @file_with_pl = @{$file_path{$addr}};
6518        $file_with_pl[-1] .= '.pl';
6519
6520        main::write(\@file_with_pl,
6521                    $annotate,      # utf8 iff annotating
6522                    \@HEADER,
6523                    \@OUT);
6524        return;
6525    }
6526
6527    sub set_status($self, $status, $info) {    # Set the table's status
6528        # status The status enum value
6529        # info Any message associated with it.
6530        my $addr = pack 'J', refaddr $self;
6531
6532        $status{$addr} = $status;
6533        $status_info{$addr} = $info;
6534        return;
6535    }
6536
6537    sub set_fate($self, $fate, $reason=undef) {  # Set the fate of a table
6538        my $addr = pack 'J', refaddr $self;
6539
6540        return if $fate{$addr} == $fate;    # If no-op
6541
6542        # Can only change the ordinary fate, except if going to $MAP_PROXIED
6543        return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6544
6545        $fate{$addr} = $fate;
6546
6547        # Don't document anything to do with a non-normal fated table
6548        if ($fate != $ORDINARY) {
6549            my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6550            foreach my $alias ($self->aliases) {
6551                $alias->set_ucd($put_in_pod);
6552
6553                # MAP_PROXIED doesn't affect the match tables
6554                next if $fate == $MAP_PROXIED;
6555                $alias->set_make_re_pod_entry($put_in_pod);
6556            }
6557        }
6558
6559        # Save the reason for suppression for output
6560        if ($fate >= $SUPPRESSED) {
6561            $reason = "" unless defined $reason;
6562            $why_suppressed{$complete_name{$addr}} = $reason;
6563        }
6564
6565        return;
6566    }
6567
6568    sub lock($self) {
6569        # Don't allow changes to the table from now on.  This stores a stack
6570        # trace of where it was called, so that later attempts to modify it
6571        # can immediately show where it got locked.
6572        my $addr = pack 'J', refaddr $self;
6573
6574        $locked{$addr} = "";
6575
6576        my $line = (caller(0))[2];
6577        my $i = 1;
6578
6579        # Accumulate the stack trace
6580        while (1) {
6581            my ($pkg, $file, $caller_line, $caller) = caller $i++;
6582
6583            last unless defined $caller;
6584
6585            $locked{$addr} .= "    called from $caller() at line $line\n";
6586            $line = $caller_line;
6587        }
6588        $locked{$addr} .= "    called from main at line $line\n";
6589
6590        return;
6591    }
6592
6593    sub carp_if_locked($self) {
6594        # Return whether a table is locked or not, and, by the way, complain
6595        # if is locked
6596        my $addr = pack 'J', refaddr $self;
6597
6598        return 0 if ! $locked{$addr};
6599        Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6600        return 1;
6601    }
6602
6603    sub set_file_path($self, @path) { # Set the final directory path for this table
6604        @{$file_path{pack 'J', refaddr $self}} = @path;
6605        return
6606    }
6607
6608    # Accessors for the range list stored in this table.  First for
6609    # unconditional
6610    for my $sub (qw(
6611                    containing_range
6612                    contains
6613                    count
6614                    each_range
6615                    hash
6616                    is_empty
6617                    matches_identically_to
6618                    max
6619                    min
6620                    range_count
6621                    reset_each_range
6622                    type_of
6623                    value_of
6624                ))
6625    {
6626        no strict "refs";
6627        *$sub = sub {
6628            use strict "refs";
6629            my $self = shift;
6630            return $self->_range_list->$sub(@_);
6631        }
6632    }
6633
6634    # Then for ones that should fail if locked
6635    for my $sub (qw(
6636                    delete_range
6637                ))
6638    {
6639        no strict "refs";
6640        *$sub = sub {
6641            use strict "refs";
6642            my $self = shift;
6643
6644            return if $self->carp_if_locked;
6645            no overloading;
6646            return $self->_range_list->$sub(@_);
6647        }
6648    }
6649
6650} # End closure
6651
6652package Map_Table;
6653use parent '-norequire', '_Base_Table';
6654
6655# A Map Table is a table that contains the mappings from code points to
6656# values.  There are two weird cases:
6657# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6658#    are written in the table's file at the end of the table nonetheless.  It
6659#    requires specially constructed code to handle these; utf8.c can not read
6660#    these in, so they should not go in $map_directory.  As of this writing,
6661#    the only case that these happen is for named sequences used in
6662#    charnames.pm.   But this code doesn't enforce any syntax on these, so
6663#    something else could come along that uses it.
6664# 2) Specials are anything that doesn't fit syntactically into the body of the
6665#    table.  The ranges for these have a map type of non-zero.  The code below
6666#    knows about and handles each possible type.   In most cases, these are
6667#    written as part of the header.
6668#
6669# A map table deliberately can't be manipulated at will unlike match tables.
6670# This is because of the ambiguities having to do with what to do with
6671# overlapping code points.  And there just isn't a need for those things;
6672# what one wants to do is just query, add, replace, or delete mappings, plus
6673# write the final result.
6674# However, there is a method to get the list of possible ranges that aren't in
6675# this table to use for defaulting missing code point mappings.  And,
6676# map_add_or_replace_non_nulls() does allow one to add another table to this
6677# one, but it is clearly very specialized, and defined that the other's
6678# non-null values replace this one's if there is any overlap.
6679
6680sub trace { return main::trace(@_); }
6681
6682{ # Closure
6683
6684    main::setup_package();
6685
6686    my %default_map;
6687    # Many input files omit some entries; this gives what the mapping for the
6688    # missing entries should be
6689    main::set_access('default_map', \%default_map, 'r');
6690
6691    my %anomalous_entries;
6692    # Things that go in the body of the table which don't fit the normal
6693    # scheme of things, like having a range.  Not much can be done with these
6694    # once there except to output them.  This was created to handle named
6695    # sequences.
6696    main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6697    main::set_access('anomalous_entries',       # Append singular, read plural
6698                    \%anomalous_entries,
6699                    'readable_array');
6700    my %to_output_map;
6701    # Enum as to whether or not to write out this map table, and how:
6702    #   0               don't output
6703    #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6704    #                   it should not be removed nor its format changed.  This
6705    #                   is done for those files that have traditionally been
6706    #                   output.
6707    #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6708    #                   with this file
6709    #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6710    #                   outputting the actual mappings as-is, we adjust things
6711    #                   to create a much more compact table. Only those few
6712    #                   tables where the mapping is convertible at least to an
6713    #                   integer and compacting makes a big difference should
6714    #                   have this.  Hence, the default is to not do this
6715    #                   unless the table's default mapping is to $CODE_POINT,
6716    #                   and the range size is not 1.
6717    main::set_access('to_output_map', \%to_output_map, 's');
6718
6719    sub new {
6720        my $class = shift;
6721        my $name = shift;
6722
6723        my %args = @_;
6724
6725        # Optional initialization data for the table.
6726        my $initialize = delete $args{'Initialize'};
6727
6728        my $default_map = delete $args{'Default_Map'};
6729        my $property = delete $args{'_Property'};
6730        my $full_name = delete $args{'Full_Name'};
6731        my $to_output_map = delete $args{'To_Output_Map'};
6732
6733        # Rest of parameters passed on
6734
6735        my $range_list = Range_Map->new(Owner => $property);
6736
6737        my $self = $class->SUPER::new(
6738                                    Name => $name,
6739                                    Complete_Name =>  $full_name,
6740                                    Full_Name => $full_name,
6741                                    _Property => $property,
6742                                    _Range_List => $range_list,
6743                                    Write_As_Invlist => 0,
6744                                    %args);
6745
6746        my $addr = pack 'J', refaddr $self;
6747
6748        $anomalous_entries{$addr} = [];
6749        $default_map{$addr} = $default_map;
6750        $to_output_map{$addr} = $to_output_map;
6751
6752        $self->initialize($initialize) if defined $initialize;
6753
6754        return $self;
6755    }
6756
6757    use overload
6758        fallback => 0,
6759        qw("") => "_operator_stringify",
6760    ;
6761
6762    sub _operator_stringify($self, $other="", $reversed=0) {
6763
6764        my $name = $self->property->full_name;
6765        $name = '""' if $name eq "";
6766        return "Map table for Property '$name'";
6767    }
6768
6769    sub add_alias {
6770        # Add a synonym for this table (which means the property itself)
6771        my $self = shift;
6772        my $name = shift;
6773        # Rest of parameters passed on.
6774
6775        $self->SUPER::add_alias($name, $self->property, @_);
6776        return;
6777    }
6778
6779    sub add_map {
6780        # Add a range of code points to the list of specially-handled code
6781        # points.  0 is assumed if the type of special is not passed
6782        # in.
6783
6784        my $self = shift;
6785        my $lower = shift;
6786        my $upper = shift;
6787        my $string = shift;
6788        my %args = @_;
6789
6790        my $type = delete $args{'Type'} || 0;
6791        # Rest of parameters passed on
6792
6793        # Can't change the table if locked.
6794        return if $self->carp_if_locked;
6795
6796        $self->_range_list->add_map($lower, $upper,
6797                                    $string,
6798                                    @_,
6799                                    Type => $type);
6800        return;
6801    }
6802
6803    sub append_to_body($self) {
6804        # Adds to the written HERE document of the table's body any anomalous
6805        # entries in the table..
6806        my $addr = pack 'J', refaddr $self;
6807
6808        return "" unless @{$anomalous_entries{$addr}};
6809        return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6810    }
6811
6812    sub map_add_or_replace_non_nulls($self, $other) {
6813        # This adds the mappings in the table $other to $self.  Non-null
6814        # mappings from $other override those in $self.  It essentially merges
6815        # the two tables, with the second having priority except for null
6816        # mappings.
6817        return if $self->carp_if_locked;
6818
6819        if (! $other->isa(__PACKAGE__)) {
6820            Carp::my_carp_bug("$other should be a "
6821                        . __PACKAGE__
6822                        . ".  Not a '"
6823                        . ref($other)
6824                        . "'.  Not added;");
6825            return;
6826        }
6827
6828        local $to_trace = 0 if main::DEBUG;
6829
6830        my $self_range_list = $self->_range_list;
6831        my $other_range_list = $other->_range_list;
6832        foreach my $range ($other_range_list->ranges) {
6833            my $value = $range->value;
6834            next if $value eq "";
6835            $self_range_list->_add_delete('+',
6836                                          $range->start,
6837                                          $range->end,
6838                                          $value,
6839                                          Type => $range->type,
6840                                          Replace => $UNCONDITIONALLY);
6841        }
6842
6843        return;
6844    }
6845
6846    sub set_default_map($self, $map, $use_full_name=0) {
6847        # Define what code points that are missing from the input files should
6848        # map to.  The optional second parameter 'full_name' indicates to
6849        # force using the full name of the map instead of its standard name.
6850        if ($use_full_name && $use_full_name ne 'full_name') {
6851            Carp::my_carp_bug("Second parameter to set_default_map() if"
6852                            . " present, must be 'full_name'");
6853        }
6854
6855        my $addr = pack 'J', refaddr $self;
6856
6857        # Convert the input to the standard equivalent, if any (won't have any
6858        # for $STRING properties)
6859        my $standard = $self->property->table($map);
6860        if (defined $standard) {
6861            $map = ($use_full_name)
6862                   ? $standard->full_name
6863                   : $standard->name;
6864        }
6865
6866        # Warn if there already is a non-equivalent default map for this
6867        # property.  Note that a default map can be a ref, which means that
6868        # what it actually means is delayed until later in the program, and it
6869        # IS permissible to override it here without a message.
6870        my $default_map = $default_map{$addr};
6871        if (defined $default_map
6872            && ! ref($default_map)
6873            && $default_map ne $map
6874            && main::Standardize($map) ne $default_map)
6875        {
6876            my $property = $self->property;
6877            my $map_table = $property->table($map);
6878            my $default_table = $property->table($default_map);
6879            if (defined $map_table
6880                && defined $default_table
6881                && $map_table != $default_table)
6882            {
6883                Carp::my_carp("Changing the default mapping for "
6884                            . $property
6885                            . " from $default_map to $map'");
6886            }
6887        }
6888
6889        $default_map{$addr} = $map;
6890
6891        # Don't also create any missing table for this map at this point,
6892        # because if we did, it could get done before the main table add is
6893        # done for PropValueAliases.txt; instead the caller will have to make
6894        # sure it exists, if desired.
6895        return;
6896    }
6897
6898    sub to_output_map($self) {
6899        # Returns boolean: should we write this map table?
6900        my $addr = pack 'J', refaddr $self;
6901
6902        # If overridden, use that
6903        return $to_output_map{$addr} if defined $to_output_map{$addr};
6904
6905        my $full_name = $self->full_name;
6906        return $global_to_output_map{$full_name}
6907                                if defined $global_to_output_map{$full_name};
6908
6909        # If table says to output, do so; if says to suppress it, do so.
6910        my $fate = $self->fate;
6911        return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6912        return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6913        return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6914
6915        my $type = $self->property->type;
6916
6917        # Don't want to output binary map tables even for debugging.
6918        return 0 if $type == $BINARY;
6919
6920        # But do want to output string ones.  All the ones that remain to
6921        # be dealt with (i.e. which haven't explicitly been set to external)
6922        # are for internal Perl use only.  The default for those that map to
6923        # $CODE_POINT and haven't been restricted to a single element range
6924        # is to use the adjusted form.
6925        if ($type == $STRING) {
6926            return $INTERNAL_MAP if $self->range_size_1
6927                                    || $default_map{$addr} ne $CODE_POINT;
6928            return $OUTPUT_ADJUSTED;
6929        }
6930
6931        # Otherwise is an $ENUM, do output it, for Perl's purposes
6932        return $INTERNAL_MAP;
6933    }
6934
6935    sub inverse_list($self) {
6936        # Returns a Range_List that is gaps of the current table.  That is,
6937        # the inversion
6938        my $current = Range_List->new(Initialize => $self->_range_list,
6939                                Owner => $self->property);
6940        return ~ $current;
6941    }
6942
6943    sub header($self) {
6944        my $return = $self->SUPER::header();
6945
6946        if ($self->to_output_map >= $INTERNAL_MAP) {
6947            $return .= $INTERNAL_ONLY_HEADER;
6948        }
6949        else {
6950            # Other properties have fixed formats.
6951            my $property_name = $self->property->full_name;
6952
6953            $return .= <<END;
6954
6955# !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6956
6957# This file is for internal use by core Perl only.  It is retained for
6958# backwards compatibility with applications that may have come to rely on it,
6959# but its format and even its name or existence are subject to change without
6960# notice in a future Perl version.  Don't use it directly.  Instead, its
6961# contents are now retrievable through a stable API in the Unicode::UCD
6962# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
6963# code points can be retrieved via Unicode::UCD::charprop());
6964END
6965        }
6966        return $return;
6967    }
6968
6969    sub set_final_comment($self) {
6970        # Just before output, create the comment that heads the file
6971        # containing this table.
6972
6973        return unless $debugging_build;
6974
6975        # No sense generating a comment if aren't going to write it out.
6976        return if ! $self->to_output_map;
6977
6978        my $addr = pack 'J', refaddr $self;
6979
6980        my $property = $self->property;
6981
6982        # Get all the possible names for this property.  Don't use any that
6983        # aren't ok for use in a file name, etc.  This is perhaps causing that
6984        # flag to do double duty, and may have to be changed in the future to
6985        # have our own flag for just this purpose; but it works now to exclude
6986        # Perl generated synonyms from the lists for properties, where the
6987        # name is always the proper Unicode one.
6988        my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6989
6990        my $count = $self->count;
6991        my $default_map = $default_map{$addr};
6992
6993        # The ranges that map to the default aren't output, so subtract that
6994        # to get those actually output.  A property with matching tables
6995        # already has the information calculated.
6996        if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
6997            $count -= $property->table($default_map)->count;
6998        }
6999        elsif (defined $default_map) {
7000
7001            # But for $STRING properties, must calculate now.  Subtract the
7002            # count from each range that maps to the default.
7003            foreach my $range ($self->_range_list->ranges) {
7004                if ($range->value eq $default_map) {
7005                    $count -= $range->end +1 - $range->start;
7006                }
7007            }
7008
7009        }
7010
7011        # Get a  string version of $count with underscores in large numbers,
7012        # for clarity.
7013        my $string_count = main::clarify_code_point_count($count);
7014
7015        my $code_points = ($count == 1)
7016                        ? 'single code point'
7017                        : "$string_count code points";
7018
7019        my $mapping;
7020        my $these_mappings;
7021        my $are;
7022        if (@property_aliases <= 1) {
7023            $mapping = 'mapping';
7024            $these_mappings = 'this mapping';
7025            $are = 'is'
7026        }
7027        else {
7028            $mapping = 'synonymous mappings';
7029            $these_mappings = 'these mappings';
7030            $are = 'are'
7031        }
7032        my $cp;
7033        if ($count >= $MAX_UNICODE_CODEPOINTS) {
7034            $cp = "any code point in Unicode Version $string_version";
7035        }
7036        else {
7037            my $map_to;
7038            if ($default_map eq "") {
7039                $map_to = 'the empty string';
7040            }
7041            elsif ($default_map eq $CODE_POINT) {
7042                $map_to = "itself";
7043            }
7044            else {
7045                $map_to = "'$default_map'";
7046            }
7047            if ($count == 1) {
7048                $cp = "the single code point";
7049            }
7050            else {
7051                $cp = "one of the $code_points";
7052            }
7053            $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7054        }
7055
7056        my $comment = "";
7057
7058        my $status = $self->status;
7059        if ($status ne $NORMAL) {
7060            my $warn = uc $status_past_participles{$status};
7061            $comment .= <<END;
7062
7063!!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
7064 All property or property=value combinations contained in this file are $warn.
7065 See $unicode_reference_url for what this means.
7066
7067END
7068        }
7069        $comment .= "This file returns the $mapping:\n";
7070
7071        my $ucd_accessible_name = "";
7072        my $has_underscore_name = 0;
7073        my $full_name = $self->property->full_name;
7074        for my $i (0 .. @property_aliases - 1) {
7075            my $name = $property_aliases[$i]->name;
7076            $has_underscore_name = 1 if $name =~ /^_/;
7077            $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7078            if ($property_aliases[$i]->ucd) {
7079                if ($name eq $full_name) {
7080                    $ucd_accessible_name = $full_name;
7081                }
7082                elsif (! $ucd_accessible_name) {
7083                    $ucd_accessible_name = $name;
7084                }
7085            }
7086        }
7087        $comment .= "\nwhere 'cp' is $cp.";
7088        if ($ucd_accessible_name) {
7089            $comment .= "  Note that $these_mappings";
7090            if ($has_underscore_name) {
7091                $comment .= " (except for the one(s) that begin with an underscore)";
7092            }
7093            $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7094
7095        }
7096
7097        # And append any commentary already set from the actual property.
7098        $comment .= "\n\n" . $self->comment if $self->comment;
7099        if ($self->description) {
7100            $comment .= "\n\n" . join " ", $self->description;
7101        }
7102        if ($self->note) {
7103            $comment .= "\n\n" . join " ", $self->note;
7104        }
7105        $comment .= "\n";
7106
7107        if (! $self->perl_extension) {
7108            $comment .= <<END;
7109
7110For information about what this property really means, see:
7111$unicode_reference_url
7112END
7113        }
7114
7115        if ($count) {        # Format differs for empty table
7116                $comment.= "\nThe format of the ";
7117            if ($self->range_size_1) {
7118                $comment.= <<END;
7119main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7120is in hex; MAPPING is what CODE_POINT maps to.
7121END
7122            }
7123            else {
7124
7125                # There are tables which end up only having one element per
7126                # range, but it is not worth keeping track of for making just
7127                # this comment a little better.
7128                $comment .= <<END;
7129non-comment portions of the main body of lines of this file is:
7130START\\tSTOP\\tMAPPING where START is the starting code point of the
7131range, in hex; STOP is the ending point, or if omitted, the range has just one
7132code point; MAPPING is what each code point between START and STOP maps to.
7133END
7134                if ($self->output_range_counts) {
7135                    $comment .= <<END;
7136Numbers in comments in [brackets] indicate how many code points are in the
7137range (omitted when the range is a single code point or if the mapping is to
7138the null string).
7139END
7140                }
7141            }
7142        }
7143        $self->set_comment(main::join_lines($comment));
7144        return;
7145    }
7146
7147    my %swash_keys; # Makes sure don't duplicate swash names.
7148
7149    # The remaining variables are temporaries used while writing each table,
7150    # to output special ranges.
7151    my @multi_code_point_maps;  # Map is to more than one code point.
7152
7153    sub handle_special_range($self, $range) {
7154        # Called in the middle of write when it finds a range it doesn't know
7155        # how to handle.
7156
7157        my $addr = pack 'J', refaddr $self;
7158
7159        my $type = $range->type;
7160
7161        my $low = $range->start;
7162        my $high = $range->end;
7163        my $map = $range->value;
7164
7165        # No need to output the range if it maps to the default.
7166        return if $map eq $default_map{$addr};
7167
7168        my $property = $self->property;
7169
7170        # Switch based on the map type...
7171        if ($type == $HANGUL_SYLLABLE) {
7172
7173            # These are entirely algorithmically determinable based on
7174            # some constants furnished by Unicode; for now, just set a
7175            # flag to indicate that have them.  After everything is figured
7176            # out, we will output the code that does the algorithm.  (Don't
7177            # output them if not needed because we are suppressing this
7178            # property.)
7179            $has_hangul_syllables = 1 if $property->to_output_map;
7180        }
7181        elsif ($type == $CP_IN_NAME) {
7182
7183            # Code points whose name ends in their code point are also
7184            # algorithmically determinable, but need information about the map
7185            # to do so.  Both the map and its inverse are stored in data
7186            # structures output in the file.  They are stored in the mean time
7187            # in global lists The lists will be written out later into Name.pm,
7188            # which is created only if needed.  In order to prevent duplicates
7189            # in the list, only add to them for one property, should multiple
7190            # ones need them.
7191            if ($needing_code_points_ending_in_code_point == 0) {
7192                $needing_code_points_ending_in_code_point = $property;
7193            }
7194            if ($property == $needing_code_points_ending_in_code_point) {
7195                push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7196                push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7197
7198                my $squeezed = $map =~ s/[-\s]+//gr;
7199                push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7200                                                                          $low;
7201                push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7202                                                                         $high;
7203
7204                # Calculate the set of legal characters in names of this
7205                # series.  It includes every character in the name prefix.
7206                my %legal;
7207                $legal{$_} = 1 for split //, $map;
7208
7209                # Plus the hex code point chars, blank, and minus.  Also \n
7210                # can show up as being required due to anchoring
7211                for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") {
7212                    $legal{$i} = 1;
7213                }
7214                my $legal = join "", sort { $a cmp $b } keys %legal;
7215
7216                # The legal chars can be used in match optimizations
7217                push @code_points_ending_in_code_point, { low => $low,
7218                                                        high => $high,
7219                                                        name => $map,
7220                                                        legal => $legal,
7221                                                        };
7222            }
7223        }
7224        elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7225
7226            # Multi-code point maps and null string maps have an entry
7227            # for each code point in the range.  They use the same
7228            # output format.
7229            for my $code_point ($low .. $high) {
7230
7231                # The pack() below can't cope with surrogates.  XXX This may
7232                # no longer be true
7233                if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7234                    Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
7235                    next;
7236                }
7237
7238                # Generate the hash entries for these in the form that
7239                # utf8.c understands.
7240                my $tostr = "";
7241                my $to_name = "";
7242                my $to_chr = "";
7243                foreach my $to (split " ", $map) {
7244                    if ($to !~ /^$code_point_re$/) {
7245                        Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
7246                        next;
7247                    }
7248                    $tostr .= sprintf "\\x{%s}", $to;
7249                    $to = CORE::hex $to;
7250                    if ($annotate) {
7251                        $to_name .= " + " if $to_name;
7252                        $to_chr .= main::display_chr($to);
7253                        main::populate_char_info($to)
7254                                            if ! defined $viacode[$to];
7255                        $to_name .=  $viacode[$to];
7256                    }
7257                }
7258
7259                # The unpack yields a list of the bytes that comprise the
7260                # UTF-8 of $code_point, which are each placed in \xZZ format
7261                # and output in the %s to map to $tostr, so the result looks
7262                # like:
7263                # "\xC4\xB0" => "\x{0069}\x{0307}",
7264                my $utf8 = sprintf(qq["%s" => "$tostr",],
7265                        join("", map { sprintf "\\x%02X", $_ }
7266                            unpack("U0C*", chr $code_point)));
7267
7268                # Add a comment so that a human reader can more easily
7269                # see what's going on.
7270                push @multi_code_point_maps,
7271                        sprintf("%-45s # U+%04X", $utf8, $code_point);
7272                if (! $annotate) {
7273                    $multi_code_point_maps[-1] .= " => $map";
7274                }
7275                else {
7276                    main::populate_char_info($code_point)
7277                                    if ! defined $viacode[$code_point];
7278                    $multi_code_point_maps[-1] .= " '"
7279                        . main::display_chr($code_point)
7280                        . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7281                }
7282            }
7283        }
7284        else {
7285            Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7286        }
7287
7288        return;
7289    }
7290
7291    sub pre_body($self) {
7292        # Returns the string that should be output in the file before the main
7293        # body of this table.  It isn't called until the main body is
7294        # calculated, saving a pass.  The string includes some hash entries
7295        # identifying the format of the body, and what the single value should
7296        # be for all ranges missing from it.  It also includes any code points
7297        # which have map_types that don't go in the main table.
7298
7299        my $addr = pack 'J', refaddr $self;
7300
7301        my $name = $self->property->swash_name;
7302
7303        # Currently there is nothing in the pre_body unless a swash is being
7304        # generated.
7305        return unless defined $name;
7306
7307        if (defined $swash_keys{$name}) {
7308            Carp::my_carp(main::join_lines(<<END
7309Already created a swash name '$name' for $swash_keys{$name}.  This means that
7310the same name desired for $self shouldn't be used.  Bad News.  This must be
7311fixed before production use, but proceeding anyway
7312END
7313            ));
7314        }
7315        $swash_keys{$name} = "$self";
7316
7317        my $pre_body = "";
7318
7319        # Here we assume we were called after have gone through the whole
7320        # file.  If we actually generated anything for each map type, add its
7321        # respective header and trailer
7322        my $specials_name = "";
7323        if (@multi_code_point_maps) {
7324            $specials_name = "Unicode::UCD::ToSpec$name";
7325            $pre_body .= <<END;
7326
7327# Some code points require special handling because their mappings are each to
7328# multiple code points.  These do not appear in the main body, but are defined
7329# in the hash below.
7330
7331# Each key is the string of N bytes that together make up the UTF-8 encoding
7332# for the code point.  (i.e. the same as looking at the code point's UTF-8
7333# under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7334\%$specials_name = (
7335END
7336            $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7337        }
7338
7339        my $format = $self->format;
7340
7341        my $return = "";
7342
7343        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7344        if ($output_adjusted) {
7345            if ($specials_name) {
7346                $return .= <<END;
7347# The mappings in the non-hash portion of this file must be modified to get the
7348# correct values by adding the code point ordinal number to each one that is
7349# numeric.
7350END
7351            }
7352            else {
7353                $return .= <<END;
7354# The mappings must be modified to get the correct values by adding the code
7355# point ordinal number to each one that is numeric.
7356END
7357            }
7358        }
7359
7360        $return .= <<END;
7361
7362# The name this table is to be known by, with the format of the mappings in
7363# the main body of the table, and what all code points missing from this file
7364# map to.
7365\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7366END
7367        if ($specials_name) {
7368            $return .= <<END;
7369\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7370END
7371        }
7372        my $default_map = $default_map{$addr};
7373
7374        # For $CODE_POINT default maps and using adjustments, instead the default
7375        # becomes zero.
7376        $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '"
7377                .  (($output_adjusted && $default_map eq $CODE_POINT)
7378                   ? "0"
7379                   : $default_map)
7380                . "';";
7381
7382        if ($default_map eq $CODE_POINT) {
7383            $return .= ' # code point maps to itself';
7384        }
7385        elsif ($default_map eq "") {
7386            $return .= ' # code point maps to the empty string';
7387        }
7388        $return .= "\n";
7389
7390        $return .= $pre_body;
7391
7392        return $return;
7393    }
7394
7395    sub write($self) {
7396        # Write the table to the file.
7397
7398        my $addr = pack 'J', refaddr $self;
7399
7400        # Clear the temporaries
7401        undef @multi_code_point_maps;
7402
7403        # Calculate the format of the table if not already done.
7404        my $format = $self->format;
7405        my $type = $self->property->type;
7406        my $default_map = $self->default_map;
7407        if (! defined $format) {
7408            if ($type == $BINARY) {
7409
7410                # Don't bother checking the values, because we elsewhere
7411                # verify that a binary table has only 2 values.
7412                $format = $BINARY_FORMAT;
7413            }
7414            else {
7415                my @ranges = $self->_range_list->ranges;
7416
7417                # default an empty table based on its type and default map
7418                if (! @ranges) {
7419
7420                    # But it turns out that the only one we can say is a
7421                    # non-string (besides binary, handled above) is when the
7422                    # table is a string and the default map is to a code point
7423                    if ($type == $STRING && $default_map eq $CODE_POINT) {
7424                        $format = $HEX_FORMAT;
7425                    }
7426                    else {
7427                        $format = $STRING_FORMAT;
7428                    }
7429                }
7430                else {
7431
7432                    # Start with the most restrictive format, and as we find
7433                    # something that doesn't fit with that, change to the next
7434                    # most restrictive, and so on.
7435                    $format = $DECIMAL_FORMAT;
7436                    foreach my $range (@ranges) {
7437                        next if $range->type != 0;  # Non-normal ranges don't
7438                                                    # affect the main body
7439                        my $map = $range->value;
7440                        if ($map ne $default_map) {
7441                            last if $format eq $STRING_FORMAT;  # already at
7442                                                                # least
7443                                                                # restrictive
7444                            $format = $INTEGER_FORMAT
7445                                                if $format eq $DECIMAL_FORMAT
7446                                                    && $map !~ / ^ [0-9] $ /x;
7447                            $format = $FLOAT_FORMAT
7448                                            if $format eq $INTEGER_FORMAT
7449                                                && $map !~ / ^ -? [0-9]+ $ /x;
7450                            $format = $RATIONAL_FORMAT
7451                                if $format eq $FLOAT_FORMAT
7452                                    && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7453                            $format = $HEX_FORMAT
7454                                if ($format eq $RATIONAL_FORMAT
7455                                       && $map !~
7456                                           m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7457                                        # Assume a leading zero means hex,
7458                                        # even if all digits are 0-9
7459                                    || ($format eq $INTEGER_FORMAT
7460                                        && $map =~ /^0[0-9A-F]/);
7461                            $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7462                                                       && $map =~ /[^0-9A-F]/;
7463                        }
7464                    }
7465                }
7466            }
7467        } # end of calculating format
7468
7469        if ($default_map eq $CODE_POINT
7470            && $format ne $HEX_FORMAT
7471            && ! defined $self->format)    # manual settings are always
7472                                           # considered ok
7473        {
7474            Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7475        }
7476
7477        # If the output is to be adjusted, the format of the table that gets
7478        # output is actually 'a' or 'ax' instead of whatever it is stored
7479        # internally as.
7480        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7481        if ($output_adjusted) {
7482            if ($default_map eq $CODE_POINT) {
7483                $format = $HEX_ADJUST_FORMAT;
7484            }
7485            else {
7486                $format = $ADJUST_FORMAT;
7487            }
7488        }
7489
7490        $self->_set_format($format);
7491
7492        return $self->SUPER::write(
7493            $output_adjusted,
7494            $default_map);   # don't write defaulteds
7495    }
7496
7497    # Accessors for the underlying list that should fail if locked.
7498    for my $sub (qw(
7499                    add_duplicate
7500                    replace_map
7501                ))
7502    {
7503        no strict "refs";
7504        *$sub = sub {
7505            use strict "refs";
7506            my $self = shift;
7507
7508            return if $self->carp_if_locked;
7509            return $self->_range_list->$sub(@_);
7510        }
7511    }
7512} # End closure for Map_Table
7513
7514package Match_Table;
7515use parent '-norequire', '_Base_Table';
7516
7517# A Match table is one which is a list of all the code points that have
7518# the same property and property value, for use in \p{property=value}
7519# constructs in regular expressions.  It adds very little data to the base
7520# structure, but many methods, as these lists can be combined in many ways to
7521# form new ones.
7522# There are only a few concepts added:
7523# 1) Equivalents and Relatedness.
7524#    Two tables can match the identical code points, but have different names.
7525#    This always happens when there is a perl single form extension
7526#    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7527#    tables are set to be related, with the Perl extension being a child, and
7528#    the Unicode property being the parent.
7529#
7530#    It may be that two tables match the identical code points and we don't
7531#    know if they are related or not.  This happens most frequently when the
7532#    Block and Script properties have the exact range.  But note that a
7533#    revision to Unicode could add new code points to the script, which would
7534#    now have to be in a different block (as the block was filled, or there
7535#    would have been 'Unknown' script code points in it and they wouldn't have
7536#    been identical).  So we can't rely on any two properties from Unicode
7537#    always matching the same code points from release to release, and thus
7538#    these tables are considered coincidentally equivalent--not related.  When
7539#    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7540#    'leader', and the others are 'equivalents'.  This concept is useful
7541#    to minimize the number of tables written out.  Only one file is used for
7542#    any identical set of code points, with entries in UCD.pl mapping all
7543#    the involved tables to it.
7544#
7545#    Related tables will always be identical; we set them up to be so.  Thus
7546#    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7547#    unrelated tables.  Relatedness makes generating the documentation easier.
7548#
7549# 2) Complement.
7550#    Like equivalents, two tables may be the inverses of each other, the
7551#    intersection between them is null, and the union is every Unicode code
7552#    point.  The two tables that occupy a binary property are necessarily like
7553#    this.  By specifying one table as the complement of another, we can avoid
7554#    storing it on disk (using the other table and performing a fast
7555#    transform), and some memory and calculations.
7556#
7557# 3) Conflicting.  It may be that there will eventually be name clashes, with
7558#    the same name meaning different things.  For a while, there actually were
7559#    conflicts, but they have so far been resolved by changing Perl's or
7560#    Unicode's definitions to match the other, but when this code was written,
7561#    it wasn't clear that that was what was going to happen.  (Unicode changed
7562#    because of protests during their beta period.)  Name clashes are warned
7563#    about during compilation, and the documentation.  The generated tables
7564#    are sane, free of name clashes, because the code suppresses the Perl
7565#    version.  But manual intervention to decide what the actual behavior
7566#    should be may be required should this happen.  The introductory comments
7567#    have more to say about this.
7568#
7569# 4) Definition.  This is a string for human consumption that specifies the
7570#    code points that this table matches.  This is used only for the generated
7571#    pod file.  It may be specified explicitly, or automatically computed.
7572#    Only the first portion of complicated definitions is computed and
7573#    displayed.
7574
7575sub standardize { return main::standardize($_[0]); }
7576sub trace { return main::trace(@_); }
7577
7578
7579{ # Closure
7580
7581    main::setup_package();
7582
7583    my %leader;
7584    # The leader table of this one; initially $self.
7585    main::set_access('leader', \%leader, 'r');
7586
7587    my %equivalents;
7588    # An array of any tables that have this one as their leader
7589    main::set_access('equivalents', \%equivalents, 'readable_array');
7590
7591    my %parent;
7592    # The parent table to this one, initially $self.  This allows us to
7593    # distinguish between equivalent tables that are related (for which this
7594    # is set to), and those which may not be, but share the same output file
7595    # because they match the exact same set of code points in the current
7596    # Unicode release.
7597    main::set_access('parent', \%parent, 'r');
7598
7599    my %children;
7600    # An array of any tables that have this one as their parent
7601    main::set_access('children', \%children, 'readable_array');
7602
7603    my %conflicting;
7604    # Array of any tables that would have the same name as this one with
7605    # a different meaning.  This is used for the generated documentation.
7606    main::set_access('conflicting', \%conflicting, 'readable_array');
7607
7608    my %matches_all;
7609    # Set in the constructor for tables that are expected to match all code
7610    # points.
7611    main::set_access('matches_all', \%matches_all, 'r');
7612
7613    my %complement;
7614    # Points to the complement that this table is expressed in terms of; 0 if
7615    # none.
7616    main::set_access('complement', \%complement, 'r');
7617
7618    my %definition;
7619    # Human readable string of the first few ranges of code points matched by
7620    # this table
7621    main::set_access('definition', \%definition, 'r', 's');
7622
7623    sub new {
7624        my $class = shift;
7625
7626        my %args = @_;
7627
7628        # The property for which this table is a listing of property values.
7629        my $property = delete $args{'_Property'};
7630
7631        my $name = delete $args{'Name'};
7632        my $full_name = delete $args{'Full_Name'};
7633        $full_name = $name if ! defined $full_name;
7634
7635        # Optional
7636        my $initialize = delete $args{'Initialize'};
7637        my $matches_all = delete $args{'Matches_All'} || 0;
7638        my $format = delete $args{'Format'};
7639        my $definition = delete $args{'Definition'} // "";
7640        # Rest of parameters passed on.
7641
7642        my $range_list = Range_List->new(Initialize => $initialize,
7643                                         Owner => $property);
7644
7645        my $complete = $full_name;
7646        $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7647                                              # but this helps debug if it
7648                                              # does
7649        # The complete name for a match table includes it's property in a
7650        # compound form 'property=table', except if the property is the
7651        # pseudo-property, perl, in which case it is just the single form,
7652        # 'table' (If you change the '=' must also change the ':' in lots of
7653        # places in this program that assume an equal sign)
7654        $complete = $property->full_name . "=$complete" if $property != $perl;
7655
7656        my $self = $class->SUPER::new(%args,
7657                                      Name => $name,
7658                                      Complete_Name => $complete,
7659                                      Full_Name => $full_name,
7660                                      _Property => $property,
7661                                      _Range_List => $range_list,
7662                                      Format => $EMPTY_FORMAT,
7663                                      Write_As_Invlist => 1,
7664                                      );
7665        my $addr = pack 'J', refaddr $self;
7666
7667        $conflicting{$addr} = [ ];
7668        $equivalents{$addr} = [ ];
7669        $children{$addr} = [ ];
7670        $matches_all{$addr} = $matches_all;
7671        $leader{$addr} = $self;
7672        $parent{$addr} = $self;
7673        $complement{$addr} = 0;
7674        $definition{$addr} = $definition;
7675
7676        if (defined $format && $format ne $EMPTY_FORMAT) {
7677            Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7678        }
7679
7680        return $self;
7681    }
7682
7683    # See this program's beginning comment block about overloading these.
7684    use overload
7685        fallback => 0,
7686        qw("") => "_operator_stringify",
7687        '=' => sub {
7688                    my $self = shift;
7689
7690                    return if $self->carp_if_locked;
7691                    return $self;
7692                },
7693
7694        '+' => sub {
7695                        my $self = shift;
7696                        my $other = shift;
7697
7698                        return $self->_range_list + $other;
7699                    },
7700        '&' => sub {
7701                        my $self = shift;
7702                        my $other = shift;
7703
7704                        return $self->_range_list & $other;
7705                    },
7706        '+=' => sub {
7707                        my $self = shift;
7708                        my $other = shift;
7709                        my $reversed = shift;
7710
7711                        if ($reversed) {
7712                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7713                            . ref($other)
7714                            . ' += '
7715                            . ref($self)
7716                            . "'.  undef returned.");
7717                            return;
7718                        }
7719
7720                        return if $self->carp_if_locked;
7721
7722                        if (ref $other) {
7723
7724                            # Change the range list of this table to be the
7725                            # union of the two.
7726                            $self->_set_range_list($self->_range_list
7727                                                    + $other);
7728                        }
7729                        else {    # $other is just a simple value
7730                            $self->add_range($other, $other);
7731                        }
7732                        return $self;
7733                    },
7734        '&=' => sub {
7735                        my $self = shift;
7736                        my $other = shift;
7737                        my $reversed = shift;
7738
7739                        if ($reversed) {
7740                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7741                            . ref($other)
7742                            . ' &= '
7743                            . ref($self)
7744                            . "'.  undef returned.");
7745                            return;
7746                        }
7747
7748                        return if $self->carp_if_locked;
7749                        $self->_set_range_list($self->_range_list & $other);
7750                        return $self;
7751                    },
7752        '-' => sub { my $self = shift;
7753                    my $other = shift;
7754                    my $reversed = shift;
7755                    if ($reversed) {
7756                        Carp::my_carp_bug("Bad news.  Can't cope with '"
7757                        . ref($other)
7758                        . ' - '
7759                        . ref($self)
7760                        . "'.  undef returned.");
7761                        return;
7762                    }
7763
7764                    return $self->_range_list - $other;
7765                },
7766        '~' => sub { my $self = shift;
7767                    return ~ $self->_range_list;
7768                },
7769    ;
7770
7771    sub _operator_stringify($self, $other="", $reversed=0) {
7772
7773        my $name = $self->complete_name;
7774        return "Table '$name'";
7775    }
7776
7777    sub _range_list {
7778        # Returns the range list associated with this table, which will be the
7779        # complement's if it has one.
7780
7781        my $self = shift;
7782        my $complement = $self->complement;
7783
7784        # In order to avoid re-complementing on each access, only do the
7785        # complement the first time, and store the result in this table's
7786        # range list to use henceforth.  However, this wouldn't work if the
7787        # controlling (complement) table changed after we do this, so lock it.
7788        # Currently, the value of the complement isn't needed until after it
7789        # is fully constructed, so this works.  If this were to change, the
7790        # each_range iteration functionality would no longer work on this
7791        # complement.
7792        if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
7793            $self->_set_range_list($self->SUPER::_range_list
7794                                + ~ $complement->_range_list);
7795            $complement->lock;
7796        }
7797
7798        return $self->SUPER::_range_list;
7799    }
7800
7801    sub add_alias {
7802        # Add a synonym for this table.  See the comments in the base class
7803
7804        my $self = shift;
7805        my $name = shift;
7806        # Rest of parameters passed on.
7807
7808        $self->SUPER::add_alias($name, $self, @_);
7809        return;
7810    }
7811
7812    sub add_conflicting {
7813        # Add the name of some other object to the list of ones that name
7814        # clash with this match table.
7815
7816        my $self = shift;
7817        my $conflicting_name = shift;   # The name of the conflicting object
7818        my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7819        my $conflicting_object = shift; # Optional, the conflicting object
7820                                        # itself.  This is used to
7821                                        # disambiguate the text if the input
7822                                        # name is identical to any of the
7823                                        # aliases $self is known by.
7824                                        # Sometimes the conflicting object is
7825                                        # merely hypothetical, so this has to
7826                                        # be an optional parameter.
7827        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7828
7829        my $addr = pack 'J', refaddr $self;
7830
7831        # Check if the conflicting name is exactly the same as any existing
7832        # alias in this table (as long as there is a real object there to
7833        # disambiguate with).
7834        if (defined $conflicting_object) {
7835            foreach my $alias ($self->aliases) {
7836                if (standardize($alias->name) eq standardize($conflicting_name)) {
7837
7838                    # Here, there is an exact match.  This results in
7839                    # ambiguous comments, so disambiguate by changing the
7840                    # conflicting name to its object's complete equivalent.
7841                    $conflicting_name = $conflicting_object->complete_name;
7842                    last;
7843                }
7844            }
7845        }
7846
7847        # Convert to the \p{...} final name
7848        $conflicting_name = "\\$p" . "{$conflicting_name}";
7849
7850        # Only add once
7851        return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7852
7853        push @{$conflicting{$addr}}, $conflicting_name;
7854
7855        return;
7856    }
7857
7858    sub is_set_equivalent_to($self, $other=undef) {
7859        # Return boolean of whether or not the other object is a table of this
7860        # type and has been marked equivalent to this one.
7861
7862        return 0 if ! defined $other; # Can happen for incomplete early
7863                                      # releases
7864        unless ($other->isa(__PACKAGE__)) {
7865            my $ref_other = ref $other;
7866            my $ref_self = ref $self;
7867            Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7868            return 0;
7869        }
7870
7871        # Two tables are equivalent if they have the same leader.
7872        return $leader{pack 'J', refaddr $self} == $leader{pack 'J', refaddr $other};
7873        return;
7874    }
7875
7876    sub set_equivalent_to {
7877        # Set $self equivalent to the parameter table.
7878        # The required Related => 'x' parameter is a boolean indicating
7879        # whether these tables are related or not.  If related, $other becomes
7880        # the 'parent' of $self; if unrelated it becomes the 'leader'
7881        #
7882        # Related tables share all characteristics except names; equivalents
7883        # not quite so many.
7884        # If they are related, one must be a perl extension.  This is because
7885        # we can't guarantee that Unicode won't change one or the other in a
7886        # later release even if they are identical now.
7887
7888        my $self = shift;
7889        my $other = shift;
7890
7891        my %args = @_;
7892        my $related = delete $args{'Related'};
7893
7894        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7895
7896        return if ! defined $other;     # Keep on going; happens in some early
7897                                        # Unicode releases.
7898
7899        if (! defined $related) {
7900            Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7901            $related = 0;
7902        }
7903
7904        # If already are equivalent, no need to re-do it;  if subroutine
7905        # returns null, it found an error, also do nothing
7906        my $are_equivalent = $self->is_set_equivalent_to($other);
7907        return if ! defined $are_equivalent || $are_equivalent;
7908
7909        my $addr = pack 'J', refaddr $self;
7910        my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7911
7912        if ($related) {
7913            if ($current_leader->perl_extension) {
7914                if ($other->perl_extension) {
7915                    Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7916                    return;
7917                }
7918            } elsif ($self->property != $other->property    # Depending on
7919                                                            # situation, might
7920                                                            # be better to use
7921                                                            # add_alias()
7922                                                            # instead for same
7923                                                            # property
7924                     && ! $other->perl_extension
7925
7926                         # We allow the sc and scx properties to be marked as
7927                         # related.  They are in fact related, and this allows
7928                         # the pod to show that better.  This test isn't valid
7929                         # if this is an early Unicode release without the scx
7930                         # property (having that also implies the sc property
7931                         # exists, so don't have to test for no 'sc')
7932                     && (   ! defined $scx
7933                         && ! (   (   $self->property == $script
7934                                   || $self->property == $scx)
7935                               && (   $self->property == $script
7936                                   || $self->property == $scx))))
7937            {
7938                Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7939                $related = 0;
7940            }
7941        }
7942
7943        if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7944            Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
7945            return;
7946        }
7947
7948        my $leader = pack 'J', refaddr $current_leader;
7949        my $other_addr = pack 'J', refaddr $other;
7950
7951        # Any tables that are equivalent to or children of this table must now
7952        # instead be equivalent to or (children) to the new leader (parent),
7953        # still equivalent.  The equivalency includes their matches_all info,
7954        # and for related tables, their fate and status.
7955        # All related tables are of necessity equivalent, but the converse
7956        # isn't necessarily true
7957        my $status = $other->status;
7958        my $status_info = $other->status_info;
7959        my $fate = $other->fate;
7960        my $matches_all = $matches_all{other_addr};
7961        my $caseless_equivalent = $other->caseless_equivalent;
7962        foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7963            next if $table == $other;
7964            trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7965
7966            my $table_addr = pack 'J', refaddr $table;
7967            $leader{$table_addr} = $other;
7968            $matches_all{$table_addr} = $matches_all;
7969            $self->_set_range_list($other->_range_list);
7970            push @{$equivalents{$other_addr}}, $table;
7971            if ($related) {
7972                $parent{$table_addr} = $other;
7973                push @{$children{$other_addr}}, $table;
7974                $table->set_status($status, $status_info);
7975
7976                # This reason currently doesn't get exposed outside; otherwise
7977                # would have to look up the parent's reason and use it instead.
7978                $table->set_fate($fate, "Parent's fate");
7979
7980                $self->set_caseless_equivalent($caseless_equivalent);
7981            }
7982        }
7983
7984        # Now that we've declared these to be equivalent, any changes to one
7985        # of the tables would invalidate that equivalency.
7986        $self->lock;
7987        $other->lock;
7988        return;
7989    }
7990
7991    sub set_complement($self, $other) {
7992        # Set $self to be the complement of the parameter table.  $self is
7993        # locked, as what it contains should all come from the other table.
7994
7995        if ($other->complement != 0) {
7996            Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7997            return;
7998        }
7999        $complement{pack 'J', refaddr $self} = $other;
8000
8001        # Be sure the other property knows we are depending on them; or the
8002        # other table if it is one in the current property.
8003        if ($self->property != $other->property) {
8004            $other->property->set_has_dependency(1);
8005        }
8006        else {
8007            $other->set_has_dependency(1);
8008        }
8009        $self->lock;
8010        return;
8011    }
8012
8013    sub add_range($self, @range) { # Add a range to the list for this table.
8014        # Rest of parameters passed on
8015
8016        return if $self->carp_if_locked;
8017        return $self->_range_list->add_range(@range);
8018    }
8019
8020    sub header($self) {
8021        # All match tables are to be used only by the Perl core.
8022        return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8023    }
8024
8025    sub pre_body {  # Does nothing for match tables.
8026        return
8027    }
8028
8029    sub append_to_body {  # Does nothing for match tables.
8030        return
8031    }
8032
8033    sub set_fate($self, $fate, $reason=undef) {
8034        $self->SUPER::set_fate($fate, $reason);
8035
8036        # All children share this fate
8037        foreach my $child ($self->children) {
8038            $child->set_fate($fate, $reason);
8039        }
8040        return;
8041    }
8042
8043    sub calculate_table_definition
8044    {
8045        # Returns a human-readable string showing some or all of the code
8046        # points matched by this table.  The string will include a
8047        # bracketed-character class for all characters matched in the 00-FF
8048        # range, and the first few ranges matched beyond that.
8049        my $max_ranges = 6;
8050
8051        my $self = shift;
8052        my $definition = $self->definition || "";
8053
8054        # Skip this if already have a definition.
8055        return $definition if $definition;
8056
8057        my $lows_string = "";   # The string representation of the 0-FF
8058                                # characters
8059        my $string_range = "";  # The string rep. of the above FF ranges
8060        my $range_count = 0;    # How many ranges in $string_rage
8061
8062        my @lows_invlist;       # The inversion list of the 0-FF code points
8063        my $first_non_control = ord(" ");   # Everything below this is a
8064                                            # control, on ASCII or EBCDIC
8065        my $max_table_code_point = $self->max;
8066
8067        # On ASCII platforms, the range 80-FF contains no printables.
8068        my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8069
8070
8071        # Look through the first few ranges matched by this table.
8072        $self->reset_each_range;    # Defensive programming
8073        while (defined (my $range = $self->each_range())) {
8074            my $start = $range->start;
8075            my $end = $range->end;
8076
8077            # Accumulate an inversion list of the 00-FF code points
8078            if ($start < 256 && ($start > 0 || $end < 256)) {
8079                push @lows_invlist, $start;
8080                push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8081
8082                # Get next range if there are more ranges below 256
8083                next if $end < 256 && $end < $max_table_code_point;
8084
8085                # If the range straddles the 255/256 boundary, we split it
8086                # there.  We already added above the low portion to the
8087                # inversion list
8088                $start = 256 if $end > 256;
8089            }
8090
8091            # Here, @lows_invlist contains the code points below 256, and
8092            # there is no other range, or the current one starts at or above
8093            # 256.  Generate the [char class] for the 0-255 ones.
8094            while (@lows_invlist) {
8095
8096                # If this range (necessarily the first one, by the way) starts
8097                # at 0 ...
8098                if ($lows_invlist[0] == 0) {
8099
8100                    # If it ends within the block of controls, that means that
8101                    # some controls are in it and some aren't.  Since Unicode
8102                    # properties pretty much only know about a few of the
8103                    # controls, like \n, \t, this means that its one of them
8104                    # that isn't in the range.  Complement the inversion list
8105                    # which will likely cause these to be output using their
8106                    # mnemonics, hence being clearer.
8107                    if ($lows_invlist[1] < $first_non_control) {
8108                        $lows_string .= '^';
8109                        shift @lows_invlist;
8110                        push @lows_invlist, 256;
8111                    }
8112                    elsif ($lows_invlist[1] <= $highest_printable) {
8113
8114                        # Here, it extends into the printables block.  Split
8115                        # into two ranges so that the controls are separate.
8116                        $lows_string .= sprintf "\\x00-\\x%02x",
8117                                                    $first_non_control - 1;
8118                        $lows_invlist[0] = $first_non_control;
8119                    }
8120                }
8121
8122                # If the range completely contains the printables, don't
8123                # individually spell out the printables.
8124                if (    $lows_invlist[0] <= $first_non_control
8125                    && $lows_invlist[1] > $highest_printable)
8126                {
8127                    $lows_string .= sprintf "\\x%02x-\\x%02x",
8128                                        $lows_invlist[0], $lows_invlist[1] - 1;
8129                    shift @lows_invlist;
8130                    shift @lows_invlist;
8131                    next;
8132                }
8133
8134                # Here, the range may include some but not all printables.
8135                # Look at each one individually
8136                foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8137                    my $char = chr $ord;
8138
8139                    # If there is already something in the list, an
8140                    # alphanumeric char could be the next in sequence.  If so,
8141                    # we start or extend a range.  That is, we could have so
8142                    # far something like 'a-c', and the next char is a 'd', so
8143                    # we change it to 'a-d'.  We use native_to_unicode()
8144                    # because a-z on EBCDIC means 26 chars, and excludes the
8145                    # gap ones.
8146                    if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8147                        my $prev = substr($lows_string, -1);
8148                        if (   $prev !~ /[[:alnum:]]/
8149                            ||   utf8::native_to_unicode(ord $prev) + 1
8150                              != utf8::native_to_unicode(ord $char))
8151                        {
8152                            # Not extending the range
8153                            $lows_string .= $char;
8154                        }
8155                        elsif (   length $lows_string > 1
8156                               && substr($lows_string, -2, 1) eq '-')
8157                        {
8158                            # We had a sequence like '-c' and the current
8159                            # character is 'd'.  Extend the range.
8160                            substr($lows_string, -1, 1) = $char;
8161                        }
8162                        else {
8163                            # We had something like 'd' and this is 'e'.
8164                            # Start a range.
8165                            $lows_string .= "-$char";
8166                        }
8167                    }
8168                    elsif ($char =~ /[[:graph:]]/) {
8169
8170                        # We output a graphic char as-is, preceded by a
8171                        # backslash if it is a metacharacter
8172                        $lows_string .= '\\'
8173                                if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8174                        $lows_string .= $char;
8175                    } # Otherwise use mnemonic for any that have them
8176                    elsif ($char =~ /[\a]/) {
8177                        $lows_string .= '\a';
8178                    }
8179                    elsif ($char =~ /[\b]/) {
8180                        $lows_string .= '\b';
8181                    }
8182                    elsif ($char eq "\e") {
8183                        $lows_string .= '\e';
8184                    }
8185                    elsif ($char eq "\f") {
8186                        $lows_string .= '\f';
8187                    }
8188                    elsif ($char eq "\cK") {
8189                        $lows_string .= '\cK';
8190                    }
8191                    elsif ($char eq "\n") {
8192                        $lows_string .= '\n';
8193                    }
8194                    elsif ($char eq "\r") {
8195                        $lows_string .= '\r';
8196                    }
8197                    elsif ($char eq "\t") {
8198                        $lows_string .= '\t';
8199                    }
8200                    else {
8201
8202                        # Here is a non-graphic without a mnemonic.  We use \x
8203                        # notation.  But if the ordinal of this is one above
8204                        # the previous, create or extend the range
8205                        my $hex_representation = sprintf("%02x", ord $char);
8206                        if (   length $lows_string >= 4
8207                            && substr($lows_string, -4, 2) eq '\\x'
8208                            && hex(substr($lows_string, -2)) + 1 == ord $char)
8209                        {
8210                            if (       length $lows_string >= 5
8211                                &&     substr($lows_string, -5, 1) eq '-'
8212                                && (   length $lows_string == 5
8213                                    || substr($lows_string, -6, 1) ne '\\'))
8214                            {
8215                                substr($lows_string, -2) = $hex_representation;
8216                            }
8217                            else {
8218                                $lows_string .= '-\\x' . $hex_representation;
8219                            }
8220                        }
8221                        else {
8222                            $lows_string .= '\\x' . $hex_representation;
8223                        }
8224                    }
8225                }
8226            }
8227
8228            # Done with assembling the string of all lows.  If there are only
8229            # lows in the property, are completely done.
8230            if ($max_table_code_point < 256) {
8231                $self->reset_each_range;
8232                last;
8233            }
8234
8235            # Otherwise, quit if reached max number of non-lows ranges.  If
8236            # there are lows, count them as one unit towards the maximum.
8237            $range_count++;
8238            if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8239                $string_range .= " ...";
8240                $self->reset_each_range;
8241                last;
8242            }
8243
8244            # Otherwise add this range.
8245            $string_range .= ", " if $string_range ne "";
8246            if ($start == $end) {
8247                $string_range .= sprintf("U+%04X", $start);
8248            }
8249            elsif ($end >= $MAX_WORKING_CODEPOINT)  {
8250                $string_range .= sprintf("U+%04X..infinity", $start);
8251            }
8252            else  {
8253                $string_range .= sprintf("U+%04X..%04X",
8254                                        $start, $end);
8255            }
8256        }
8257
8258        # Done with all the ranges we're going to look at.  Assemble the
8259        # definition from the lows + non-lows.
8260
8261        if ($lows_string ne "" || $string_range ne "") {
8262            if ($lows_string ne "") {
8263                $definition .= "[$lows_string]";
8264                $definition .= ", " if $string_range;
8265            }
8266            $definition .= $string_range;
8267        }
8268
8269        return $definition;
8270    }
8271
8272    sub write($self) {
8273        return $self->SUPER::write(0); # No adjustments
8274    }
8275
8276    # $leader - Should only be called on the leader table of an equivalent group
8277    sub set_final_comment($leader) {
8278        # This creates a comment for the file that is to hold the match table
8279        # $self.  It is somewhat convoluted to make the English read nicely,
8280        # but, heh, it's just a comment.
8281        # This should be called only with the leader match table of all the
8282        # ones that share the same file.  It lists all such tables, ordered so
8283        # that related ones are together.
8284
8285        return unless $debugging_build;
8286
8287        my $addr = pack 'J', refaddr $leader;
8288
8289        if ($leader{$addr} != $leader) {
8290            Carp::my_carp_bug(<<END
8291set_final_comment() must be called on a leader table, which $leader is not.
8292It is equivalent to $leader{$addr}.  No comment created
8293END
8294            );
8295            return;
8296        }
8297
8298        # Get the number of code points matched by each of the tables in this
8299        # file, and add underscores for clarity.
8300        my $count = $leader->count;
8301        my $unicode_count;
8302        my $non_unicode_string;
8303        if ($count > $MAX_UNICODE_CODEPOINTS) {
8304            $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8305                                       - $MAX_UNICODE_CODEPOINT);
8306            $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8307        }
8308        else {
8309            $unicode_count = $count;
8310            $non_unicode_string = "";
8311        }
8312        my $string_count = main::clarify_code_point_count($unicode_count);
8313
8314        my $loose_count = 0;        # how many aliases loosely matched
8315        my $compound_name = "";     # ? Are any names compound?, and if so, an
8316                                    # example
8317        my $properties_with_compound_names = 0;    # count of these
8318
8319
8320        my %flags;              # The status flags used in the file
8321        my $total_entries = 0;  # number of entries written in the comment
8322        my $matches_comment = ""; # The portion of the comment about the
8323                                  # \p{}'s
8324        my @global_comments;    # List of all the tables' comments that are
8325                                # there before this routine was called.
8326        my $has_ucd_alias = 0;  # If there is an alias that is accessible via
8327                                # Unicode::UCD.  If not, then don't say it is
8328                                # in the comment
8329
8330        # Get list of all the parent tables that are equivalent to this one
8331        # (including itself).
8332        my @parents = grep { $parent{main::objaddr $_} == $_ }
8333                            main::uniques($leader, @{$equivalents{$addr}});
8334        my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
8335                                              # tables
8336        for my $parent (@parents) {
8337
8338            my $property = $parent->property;
8339
8340            # Special case 'N' tables in properties with two match tables when
8341            # the other is a 'Y' one.  These are likely to be binary tables,
8342            # but not necessarily.  In either case, \P{} will match the
8343            # complement of \p{}, and so if something is a synonym of \p, the
8344            # complement of that something will be the synonym of \P.  This
8345            # would be true of any property with just two match tables, not
8346            # just those whose values are Y and N; but that would require a
8347            # little extra work, and there are none such so far in Unicode.
8348            my $perl_p = 'p';        # which is it?  \p{} or \P{}
8349            my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
8350
8351            if (scalar $property->tables == 2
8352                && $parent == $property->table('N')
8353                && defined (my $yes = $property->table('Y')))
8354            {
8355                my $yes_addr = pack 'J', refaddr $yes;
8356                @yes_perl_synonyms
8357                    = grep { $_->property == $perl }
8358                                    main::uniques($yes,
8359                                                $parent{$yes_addr},
8360                                                $parent{$yes_addr}->children);
8361
8362                # But these synonyms are \P{} ,not \p{}
8363                $perl_p = 'P';
8364            }
8365
8366            my @description;        # Will hold the table description
8367            my @note;               # Will hold the table notes.
8368            my @conflicting;        # Will hold the table conflicts.
8369
8370            # Look at the parent, any yes synonyms, and all the children
8371            my $parent_addr = pack 'J', refaddr $parent;
8372            for my $table ($parent,
8373                           @yes_perl_synonyms,
8374                           @{$children{$parent_addr}})
8375            {
8376                my $table_addr = pack 'J', refaddr $table;
8377                my $table_property = $table->property;
8378
8379                # Tables are separated by a blank line to create a grouping.
8380                $matches_comment .= "\n" if $matches_comment;
8381
8382                # The table is named based on the property and value
8383                # combination it is for, like script=greek.  But there may be
8384                # a number of synonyms for each side, like 'sc' for 'script',
8385                # and 'grek' for 'greek'.  Any combination of these is a valid
8386                # name for this table.  In this case, there are three more,
8387                # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
8388                # listing all possible combinations in the comment, we make
8389                # sure that each synonym occurs at least once, and add
8390                # commentary that the other combinations are possible.
8391                # Because regular expressions don't recognize things like
8392                # \p{jsn=}, only look at non-null right-hand-sides
8393                my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8394                my @table_aliases = grep { $_->name ne "" } $table->aliases;
8395
8396                # The alias lists above are already ordered in the order we
8397                # want to output them.  To ensure that each synonym is listed,
8398                # we must use the max of the two numbers.  But if there are no
8399                # legal synonyms (nothing in @table_aliases), then we don't
8400                # list anything.
8401                my $listed_combos = (@table_aliases)
8402                                    ?  main::max(scalar @table_aliases,
8403                                                 scalar @property_aliases)
8404                                    : 0;
8405                trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8406
8407                my $property_had_compound_name = 0;
8408
8409                for my $i (0 .. $listed_combos - 1) {
8410                    $total_entries++;
8411
8412                    # The current alias for the property is the next one on
8413                    # the list, or if beyond the end, start over.  Similarly
8414                    # for the table (\p{prop=table})
8415                    my $property_alias = $property_aliases
8416                                            [$i % @property_aliases]->name;
8417                    my $table_alias_object = $table_aliases
8418                                                        [$i % @table_aliases];
8419                    my $table_alias = $table_alias_object->name;
8420                    my $loose_match = $table_alias_object->loose_match;
8421                    $has_ucd_alias |= $table_alias_object->ucd;
8422
8423                    if ($table_alias !~ /\D/) { # Clarify large numbers.
8424                        $table_alias = main::clarify_number($table_alias)
8425                    }
8426
8427                    # Add a comment for this alias combination
8428                    my $current_match_comment;
8429                    if ($table_property == $perl) {
8430                        $current_match_comment = "\\$perl_p"
8431                                                    . "{$table_alias}";
8432                    }
8433                    else {
8434                        $current_match_comment
8435                                        = "\\p{$property_alias=$table_alias}";
8436                        $property_had_compound_name = 1;
8437                    }
8438
8439                    # Flag any abnormal status for this table.
8440                    my $flag = $property->status
8441                                || $table->status
8442                                || $table_alias_object->status;
8443                    if ($flag && $flag ne $PLACEHOLDER) {
8444                        $flags{$flag} = $status_past_participles{$flag};
8445                    }
8446
8447                    $loose_count++;
8448
8449                    # Pretty up the comment.  Note the \b; it says don't make
8450                    # this line a continuation.
8451                    $matches_comment .= sprintf("\b%-1s%-s%s\n",
8452                                        $flag,
8453                                        " " x 7,
8454                                        $current_match_comment);
8455                } # End of generating the entries for this table.
8456
8457                # Save these for output after this group of related tables.
8458                push @description, $table->description;
8459                push @note, $table->note;
8460                push @conflicting, $table->conflicting;
8461
8462                # And this for output after all the tables.
8463                push @global_comments, $table->comment;
8464
8465                # Compute an alternate compound name using the final property
8466                # synonym and the first table synonym with a colon instead of
8467                # the equal sign used elsewhere.
8468                if ($property_had_compound_name) {
8469                    $properties_with_compound_names ++;
8470                    if (! $compound_name || @property_aliases > 1) {
8471                        $compound_name = $property_aliases[-1]->name
8472                                        . ': '
8473                                        . $table_aliases[0]->name;
8474                    }
8475                }
8476            } # End of looping through all children of this table
8477
8478            # Here have assembled in $matches_comment all the related tables
8479            # to the current parent (preceded by the same info for all the
8480            # previous parents).  Put out information that applies to all of
8481            # the current family.
8482            if (@conflicting) {
8483
8484                # But output the conflicting information now, as it applies to
8485                # just this table.
8486                my $conflicting = join ", ", @conflicting;
8487                if ($conflicting) {
8488                    $matches_comment .= <<END;
8489
8490    Note that contrary to what you might expect, the above is NOT the same as
8491END
8492                    $matches_comment .= "any of: " if @conflicting > 1;
8493                    $matches_comment .= "$conflicting\n";
8494                }
8495            }
8496            if (@description) {
8497                $matches_comment .= "\n    Meaning: "
8498                                    . join('; ', @description)
8499                                    . "\n";
8500            }
8501            if (@note) {
8502                $matches_comment .= "\n    Note: "
8503                                    . join("\n    ", @note)
8504                                    . "\n";
8505            }
8506        } # End of looping through all tables
8507
8508        $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8509
8510
8511        my $code_points;
8512        my $match;
8513        my $any_of_these;
8514        if ($unicode_count == 1) {
8515            $match = 'matches';
8516            $code_points = 'single code point';
8517        }
8518        else {
8519            $match = 'match';
8520            $code_points = "$string_count code points";
8521        }
8522
8523        my $synonyms;
8524        my $entries;
8525        if ($total_entries == 1) {
8526            $synonyms = "";
8527            $entries = 'entry';
8528            $any_of_these = 'this'
8529        }
8530        else {
8531            $synonyms = " any of the following regular expression constructs";
8532            $entries = 'entries';
8533            $any_of_these = 'any of these'
8534        }
8535
8536        my $comment = "";
8537        if ($has_ucd_alias) {
8538            $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8539        }
8540        if ($has_unrelated) {
8541            $comment .= <<END;
8542This file is for tables that are not necessarily related:  To conserve
8543resources, every table that matches the identical set of code points in this
8544version of Unicode uses this file.  Each one is listed in a separate group
8545below.  It could be that the tables will match the same set of code points in
8546other Unicode releases, or it could be purely coincidence that they happen to
8547be the same in Unicode $unicode_version, and hence may not in other versions.
8548
8549END
8550        }
8551
8552        if (%flags) {
8553            foreach my $flag (sort keys %flags) {
8554                $comment .= <<END;
8555'$flag' below means that this form is $flags{$flag}.
8556END
8557                if ($flag eq $INTERNAL_ALIAS) {
8558                    $comment .= "DO NOT USE!!!";
8559                }
8560                else {
8561                    $comment .= "Consult $pod_file.pod";
8562                }
8563                $comment .= "\n";
8564            }
8565            $comment .= "\n";
8566        }
8567
8568        if ($total_entries == 0) {
8569            Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8570            $comment .= <<END;
8571This file returns the $code_points in Unicode Version
8572$unicode_version for
8573$leader, but it is inaccessible through Perl regular expressions, as
8574"\\p{prop=}" is not recognized.
8575END
8576
8577        } else {
8578            $comment .= <<END;
8579This file returns the $code_points in Unicode Version
8580$unicode_version that
8581$match$synonyms:
8582
8583$matches_comment
8584$pod_file.pod should be consulted for the syntax rules for $any_of_these,
8585including if adding or subtracting white space, underscore, and hyphen
8586characters matters or doesn't matter, and other permissible syntactic
8587variants.  Upper/lower case distinctions never matter.
8588END
8589
8590        }
8591        if ($compound_name) {
8592            $comment .= <<END;
8593
8594A colon can be substituted for the equals sign, and
8595END
8596            if ($properties_with_compound_names > 1) {
8597                $comment .= <<END;
8598within each group above,
8599END
8600            }
8601            $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8602
8603            # Note the \b below, it says don't make that line a continuation.
8604            $comment .= <<END;
8605anything to the left of the equals (or colon) can be combined with anything to
8606the right.  Thus, for example,
8607$compound_name
8608\bis also valid.
8609END
8610        }
8611
8612        # And append any comment(s) from the actual tables.  They are all
8613        # gathered here, so may not read all that well.
8614        if (@global_comments) {
8615            $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8616        }
8617
8618        if ($count) {   # The format differs if no code points, and needs no
8619                        # explanation in that case
8620            if ($leader->write_as_invlist) {
8621                $comment.= <<END;
8622
8623The first data line of this file begins with the letter V to indicate it is in
8624inversion list format.  The number following the V gives the number of lines
8625remaining.  Each of those remaining lines is a single number representing the
8626starting code point of a range which goes up to but not including the number
8627on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8628the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8629the property.  The final line's range extends to the platform's infinity.
8630END
8631            }
8632            else {
8633                $comment.= <<END;
8634The format of the lines of this file is:
8635START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8636STOP is the ending point, or if omitted, the range has just one code point.
8637END
8638            }
8639            if ($leader->output_range_counts) {
8640                $comment .= <<END;
8641Numbers in comments in [brackets] indicate how many code points are in the
8642range.
8643END
8644            }
8645        }
8646
8647        $leader->set_comment(main::join_lines($comment));
8648        return;
8649    }
8650
8651    # Accessors for the underlying list
8652    for my $sub (qw(
8653                    get_valid_code_point
8654                    get_invalid_code_point
8655                ))
8656    {
8657        no strict "refs";
8658        *$sub = sub {
8659            use strict "refs";
8660            my $self = shift;
8661
8662            return $self->_range_list->$sub(@_);
8663        }
8664    }
8665} # End closure for Match_Table
8666
8667package Property;
8668
8669# The Property class represents a Unicode property, or the $perl
8670# pseudo-property.  It contains a map table initialized empty at construction
8671# time, and for properties accessible through regular expressions, various
8672# match tables, created through the add_match_table() method, and referenced
8673# by the table('NAME') or tables() methods, the latter returning a list of all
8674# of the match tables.  Otherwise table operations implicitly are for the map
8675# table.
8676#
8677# Most of the data in the property is actually about its map table, so it
8678# mostly just uses that table's accessors for most methods.  The two could
8679# have been combined into one object, but for clarity because of their
8680# differing semantics, they have been kept separate.  It could be argued that
8681# the 'file' and 'directory' fields should be kept with the map table.
8682#
8683# Each property has a type.  This can be set in the constructor, or in the
8684# set_type accessor, but mostly it is figured out by the data.  Every property
8685# starts with unknown type, overridden by a parameter to the constructor, or
8686# as match tables are added, or ranges added to the map table, the data is
8687# inspected, and the type changed.  After the table is mostly or entirely
8688# filled, compute_type() should be called to finalize the analysis.
8689#
8690# There are very few operations defined.  One can safely remove a range from
8691# the map table, and property_add_or_replace_non_nulls() adds the maps from another
8692# table to this one, replacing any in the intersection of the two.
8693
8694sub standardize { return main::standardize($_[0]); }
8695sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8696
8697{   # Closure
8698
8699    # This hash will contain as keys, all the aliases of all properties, and
8700    # as values, pointers to their respective property objects.  This allows
8701    # quick look-up of a property from any of its names.
8702    my %alias_to_property_of;
8703
8704    sub dump_alias_to_property_of {
8705        # For debugging
8706
8707        print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8708        return;
8709    }
8710
8711    sub property_ref($name) {
8712        # This is a package subroutine, not called as a method.
8713        # If the single parameter is a literal '*' it returns a list of all
8714        # defined properties.
8715        # Otherwise, the single parameter is a name, and it returns a pointer
8716        # to the corresponding property object, or undef if none.
8717        #
8718        # Properties can have several different names.  The 'standard' form of
8719        # each of them is stored in %alias_to_property_of as they are defined.
8720        # But it's possible that this subroutine will be called with some
8721        # variant, so if the initial lookup fails, it is repeated with the
8722        # standardized form of the input name.  If found, besides returning the
8723        # result, the input name is added to the list so future calls won't
8724        # have to do the conversion again.
8725
8726        if (! defined $name) {
8727            Carp::my_carp_bug("Undefined input property.  No action taken.");
8728            return;
8729        }
8730
8731        return main::uniques(values %alias_to_property_of) if $name eq '*';
8732
8733        # Return cached result if have it.
8734        my $result = $alias_to_property_of{$name};
8735        return $result if defined $result;
8736
8737        # Convert the input to standard form.
8738        my $standard_name = standardize($name);
8739
8740        $result = $alias_to_property_of{$standard_name};
8741        return unless defined $result;        # Don't cache undefs
8742
8743        # Cache the result before returning it.
8744        $alias_to_property_of{$name} = $result;
8745        return $result;
8746    }
8747
8748
8749    main::setup_package();
8750
8751    my %map;
8752    # A pointer to the map table object for this property
8753    main::set_access('map', \%map);
8754
8755    my %full_name;
8756    # The property's full name.  This is a duplicate of the copy kept in the
8757    # map table, but is needed because stringify needs it during
8758    # construction of the map table, and then would have a chicken before egg
8759    # problem.
8760    main::set_access('full_name', \%full_name, 'r');
8761
8762    my %table_ref;
8763    # This hash will contain as keys, all the aliases of any match tables
8764    # attached to this property, and as values, the pointers to their
8765    # respective tables.  This allows quick look-up of a table from any of its
8766    # names.
8767    main::set_access('table_ref', \%table_ref);
8768
8769    my %type;
8770    # The type of the property, $ENUM, $BINARY, etc
8771    main::set_access('type', \%type, 'r');
8772
8773    my %file;
8774    # The filename where the map table will go (if actually written).
8775    # Normally defaulted, but can be overridden.
8776    main::set_access('file', \%file, 'r', 's');
8777
8778    my %directory;
8779    # The directory where the map table will go (if actually written).
8780    # Normally defaulted, but can be overridden.
8781    main::set_access('directory', \%directory, 's');
8782
8783    my %pseudo_map_type;
8784    # This is used to affect the calculation of the map types for all the
8785    # ranges in the table.  It should be set to one of the values that signify
8786    # to alter the calculation.
8787    main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8788
8789    my %has_only_code_point_maps;
8790    # A boolean used to help in computing the type of data in the map table.
8791    main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8792
8793    my %unique_maps;
8794    # A list of the first few distinct mappings this property has.  This is
8795    # used to disambiguate between binary and enum property types, so don't
8796    # have to keep more than three.
8797    main::set_access('unique_maps', \%unique_maps);
8798
8799    my %pre_declared_maps;
8800    # A boolean that gives whether the input data should declare all the
8801    # tables used, or not.  If the former, unknown ones raise a warning.
8802    main::set_access('pre_declared_maps',
8803                                    \%pre_declared_maps, 'r', 's');
8804
8805    my %match_subdir;
8806    # For properties whose shortest names are too long for a DOS 8.3
8807    # filesystem to distinguish between, this is used to manually give short
8808    # names for the directory name immediately under $match_tables that the
8809    # match tables for this property should be placed in.
8810    main::set_access('match_subdir', \%match_subdir, 'r');
8811
8812    my %has_dependency;
8813    # A boolean that gives whether some table somewhere is defined as the
8814    # complement of a table in this property.  This is a crude, but currently
8815    # sufficient, mechanism to make this property not get destroyed before
8816    # what is dependent on it is.  Other dependencies could be added, so the
8817    # name was chosen to reflect a more general situation than actually is
8818    # currently the case.
8819    main::set_access('has_dependency', \%has_dependency, 'r', 's');
8820
8821    sub new {
8822        # The only required parameter is the positionally first, name.  All
8823        # other parameters are key => value pairs.  See the documentation just
8824        # above for the meanings of the ones not passed directly on to the map
8825        # table constructor.
8826
8827        my $class = shift;
8828        my $name = shift || "";
8829
8830        my $self = property_ref($name);
8831        if (defined $self) {
8832            my $options_string = join ", ", @_;
8833            $options_string = ".  Ignoring options $options_string" if $options_string;
8834            Carp::my_carp("$self is already in use.  Using existing one$options_string;");
8835            return $self;
8836        }
8837
8838        my %args = @_;
8839
8840        $self = bless \do { my $anonymous_scalar }, $class;
8841        my $addr = pack 'J', refaddr $self;
8842
8843        $directory{$addr} = delete $args{'Directory'};
8844        $file{$addr} = delete $args{'File'};
8845        $full_name{$addr} = delete $args{'Full_Name'} || $name;
8846        $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8847        $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8848        $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8849                                    # Starting in this release, property
8850                                    # values should be defined for all
8851                                    # properties, except those overriding this
8852                                    // $v_version ge v5.1.0;
8853        $match_subdir{$addr} = delete $args{'Match_SubDir'};
8854
8855        # Rest of parameters passed on.
8856
8857        $has_only_code_point_maps{$addr} = 1;
8858        $table_ref{$addr} = { };
8859        $unique_maps{$addr} = { };
8860        $has_dependency{$addr} = 0;
8861
8862        $map{$addr} = Map_Table->new($name,
8863                                    Full_Name => $full_name{$addr},
8864                                    _Alias_Hash => \%alias_to_property_of,
8865                                    _Property => $self,
8866                                    %args);
8867        return $self;
8868    }
8869
8870    # See this program's beginning comment block about overloading the copy
8871    # constructor.  Few operations are defined on properties, but a couple are
8872    # useful.  It is safe to take the inverse of a property, and to remove a
8873    # single code point from it.
8874    use overload
8875        fallback => 0,
8876        qw("") => "_operator_stringify",
8877        "." => \&main::_operator_dot,
8878        ".=" => \&main::_operator_dot_equal,
8879        '==' => \&main::_operator_equal,
8880        '!=' => \&main::_operator_not_equal,
8881        '=' => sub { return shift },
8882        '-=' => "_minus_and_equal",
8883    ;
8884
8885    sub _operator_stringify($self, $other="", $reversed=0) {
8886        return "Property '" .  $self->full_name . "'";
8887    }
8888
8889    sub _minus_and_equal($self, $other, $reversed=0) {
8890        # Remove a single code point from the map table of a property.
8891        if (ref $other) {
8892            Carp::my_carp_bug("Bad news.  Can't cope with a "
8893                        . ref($other)
8894                        . " argument to '-='.  Subtraction ignored.");
8895            return $self;
8896        }
8897        elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
8898            Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
8899            . ref $self
8900            . " from a non-object.  undef returned.");
8901            return;
8902        }
8903        else {
8904            $map{pack 'J', refaddr $self}->delete_range($other, $other);
8905        }
8906        return $self;
8907    }
8908
8909    sub add_match_table {
8910        # Add a new match table for this property, with name given by the
8911        # parameter.  It returns a pointer to the table.
8912
8913        my $self = shift;
8914        my $name = shift;
8915        my %args = @_;
8916
8917        my $addr = pack 'J', refaddr $self;
8918
8919        my $table = $table_ref{$addr}{$name};
8920        my $standard_name = main::standardize($name);
8921        if (defined $table
8922            || (defined ($table = $table_ref{$addr}{$standard_name})))
8923        {
8924            Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
8925            $table_ref{$addr}{$name} = $table;
8926            return $table;
8927        }
8928        else {
8929
8930            # See if this is a perl extension, if not passed in.
8931            my $perl_extension = delete $args{'Perl_Extension'};
8932            $perl_extension
8933                        = $self->perl_extension if ! defined $perl_extension;
8934
8935            my $fate;
8936            my $suppression_reason = "";
8937            if ($self->name =~ /^_/) {
8938                $fate = $SUPPRESSED;
8939                $suppression_reason = "Parent property is internal only";
8940            }
8941            elsif ($self->fate >= $SUPPRESSED) {
8942                $fate = $self->fate;
8943                $suppression_reason = $why_suppressed{$self->complete_name};
8944
8945            }
8946            elsif ($name =~ /^_/) {
8947                $fate = $INTERNAL_ONLY;
8948            }
8949            $table = Match_Table->new(
8950                                Name => $name,
8951                                Perl_Extension => $perl_extension,
8952                                _Alias_Hash => $table_ref{$addr},
8953                                _Property => $self,
8954                                Fate => $fate,
8955                                Suppression_Reason => $suppression_reason,
8956                                Status => $self->status,
8957                                _Status_Info => $self->status_info,
8958                                %args);
8959            return unless defined $table;
8960        }
8961
8962        # Save the names for quick look up
8963        $table_ref{$addr}{$standard_name} = $table;
8964        $table_ref{$addr}{$name} = $table;
8965
8966        # Perhaps we can figure out the type of this property based on the
8967        # fact of adding this match table.  First, string properties don't
8968        # have match tables; second, a binary property can't have 3 match
8969        # tables
8970        if ($type{$addr} == $UNKNOWN) {
8971            $type{$addr} = $NON_STRING;
8972        }
8973        elsif ($type{$addr} == $STRING) {
8974            Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
8975            $type{$addr} = $NON_STRING;
8976        }
8977        elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8978            if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8979                if ($type{$addr} == $BINARY) {
8980                    Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
8981                }
8982                $type{$addr} = $ENUM;
8983            }
8984        }
8985
8986        return $table;
8987    }
8988
8989    sub delete_match_table($self, $table_to_remove) {
8990        # Delete the table referred to by $2 from the property $1.
8991        my $addr = pack 'J', refaddr $self;
8992
8993        # Remove all names that refer to it.
8994        foreach my $key (keys %{$table_ref{$addr}}) {
8995            delete $table_ref{$addr}{$key}
8996                                if $table_ref{$addr}{$key} == $table_to_remove;
8997        }
8998
8999        $table_to_remove->DESTROY;
9000        return;
9001    }
9002
9003    sub table($self, $name) {
9004        # Return a pointer to the match table (with name given by the
9005        # parameter) associated with this property; undef if none.
9006        my $addr = pack 'J', refaddr $self;
9007
9008        return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9009
9010        # If quick look-up failed, try again using the standard form of the
9011        # input name.  If that succeeds, cache the result before returning so
9012        # won't have to standardize this input name again.
9013        my $standard_name = main::standardize($name);
9014        return unless defined $table_ref{$addr}{$standard_name};
9015
9016        $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9017        return $table_ref{$addr}{$name};
9018    }
9019
9020    sub tables {
9021        # Return a list of pointers to all the match tables attached to this
9022        # property
9023
9024        return main::uniques(values %{$table_ref{pack 'J', refaddr shift}});
9025    }
9026
9027    sub directory {
9028        # Returns the directory the map table for this property should be
9029        # output in.  If a specific directory has been specified, that has
9030        # priority;  'undef' is returned if the type isn't defined;
9031        # or $map_directory for everything else.
9032
9033        my $addr = pack 'J', refaddr shift;
9034
9035        return $directory{$addr} if defined $directory{$addr};
9036        return undef if $type{$addr} == $UNKNOWN;
9037        return $map_directory;
9038    }
9039
9040    sub swash_name($self) {
9041        # Return the name that is used to both:
9042        #   1)  Name the file that the map table is written to.
9043        #   2)  The name of swash related stuff inside that file.
9044        # The reason for this is that the Perl core historically has used
9045        # certain names that aren't the same as the Unicode property names.
9046        # To continue using these, $file is hard-coded in this file for those,
9047        # but otherwise the standard name is used.  This is different from the
9048        # external_name, so that the rest of the files, like in lib can use
9049        # the standard name always, without regard to historical precedent.
9050        my $addr = pack 'J', refaddr $self;
9051
9052        # Swash names are used only on either
9053        # 1) regular or internal-only map tables
9054        # 2) otherwise there should be no access to the
9055        #    property map table from other parts of Perl.
9056        return if $map{$addr}->fate != $ORDINARY
9057                  && ! ($map{$addr}->name =~ /^_/
9058                        && $map{$addr}->fate == $INTERNAL_ONLY);
9059
9060        return $file{$addr} if defined $file{$addr};
9061        return $map{$addr}->external_name;
9062    }
9063
9064    sub to_create_match_tables($self) {
9065        # Returns a boolean as to whether or not match tables should be
9066        # created for this property.
9067
9068        # The whole point of this pseudo property is match tables.
9069        return 1 if $self == $perl;
9070
9071        my $addr = pack 'J', refaddr $self;
9072
9073        # Don't generate tables of code points that match the property values
9074        # of a string property.  Such a list would most likely have many
9075        # property values, each with just one or very few code points mapping
9076        # to it.
9077        return 0 if $type{$addr} == $STRING;
9078
9079        # Otherwise, do.
9080        return 1;
9081    }
9082
9083    sub property_add_or_replace_non_nulls($self, $other) {
9084        # This adds the mappings in the property $other to $self.  Non-null
9085        # mappings from $other override those in $self.  It essentially merges
9086        # the two properties, with the second having priority except for null
9087        # mappings.
9088
9089        if (! $other->isa(__PACKAGE__)) {
9090            Carp::my_carp_bug("$other should be a "
9091                            . __PACKAGE__
9092                            . ".  Not a '"
9093                            . ref($other)
9094                            . "'.  Not added;");
9095            return;
9096        }
9097
9098        return $map{pack 'J', refaddr $self}->map_add_or_replace_non_nulls($map{pack 'J', refaddr $other});
9099    }
9100
9101    sub set_proxy_for {
9102        # Certain tables are not generally written out to files, but
9103        # Unicode::UCD has the intelligence to know that the file for $self
9104        # can be used to reconstruct those tables.  This routine just changes
9105        # things so that UCD pod entries for those suppressed tables are
9106        # generated, so the fact that a proxy is used is invisible to the
9107        # user.
9108
9109        my $self = shift;
9110
9111        foreach my $property_name (@_) {
9112            my $ref = property_ref($property_name);
9113            next if $ref->to_output_map;
9114            $ref->set_fate($MAP_PROXIED);
9115        }
9116    }
9117
9118    sub set_type($self, $type) {
9119        # Set the type of the property.  Mostly this is figured out by the
9120        # data in the table.  But this is used to set it explicitly.  The
9121        # reason it is not a standard accessor is that when setting a binary
9122        # property, we need to make sure that all the true/false aliases are
9123        # present, as they were omitted in early Unicode releases.
9124
9125        if ($type != $ENUM
9126            && $type != $BINARY
9127            && $type != $FORCED_BINARY
9128            && $type != $STRING)
9129        {
9130            Carp::my_carp("Unrecognized type '$type'.  Type not set");
9131            return;
9132        }
9133
9134        $type{pack 'J', refaddr $self} = $type;
9135        return if $type != $BINARY && $type != $FORCED_BINARY;
9136
9137        my $yes = $self->table('Y');
9138        $yes = $self->table('Yes') if ! defined $yes;
9139        $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9140                                                            if ! defined $yes;
9141
9142        # Add aliases in order wanted, duplicates will be ignored.  We use a
9143        # binary property present in all releases for its ordered lists of
9144        # true/false aliases.  Note, that could run into problems in
9145        # outputting things in that we don't distinguish between the name and
9146        # full name of these.  Hopefully, if the table was already created
9147        # before this code is executed, it was done with these set properly.
9148        my $bm = property_ref("Bidi_Mirrored");
9149        foreach my $alias ($bm->table("Y")->aliases) {
9150            $yes->add_alias($alias->name);
9151        }
9152        my $no = $self->table('N');
9153        $no = $self->table('No') if ! defined $no;
9154        $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9155        foreach my $alias ($bm->table("N")->aliases) {
9156            $no->add_alias($alias->name);
9157        }
9158
9159        return;
9160    }
9161
9162    sub add_map {
9163        # Add a map to the property's map table.  This also keeps
9164        # track of the maps so that the property type can be determined from
9165        # its data.
9166
9167        my $self = shift;
9168        my $start = shift;  # First code point in range
9169        my $end = shift;    # Final code point in range
9170        my $map = shift;    # What the range maps to.
9171        # Rest of parameters passed on.
9172
9173        my $addr = pack 'J', refaddr $self;
9174
9175        # If haven't the type of the property, gather information to figure it
9176        # out.
9177        if ($type{$addr} == $UNKNOWN) {
9178
9179            # If the map contains an interior blank or dash, or most other
9180            # nonword characters, it will be a string property.  This
9181            # heuristic may actually miss some string properties.  If so, they
9182            # may need to have explicit set_types called for them.  This
9183            # happens in the Unihan properties.
9184            if ($map =~ / (?<= . ) [ -] (?= . ) /x
9185                || $map =~ / [^\w.\/\ -]  /x)
9186            {
9187                $self->set_type($STRING);
9188
9189                # $unique_maps is used for disambiguating between ENUM and
9190                # BINARY later; since we know the property is not going to be
9191                # one of those, no point in keeping the data around
9192                undef $unique_maps{$addr};
9193            }
9194            else {
9195
9196                # Not necessarily a string.  The final decision has to be
9197                # deferred until all the data are in.  We keep track of if all
9198                # the values are code points for that eventual decision.
9199                $has_only_code_point_maps{$addr} &=
9200                                            $map =~ / ^ $code_point_re $/x;
9201
9202                # For the purposes of disambiguating between binary and other
9203                # enumerations at the end, we keep track of the first three
9204                # distinct property values.  Once we get to three, we know
9205                # it's not going to be binary, so no need to track more.
9206                if (scalar keys %{$unique_maps{$addr}} < 3) {
9207                    $unique_maps{$addr}{main::standardize($map)} = 1;
9208                }
9209            }
9210        }
9211
9212        # Add the mapping by calling our map table's method
9213        return $map{$addr}->add_map($start, $end, $map, @_);
9214    }
9215
9216    sub compute_type($self) {
9217        # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
9218        # should be called after the property is mostly filled with its maps.
9219        # We have been keeping track of what the property values have been,
9220        # and now have the necessary information to figure out the type.
9221
9222        my $addr = pack 'J', refaddr $self;
9223
9224        my $type = $type{$addr};
9225
9226        # If already have figured these out, no need to do so again, but we do
9227        # a double check on ENUMS to make sure that a string property hasn't
9228        # improperly been classified as an ENUM, so continue on with those.
9229        return if $type == $STRING
9230                  || $type == $BINARY
9231                  || $type == $FORCED_BINARY;
9232
9233        # If every map is to a code point, is a string property.
9234        if ($type == $UNKNOWN
9235            && ($has_only_code_point_maps{$addr}
9236                || (defined $map{$addr}->default_map
9237                    && $map{$addr}->default_map eq "")))
9238        {
9239            $self->set_type($STRING);
9240        }
9241        else {
9242
9243            # Otherwise, it is to some sort of enumeration.  (The case where
9244            # it is a Unicode miscellaneous property, and treated like a
9245            # string in this program is handled in add_map()).  Distinguish
9246            # between binary and some other enumeration type.  Of course, if
9247            # there are more than two values, it's not binary.  But more
9248            # subtle is the test that the default mapping is defined means it
9249            # isn't binary.  This in fact may change in the future if Unicode
9250            # changes the way its data is structured.  But so far, no binary
9251            # properties ever have @missing lines for them, so the default map
9252            # isn't defined for them.  The few properties that are two-valued
9253            # and aren't considered binary have the default map defined
9254            # starting in Unicode 5.0, when the @missing lines appeared; and
9255            # this program has special code to put in a default map for them
9256            # for earlier than 5.0 releases.
9257            if ($type == $ENUM
9258                || scalar keys %{$unique_maps{$addr}} > 2
9259                || defined $self->default_map)
9260            {
9261                my $tables = $self->tables;
9262                my $count = $self->count;
9263                if ($verbosity && $tables > 500 && $tables/$count > .1) {
9264                    Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9265                }
9266                $self->set_type($ENUM);
9267            }
9268            else {
9269                $self->set_type($BINARY);
9270            }
9271        }
9272        undef $unique_maps{$addr};  # Garbage collect
9273        return;
9274    }
9275
9276    # $reaons - Ignored unless suppressing
9277    sub set_fate($self, $fate, $reason=undef) {
9278        my $addr = pack 'J', refaddr $self;
9279        if ($fate >= $SUPPRESSED) {
9280            $why_suppressed{$self->complete_name} = $reason;
9281        }
9282
9283        # Each table shares the property's fate, except that MAP_PROXIED
9284        # doesn't affect match tables
9285        $map{$addr}->set_fate($fate, $reason);
9286        if ($fate != $MAP_PROXIED) {
9287            foreach my $table ($map{$addr}, $self->tables) {
9288                $table->set_fate($fate, $reason);
9289            }
9290        }
9291        return;
9292    }
9293
9294
9295    # Most of the accessors for a property actually apply to its map table.
9296    # Setup up accessor functions for those, referring to %map
9297    for my $sub (qw(
9298                    add_alias
9299                    add_anomalous_entry
9300                    add_comment
9301                    add_conflicting
9302                    add_description
9303                    add_duplicate
9304                    add_note
9305                    aliases
9306                    comment
9307                    complete_name
9308                    containing_range
9309                    count
9310                    default_map
9311                    definition
9312                    delete_range
9313                    description
9314                    each_range
9315                    external_name
9316                    fate
9317                    file_path
9318                    format
9319                    initialize
9320                    inverse_list
9321                    is_empty
9322                    name
9323                    note
9324                    perl_extension
9325                    property
9326                    range_count
9327                    ranges
9328                    range_size_1
9329                    replace_map
9330                    reset_each_range
9331                    set_comment
9332                    set_default_map
9333                    set_file_path
9334                    set_final_comment
9335                    _set_format
9336                    set_range_size_1
9337                    set_status
9338                    set_to_output_map
9339                    short_name
9340                    status
9341                    status_info
9342                    to_output_map
9343                    type_of
9344                    value_of
9345                    write
9346                ))
9347                    # 'property' above is for symmetry, so that one can take
9348                    # the property of a property and get itself, and so don't
9349                    # have to distinguish between properties and tables in
9350                    # calling code
9351    {
9352        no strict "refs";
9353        *$sub = sub {
9354            use strict "refs";
9355            my $self = shift;
9356            return $map{pack 'J', refaddr $self}->$sub(@_);
9357        }
9358    }
9359
9360
9361} # End closure
9362
9363package main;
9364
9365sub display_chr {
9366    # Converts an ordinal printable character value to a displayable string,
9367    # using a dotted circle to hold combining characters.
9368
9369    my $ord = shift;
9370    my $chr = chr $ord;
9371    return $chr if $ccc->table(0)->contains($ord);
9372    return "\x{25CC}$chr";
9373}
9374
9375sub join_lines($input) {
9376    # Returns lines of the input joined together, so that they can be folded
9377    # properly.
9378    # This causes continuation lines to be joined together into one long line
9379    # for folding.  A continuation line is any line that doesn't begin with a
9380    # space or "\b" (the latter is stripped from the output).  This is so
9381    # lines can be in a HERE document so as to fit nicely in the terminal
9382    # width, but be joined together in one long line, and then folded with
9383    # indents, '#' prefixes, etc, properly handled.
9384    # A blank separates the joined lines except if there is a break; an extra
9385    # blank is inserted after a period ending a line.
9386
9387    # Initialize the return with the first line.
9388    my ($return, @lines) = split "\n", $input;
9389
9390    # If the first line is null, it was an empty line, add the \n back in
9391    $return = "\n" if $return eq "";
9392
9393    # Now join the remainder of the physical lines.
9394    for my $line (@lines) {
9395
9396        # An empty line means wanted a blank line, so add two \n's to get that
9397        # effect, and go to the next line.
9398        if (length $line == 0) {
9399            $return .= "\n\n";
9400            next;
9401        }
9402
9403        # Look at the last character of what we have so far.
9404        my $previous_char = substr($return, -1, 1);
9405
9406        # And at the next char to be output.
9407        my $next_char = substr($line, 0, 1);
9408
9409        if ($previous_char ne "\n") {
9410
9411            # Here didn't end wth a nl.  If the next char a blank or \b, it
9412            # means that here there is a break anyway.  So add a nl to the
9413            # output.
9414            if ($next_char eq " " || $next_char eq "\b") {
9415                $previous_char = "\n";
9416                $return .= $previous_char;
9417            }
9418
9419            # Add an extra space after periods.
9420            $return .= " " if $previous_char eq '.';
9421        }
9422
9423        # Here $previous_char is still the latest character to be output.  If
9424        # it isn't a nl, it means that the next line is to be a continuation
9425        # line, with a blank inserted between them.
9426        $return .= " " if $previous_char ne "\n";
9427
9428        # Get rid of any \b
9429        substr($line, 0, 1) = "" if $next_char eq "\b";
9430
9431        # And append this next line.
9432        $return .= $line;
9433    }
9434
9435    return $return;
9436}
9437
9438sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) {
9439    # Returns a string of the input (string or an array of strings) folded
9440    # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9441    # a \n
9442    # This is tailored for the kind of text written by this program,
9443    # especially the pod file, which can have very long names with
9444    # underscores in the middle, or words like AbcDefgHij....  We allow
9445    # breaking in the middle of such constructs if the line won't fit
9446    # otherwise.  The break in such cases will come either just after an
9447    # underscore, or just before one of the Capital letters.
9448
9449    local $to_trace = 0 if main::DEBUG;
9450
9451    # $prefix Optional string to prepend to each output line
9452    # $hanging_indent Optional number of spaces to indent
9453	# continuation lines
9454    # $right_margin  Optional number of spaces to narrow the
9455    # total width by.
9456
9457    # The space available doesn't include what's automatically prepended
9458    # to each line, or what's reserved on the right.
9459    my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9460    # XXX Instead of using the 'nofold' perhaps better to look up the stack
9461
9462    if (DEBUG && $hanging_indent >= $max) {
9463        Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
9464        $hanging_indent = 0;
9465    }
9466
9467    # First, split into the current physical lines.
9468    my @line;
9469    if (ref $line) {        # Better be an array, because not bothering to
9470                            # test
9471        foreach my $line (@{$line}) {
9472            push @line, split /\n/, $line;
9473        }
9474    }
9475    else {
9476        @line = split /\n/, $line;
9477    }
9478
9479    #local $to_trace = 1 if main::DEBUG;
9480    trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9481
9482    # Look at each current physical line.
9483    for (my $i = 0; $i < @line; $i++) {
9484        Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9485        #local $to_trace = 1 if main::DEBUG;
9486        trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9487
9488        # Remove prefix, because will be added back anyway, don't want
9489        # doubled prefix
9490        $line[$i] =~ s/^$prefix//;
9491
9492        # Remove trailing space
9493        $line[$i] =~ s/\s+\Z//;
9494
9495        # If the line is too long, fold it.
9496        if (length $line[$i] > $max) {
9497            my $remainder;
9498
9499            # Here needs to fold.  Save the leading space in the line for
9500            # later.
9501            $line[$i] =~ /^ ( \s* )/x;
9502            my $leading_space = $1;
9503            trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9504
9505            # If character at final permissible position is white space,
9506            # fold there, which will delete that white space
9507            if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9508                $remainder = substr($line[$i], $max);
9509                $line[$i] = substr($line[$i], 0, $max - 1);
9510            }
9511            else {
9512
9513                # Otherwise fold at an acceptable break char closest to
9514                # the max length.  Look at just the maximal initial
9515                # segment of the line
9516                my $segment = substr($line[$i], 0, $max - 1);
9517                if ($segment =~
9518                    /^ ( .{$hanging_indent}   # Don't look before the
9519                                              #  indent.
9520                        \ *                   # Don't look in leading
9521                                              #  blanks past the indent
9522                            [^ ] .*           # Find the right-most
9523                        (?:                   #  acceptable break:
9524                            [ \s = ]          # space or equal
9525                            | - (?! [.0-9] )  # or non-unary minus.
9526                            | [^\\[(] (?= \\ )# break before single backslash
9527                                              #  not immediately after opening
9528                                              #  punctuation
9529                        )                     # $1 includes the character
9530                    )/x)
9531                {
9532                    # Split into the initial part that fits, and remaining
9533                    # part of the input
9534                    $remainder = substr($line[$i], length $1);
9535                    $line[$i] = $1;
9536                    trace $line[$i] if DEBUG && $to_trace;
9537                    trace $remainder if DEBUG && $to_trace;
9538                }
9539
9540                # If didn't find a good breaking spot, see if there is a
9541                # not-so-good breaking spot.  These are just after
9542                # underscores or where the case changes from lower to
9543                # upper.  Use \a as a soft hyphen, but give up
9544                # and don't break the line if there is actually a \a
9545                # already in the input.  We use an ascii character for the
9546                # soft-hyphen to avoid any attempt by miniperl to try to
9547                # access the files that this program is creating.
9548                elsif ($segment !~ /\a/
9549                       && ($segment =~ s/_/_\a/g
9550                       || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg))
9551                {
9552                    # Here were able to find at least one place to insert
9553                    # our substitute soft hyphen.  Find the right-most one
9554                    # and replace it by a real hyphen.
9555                    trace $segment if DEBUG && $to_trace;
9556                    substr($segment,
9557                            rindex($segment, "\a"),
9558                            1) = '-';
9559
9560                    # Then remove the soft hyphen substitutes.
9561                    $segment =~ s/\a//g;
9562                    trace $segment if DEBUG && $to_trace;
9563
9564                    # And split into the initial part that fits, and
9565                    # remainder of the line
9566                    my $pos = rindex($segment, '-');
9567                    $remainder = substr($line[$i], $pos);
9568                    trace $remainder if DEBUG && $to_trace;
9569                    $line[$i] = substr($segment, 0, $pos + 1);
9570                }
9571            }
9572
9573            # Here we know if we can fold or not.  If we can, $remainder
9574            # is what remains to be processed in the next iteration.
9575            if (defined $remainder) {
9576                trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9577
9578                # Insert the folded remainder of the line as a new element
9579                # of the array.  (It may still be too long, but we will
9580                # deal with that next time through the loop.)  Omit any
9581                # leading space in the remainder.
9582                $remainder =~ s/^\s+//;
9583                trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9584
9585                # But then indent by whichever is larger of:
9586                # 1) the leading space on the input line;
9587                # 2) the hanging indent.
9588                # This preserves indentation in the original line.
9589                my $lead = ($leading_space)
9590                            ? length $leading_space
9591                            : $hanging_indent;
9592                $lead = max($lead, $hanging_indent);
9593                splice @line, $i+1, 0, (" " x $lead) . $remainder;
9594            }
9595        }
9596
9597        # Ready to output the line. Get rid of any trailing space
9598        # And prefix by the required $prefix passed in.
9599        $line[$i] =~ s/\s+$//;
9600        $line[$i] = "$prefix$line[$i]\n";
9601    } # End of looping through all the lines.
9602
9603    return join "", @line;
9604}
9605
9606sub property_ref {  # Returns a reference to a property object.
9607    return Property::property_ref(@_);
9608}
9609
9610sub force_unlink ($filename) {
9611    return unless file_exists($filename);
9612    return if CORE::unlink($filename);
9613
9614    # We might need write permission
9615    chmod 0777, $filename;
9616    CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9617    return;
9618}
9619
9620sub write ($file, $use_utf8, @lines) {
9621    # Given a filename and references to arrays of lines, write the lines of
9622    # each array to the file
9623    # Filename can be given as an arrayref of directory names
9624
9625    # Get into a single string if an array, and get rid of, in Unix terms, any
9626    # leading '.'
9627    $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9628    $file = File::Spec->canonpath($file);
9629
9630    # If has directories, make sure that they all exist
9631    (undef, my $directories, undef) = File::Spec->splitpath($file);
9632    File::Path::mkpath($directories) if $directories && ! -d $directories;
9633
9634    push @files_actually_output, $file;
9635
9636    force_unlink ($file);
9637
9638    my $OUT;
9639    if (not open $OUT, ">", $file) {
9640        Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9641        return;
9642    }
9643
9644    binmode $OUT, ":utf8" if $use_utf8;
9645
9646    foreach my $lines_ref (@lines) {
9647        unless (@$lines_ref) {
9648            Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9649        }
9650
9651        print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9652    }
9653    close $OUT or die Carp::my_carp("close '$file' failed: $!");
9654
9655    print "$file written.\n" if $verbosity >= $VERBOSE;
9656
9657    return;
9658}
9659
9660
9661sub Standardize($name=undef) {
9662    # This converts the input name string into a standardized equivalent to
9663    # use internally.
9664
9665    unless (defined $name) {
9666      Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
9667      return;
9668    }
9669
9670    # Remove any leading or trailing white space
9671    $name =~ s/^\s+//g;
9672    $name =~ s/\s+$//g;
9673
9674    # Convert interior white space and hyphens into underscores.
9675    $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9676
9677    # Capitalize the letter following an underscore, and convert a sequence of
9678    # multiple underscores to a single one
9679    $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9680
9681    # And capitalize the first letter, but not for the special cjk ones.
9682    $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9683    return $name;
9684}
9685
9686sub standardize ($str=undef) {
9687    # Returns a lower-cased standardized name, without underscores.  This form
9688    # is chosen so that it can distinguish between any real versus superficial
9689    # Unicode name differences.  It relies on the fact that Unicode doesn't
9690    # have interior underscores, white space, nor dashes in any
9691    # stricter-matched name.  It should not be used on Unicode code point
9692    # names (the Name property), as they mostly, but not always follow these
9693    # rules.
9694
9695    my $name = Standardize($str);
9696    return if !defined $name;
9697
9698    $name =~ s/ (?<= .) _ (?= . ) //xg;
9699    return lc $name;
9700}
9701
9702sub UCD_name ($table, $alias) {
9703    # Returns the name that Unicode::UCD will use to find a table.  XXX
9704    # perhaps this function should be placed somewhere, like UCD.pm so that
9705    # Unicode::UCD can use it directly without duplicating code that can get
9706    # out-of sync.
9707
9708    my $property = $table->property;
9709    $property = ($property == $perl)
9710                ? ""                # 'perl' is never explicitly stated
9711                : standardize($property->name) . '=';
9712    if ($alias->loose_match) {
9713        return $property . standardize($alias->name);
9714    }
9715    else {
9716        return lc ($property . $alias->name);
9717    }
9718
9719    return;
9720}
9721
9722{   # Closure
9723
9724    my $indent_increment = " " x ( $debugging_build ? 2 : 0);
9725    %main::already_output = ();
9726
9727    $main::simple_dumper_nesting = 0;
9728
9729    sub simple_dumper( $item, $indent = "" ) {
9730        # Like Simple Data::Dumper. Good enough for our needs. We can't use
9731        # the real thing as we have to run under miniperl.
9732
9733        # It is designed so that on input it is at the beginning of a line,
9734        # and the final thing output in any call is a trailing ",\n".
9735
9736        $indent = "" if ! $debugging_build;
9737
9738        # nesting level is localized, so that as the call stack pops, it goes
9739        # back to the prior value.
9740        local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9741        local %main::already_output = %main::already_output;
9742        $main::simple_dumper_nesting++;
9743        #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9744
9745        # Determine the indent for recursive calls.
9746        my $next_indent = $indent . $indent_increment;
9747
9748        my $output;
9749        if (! ref $item) {
9750
9751            # Dump of scalar: just output it in quotes if not a number.  To do
9752            # so we must escape certain characters, and therefore need to
9753            # operate on a copy to avoid changing the original
9754            my $copy = $item;
9755            $copy = $UNDEF unless defined $copy;
9756
9757            # Quote non-integers (integers also have optional leading '-')
9758            if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9759
9760                # Escape apostrophe and backslash
9761                $copy =~ s/ ( ['\\] ) /\\$1/xg;
9762                $copy = "'$copy'";
9763            }
9764            $output = "$indent$copy,\n";
9765        }
9766        else {
9767
9768            # Keep track of cycles in the input, and refuse to infinitely loop
9769            my $addr = pack 'J', refaddr $item;
9770            if (defined $main::already_output{$addr}) {
9771                return "${indent}ALREADY OUTPUT: $item\n";
9772            }
9773            $main::already_output{$addr} = $item;
9774
9775            if (ref $item eq 'ARRAY') {
9776                my $using_brackets;
9777                $output = $indent;
9778                if ($main::simple_dumper_nesting > 1) {
9779                    $output .= '[';
9780                    $using_brackets = 1;
9781                }
9782                else {
9783                    $using_brackets = 0;
9784                }
9785
9786                # If the array is empty, put the closing bracket on the same
9787                # line.  Otherwise, recursively add each array element
9788                if (@$item == 0) {
9789                    $output .= " ";
9790                }
9791                else {
9792                    $output .= "\n";
9793                    for (my $i = 0; $i < @$item; $i++) {
9794
9795                        # Indent array elements one level
9796                        $output .= &simple_dumper($item->[$i], $next_indent);
9797                        next if ! $debugging_build;
9798                        $output =~ s/\n$//;      # Remove any trailing nl so
9799                        $output .= " # [$i]\n";  # as to add a comment giving
9800                                                 # the array index
9801                    }
9802                    $output .= $indent;     # Indent closing ']' to orig level
9803                }
9804                $output .= ']' if $using_brackets;
9805                $output .= ",\n";
9806            }
9807            elsif (ref $item eq 'HASH') {
9808                my $is_first_line;
9809                my $using_braces;
9810                my $body_indent;
9811
9812                # No surrounding braces at top level
9813                $output .= $indent;
9814                if ($main::simple_dumper_nesting > 1) {
9815                    $output .= "{\n";
9816                    $is_first_line = 0;
9817                    $body_indent = $next_indent;
9818                    $next_indent .= $indent_increment;
9819                    $using_braces = 1;
9820                }
9821                else {
9822                    $is_first_line = 1;
9823                    $body_indent = $indent;
9824                    $using_braces = 0;
9825                }
9826
9827                # Output hashes sorted alphabetically instead of apparently
9828                # random.  Use caseless alphabetic sort
9829                foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9830                {
9831                    if ($is_first_line) {
9832                        $is_first_line = 0;
9833                    }
9834                    else {
9835                        $output .= "$body_indent";
9836                    }
9837
9838                    # The key must be a scalar, but this recursive call quotes
9839                    # it
9840                    $output .= &simple_dumper($key);
9841
9842                    # And change the trailing comma and nl to the hash fat
9843                    # comma for clarity, and so the value can be on the same
9844                    # line
9845                    $output =~ s/,\n$/ => /;
9846
9847                    # Recursively call to get the value's dump.
9848                    my $next = &simple_dumper($item->{$key}, $next_indent);
9849
9850                    # If the value is all on one line, remove its indent, so
9851                    # will follow the => immediately.  If it takes more than
9852                    # one line, start it on a new line.
9853                    if ($next !~ /\n.*\n/) {
9854                        $next =~ s/^ *//;
9855                    }
9856                    else {
9857                        $output .= "\n";
9858                    }
9859                    $output .= $next;
9860                }
9861
9862                $output .= "$indent},\n" if $using_braces;
9863            }
9864            elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9865                $output = $indent . ref($item) . "\n";
9866                # XXX see if blessed
9867            }
9868            elsif ($item->can('dump')) {
9869
9870                # By convention in this program, objects furnish a 'dump'
9871                # method.  Since not doing any output at this level, just pass
9872                # on the input indent
9873                $output = $item->dump($indent);
9874            }
9875            else {
9876                Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
9877            }
9878        }
9879        return $output;
9880    }
9881}
9882
9883sub dump_inside_out( $object, $fields_ref, @args ) {
9884    # Dump inside-out hashes in an object's state by converting them to a
9885    # regular hash and then calling simple_dumper on that.
9886
9887    my $addr = pack 'J', refaddr $object;
9888
9889    my %hash;
9890    foreach my $key (keys %$fields_ref) {
9891        $hash{$key} = $fields_ref->{$key}{$addr};
9892    }
9893
9894    return simple_dumper(\%hash, @args);
9895}
9896
9897sub _operator_dot($self, $other="", $reversed=0) {
9898    # Overloaded '.' method that is common to all packages.  It uses the
9899    # package's stringify method.
9900
9901    foreach my $which (\$self, \$other) {
9902        next unless ref $$which;
9903        if ($$which->can('_operator_stringify')) {
9904            $$which = $$which->_operator_stringify;
9905        }
9906        else {
9907            my $ref = ref $$which;
9908            my $addr = pack 'J', refaddr $$which;
9909            $$which = "$ref ($addr)";
9910        }
9911    }
9912    return ($reversed)
9913            ? "$other$self"
9914            : "$self$other";
9915}
9916
9917sub _operator_dot_equal($self, $other="", $reversed=0) {
9918    # Overloaded '.=' method that is common to all packages.
9919
9920    if ($reversed) {
9921        return $other .= "$self";
9922    }
9923    else {
9924        return "$self" . "$other";
9925    }
9926}
9927
9928sub _operator_equal($self, $other, @) {
9929    # Generic overloaded '==' routine.  To be equal, they must be the exact
9930    # same object
9931
9932    return 0 unless defined $other;
9933    return 0 unless ref $other;
9934    no overloading;
9935    return $self == $other;
9936}
9937
9938sub _operator_not_equal($self, $other, @) {
9939    return ! _operator_equal($self, $other);
9940}
9941
9942sub substitute_PropertyAliases($file_object) {
9943    # Deal with early releases that don't have the crucial PropertyAliases.txt
9944    # file.
9945
9946    $file_object->insert_lines(get_old_property_aliases());
9947
9948    process_PropertyAliases($file_object);
9949}
9950
9951
9952sub process_PropertyAliases($file) {
9953    # This reads in the PropertyAliases.txt file, which contains almost all
9954    # the character properties in Unicode and their equivalent aliases:
9955    # scf       ; Simple_Case_Folding         ; sfc
9956    #
9957    # Field 0 is the preferred short name for the property.
9958    # Field 1 is the full name.
9959    # Any succeeding ones are other accepted names.
9960
9961    # Add any cjk properties that may have been defined.
9962    $file->insert_lines(@cjk_properties);
9963
9964    while ($file->next_line) {
9965
9966        my @data = split /\s*;\s*/;
9967
9968        my $full = $data[1];
9969
9970        # This line is defective in early Perls.  The property in Unihan.txt
9971        # is kRSUnicode.
9972        if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
9973            push @data, qw(cjkRSUnicode kRSUnicode);
9974        }
9975
9976        my $this = Property->new($data[0], Full_Name => $full);
9977
9978        $this->set_fate($SUPPRESSED, $why_suppressed{$full})
9979                                                    if $why_suppressed{$full};
9980
9981        # Start looking for more aliases after these two.
9982        for my $i (2 .. @data - 1) {
9983            $this->add_alias($data[$i]);
9984        }
9985
9986    }
9987
9988    my $scf = property_ref("Simple_Case_Folding");
9989    $scf->add_alias("scf");
9990    $scf->add_alias("sfc");
9991
9992    return;
9993}
9994
9995sub finish_property_setup($file) {
9996    # Finishes setting up after PropertyAliases.
9997
9998    # This entry was missing from this file in earlier Unicode versions
9999    if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10000        Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10001    }
10002
10003    # These are used so much, that we set globals for them.
10004    $gc = property_ref('General_Category');
10005    $block = property_ref('Block');
10006    $script = property_ref('Script');
10007    $age = property_ref('Age');
10008
10009    # Perl adds this alias.
10010    $gc->add_alias('Category');
10011
10012    # Unicode::Normalize expects this file with this name and directory.
10013    $ccc = property_ref('Canonical_Combining_Class');
10014    if (defined $ccc) {
10015        $ccc->set_file('CombiningClass');
10016        $ccc->set_directory(File::Spec->curdir());
10017    }
10018
10019    # These two properties aren't actually used in the core, but unfortunately
10020    # the names just above that are in the core interfere with these, so
10021    # choose different names.  These aren't a problem unless the map tables
10022    # for these files get written out.
10023    my $lowercase = property_ref('Lowercase');
10024    $lowercase->set_file('IsLower') if defined $lowercase;
10025    my $uppercase = property_ref('Uppercase');
10026    $uppercase->set_file('IsUpper') if defined $uppercase;
10027
10028    # Set up the hard-coded default mappings, but only on properties defined
10029    # for this release
10030    foreach my $property (keys %default_mapping) {
10031        my $property_object = property_ref($property);
10032        next if ! defined $property_object;
10033        my $default_map = $default_mapping{$property};
10034        $property_object->set_default_map($default_map);
10035
10036        # A map of <code point> implies the property is string.
10037        if ($property_object->type == $UNKNOWN
10038            && $default_map eq $CODE_POINT)
10039        {
10040            $property_object->set_type($STRING);
10041        }
10042    }
10043
10044    # For backwards compatibility with applications that may read the mapping
10045    # file directly (it was documented in 5.12 and 5.14 as being thusly
10046    # usable), keep it from being adjusted.  (range_size_1 is
10047    # used to force the traditional format.)
10048    if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10049        $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10050        $nfkc_cf->set_range_size_1(1);
10051    }
10052    if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10053        $bmg->set_to_output_map($EXTERNAL_MAP);
10054        $bmg->set_range_size_1(1);
10055    }
10056
10057    property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10058
10059    # The rest of this sub is for properties that need the Multi_Default class
10060    # to create objects for defaults.  As of v15.0, this is no longer needed.
10061
10062    return if $v_version ge v15.0.0;
10063
10064    # Bidi class has a complicated default, but the derived file takes care of
10065    # the complications, leaving just 'L'.
10066    if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10067        property_ref('Bidi_Class')->set_default_map('L');
10068    }
10069    else {
10070        my $default;
10071
10072        # The derived file was introduced in 3.1.1.  The values below are
10073        # taken from table 3-8, TUS 3.0
10074        my $default_R =
10075            'my $default = Range_List->new;
10076             $default->add_range(0x0590, 0x05FF);
10077             $default->add_range(0xFB1D, 0xFB4F);'
10078        ;
10079
10080        # The defaults apply only to unassigned characters
10081        $default_R .= '$gc->table("Unassigned") & $default;';
10082
10083        if ($v_version lt v3.0.0) {
10084            $default = Multi_Default->new(R => $default_R, 'L');
10085        }
10086        else {
10087
10088            # AL apparently not introduced until 3.0:  TUS 2.x references are
10089            # not on-line to check it out
10090            my $default_AL =
10091                'my $default = Range_List->new;
10092                 $default->add_range(0x0600, 0x07BF);
10093                 $default->add_range(0xFB50, 0xFDFF);
10094                 $default->add_range(0xFE70, 0xFEFF);'
10095            ;
10096
10097            # Non-character code points introduced in this release; aren't AL
10098            if ($v_version ge 3.1.0) {
10099                $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10100            }
10101            $default_AL .= '$gc->table("Unassigned") & $default';
10102            $default = Multi_Default->new(AL => $default_AL,
10103                                          R => $default_R,
10104                                          'L');
10105        }
10106        property_ref('Bidi_Class')->set_default_map($default);
10107    }
10108
10109    # Joining type has a complicated default, but the derived file takes care
10110    # of the complications, leaving just 'U' (or Non_Joining), except the file
10111    # is bad in 3.1.0
10112    if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10113        if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10114            property_ref('Joining_Type')->set_default_map('Non_Joining');
10115        }
10116        else {
10117
10118            # Otherwise, there are not one, but two possibilities for the
10119            # missing defaults: T and U.
10120            # The missing defaults that evaluate to T are given by:
10121            # T = Mn + Cf - ZWNJ - ZWJ
10122            # where Mn and Cf are the general category values. In other words,
10123            # any non-spacing mark or any format control character, except
10124            # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10125            # WIDTH JOINER (joining type C).
10126            my $default = Multi_Default->new(
10127               'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10128               'Non_Joining');
10129            property_ref('Joining_Type')->set_default_map($default);
10130        }
10131    }
10132
10133    # Line break has a complicated default in early releases. It is 'Unknown'
10134    # for non-assigned code points; 'AL' for assigned.
10135    if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10136        my $lb = property_ref('Line_Break');
10137        if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10138            $lb->set_default_map('Unknown');
10139        }
10140        else {
10141            my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10142                                             'Unknown',
10143                                            );
10144            $lb->set_default_map($default);
10145        }
10146    }
10147
10148    return;
10149}
10150
10151sub get_old_property_aliases() {
10152    # Returns what would be in PropertyAliases.txt if it existed in very old
10153    # versions of Unicode.  It was derived from the one in 3.2, and pared
10154    # down based on the data that was actually in the older releases.
10155    # An attempt was made to use the existence of files to mean inclusion or
10156    # not of various aliases, but if this was not sufficient, using version
10157    # numbers was resorted to.
10158
10159    my @return;
10160
10161    # These are to be used in all versions (though some are constructed by
10162    # this program if missing)
10163    push @return, split /\n/, <<'END';
10164bc        ; Bidi_Class
10165Bidi_M    ; Bidi_Mirrored
10166cf        ; Case_Folding
10167ccc       ; Canonical_Combining_Class
10168dm        ; Decomposition_Mapping
10169dt        ; Decomposition_Type
10170gc        ; General_Category
10171isc       ; ISO_Comment
10172lc        ; Lowercase_Mapping
10173na        ; Name
10174na1       ; Unicode_1_Name
10175nt        ; Numeric_Type
10176nv        ; Numeric_Value
10177scf       ; Simple_Case_Folding
10178slc       ; Simple_Lowercase_Mapping
10179stc       ; Simple_Titlecase_Mapping
10180suc       ; Simple_Uppercase_Mapping
10181tc        ; Titlecase_Mapping
10182uc        ; Uppercase_Mapping
10183END
10184
10185    if (-e 'Blocks.txt') {
10186        push @return, "blk       ; Block\n";
10187    }
10188    if (-e 'ArabicShaping.txt') {
10189        push @return, split /\n/, <<'END';
10190jg        ; Joining_Group
10191jt        ; Joining_Type
10192END
10193    }
10194    if (-e 'PropList.txt') {
10195
10196        # This first set is in the original old-style proplist.
10197        push @return, split /\n/, <<'END';
10198Bidi_C    ; Bidi_Control
10199Dash      ; Dash
10200Dia       ; Diacritic
10201Ext       ; Extender
10202Hex       ; Hex_Digit
10203Hyphen    ; Hyphen
10204IDC       ; ID_Continue
10205Ideo      ; Ideographic
10206Join_C    ; Join_Control
10207Math      ; Math
10208QMark     ; Quotation_Mark
10209Term      ; Terminal_Punctuation
10210WSpace    ; White_Space
10211END
10212        # The next sets were added later
10213        if ($v_version ge v3.0.0) {
10214            push @return, split /\n/, <<'END';
10215Upper     ; Uppercase
10216Lower     ; Lowercase
10217END
10218        }
10219        if ($v_version ge v3.0.1) {
10220            push @return, split /\n/, <<'END';
10221NChar     ; Noncharacter_Code_Point
10222END
10223        }
10224        # The next sets were added in the new-style
10225        if ($v_version ge v3.1.0) {
10226            push @return, split /\n/, <<'END';
10227OAlpha    ; Other_Alphabetic
10228OLower    ; Other_Lowercase
10229OMath     ; Other_Math
10230OUpper    ; Other_Uppercase
10231END
10232        }
10233        if ($v_version ge v3.1.1) {
10234            push @return, "AHex      ; ASCII_Hex_Digit\n";
10235        }
10236    }
10237    if (-e 'EastAsianWidth.txt') {
10238        push @return, "ea        ; East_Asian_Width\n";
10239    }
10240    if (-e 'CompositionExclusions.txt') {
10241        push @return, "CE        ; Composition_Exclusion\n";
10242    }
10243    if (-e 'LineBreak.txt') {
10244        push @return, "lb        ; Line_Break\n";
10245    }
10246    if (-e 'BidiMirroring.txt') {
10247        push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
10248    }
10249    if (-e 'Scripts.txt') {
10250        push @return, "sc        ; Script\n";
10251    }
10252    if (-e 'DNormalizationProps.txt') {
10253        push @return, split /\n/, <<'END';
10254Comp_Ex   ; Full_Composition_Exclusion
10255FC_NFKC   ; FC_NFKC_Closure
10256NFC_QC    ; NFC_Quick_Check
10257NFD_QC    ; NFD_Quick_Check
10258NFKC_QC   ; NFKC_Quick_Check
10259NFKD_QC   ; NFKD_Quick_Check
10260XO_NFC    ; Expands_On_NFC
10261XO_NFD    ; Expands_On_NFD
10262XO_NFKC   ; Expands_On_NFKC
10263XO_NFKD   ; Expands_On_NFKD
10264END
10265    }
10266    if (-e 'DCoreProperties.txt') {
10267        push @return, split /\n/, <<'END';
10268Alpha     ; Alphabetic
10269IDS       ; ID_Start
10270XIDC      ; XID_Continue
10271XIDS      ; XID_Start
10272END
10273        # These can also appear in some versions of PropList.txt
10274        push @return, "Lower     ; Lowercase\n"
10275                                    unless grep { $_ =~ /^Lower\b/} @return;
10276        push @return, "Upper     ; Uppercase\n"
10277                                    unless grep { $_ =~ /^Upper\b/} @return;
10278    }
10279
10280    # This flag requires the DAge.txt file to be copied into the directory.
10281    if (DEBUG && $compare_versions) {
10282        push @return, 'age       ; Age';
10283    }
10284
10285    return @return;
10286}
10287
10288sub substitute_PropValueAliases($file_object) {
10289    # Deal with early releases that don't have the crucial
10290    # PropValueAliases.txt file.
10291
10292    $file_object->insert_lines(get_old_property_value_aliases());
10293
10294    process_PropValueAliases($file_object);
10295}
10296
10297sub process_PropValueAliases($file) {
10298    # This file contains values that properties look like:
10299    # bc ; AL        ; Arabic_Letter
10300    # blk; n/a       ; Greek_And_Coptic                 ; Greek
10301    #
10302    # Field 0 is the property.
10303    # Field 1 is the short name of a property value or 'n/a' if no
10304    #                short name exists;
10305    # Field 2 is the full property value name;
10306    # Any other fields are more synonyms for the property value.
10307    # Purely numeric property values are omitted from the file; as are some
10308    # others, fewer and fewer in later releases
10309
10310    # Entries for the ccc property have an extra field before the
10311    # abbreviation:
10312    # ccc;   0; NR   ; Not_Reordered
10313    # It is the numeric value that the names are synonyms for.
10314
10315    # There are comment entries for values missing from this file:
10316    # # @missing: 0000..10FFFF; ISO_Comment; <none>
10317    # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10318
10319    if ($v_version lt 4.0.0) {
10320        $file->insert_lines(split /\n/, <<'END'
10321Hangul_Syllable_Type; L                                ; Leading_Jamo
10322Hangul_Syllable_Type; LV                               ; LV_Syllable
10323Hangul_Syllable_Type; LVT                              ; LVT_Syllable
10324Hangul_Syllable_Type; NA                               ; Not_Applicable
10325Hangul_Syllable_Type; T                                ; Trailing_Jamo
10326Hangul_Syllable_Type; V                                ; Vowel_Jamo
10327END
10328        );
10329    }
10330    if ($v_version lt 4.1.0) {
10331        $file->insert_lines(split /\n/, <<'END'
10332_Perl_GCB; CN                               ; Control
10333_Perl_GCB; CR                               ; CR
10334_Perl_GCB; EX                               ; Extend
10335_Perl_GCB; L                                ; L
10336_Perl_GCB; LF                               ; LF
10337_Perl_GCB; LV                               ; LV
10338_Perl_GCB; LVT                              ; LVT
10339_Perl_GCB; T                                ; T
10340_Perl_GCB; V                                ; V
10341_Perl_GCB; XX                               ; Other
10342END
10343        );
10344    }
10345
10346    # Add any explicit cjk values
10347    $file->insert_lines(@cjk_property_values);
10348
10349    # This line is used only for testing the code that checks for name
10350    # conflicts.  There is a script Inherited, and when this line is executed
10351    # it causes there to be a name conflict with the 'Inherited' that this
10352    # program generates for this block property value
10353    #$file->insert_lines('blk; n/a; Herited');
10354
10355    # Process each line of the file ...
10356    while ($file->next_line) {
10357
10358        # Fix typo in input file
10359        s/CCC133/CCC132/g if $v_version eq v6.1.0;
10360
10361        my ($property, @data) = split /\s*;\s*/;
10362
10363        # The ccc property has an extra field at the beginning, which is the
10364        # numeric value.  Move it to be after the other two, mnemonic, fields,
10365        # so that those will be used as the property value's names, and the
10366        # number will be an extra alias.  (Rightmost splice removes field 1-2,
10367        # returning them in a slice; left splice inserts that before anything,
10368        # thus shifting the former field 0 to after them.)
10369        splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10370
10371        if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10372            my $new_style = $data[1] =~ s/-/_/gr;
10373            splice @data, 1, 0, $new_style;
10374        }
10375
10376        # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
10377        # there is no short name, use the full one in element 1
10378        if ($data[0] eq "n/a") {
10379            $data[0] = $data[1];
10380        }
10381        elsif ($data[0] ne $data[1]
10382               && standardize($data[0]) eq standardize($data[1])
10383               && $data[1] !~ /[[:upper:]]/)
10384        {
10385            # Also, there is a bug in the file in which "n/a" is omitted, and
10386            # the two fields are identical except for case, and the full name
10387            # is all lower case.  Copy the "short" name unto the full one to
10388            # give it some upper case.
10389
10390            $data[1] = $data[0];
10391        }
10392
10393        # Earlier releases had the pseudo property 'qc' that should expand to
10394        # the ones that replace it below.
10395        if ($property eq 'qc') {
10396            if (lc $data[0] eq 'y') {
10397                $file->insert_lines('NFC_QC; Y      ; Yes',
10398                                    'NFD_QC; Y      ; Yes',
10399                                    'NFKC_QC; Y     ; Yes',
10400                                    'NFKD_QC; Y     ; Yes',
10401                                    );
10402            }
10403            elsif (lc $data[0] eq 'n') {
10404                $file->insert_lines('NFC_QC; N      ; No',
10405                                    'NFD_QC; N      ; No',
10406                                    'NFKC_QC; N     ; No',
10407                                    'NFKD_QC; N     ; No',
10408                                    );
10409            }
10410            elsif (lc $data[0] eq 'm') {
10411                $file->insert_lines('NFC_QC; M      ; Maybe',
10412                                    'NFKC_QC; M     ; Maybe',
10413                                    );
10414            }
10415            else {
10416                $file->carp_bad_line("qc followed by unexpected '$data[0]");
10417            }
10418            next;
10419        }
10420
10421        # The first field is the short name, 2nd is the full one.
10422        my $property_object = property_ref($property);
10423        my $table = $property_object->add_match_table($data[0],
10424                                                Full_Name => $data[1]);
10425
10426        # Start looking for more aliases after these two.
10427        for my $i (2 .. @data - 1) {
10428            $table->add_alias($data[$i]);
10429        }
10430    } # End of looping through the file
10431
10432    # As noted in the comments early in the program, it generates tables for
10433    # the default values for all releases, even those for which the concept
10434    # didn't exist at the time.  Here we add those if missing.
10435    if (defined $age && ! defined $age->table('Unassigned')) {
10436        $age->add_match_table('Unassigned');
10437    }
10438    $block->add_match_table('No_Block') if -e 'Blocks.txt'
10439                                    && ! defined $block->table('No_Block');
10440
10441
10442    # Now set the default mappings of the properties from the file.  This is
10443    # done after the loop because a number of properties have only @missings
10444    # entries in the file, and may not show up until the end.
10445    my @defaults = $file->get_missings;
10446    foreach my $default_ref (@defaults) {
10447        my $default = $default_ref->{default};
10448        my $property = property_ref($default_ref->{property});
10449        $property->set_default_map($default);
10450    }
10451
10452    return;
10453}
10454
10455sub get_old_property_value_aliases () {
10456    # Returns what would be in PropValueAliases.txt if it existed in very old
10457    # versions of Unicode.  It was derived from the one in 3.2, and pared
10458    # down.  An attempt was made to use the existence of files to mean
10459    # inclusion or not of various aliases, but if this was not sufficient,
10460    # using version numbers was resorted to.
10461
10462    my @return = split /\n/, <<'END';
10463bc ; AN        ; Arabic_Number
10464bc ; B         ; Paragraph_Separator
10465bc ; CS        ; Common_Separator
10466bc ; EN        ; European_Number
10467bc ; ES        ; European_Separator
10468bc ; ET        ; European_Terminator
10469bc ; L         ; Left_To_Right
10470bc ; ON        ; Other_Neutral
10471bc ; R         ; Right_To_Left
10472bc ; WS        ; White_Space
10473
10474Bidi_M; N; No; F; False
10475Bidi_M; Y; Yes; T; True
10476
10477# The standard combining classes are very much different in v1, so only use
10478# ones that look right (not checked thoroughly)
10479ccc;   0; NR   ; Not_Reordered
10480ccc;   1; OV   ; Overlay
10481ccc;   7; NK   ; Nukta
10482ccc;   8; KV   ; Kana_Voicing
10483ccc;   9; VR   ; Virama
10484ccc; 202; ATBL ; Attached_Below_Left
10485ccc; 216; ATAR ; Attached_Above_Right
10486ccc; 218; BL   ; Below_Left
10487ccc; 220; B    ; Below
10488ccc; 222; BR   ; Below_Right
10489ccc; 224; L    ; Left
10490ccc; 228; AL   ; Above_Left
10491ccc; 230; A    ; Above
10492ccc; 232; AR   ; Above_Right
10493ccc; 234; DA   ; Double_Above
10494
10495dt ; can       ; canonical
10496dt ; enc       ; circle
10497dt ; fin       ; final
10498dt ; font      ; font
10499dt ; fra       ; fraction
10500dt ; init      ; initial
10501dt ; iso       ; isolated
10502dt ; med       ; medial
10503dt ; n/a       ; none
10504dt ; nb        ; noBreak
10505dt ; sqr       ; square
10506dt ; sub       ; sub
10507dt ; sup       ; super
10508
10509gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10510gc ; Cc        ; Control
10511gc ; Cn        ; Unassigned
10512gc ; Co        ; Private_Use
10513gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10514gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10515gc ; Ll        ; Lowercase_Letter
10516gc ; Lm        ; Modifier_Letter
10517gc ; Lo        ; Other_Letter
10518gc ; Lu        ; Uppercase_Letter
10519gc ; M         ; Mark                             # Mc | Me | Mn
10520gc ; Mc        ; Spacing_Mark
10521gc ; Mn        ; Nonspacing_Mark
10522gc ; N         ; Number                           # Nd | Nl | No
10523gc ; Nd        ; Decimal_Number
10524gc ; No        ; Other_Number
10525gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10526gc ; Pd        ; Dash_Punctuation
10527gc ; Pe        ; Close_Punctuation
10528gc ; Po        ; Other_Punctuation
10529gc ; Ps        ; Open_Punctuation
10530gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10531gc ; Sc        ; Currency_Symbol
10532gc ; Sm        ; Math_Symbol
10533gc ; So        ; Other_Symbol
10534gc ; Z         ; Separator                        # Zl | Zp | Zs
10535gc ; Zl        ; Line_Separator
10536gc ; Zp        ; Paragraph_Separator
10537gc ; Zs        ; Space_Separator
10538
10539nt ; de        ; Decimal
10540nt ; di        ; Digit
10541nt ; n/a       ; None
10542nt ; nu        ; Numeric
10543END
10544
10545    if (-e 'ArabicShaping.txt') {
10546        push @return, split /\n/, <<'END';
10547jg ; n/a       ; AIN
10548jg ; n/a       ; ALEF
10549jg ; n/a       ; DAL
10550jg ; n/a       ; GAF
10551jg ; n/a       ; LAM
10552jg ; n/a       ; MEEM
10553jg ; n/a       ; NO_JOINING_GROUP
10554jg ; n/a       ; NOON
10555jg ; n/a       ; QAF
10556jg ; n/a       ; SAD
10557jg ; n/a       ; SEEN
10558jg ; n/a       ; TAH
10559jg ; n/a       ; WAW
10560
10561jt ; C         ; Join_Causing
10562jt ; D         ; Dual_Joining
10563jt ; L         ; Left_Joining
10564jt ; R         ; Right_Joining
10565jt ; U         ; Non_Joining
10566jt ; T         ; Transparent
10567END
10568        if ($v_version ge v3.0.0) {
10569            push @return, split /\n/, <<'END';
10570jg ; n/a       ; ALAPH
10571jg ; n/a       ; BEH
10572jg ; n/a       ; BETH
10573jg ; n/a       ; DALATH_RISH
10574jg ; n/a       ; E
10575jg ; n/a       ; FEH
10576jg ; n/a       ; FINAL_SEMKATH
10577jg ; n/a       ; GAMAL
10578jg ; n/a       ; HAH
10579jg ; n/a       ; HAMZA_ON_HEH_GOAL
10580jg ; n/a       ; HE
10581jg ; n/a       ; HEH
10582jg ; n/a       ; HEH_GOAL
10583jg ; n/a       ; HETH
10584jg ; n/a       ; KAF
10585jg ; n/a       ; KAPH
10586jg ; n/a       ; KNOTTED_HEH
10587jg ; n/a       ; LAMADH
10588jg ; n/a       ; MIM
10589jg ; n/a       ; NUN
10590jg ; n/a       ; PE
10591jg ; n/a       ; QAPH
10592jg ; n/a       ; REH
10593jg ; n/a       ; REVERSED_PE
10594jg ; n/a       ; SADHE
10595jg ; n/a       ; SEMKATH
10596jg ; n/a       ; SHIN
10597jg ; n/a       ; SWASH_KAF
10598jg ; n/a       ; TAW
10599jg ; n/a       ; TEH_MARBUTA
10600jg ; n/a       ; TETH
10601jg ; n/a       ; YEH
10602jg ; n/a       ; YEH_BARREE
10603jg ; n/a       ; YEH_WITH_TAIL
10604jg ; n/a       ; YUDH
10605jg ; n/a       ; YUDH_HE
10606jg ; n/a       ; ZAIN
10607END
10608        }
10609    }
10610
10611
10612    if (-e 'EastAsianWidth.txt') {
10613        push @return, split /\n/, <<'END';
10614ea ; A         ; Ambiguous
10615ea ; F         ; Fullwidth
10616ea ; H         ; Halfwidth
10617ea ; N         ; Neutral
10618ea ; Na        ; Narrow
10619ea ; W         ; Wide
10620END
10621    }
10622
10623    if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10624        my @lb = split /\n/, <<'END';
10625lb ; AI        ; Ambiguous
10626lb ; AL        ; Alphabetic
10627lb ; B2        ; Break_Both
10628lb ; BA        ; Break_After
10629lb ; BB        ; Break_Before
10630lb ; BK        ; Mandatory_Break
10631lb ; CB        ; Contingent_Break
10632lb ; CL        ; Close_Punctuation
10633lb ; CM        ; Combining_Mark
10634lb ; CR        ; Carriage_Return
10635lb ; EX        ; Exclamation
10636lb ; GL        ; Glue
10637lb ; HY        ; Hyphen
10638lb ; ID        ; Ideographic
10639lb ; IN        ; Inseperable
10640lb ; IS        ; Infix_Numeric
10641lb ; LF        ; Line_Feed
10642lb ; NS        ; Nonstarter
10643lb ; NU        ; Numeric
10644lb ; OP        ; Open_Punctuation
10645lb ; PO        ; Postfix_Numeric
10646lb ; PR        ; Prefix_Numeric
10647lb ; QU        ; Quotation
10648lb ; SA        ; Complex_Context
10649lb ; SG        ; Surrogate
10650lb ; SP        ; Space
10651lb ; SY        ; Break_Symbols
10652lb ; XX        ; Unknown
10653lb ; ZW        ; ZWSpace
10654END
10655        # If this Unicode version predates the lb property, we use our
10656        # substitute one
10657        if (-e 'LBsubst.txt') {
10658            $_ = s/^lb/_Perl_LB/r for @lb;
10659        }
10660        push @return, @lb;
10661    }
10662
10663    if (-e 'DNormalizationProps.txt') {
10664        push @return, split /\n/, <<'END';
10665qc ; M         ; Maybe
10666qc ; N         ; No
10667qc ; Y         ; Yes
10668END
10669    }
10670
10671    if (-e 'Scripts.txt') {
10672        push @return, split /\n/, <<'END';
10673sc ; Arab      ; Arabic
10674sc ; Armn      ; Armenian
10675sc ; Beng      ; Bengali
10676sc ; Bopo      ; Bopomofo
10677sc ; Cans      ; Canadian_Aboriginal
10678sc ; Cher      ; Cherokee
10679sc ; Cyrl      ; Cyrillic
10680sc ; Deva      ; Devanagari
10681sc ; Dsrt      ; Deseret
10682sc ; Ethi      ; Ethiopic
10683sc ; Geor      ; Georgian
10684sc ; Goth      ; Gothic
10685sc ; Grek      ; Greek
10686sc ; Gujr      ; Gujarati
10687sc ; Guru      ; Gurmukhi
10688sc ; Hang      ; Hangul
10689sc ; Hani      ; Han
10690sc ; Hebr      ; Hebrew
10691sc ; Hira      ; Hiragana
10692sc ; Ital      ; Old_Italic
10693sc ; Kana      ; Katakana
10694sc ; Khmr      ; Khmer
10695sc ; Knda      ; Kannada
10696sc ; Laoo      ; Lao
10697sc ; Latn      ; Latin
10698sc ; Mlym      ; Malayalam
10699sc ; Mong      ; Mongolian
10700sc ; Mymr      ; Myanmar
10701sc ; Ogam      ; Ogham
10702sc ; Orya      ; Oriya
10703sc ; Qaai      ; Inherited
10704sc ; Runr      ; Runic
10705sc ; Sinh      ; Sinhala
10706sc ; Syrc      ; Syriac
10707sc ; Taml      ; Tamil
10708sc ; Telu      ; Telugu
10709sc ; Thaa      ; Thaana
10710sc ; Thai      ; Thai
10711sc ; Tibt      ; Tibetan
10712sc ; Yiii      ; Yi
10713sc ; Zyyy      ; Common
10714END
10715    }
10716
10717    if ($v_version ge v2.0.0) {
10718        push @return, split /\n/, <<'END';
10719dt ; com       ; compat
10720dt ; nar       ; narrow
10721dt ; sml       ; small
10722dt ; vert      ; vertical
10723dt ; wide      ; wide
10724
10725gc ; Cf        ; Format
10726gc ; Cs        ; Surrogate
10727gc ; Lt        ; Titlecase_Letter
10728gc ; Me        ; Enclosing_Mark
10729gc ; Nl        ; Letter_Number
10730gc ; Pc        ; Connector_Punctuation
10731gc ; Sk        ; Modifier_Symbol
10732END
10733    }
10734    if ($v_version ge v2.1.2) {
10735        push @return, "bc ; S         ; Segment_Separator\n";
10736    }
10737    if ($v_version ge v2.1.5) {
10738        push @return, split /\n/, <<'END';
10739gc ; Pf        ; Final_Punctuation
10740gc ; Pi        ; Initial_Punctuation
10741END
10742    }
10743    if ($v_version ge v2.1.8) {
10744        push @return, "ccc; 240; IS   ; Iota_Subscript\n";
10745    }
10746
10747    if ($v_version ge v3.0.0) {
10748        push @return, split /\n/, <<'END';
10749bc ; AL        ; Arabic_Letter
10750bc ; BN        ; Boundary_Neutral
10751bc ; LRE       ; Left_To_Right_Embedding
10752bc ; LRO       ; Left_To_Right_Override
10753bc ; NSM       ; Nonspacing_Mark
10754bc ; PDF       ; Pop_Directional_Format
10755bc ; RLE       ; Right_To_Left_Embedding
10756bc ; RLO       ; Right_To_Left_Override
10757
10758ccc; 233; DB   ; Double_Below
10759END
10760    }
10761
10762    if ($v_version ge v3.1.0) {
10763        push @return, "ccc; 226; R    ; Right\n";
10764    }
10765
10766    return @return;
10767}
10768
10769sub process_NormalizationsTest($file) {
10770
10771    # Each line looks like:
10772    #      source code point; NFC; NFD; NFKC; NFKD
10773    # e.g.
10774    #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
10775
10776    # Process each line of the file ...
10777    while ($file->next_line) {
10778
10779        next if /^@/;
10780
10781        my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10782
10783        foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10784            $$var = pack "U0U*", map { hex } split " ", $$var;
10785            $$var =~ s/(\\)/$1$1/g;
10786        }
10787
10788        push @normalization_tests,
10789                "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n";
10790    } # End of looping through the file
10791}
10792
10793sub output_perl_charnames_line ($code_point, $name) {
10794
10795    # Output the entries in Perl_charnames specially, using 5 digits instead
10796    # of four.  This makes the entries a constant length, and simplifies
10797    # charnames.pm which this table is for.  Unicode can have 6 digit
10798    # ordinals, but they are all private use or noncharacters which do not
10799    # have names, so won't be in this table.
10800
10801    return sprintf "%05X\n%s\n\n", $code_point, $name;
10802}
10803
10804{ # Closure
10805
10806    # These are constants to the $property_info hash in this subroutine, to
10807    # avoid using a quoted-string which might have a typo.
10808    my $TYPE  = 'type';
10809    my $DEFAULT_MAP = 'default_map';
10810    my $DEFAULT_TABLE = 'default_table';
10811    my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10812    my $MISSINGS = 'missings';
10813
10814    sub process_generic_property_file($file) {
10815        # This processes a file containing property mappings and puts them
10816        # into internal map tables.  It should be used to handle any property
10817        # files that have mappings from a code point or range thereof to
10818        # something else.  This means almost all the UCD .txt files.
10819        # each_line_handlers() should be set to adjust the lines of these
10820        # files, if necessary, to what this routine understands:
10821        #
10822        # 0374          ; NFD_QC; N
10823        # 003C..003E    ; Math
10824        #
10825        # the fields are: "codepoint-range ; property; map"
10826        #
10827        # meaning the codepoints in the range all have the value 'map' under
10828        # 'property'.
10829        # Beginning and trailing white space in each field are not significant.
10830        # Note there is not a trailing semi-colon in the above.  A trailing
10831        # semi-colon means the map is a null-string.  An omitted map, as
10832        # opposed to a null-string, is assumed to be 'Y', based on Unicode
10833        # table syntax.  (This could have been hidden from this routine by
10834        # doing it in the $file object, but that would require parsing of the
10835        # line there, so would have to parse it twice, or change the interface
10836        # to pass this an array.  So not done.)
10837        #
10838        # The map field may begin with a sequence of commands that apply to
10839        # this range.  Each such command begins and ends with $CMD_DELIM.
10840        # These are used to indicate, for example, that the mapping for a
10841        # range has a non-default type.
10842        #
10843        # This loops through the file, calling its next_line() method, and
10844        # then taking the map and adding it to the property's table.
10845        # Complications arise because any number of properties can be in the
10846        # file, in any order, interspersed in any way.  The first time a
10847        # property is seen, it gets information about that property and
10848        # caches it for quick retrieval later.  It also normalizes the maps
10849        # so that only one of many synonyms is stored.  The Unicode input
10850        # files do use some multiple synonyms.
10851
10852        my %property_info;               # To keep track of what properties
10853                                         # have already had entries in the
10854                                         # current file, and info about each,
10855                                         # so don't have to recompute.
10856        my $property_name;               # property currently being worked on
10857        my $property_type;               # and its type
10858        my $previous_property_name = ""; # name from last time through loop
10859        my $property_object;             # pointer to the current property's
10860                                         # object
10861        my $property_addr;               # the address of that object
10862        my $default_map;                 # the string that code points missing
10863                                         # from the file map to
10864        my $default_table;               # For non-string properties, a
10865                                         # reference to the match table that
10866                                         # will contain the list of code
10867                                         # points that map to $default_map.
10868
10869        # Get the next real non-comment line
10870        LINE:
10871        while ($file->next_line) {
10872
10873            # Default replacement type; means that if parts of the range have
10874            # already been stored in our tables, the new map overrides them if
10875            # they differ more than cosmetically
10876            my $replace = $IF_NOT_EQUIVALENT;
10877            my $map_type;            # Default type for the map of this range
10878
10879            #local $to_trace = 1 if main::DEBUG;
10880            trace $_ if main::DEBUG && $to_trace;
10881
10882            # Split the line into components
10883            my ($range, $property_name, $map, @remainder)
10884                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10885
10886            # If more or less on the line than we are expecting, warn and skip
10887            # the line
10888            if (@remainder) {
10889                $file->carp_bad_line('Extra fields');
10890                next LINE;
10891            }
10892            elsif ( ! defined $property_name) {
10893                $file->carp_bad_line('Missing property');
10894                next LINE;
10895            }
10896
10897            # Examine the range.
10898            if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10899            {
10900                $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10901                next LINE;
10902            }
10903            my $low = hex $1;
10904            my $high = (defined $2) ? hex $2 : $low;
10905
10906            # If changing to a new property, get the things constant per
10907            # property
10908            if ($previous_property_name ne $property_name) {
10909
10910                $property_object = property_ref($property_name);
10911                if (! defined $property_object) {
10912                    $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
10913                    next LINE;
10914                }
10915                $property_addr = pack 'J', refaddr $property_object;
10916
10917                # Defer changing names until have a line that is acceptable
10918                # (the 'next' statement above means is unacceptable)
10919                $previous_property_name = $property_name;
10920
10921                # If not the first time for this property, retrieve info about
10922                # it from the cache
10923                my $this_property_info = $property_info{$property_addr};
10924                if (defined ($this_property_info->{$TYPE})) {
10925                    $property_type = $this_property_info->{$TYPE};
10926                    $default_map = $this_property_info->{$DEFAULT_MAP};
10927                    $map_type = $this_property_info->{$PSEUDO_MAP_TYPE};
10928                    $default_table = $this_property_info->{$DEFAULT_TABLE};
10929                }
10930                else {
10931
10932                    # Here, is the first time for this property.  Set up the
10933                    # cache.
10934                    $property_type = $this_property_info->{$TYPE}
10935                                   = $property_object->type;
10936                    $map_type
10937                        = $this_property_info->{$PSEUDO_MAP_TYPE}
10938                        = $property_object->pseudo_map_type;
10939
10940                    # The Unicode files are set up so that if the map is not
10941                    # defined, it is a binary property
10942                    if (! defined $map && $property_type != $BINARY) {
10943                        if ($property_type != $UNKNOWN
10944                            && $property_type != $NON_STRING)
10945                        {
10946                            $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
10947                        }
10948                        else {
10949                            $property_object->set_type($BINARY);
10950                            $property_type = $this_property_info->{$TYPE}
10951                                           = $BINARY;
10952                        }
10953                    }
10954
10955                    # Get any @missings default for this property.  This
10956                    # should precede the first entry for the property in the
10957                    # input file, and is located in a comment that has been
10958                    # stored by the Input_file class until we access it here.
10959                    # It's possible that there is more than one such line
10960                    # waiting for us; collect them all, and parse
10961                    my @missings_list;
10962                    @missings_list = $file->get_missings
10963                                            if $file->has_missings_defaults;
10964
10965                    foreach my $default_ref (@missings_list) {
10966
10967                        # For now, we are only interested in the fallback
10968                        # default for the entire property. i.e., an @missing
10969                        # line that is for the whole Unicode range.
10970                        next if $default_ref->{start} != 0
10971                             || $default_ref->{end} != $MAX_UNICODE_CODEPOINT;
10972
10973                        $default_map = $default_ref->{default};
10974
10975                        # For string properties, the default is just what the
10976                        # file says, but non-string properties should already
10977                        # have set up a table for the default property value;
10978                        # use the table for these, so can resolve synonyms
10979                        # later to a single standard one.
10980                        if ($property_type == $STRING
10981                            || $property_type == $UNKNOWN)
10982                        {
10983                            $this_property_info->{$MISSINGS} = $default_map;
10984                        }
10985                        else {
10986                            $default_map =
10987                               $property_object->table($default_map)->full_name;
10988                            $this_property_info->{$MISSINGS} = $default_map;
10989                            $this_property_info->{$DEFAULT_MAP} = $default_map;
10990                            if (! defined $property_object->default_map) {
10991                                $property_object->set_default_map($default_map);
10992                            }
10993                        }
10994                    }
10995
10996                    # For later Unicode versions, multiple @missing lines for
10997                    # a single property can appear in the files.  The first
10998                    # always applies to the entire Unicode range, and was
10999                    # handled above.  The subsequent ones are for smaller
11000                    # ranges, and can be read as "But for this range, the
11001                    # default is ...".  So each overrides all the preceding
11002                    # ones for the range it applies to.  Typically they apply
11003                    # to disjoint ranges, but don't have to.  What we do is to
11004                    # set them up to work in reverse order, so that after the
11005                    # rest of the table is filled, the highest priority
11006                    # default range fills in any code points that haven't been
11007                    # specified; then the next highest priority one is
11008                    # applied, and so forth.
11009                    if (@missings_list > 1 && $v_version ge v15.0.0) {
11010                        if ($property_type != $ENUM) {
11011                            Carp::my_carp_bug("Multiple \@missings lines only"
11012                                            . " make sense for ENUM-type"
11013                                            . " properties.  Changing type to"
11014                                            . " that");
11015                            $property_type = $this_property_info->{$TYPE}
11016                                                                        = $ENUM;
11017                            $property_object->set_type($ENUM);
11018                        }
11019
11020                        my $multi = Multi_Default->new();
11021
11022                        # The overall default should be first on this list,
11023                        # and is handled differently than the rest.
11024                        $default_map = shift @missings_list;
11025                        Carp::my_carp_bug("\@missings needs to be entire range")
11026                            if $default_map->{start} != 0
11027                            || $default_map->{end} != $MAX_UNICODE_CODEPOINT;
11028
11029                        # We already have looked at this line above.  Use that
11030                        # result
11031                        $multi->set_final_default($this_property_info->
11032                                                                  {$MISSINGS});
11033
11034                        # Now get the individual range elements, and add them
11035                        # to Multi_Default object
11036                        while (@missings_list) {
11037                            my $this_entry = pop @missings_list;
11038                            my $subrange_default = $this_entry->{default};
11039
11040                            # Use the short name as a standard
11041                            $subrange_default = $property_object->
11042                                        table($subrange_default)->short_name;
11043                            $multi->append_default($subrange_default,
11044                                "Range_List->new(Initialize => Range->new("
11045                              . "$this_entry->{start}, $this_entry->{end}))");
11046                        }
11047
11048                        # Override the property's simple default with this.
11049                        $property_object->set_default_map($multi);
11050                    }
11051
11052                    if (! $default_map || $property_type != $ENUM) {
11053
11054                        # Finished storing all the @missings defaults in the
11055                        # input file so far.  Get the one for the current
11056                        # property.
11057                        my $missings = $this_property_info->{$MISSINGS};
11058
11059                        # But we likely have separately stored what the
11060                        # default should be.  (This is to accommodate versions
11061                        # of the standard where the @missings lines are absent
11062                        # or incomplete.)  Hopefully the two will match.  But
11063                        # check it out.
11064                        $default_map = $property_object->default_map;
11065
11066                        # If the map is a ref, it means that the default won't
11067                        # be processed until later, so undef it, so next few
11068                        # lines will redefine it to something that nothing
11069                        # will match
11070                        undef $default_map if ref $default_map;
11071
11072                        # Create a $default_map if don't have one; maybe a
11073                        # dummy that won't match anything.
11074                        if (! defined $default_map) {
11075
11076                            # Use any @missings line in the file.
11077                            if (defined $missings) {
11078                                if (ref $missings) {
11079                                    $default_map = $missings->full_name;
11080                                    $default_table = $missings;
11081                                }
11082                                else {
11083                                    $default_map = $missings;
11084                                }
11085
11086                                # And store it with the property for outside
11087                                # use.
11088                                $property_object->set_default_map($default_map);
11089                            }
11090                            else {
11091
11092                                # Neither an @missings nor a default map.
11093                                # Create a dummy one, so won't have to test
11094                                # definedness in the main loop.
11095                                $default_map = '_Perl This will never be in a'
11096                                             . ' file from Unicode';
11097                            }
11098                        }
11099
11100                        # Here, we have $default_map defined, possibly in
11101                        # terms of $missings, but maybe not, and possibly is a
11102                        # dummy one.
11103                        if (defined $missings) {
11104
11105                            # Make sure there is no conflict between the two.
11106                            # $missings has priority.
11107                            if (ref $missings) {
11108                                $default_table
11109                                        = $property_object->table($default_map);
11110                                if ( ! defined $default_table
11111                                    || $default_table != $missings)
11112                                {
11113                                    if (! defined $default_table) {
11114                                        $default_table = $UNDEF;
11115                                    }
11116                                    $file->carp_bad_line(<<END
11117The \@missings line for $property_name in $file says that missings default to
11118$missings, but we expect it to be $default_table.  $missings used.
11119END
11120                                    );
11121                                    $default_table = $missings;
11122                                    $default_map = $missings->full_name;
11123                                }
11124                                $this_property_info->{$DEFAULT_TABLE}
11125                                                            = $default_table;
11126                            }
11127                            elsif ($default_map ne $missings) {
11128                                $file->carp_bad_line(<<END
11129The \@missings line for $property_name in $file says that missings default to
11130$missings, but we expect it to be $default_map.  $missings used.
11131END
11132                                );
11133                                $default_map = $missings;
11134                            }
11135                        }
11136
11137                        $this_property_info->{$DEFAULT_MAP} = $default_map;
11138
11139                        # If haven't done so already, find the table
11140                        # corresponding to this map for non-string properties.
11141                        if (! defined $default_table
11142                            && $property_type != $STRING
11143                            && $property_type != $UNKNOWN)
11144                        {
11145                            $default_table
11146                                        = $this_property_info->{$DEFAULT_TABLE}
11147                                        = $property_object->table($default_map);
11148                        }
11149                    }
11150                } # End of is first time for this property
11151            } # End of switching properties.
11152
11153            # Ready to process the line.
11154            # The Unicode files are set up so that if the map is not defined,
11155            # it is a binary property with value 'Y'
11156            if (! defined $map) {
11157                $map = 'Y';
11158            }
11159            else {
11160
11161                # If the map begins with a special command to us (enclosed in
11162                # delimiters), extract the command(s).
11163                while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11164                    my $command = $1;
11165                    if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
11166                        $replace = $1;
11167                    }
11168                    elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
11169                        $map_type = $1;
11170                    }
11171                    else {
11172                        $file->carp_bad_line("Unknown command line: '$1'");
11173                        next LINE;
11174                    }
11175                }
11176            }
11177
11178            if (   $default_map eq $CODE_POINT
11179                && $map =~ / ^ $code_point_re $/x)
11180            {
11181
11182                # Here, we have a map to a particular code point, and the
11183                # default map is to a code point itself.  If the range
11184                # includes the particular code point, change that portion of
11185                # the range to the default.  This makes sure that in the final
11186                # table only the non-defaults are listed.
11187                my $decimal_map = hex $map;
11188                if ($low <= $decimal_map && $decimal_map <= $high) {
11189
11190                    # If the range includes stuff before or after the map
11191                    # we're changing, split it and process the split-off parts
11192                    # later.
11193                    if ($low < $decimal_map) {
11194                        $file->insert_adjusted_lines(
11195                                            sprintf("%04X..%04X; %s; %s",
11196                                                    $low,
11197                                                    $decimal_map - 1,
11198                                                    $property_name,
11199                                                    $map));
11200                    }
11201                    if ($high > $decimal_map) {
11202                        $file->insert_adjusted_lines(
11203                                            sprintf("%04X..%04X; %s; %s",
11204                                                    $decimal_map + 1,
11205                                                    $high,
11206                                                    $property_name,
11207                                                    $map));
11208                    }
11209                    $low = $high = $decimal_map;
11210                    $map = $CODE_POINT;
11211                }
11212            }
11213
11214            if ($property_type != $STRING && $property_type != $UNKNOWN) {
11215                my $table = $property_object->table($map);
11216                if (defined $table) {
11217
11218                    # Unicode isn't very consistent about which synonym they
11219                    # use in their .txt files, even within the same file, or
11220                    # two files that are for the same property.  For enum
11221                    # properties, we know already what all the synonyms are
11222                    # (because we processed PropValueAliases already).
11223                    # Therefore we can take the input and map it to a uniform
11224                    # value now, saving us trouble later.
11225                    #
11226                    # Only if the map is well-behaved do we try this:
11227                    # non-empty, all non-blank.
11228                    if ($property_type == $ENUM && $map =~ / ^ \S+ $ /x) {
11229
11230                        # Use existing practice as much as easily practicable,
11231                        # so that code that has assumptions about spelling
11232                        # doesn't have to change
11233                        my $short_name = $property_object->short_name;
11234                        if ($short_name =~ / ^ (BC | EA | GC  |HST | JT |
11235                                                Lb | BT | BPT | NFCQC |
11236                                                NFKCQC) $ /ix)
11237                        {
11238                            $map = $table->short_name;
11239                        }
11240                        elsif ($short_name !~ / ^ ( Ccc | Age | InSC | JG |
11241                                                    SB) $ /ix)
11242                        {
11243                            $map = $table->full_name;
11244                        }
11245                    }
11246                    elsif ($table == $default_table) {
11247
11248                        # When it isn't an ENUM, we we can still tell if
11249                        # this is a synonym for the default map.  If so, use
11250                        # the default one instead.
11251                        $map = $default_map;
11252                    }
11253                }
11254            }
11255
11256            # And figure out the map type if not known.
11257            if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11258                if ($map eq "") {   # Nulls are always $NULL map type
11259                    $map_type = $NULL;
11260                } # Otherwise, non-strings, and those that don't allow
11261                  # $MULTI_CP, and those that aren't multiple code points are
11262                  # 0
11263                elsif
11264                   (($property_type != $STRING && $property_type != $UNKNOWN)
11265                   || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11266                   || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
11267                {
11268                    $map_type = 0;
11269                }
11270                else {
11271                    $map_type = $MULTI_CP;
11272                }
11273            }
11274
11275            $property_object->add_map($low, $high,
11276                                        $map,
11277                                        Type => $map_type,
11278                                        Replace => $replace);
11279        } # End of loop through file's lines
11280
11281        return;
11282    }
11283}
11284
11285{ # Closure for UnicodeData.txt handling
11286
11287    # This file was the first one in the UCD; its design leads to some
11288    # awkwardness in processing.  Here is a sample line:
11289    # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11290    # The fields in order are:
11291    my $i = 0;            # The code point is in field 0, and is shifted off.
11292    my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
11293    my $CATEGORY = $i++;  # category (e.g. "Lu")
11294    my $CCC = $i++;       # Canonical combining class (e.g. "230")
11295    my $BIDI = $i++;      # directional class (e.g. "L")
11296    my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
11297    my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
11298    my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11299                                         # Dual-use in this program; see below
11300    my $NUMERIC = $i++;   # numeric value
11301    my $MIRRORED = $i++;  # ? mirrored
11302    my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11303    my $COMMENT = $i++;   # iso comment
11304    my $UPPER = $i++;     # simple uppercase mapping
11305    my $LOWER = $i++;     # simple lowercase mapping
11306    my $TITLE = $i++;     # simple titlecase mapping
11307    my $input_field_count = $i;
11308
11309    # This routine in addition outputs these extra fields:
11310
11311    my $DECOMP_TYPE = $i++; # Decomposition type
11312
11313    # These fields are modifications of ones above, and are usually
11314    # suppressed; they must come last, as for speed, the loop upper bound is
11315    # normally set to ignore them
11316    my $NAME = $i++;        # This is the strict name field, not the one that
11317                            # charnames uses.
11318    my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
11319                            # by Unicode::Normalize
11320    my $last_field = $i - 1;
11321
11322    # All these are read into an array for each line, with the indices defined
11323    # above.  The empty fields in the example line above indicate that the
11324    # value is defaulted.  The handler called for each line of the input
11325    # changes these to their defaults.
11326
11327    # Here are the official names of the properties, in a parallel array:
11328    my @field_names;
11329    $field_names[$BIDI] = 'Bidi_Class';
11330    $field_names[$CATEGORY] = 'General_Category';
11331    $field_names[$CCC] = 'Canonical_Combining_Class';
11332    $field_names[$CHARNAME] = 'Perl_Charnames';
11333    $field_names[$COMMENT] = 'ISO_Comment';
11334    $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11335    $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11336    $field_names[$LOWER] = 'Lowercase_Mapping';
11337    $field_names[$MIRRORED] = 'Bidi_Mirrored';
11338    $field_names[$NAME] = 'Name';
11339    $field_names[$NUMERIC] = 'Numeric_Value';
11340    $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11341    $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11342    $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11343    $field_names[$TITLE] = 'Titlecase_Mapping';
11344    $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11345    $field_names[$UPPER] = 'Uppercase_Mapping';
11346
11347    # Some of these need a little more explanation:
11348    # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11349    #   property, but is used in calculating the Numeric_Type.  Perl however,
11350    #   creates a file from this field, so a Perl property is created from it.
11351    # Similarly, the Other_Digit field is used only for calculating the
11352    #   Numeric_Type, and so it can be safely re-used as the place to store
11353    #   the value for Numeric_Type; hence it is referred to as
11354    #   $NUMERIC_TYPE_OTHER_DIGIT.
11355    # The input field named $PERL_DECOMPOSITION is a combination of both the
11356    #   decomposition mapping and its type.  Perl creates a file containing
11357    #   exactly this field, so it is used for that.  The two properties are
11358    #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11359    #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
11360    #   output it), as Perl doesn't use it directly.
11361    # The input field named here $CHARNAME is used to construct the
11362    #   Perl_Charnames property, which is a combination of the Name property
11363    #   (which the input field contains), and the Unicode_1_Name property, and
11364    #   others from other files.  Since, the strict Name property is not used
11365    #   by Perl, this field is used for the table that Perl does use.  The
11366    #   strict Name property table is usually suppressed (unless the lists are
11367    #   changed to output it), so it is accumulated in a separate field,
11368    #   $NAME, which to save time is discarded unless the table is actually to
11369    #   be output
11370
11371    # This file is processed like most in this program.  Control is passed to
11372    # process_generic_property_file() which calls filter_UnicodeData_line()
11373    # for each input line.  This filter converts the input into line(s) that
11374    # process_generic_property_file() understands.  There is also a setup
11375    # routine called before any of the file is processed, and a handler for
11376    # EOF processing, all in this closure.
11377
11378    # A huge speed-up occurred at the cost of some added complexity when these
11379    # routines were altered to buffer the outputs into ranges.  Almost all the
11380    # lines of the input file apply to just one code point, and for most
11381    # properties, the map for the next code point up is the same as the
11382    # current one.  So instead of creating a line for each property for each
11383    # input line, filter_UnicodeData_line() remembers what the previous map
11384    # of a property was, and doesn't generate a line to pass on until it has
11385    # to, as when the map changes; and that passed-on line encompasses the
11386    # whole contiguous range of code points that have the same map for that
11387    # property.  This means a slight amount of extra setup, and having to
11388    # flush these buffers on EOF, testing if the maps have changed, plus
11389    # remembering state information in the closure.  But it means a lot less
11390    # real time in not having to change the data base for each property on
11391    # each line.
11392
11393    # Another complication is that there are already a few ranges designated
11394    # in the input.  There are two lines for each, with the same maps except
11395    # the code point and name on each line.  This was actually the hardest
11396    # thing to design around.  The code points in those ranges may actually
11397    # have real maps not given by these two lines.  These maps will either
11398    # be algorithmically determinable, or be in the extracted files furnished
11399    # with the UCD.  In the event of conflicts between these extracted files,
11400    # and this one, Unicode says that this one prevails.  But it shouldn't
11401    # prevail for conflicts that occur in these ranges.  The data from the
11402    # extracted files prevails in those cases.  So, this program is structured
11403    # so that those files are processed first, storing maps.  Then the other
11404    # files are processed, generally overwriting what the extracted files
11405    # stored.  But just the range lines in this input file are processed
11406    # without overwriting.  This is accomplished by adding a special string to
11407    # the lines output to tell process_generic_property_file() to turn off the
11408    # overwriting for just this one line.
11409    # A similar mechanism is used to tell it that the map is of a non-default
11410    # type.
11411
11412    sub setup_UnicodeData($file) { # Called before any lines of the input are read
11413
11414        # Create a new property specially located that is a combination of
11415        # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11416        # _Perl_Name_Alias properties.  (The final one duplicates elements of the
11417        # first, and starting in v6.1, is the same as the 'Name_Alias
11418        # property.)  A comment for the new property will later be constructed
11419        # based on the actual properties present and used
11420        $perl_charname = Property->new('Perl_Charnames',
11421                       Default_Map => "",
11422                       Directory => File::Spec->curdir(),
11423                       File => 'Name',
11424                       Fate => $INTERNAL_ONLY,
11425                       Perl_Extension => 1,
11426                       Range_Size_1 => \&output_perl_charnames_line,
11427                       Type => $STRING,
11428                       );
11429        $perl_charname->set_proxy_for('Name');
11430
11431        my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11432                                        Directory => File::Spec->curdir(),
11433                                        File => 'Decomposition',
11434                                        Format => $DECOMP_STRING_FORMAT,
11435                                        Fate => $INTERNAL_ONLY,
11436                                        Perl_Extension => 1,
11437                                        Default_Map => $CODE_POINT,
11438
11439                                        # normalize.pm can't cope with these
11440                                        Output_Range_Counts => 0,
11441
11442                                        # This is a specially formatted table
11443                                        # explicitly for normalize.pm, which
11444                                        # is expecting a particular format,
11445                                        # which means that mappings containing
11446                                        # multiple code points are in the main
11447                                        # body of the table
11448                                        Map_Type => $COMPUTE_NO_MULTI_CP,
11449                                        Type => $STRING,
11450                                        To_Output_Map => $INTERNAL_MAP,
11451                                        );
11452        $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11453        $Perl_decomp->add_comment(join_lines(<<END
11454This mapping is a combination of the Unicode 'Decomposition_Type' and
11455'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
11456identical to the official Unicode 'Decomposition_Mapping' property except for
11457two things:
11458 1) It omits the algorithmically determinable Hangul syllable decompositions,
11459which normalize.pm handles algorithmically.
11460 2) It contains the decomposition type as well.  Non-canonical decompositions
11461begin with a word in angle brackets, like <super>, which denotes the
11462compatible decomposition type.  If the map does not begin with the <angle
11463brackets>, the decomposition is canonical.
11464END
11465        ));
11466
11467        my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11468                                        Default_Map => "",
11469                                        Perl_Extension => 1,
11470                                        Directory => $map_directory,
11471                                        Type => $STRING,
11472                                        To_Output_Map => $OUTPUT_ADJUSTED,
11473                                        );
11474        $Decimal_Digit->add_comment(join_lines(<<END
11475This file gives the mapping of all code points which represent a single
11476decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11477points, and the mapping of each non-initial element of each range is actually
11478not to "0", but to the offset that element has from its corresponding DIGIT 0.
11479These code points are those that have Numeric_Type=Decimal; not special
11480things, like subscripts nor Roman numerals.
11481END
11482        ));
11483
11484        # These properties are not used for generating anything else, and are
11485        # usually not output.  By making them last in the list, we can just
11486        # change the high end of the loop downwards to avoid the work of
11487        # generating a table(s) that is/are just going to get thrown away.
11488        if (! property_ref('Decomposition_Mapping')->to_output_map
11489            && ! property_ref('Name')->to_output_map)
11490        {
11491            $last_field = min($NAME, $DECOMP_MAP) - 1;
11492        } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11493            $last_field = $DECOMP_MAP;
11494        } elsif (property_ref('Name')->to_output_map) {
11495            $last_field = $NAME;
11496        }
11497        return;
11498    }
11499
11500    my $first_time = 1;                 # ? Is this the first line of the file
11501    my $in_range = 0;                   # ? Are we in one of the file's ranges
11502    my $previous_cp;                    # hex code point of previous line
11503    my $decimal_previous_cp = -1;       # And its decimal equivalent
11504    my @start;                          # For each field, the current starting
11505                                        # code point in hex for the range
11506                                        # being accumulated.
11507    my @fields;                         # The input fields;
11508    my @previous_fields;                # And those from the previous call
11509
11510    sub filter_UnicodeData_line($file) {
11511        # Handle a single input line from UnicodeData.txt; see comments above
11512        # Conceptually this takes a single line from the file containing N
11513        # properties, and converts it into N lines with one property per line,
11514        # which is what the final handler expects.  But there are
11515        # complications due to the quirkiness of the input file, and to save
11516        # time, it accumulates ranges where the property values don't change
11517        # and only emits lines when necessary.  This is about an order of
11518        # magnitude fewer lines emitted.
11519
11520        # $_ contains the input line.
11521        # -1 in split means retain trailing null fields
11522        (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11523
11524        #local $to_trace = 1 if main::DEBUG;
11525        trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11526        if (@fields > $input_field_count) {
11527            $file->carp_bad_line('Extra fields');
11528            $_ = "";
11529            return;
11530        }
11531
11532        my $decimal_cp = hex $cp;
11533
11534        # We have to output all the buffered ranges when the next code point
11535        # is not exactly one after the previous one, which means there is a
11536        # gap in the ranges.
11537        my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11538
11539        # The decomposition mapping field requires special handling.  It looks
11540        # like either:
11541        #
11542        # <compat> 0032 0020
11543        # 0041 0300
11544        #
11545        # The decomposition type is enclosed in <brackets>; if missing, it
11546        # means the type is canonical.  There are two decomposition mapping
11547        # tables: the one for use by Perl's normalize.pm has a special format
11548        # which is this field intact; the other, for general use is of
11549        # standard format.  In either case we have to find the decomposition
11550        # type.  Empty fields have None as their type, and map to the code
11551        # point itself
11552        if ($fields[$PERL_DECOMPOSITION] eq "") {
11553            $fields[$DECOMP_TYPE] = 'None';
11554            $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11555        }
11556        else {
11557            ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11558                                            =~ / < ( .+? ) > \s* ( .+ ) /x;
11559            if (! defined $fields[$DECOMP_TYPE]) {
11560                $fields[$DECOMP_TYPE] = 'Canonical';
11561                $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11562            }
11563            else {
11564                $fields[$DECOMP_MAP] = $map;
11565            }
11566        }
11567
11568        # The 3 numeric fields also require special handling.  The 2 digit
11569        # fields must be either empty or match the number field.  This means
11570        # that if it is empty, they must be as well, and the numeric type is
11571        # None, and the numeric value is 'Nan'.
11572        # The decimal digit field must be empty or match the other digit
11573        # field.  If the decimal digit field is non-empty, the code point is
11574        # a decimal digit, and the other two fields will have the same value.
11575        # If it is empty, but the other digit field is non-empty, the code
11576        # point is an 'other digit', and the number field will have the same
11577        # value as the other digit field.  If the other digit field is empty,
11578        # but the number field is non-empty, the code point is a generic
11579        # numeric type.
11580        if ($fields[$NUMERIC] eq "") {
11581            if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11582                || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11583            ) {
11584                $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11585            }
11586            $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11587            $fields[$NUMERIC] = 'NaN';
11588        }
11589        else {
11590            $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11591            if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11592                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11593                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11594                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11595            }
11596            elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11597                $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11598                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11599            }
11600            else {
11601                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11602
11603                # Rationals require extra effort.
11604                if ($fields[$NUMERIC] =~ qr{/}) {
11605                    reduce_fraction(\$fields[$NUMERIC]);
11606                    register_fraction($fields[$NUMERIC])
11607                }
11608            }
11609        }
11610
11611        # For the properties that have empty fields in the file, and which
11612        # mean something different from empty, change them to that default.
11613        # Certain fields just haven't been empty so far in any Unicode
11614        # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11615        # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11616        # the defaults; which are very unlikely to ever change.
11617        $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11618        $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11619
11620        # UAX44 says that if title is empty, it is the same as whatever upper
11621        # is,
11622        $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11623
11624        # There are a few pairs of lines like:
11625        #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11626        #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11627        # that define ranges.  These should be processed after the fields are
11628        # adjusted above, as they may override some of them; but mostly what
11629        # is left is to possibly adjust the $CHARNAME field.  The names of all the
11630        # paired lines start with a '<', but this is also true of '<control>,
11631        # which isn't one of these special ones.
11632        if ($fields[$CHARNAME] eq '<control>') {
11633
11634            # Some code points in this file have the pseudo-name
11635            # '<control>', but the official name for such ones is the null
11636            # string.
11637            $fields[$NAME] = $fields[$CHARNAME] = "";
11638
11639            # We had better not be in between range lines.
11640            if ($in_range) {
11641                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11642                $in_range = 0;
11643            }
11644        }
11645        elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11646
11647            # Here is a non-range line.  We had better not be in between range
11648            # lines.
11649            if ($in_range) {
11650                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11651                $in_range = 0;
11652            }
11653            if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11654
11655                # These are code points whose names end in their code points,
11656                # which means the names are algorithmically derivable from the
11657                # code points.  To shorten the output Name file, the algorithm
11658                # for deriving these is placed in the file instead of each
11659                # code point, so they have map type $CP_IN_NAME
11660                $fields[$CHARNAME] = $CMD_DELIM
11661                                 . $MAP_TYPE_CMD
11662                                 . '='
11663                                 . $CP_IN_NAME
11664                                 . $CMD_DELIM
11665                                 . $fields[$CHARNAME];
11666            }
11667            $fields[$NAME] = $fields[$CHARNAME];
11668        }
11669        elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11670            $fields[$CHARNAME] = $fields[$NAME] = $1;
11671
11672            # Here we are at the beginning of a range pair.
11673            if ($in_range) {
11674                $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11675            }
11676            $in_range = 1;
11677
11678            # Because the properties in the range do not overwrite any already
11679            # in the db, we must flush the buffers of what's already there, so
11680            # they get handled in the normal scheme.
11681            $force_output = 1;
11682
11683        }
11684        elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11685            $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11686            $_ = "";
11687            return;
11688        }
11689        else { # Here, we are at the last line of a range pair.
11690
11691            if (! $in_range) {
11692                $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11693                $_ = "";
11694                return;
11695            }
11696            $in_range = 0;
11697
11698            $fields[$NAME] = $fields[$CHARNAME];
11699
11700            # Check that the input is valid: that the closing of the range is
11701            # the same as the beginning.
11702            foreach my $i (0 .. $last_field) {
11703                next if $fields[$i] eq $previous_fields[$i];
11704                $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11705            }
11706
11707            # The processing differs depending on the type of range,
11708            # determined by its $CHARNAME
11709            if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11710
11711                # Check that the data looks right.
11712                if ($decimal_previous_cp != $SBase) {
11713                    $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
11714                }
11715                if ($decimal_cp != $SBase + $SCount - 1) {
11716                    $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
11717                }
11718
11719                # The Hangul syllable range has a somewhat complicated name
11720                # generation algorithm.  Each code point in it has a canonical
11721                # decomposition also computable by an algorithm.  The
11722                # perl decomposition map table built from these is used only
11723                # by normalize.pm, which has the algorithm built in it, so the
11724                # decomposition maps are not needed, and are large, so are
11725                # omitted from it.  If the full decomposition map table is to
11726                # be output, the decompositions are generated for it, in the
11727                # EOF handling code for this input file.
11728
11729                $previous_fields[$DECOMP_TYPE] = 'Canonical';
11730
11731                # This range is stored in our internal structure with its
11732                # own map type, different from all others.
11733                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11734                                        = $CMD_DELIM
11735                                          . $MAP_TYPE_CMD
11736                                          . '='
11737                                          . $HANGUL_SYLLABLE
11738                                          . $CMD_DELIM
11739                                          . $fields[$CHARNAME];
11740            }
11741            elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
11742
11743                # All the CJK ranges like this have the name given as a
11744                # special case in the next code line.  And for the others, we
11745                # hope that Unicode continues to use the correct name in
11746                # future releases, so we don't have to make further special
11747                # cases.
11748                my $name = ($fields[$CHARNAME] =~ /^CJK/)
11749                           ? 'CJK UNIFIED IDEOGRAPH'
11750                           : uc $fields[$CHARNAME];
11751
11752                # The name for these contains the code point itself, and all
11753                # are defined to have the same base name, regardless of what
11754                # is in the file.  They are stored in our internal structure
11755                # with a map type of $CP_IN_NAME
11756                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11757                                        = $CMD_DELIM
11758                                           . $MAP_TYPE_CMD
11759                                           . '='
11760                                           . $CP_IN_NAME
11761                                           . $CMD_DELIM
11762                                           . $name;
11763
11764            }
11765            elsif ($fields[$CATEGORY] eq 'Co'
11766                     || $fields[$CATEGORY] eq 'Cs')
11767            {
11768                # The names of all the code points in these ranges are set to
11769                # null, as there are no names for the private use and
11770                # surrogate code points.
11771
11772                $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11773            }
11774            else {
11775                $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
11776            }
11777
11778            # The first line of the range caused everything else to be output,
11779            # and then its values were stored as the beginning values for the
11780            # next set of ranges, which this one ends.  Now, for each value,
11781            # add a command to tell the handler that these values should not
11782            # replace any existing ones in our database.
11783            foreach my $i (0 .. $last_field) {
11784                $previous_fields[$i] = $CMD_DELIM
11785                                        . $REPLACE_CMD
11786                                        . '='
11787                                        . $NO
11788                                        . $CMD_DELIM
11789                                        . $previous_fields[$i];
11790            }
11791
11792            # And change things so it looks like the entire range has been
11793            # gone through with this being the final part of it.  Adding the
11794            # command above to each field will cause this range to be flushed
11795            # during the next iteration, as it guaranteed that the stored
11796            # field won't match whatever value the next one has.
11797            $previous_cp = $cp;
11798            $decimal_previous_cp = $decimal_cp;
11799
11800            # We are now set up for the next iteration; so skip the remaining
11801            # code in this subroutine that does the same thing, but doesn't
11802            # know about these ranges.
11803            $_ = "";
11804
11805            return;
11806        }
11807
11808        # On the very first line, we fake it so the code below thinks there is
11809        # nothing to output, and initialize so that when it does get output it
11810        # uses the first line's values for the lowest part of the range.
11811        # (One could avoid this by using peek(), but then one would need to
11812        # know the adjustments done above and do the same ones in the setup
11813        # routine; not worth it)
11814        if ($first_time) {
11815            $first_time = 0;
11816            @previous_fields = @fields;
11817            @start = ($cp) x scalar @fields;
11818            $decimal_previous_cp = $decimal_cp - 1;
11819        }
11820
11821        # For each field, output the stored up ranges that this code point
11822        # doesn't fit in.  Earlier we figured out if all ranges should be
11823        # terminated because of changing the replace or map type styles, or if
11824        # there is a gap between this new code point and the previous one, and
11825        # that is stored in $force_output.  But even if those aren't true, we
11826        # need to output the range if this new code point's value for the
11827        # given property doesn't match the stored range's.
11828        #local $to_trace = 1 if main::DEBUG;
11829        foreach my $i (0 .. $last_field) {
11830            my $field = $fields[$i];
11831            if ($force_output || $field ne $previous_fields[$i]) {
11832
11833                # Flush the buffer of stored values.
11834                $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11835
11836                # Start a new range with this code point and its value
11837                $start[$i] = $cp;
11838                $previous_fields[$i] = $field;
11839            }
11840        }
11841
11842        # Set the values for the next time.
11843        $previous_cp = $cp;
11844        $decimal_previous_cp = $decimal_cp;
11845
11846        # The input line has generated whatever adjusted lines are needed, and
11847        # should not be looked at further.
11848        $_ = "";
11849        return;
11850    }
11851
11852    sub EOF_UnicodeData($file) {
11853        # Called upon EOF to flush the buffers, and create the Hangul
11854        # decomposition mappings if needed.
11855
11856        # Flush the buffers.
11857        foreach my $i (0 .. $last_field) {
11858            $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11859        }
11860
11861        if (-e 'Jamo.txt') {
11862
11863            # The algorithm is published by Unicode, based on values in
11864            # Jamo.txt, (which should have been processed before this
11865            # subroutine), and the results left in %Jamo
11866            unless (%Jamo) {
11867                Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
11868                return;
11869            }
11870
11871            # If the full decomposition map table is being output, insert
11872            # into it the Hangul syllable mappings.  This is to avoid having
11873            # to publish a subroutine in it to compute them.  (which would
11874            # essentially be this code.)  This uses the algorithm published by
11875            # Unicode.  (No hangul syllables in version 1)
11876            if ($v_version ge v2.0.0
11877                && property_ref('Decomposition_Mapping')->to_output_map) {
11878                for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11879                    use integer;
11880                    my $SIndex = $S - $SBase;
11881                    my $L = $LBase + $SIndex / $NCount;
11882                    my $V = $VBase + ($SIndex % $NCount) / $TCount;
11883                    my $T = $TBase + $SIndex % $TCount;
11884
11885                    trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11886                    my $decomposition = sprintf("%04X %04X", $L, $V);
11887                    $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11888                    $file->insert_adjusted_lines(
11889                                sprintf("%04X; Decomposition_Mapping; %s",
11890                                        $S,
11891                                        $decomposition));
11892                }
11893            }
11894        }
11895
11896        return;
11897    }
11898
11899    sub filter_v1_ucd($file) {
11900        # Fix UCD lines in version 1.  This is probably overkill, but this
11901        # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
11902        # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
11903        #       removed.  This program retains them
11904        # 2)    didn't include ranges, which it should have, and which are now
11905        #       added in @corrected_lines below.  It was hand populated by
11906        #       taking the data from Version 2, verified by analyzing
11907        #       DAge.txt.
11908        # 3)    There is a syntax error in the entry for U+09F8 which could
11909        #       cause problems for Unicode::UCD, and so is changed.  It's
11910        #       numeric value was simply a minus sign, without any number.
11911        #       (Eventually Unicode changed the code point to non-numeric.)
11912        # 4)    The decomposition types often don't match later versions
11913        #       exactly, and the whole syntax of that field is different; so
11914        #       the syntax is changed as well as the types to their later
11915        #       terminology.  Otherwise normalize.pm would be very unhappy
11916        # 5)    Many ccc classes are different.  These are left intact.
11917        # 6)    U+FF10..U+FF19 are missing their numeric values in all three
11918        #       fields.  These are unchanged because it doesn't really cause
11919        #       problems for Perl.
11920        # 7)    A number of code points, such as controls, don't have their
11921        #       Unicode Version 1 Names in this file.  These are added.
11922        # 8)    A number of Symbols were marked as Lm.  This changes those in
11923        #       the Latin1 range, so that regexes work.
11924        # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
11925        #       referred to by their lc equivalents.  Not fixed.
11926
11927        my @corrected_lines = split /\n/, <<'END';
119284E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
119299FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11930E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11931F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11932F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11933FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11934END
11935
11936        #local $to_trace = 1 if main::DEBUG;
11937        trace $_ if main::DEBUG && $to_trace;
11938
11939        # -1 => retain trailing null fields
11940        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11941
11942        # At the first place that is wrong in the input, insert all the
11943        # corrections, replacing the wrong line.
11944        if ($code_point eq '4E00') {
11945            my @copy = @corrected_lines;
11946            $_ = shift @copy;
11947            ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11948
11949            $file->insert_lines(@copy);
11950        }
11951        elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11952
11953            # There are no Lm characters in Latin1; these should be 'Sk', but
11954            # there isn't that in V1.
11955            $fields[$CATEGORY] = 'So';
11956        }
11957
11958        if ($fields[$NUMERIC] eq '-') {
11959            $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
11960        }
11961
11962        if  ($fields[$PERL_DECOMPOSITION] ne "") {
11963
11964            # Several entries have this change to superscript 2 or 3 in the
11965            # middle.  Convert these to the modern version, which is to use
11966            # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11967            # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11968            # 'HHHH HHHH 00B3 HHHH'.
11969            # It turns out that all of these that don't have another
11970            # decomposition defined at the beginning of the line have the
11971            # <square> decomposition in later releases.
11972            if ($code_point ne '00B2' && $code_point ne '00B3') {
11973                if  ($fields[$PERL_DECOMPOSITION]
11974                                    =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11975                {
11976                    if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11977                        $fields[$PERL_DECOMPOSITION] = '<square> '
11978                        . $fields[$PERL_DECOMPOSITION];
11979                    }
11980                }
11981            }
11982
11983            # If is like '<+circled> 0052 <-circled>', convert to
11984            # '<circled> 0052'
11985            $fields[$PERL_DECOMPOSITION] =~
11986                            s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11987
11988            # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11989            $fields[$PERL_DECOMPOSITION] =~
11990                            s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11991            or $fields[$PERL_DECOMPOSITION] =~
11992                            s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11993            or $fields[$PERL_DECOMPOSITION] =~
11994                            s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11995            or $fields[$PERL_DECOMPOSITION] =~
11996                        s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11997
11998            # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11999            $fields[$PERL_DECOMPOSITION] =~
12000                    s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
12001
12002            # Change names to modern form.
12003            $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
12004            $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
12005            $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
12006            $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
12007
12008            # One entry has weird braces
12009            $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
12010
12011            # One entry at U+2116 has an extra <sup>
12012            $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
12013        }
12014
12015        $_ = join ';', $code_point, @fields;
12016        trace $_ if main::DEBUG && $to_trace;
12017        return;
12018    }
12019
12020    sub filter_bad_Nd_ucd {
12021        # Early versions specified a value in the decimal digit field even
12022        # though the code point wasn't a decimal digit.  Clear the field in
12023        # that situation, so that the main code doesn't think it is a decimal
12024        # digit.
12025
12026        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12027        if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
12028            $fields[$PERL_DECIMAL_DIGIT] = "";
12029            $_ = join ';', $code_point, @fields;
12030        }
12031        return;
12032    }
12033
12034    my @U1_control_names = split /\n/, <<'END';
12035NULL
12036START OF HEADING
12037START OF TEXT
12038END OF TEXT
12039END OF TRANSMISSION
12040ENQUIRY
12041ACKNOWLEDGE
12042BELL
12043BACKSPACE
12044HORIZONTAL TABULATION
12045LINE FEED
12046VERTICAL TABULATION
12047FORM FEED
12048CARRIAGE RETURN
12049SHIFT OUT
12050SHIFT IN
12051DATA LINK ESCAPE
12052DEVICE CONTROL ONE
12053DEVICE CONTROL TWO
12054DEVICE CONTROL THREE
12055DEVICE CONTROL FOUR
12056NEGATIVE ACKNOWLEDGE
12057SYNCHRONOUS IDLE
12058END OF TRANSMISSION BLOCK
12059CANCEL
12060END OF MEDIUM
12061SUBSTITUTE
12062ESCAPE
12063FILE SEPARATOR
12064GROUP SEPARATOR
12065RECORD SEPARATOR
12066UNIT SEPARATOR
12067DELETE
12068BREAK PERMITTED HERE
12069NO BREAK HERE
12070INDEX
12071NEXT LINE
12072START OF SELECTED AREA
12073END OF SELECTED AREA
12074CHARACTER TABULATION SET
12075CHARACTER TABULATION WITH JUSTIFICATION
12076LINE TABULATION SET
12077PARTIAL LINE DOWN
12078PARTIAL LINE UP
12079REVERSE LINE FEED
12080SINGLE SHIFT TWO
12081SINGLE SHIFT THREE
12082DEVICE CONTROL STRING
12083PRIVATE USE ONE
12084PRIVATE USE TWO
12085SET TRANSMIT STATE
12086CANCEL CHARACTER
12087MESSAGE WAITING
12088START OF GUARDED AREA
12089END OF GUARDED AREA
12090START OF STRING
12091SINGLE CHARACTER INTRODUCER
12092CONTROL SEQUENCE INTRODUCER
12093STRING TERMINATOR
12094OPERATING SYSTEM COMMAND
12095PRIVACY MESSAGE
12096APPLICATION PROGRAM COMMAND
12097END
12098
12099    sub filter_early_U1_names {
12100        # Very early versions did not have the Unicode_1_name field specified.
12101        # They differed in which ones were present; make sure a U1 name
12102        # exists, so that Unicode::UCD::charinfo will work
12103
12104        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12105
12106
12107        # @U1_control names above are entirely positional, so we pull them out
12108        # in the exact order required, with gaps for the ones that don't have
12109        # names.
12110        if ($code_point =~ /^00[01]/
12111            || $code_point eq '007F'
12112            || $code_point =~ /^008[2-9A-F]/
12113            || $code_point =~ /^009[0-8A-F]/)
12114        {
12115            my $u1_name = shift @U1_control_names;
12116            $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12117            $_ = join ';', $code_point, @fields;
12118        }
12119        return;
12120    }
12121
12122    sub filter_v2_1_5_ucd {
12123        # A dozen entries in this 2.1.5 file had the mirrored and numeric
12124        # columns swapped;  These all had mirrored be 'N'.  So if the numeric
12125        # column appears to be N, swap it back.
12126
12127        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12128        if ($fields[$NUMERIC] eq 'N') {
12129            $fields[$NUMERIC] = $fields[$MIRRORED];
12130            $fields[$MIRRORED] = 'N';
12131            $_ = join ';', $code_point, @fields;
12132        }
12133        return;
12134    }
12135
12136    sub filter_v6_ucd {
12137
12138        # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12139        # it wasn't accepted, to allow for some deprecation cycles.  This
12140        # function is not called after 5.16
12141
12142        return if $_ !~ /^(?:0007|1F514|070F);/;
12143
12144        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12145        if ($code_point eq '0007') {
12146            $fields[$CHARNAME] = "";
12147        }
12148        elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12149                            # http://www.unicode.org/versions/corrigendum8.html
12150            $fields[$BIDI] = "AL";
12151        }
12152        elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12153            $fields[$CHARNAME] = "";
12154        }
12155
12156        $_ = join ';', $code_point, @fields;
12157
12158        return;
12159    }
12160} # End closure for UnicodeData
12161
12162sub process_GCB_test($file) {
12163
12164    while ($file->next_line) {
12165        push @backslash_X_tests, $_;
12166    }
12167
12168    return;
12169}
12170
12171sub process_LB_test($file) {
12172
12173    while ($file->next_line) {
12174        push @LB_tests, $_;
12175    }
12176
12177    return;
12178}
12179
12180sub process_SB_test($file) {
12181
12182    while ($file->next_line) {
12183        push @SB_tests, $_;
12184    }
12185
12186    return;
12187}
12188
12189sub process_WB_test($file) {
12190
12191    while ($file->next_line) {
12192        push @WB_tests, $_;
12193    }
12194
12195    return;
12196}
12197
12198sub process_NamedSequences($file) {
12199    # NamedSequences.txt entries are just added to an array.  Because these
12200    # don't look like the other tables, they have their own handler.
12201    # An example:
12202    # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12203    #
12204    # This just adds the sequence to an array for later handling
12205
12206    while ($file->next_line) {
12207        my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12208        if (@remainder) {
12209            $file->carp_bad_line(
12210                "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12211            next;
12212        }
12213
12214        # Code points need to be 5 digits long like the other entries in
12215        # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be
12216        # converted to native
12217        $sequence = join " ", map { sprintf("%05X",
12218                                    utf8::unicode_to_native(hex $_))
12219                                  } split / /, $sequence;
12220        push @named_sequences, "$sequence\n$name\n";
12221    }
12222    return;
12223}
12224
12225{ # Closure
12226
12227    my $first_range;
12228
12229    sub  filter_early_ea_lb {
12230        # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
12231        # third field be the name of the code point, which can be ignored in
12232        # most cases.  But it can be meaningful if it marks a range:
12233        # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12234        # 3400;W;<CJK Ideograph Extension A, First>
12235        #
12236        # We need to see the First in the example above to know it's a range.
12237        # They did not use the later range syntaxes.  This routine changes it
12238        # to use the modern syntax.
12239        # $1 is the Input_file object.
12240
12241        my @fields = split /\s*;\s*/;
12242        if ($fields[2] =~ /^<.*, First>/) {
12243            $first_range = $fields[0];
12244            $_ = "";
12245        }
12246        elsif ($fields[2] =~ /^<.*, Last>/) {
12247            $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12248        }
12249        else {
12250            undef $first_range;
12251            $_ = "$fields[0]; $fields[1]";
12252        }
12253
12254        return;
12255    }
12256}
12257
12258sub filter_substitute_lb {
12259    # Used on Unicodes that predate the LB property, where there is a
12260    # substitute file.  This just does the regular ea_lb handling for such
12261    # files, and then substitutes the long property value name for the short
12262    # one that comes with the file.  (The other break files have the long
12263    # names in them, so this is the odd one out.)  The reason for doing this
12264    # kludge is that regen/mk_invlists.pl is expecting the long name.  This
12265    # also fixes the typo 'Inseperable' that leads to problems.
12266
12267    filter_early_ea_lb;
12268    return unless $_;
12269
12270    my @fields = split /\s*;\s*/;
12271    $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12272    $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12273    $_ = join '; ', @fields;
12274}
12275
12276sub filter_old_style_arabic_shaping {
12277    # Early versions used a different term for the later one.
12278
12279    my @fields = split /\s*;\s*/;
12280    $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12281    $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
12282    $_ = join ';', @fields;
12283    return;
12284}
12285
12286{ # Closure
12287    my $lc; # Table for lowercase mapping
12288    my $tc;
12289    my $uc;
12290    my %special_casing_code_points;
12291
12292    sub setup_special_casing($file) {
12293        # SpecialCasing.txt contains the non-simple case change mappings.  The
12294        # simple ones are in UnicodeData.txt, which should already have been
12295        # read in to the full property data structures, so as to initialize
12296        # these with the simple ones.  Then the SpecialCasing.txt entries
12297        # add or overwrite the ones which have different full mappings.
12298
12299        # This routine sees if the simple mappings are to be output, and if
12300        # so, copies what has already been put into the full mapping tables,
12301        # while they still contain only the simple mappings.
12302
12303        # The reason it is done this way is that the simple mappings are
12304        # probably not going to be output, so it saves work to initialize the
12305        # full tables with the simple mappings, and then overwrite those
12306        # relatively few entries in them that have different full mappings,
12307        # and thus skip the simple mapping tables altogether.
12308
12309        $lc = property_ref('lc');
12310        $tc = property_ref('tc');
12311        $uc = property_ref('uc');
12312
12313        # For each of the case change mappings...
12314        foreach my $full_casing_table ($lc, $tc, $uc) {
12315            my $full_casing_name = $full_casing_table->name;
12316            my $full_casing_full_name = $full_casing_table->full_name;
12317            unless (defined $full_casing_table
12318                    && ! $full_casing_table->is_empty)
12319            {
12320                Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
12321            }
12322
12323            $full_casing_table->add_comment(join_lines( <<END
12324This file includes both the simple and full case changing maps.  The simple
12325ones are in the main body of the table below, and the full ones adding to or
12326overriding them are in the hash.
12327END
12328            ));
12329
12330            # The simple version's name in each mapping merely has an 's' in
12331            # front of the full one's
12332            my $simple_name = 's' . $full_casing_name;
12333            my $simple = property_ref($simple_name);
12334            $simple->initialize($full_casing_table) if $simple->to_output_map();
12335        }
12336
12337        return;
12338    }
12339
12340    sub filter_2_1_8_special_casing_line {
12341
12342        # This version had duplicate entries in this file.  Delete all but the
12343        # first one
12344        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12345                                              # fields
12346        if (exists $special_casing_code_points{$fields[0]}) {
12347            $_ = "";
12348            return;
12349        }
12350
12351        $special_casing_code_points{$fields[0]} = 1;
12352        filter_special_casing_line(@_);
12353    }
12354
12355    sub filter_special_casing_line($file) {
12356        # Change the format of $_ from SpecialCasing.txt into something that
12357        # the generic handler understands.  Each input line contains three
12358        # case mappings.  This will generate three lines to pass to the
12359        # generic handler for each of those.
12360
12361        # The input syntax (after stripping comments and trailing white space
12362        # is like one of the following (with the final two being entries that
12363        # we ignore):
12364        # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12365        # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12366        # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12367        # Note the trailing semi-colon, unlike many of the input files.  That
12368        # means that there will be an extra null field generated by the split
12369
12370        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12371                                              # fields
12372
12373        # field #4 is when this mapping is conditional.  If any of these get
12374        # implemented, it would be by hard-coding in the casing functions in
12375        # the Perl core, not through tables.  But if there is a new condition
12376        # we don't know about, output a warning.  We know about all the
12377        # conditions through 6.0
12378        if ($fields[4] ne "") {
12379            my @conditions = split ' ', $fields[4];
12380            if ($conditions[0] ne 'tr'  # We know that these languages have
12381                                        # conditions, and some are multiple
12382                && $conditions[0] ne 'az'
12383                && $conditions[0] ne 'lt'
12384
12385                # And, we know about a single condition Final_Sigma, but
12386                # nothing else.
12387                && ($v_version gt v5.2.0
12388                    && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12389            {
12390                $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12391            }
12392            elsif ($conditions[0] ne 'Final_Sigma') {
12393
12394                    # Don't print out a message for Final_Sigma, because we
12395                    # have hard-coded handling for it.  (But the standard
12396                    # could change what the rule should be, but it wouldn't
12397                    # show up here anyway.
12398
12399                    print "# SKIPPING Special Casing: $_\n"
12400                                                    if $verbosity >= $VERBOSE;
12401            }
12402            $_ = "";
12403            return;
12404        }
12405        elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12406            $file->carp_bad_line('Extra fields');
12407            $_ = "";
12408            return;
12409        }
12410
12411        my $decimal_code_point = hex $fields[0];
12412
12413        # Loop to handle each of the three mappings in the input line, in
12414        # order, with $i indicating the current field number.
12415        my $i = 0;
12416        for my $object ($lc, $tc, $uc) {
12417            $i++;   # First time through, $i = 0 ... 3rd time = 3
12418
12419            my $value = $object->value_of($decimal_code_point);
12420            $value = ($value eq $CODE_POINT)
12421                      ? $decimal_code_point
12422                      : hex $value;
12423
12424            # If this isn't a multi-character mapping, it should already have
12425            # been read in.
12426            if ($fields[$i] !~ / /) {
12427                if ($value != hex $fields[$i]) {
12428                    Carp::my_carp("Bad news. UnicodeData.txt thinks "
12429                                  . $object->name
12430                                  . "(0x$fields[0]) is $value"
12431                                  . " and SpecialCasing.txt thinks it is "
12432                                  . hex($fields[$i])
12433                                  . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
12434                }
12435            }
12436            else {
12437
12438                # The mapping is additional, beyond the simple mapping.
12439                $file->insert_adjusted_lines("$fields[0]; "
12440                                             . $object->name
12441                                            . "; "
12442                                            . $CMD_DELIM
12443                                            . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12444                                            . $CMD_DELIM
12445                                            . $fields[$i]);
12446            }
12447        }
12448
12449        # Everything has been handled by the insert_adjusted_lines()
12450        $_ = "";
12451
12452        return;
12453    }
12454}
12455
12456sub filter_old_style_case_folding($file) {
12457    # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12458    # and later style.  Different letters were used in the earlier.
12459
12460    my @fields = split /\s*;\s*/;
12461
12462    if ($fields[1] eq 'L') {
12463        $fields[1] = 'C';             # L => C always
12464    }
12465    elsif ($fields[1] eq 'E') {
12466        if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
12467            $fields[1] = 'F'
12468        }
12469        else {
12470            $fields[1] = 'C'
12471        }
12472    }
12473    else {
12474        $file->carp_bad_line("Expecting L or E in second field");
12475        $_ = "";
12476        return;
12477    }
12478    $_ = join("; ", @fields) . ';';
12479    return;
12480}
12481
12482{ # Closure for case folding
12483
12484    # Create the map for simple only if are going to output it, for otherwise
12485    # it takes no part in anything we do.
12486    my $to_output_simple;
12487
12488    sub setup_case_folding {
12489        # Read in the case foldings in CaseFolding.txt.  This handles both
12490        # simple and full case folding.
12491
12492        $to_output_simple
12493                        = property_ref('Simple_Case_Folding')->to_output_map;
12494
12495        if (! $to_output_simple) {
12496            property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12497        }
12498
12499        # If we ever wanted to show that these tables were combined, a new
12500        # property method could be created, like set_combined_props()
12501        property_ref('Case_Folding')->add_comment(join_lines( <<END
12502This file includes both the simple and full case folding maps.  The simple
12503ones are in the main body of the table below, and the full ones adding to or
12504overriding them are in the hash.
12505END
12506        ));
12507        return;
12508    }
12509
12510    sub filter_case_folding_line($file) {
12511        # Called for each line in CaseFolding.txt
12512        # Input lines look like:
12513        # 0041; C; 0061; # LATIN CAPITAL LETTER A
12514        # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12515        # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12516        #
12517        # 'C' means that folding is the same for both simple and full
12518        # 'F' that it is only for full folding
12519        # 'S' that it is only for simple folding
12520        # 'T' is locale-dependent, and ignored
12521        # 'I' is a type of 'F' used in some early releases.
12522        # Note the trailing semi-colon, unlike many of the input files.  That
12523        # means that there will be an extra null field generated by the split
12524        # below, which we ignore and hence is not an error.
12525
12526        my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12527        if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12528            $file->carp_bad_line('Extra fields');
12529            $_ = "";
12530            return;
12531        }
12532
12533        if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12534            $_ = "";
12535            return;
12536        }
12537
12538        # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12539        # I are all full foldings; S is single-char.  For S, there is always
12540        # an F entry, so we must allow multiple values for the same code
12541        # point.  Fortunately this table doesn't need further manipulation
12542        # which would preclude using multiple-values.  The S is now included
12543        # so that _swash_inversion_hash() is able to construct closures
12544        # without having to worry about F mappings.
12545        if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12546            $_ = "$range; Case_Folding; "
12547                 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12548        }
12549        else {
12550            $_ = "";
12551            $file->carp_bad_line('Expecting C F I S or T in second field');
12552        }
12553
12554        # C and S are simple foldings, but simple case folding is not needed
12555        # unless we explicitly want its map table output.
12556        if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12557            $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12558        }
12559
12560        return;
12561    }
12562
12563} # End case fold closure
12564
12565sub filter_jamo_line {
12566    # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12567    # from this file that is used in generating the Name property for Jamo
12568    # code points.  But, it also is used to convert early versions' syntax
12569    # into the modern form.  Here are two examples:
12570    # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12571    # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12572    #
12573    # The input is $_, the output is $_ filtered.
12574
12575    my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12576
12577    # Let the caller handle unexpected input.  In earlier versions, there was
12578    # a third field which is supposed to be a comment, but did not have a '#'
12579    # before it.
12580    return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12581
12582    $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12583                                # beginning.
12584
12585    # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12586    $fields[1] = 'R' if $fields[0] eq '1105';
12587
12588    # Add to structure so can generate Names from it.
12589    my $cp = hex $fields[0];
12590    my $short_name = $fields[1];
12591    $Jamo{$cp} = $short_name;
12592    if ($cp <= $LBase + $LCount) {
12593        $Jamo_L{$short_name} = $cp - $LBase;
12594    }
12595    elsif ($cp <= $VBase + $VCount) {
12596        $Jamo_V{$short_name} = $cp - $VBase;
12597    }
12598    elsif ($cp <= $TBase + $TCount) {
12599        $Jamo_T{$short_name} = $cp - $TBase;
12600    }
12601    else {
12602        Carp::my_carp_bug("Unexpected Jamo code point in $_");
12603    }
12604
12605
12606    # Reassemble using just the first two fields to look like a typical
12607    # property file line
12608    $_ = "$fields[0]; $fields[1]";
12609
12610    return;
12611}
12612
12613sub register_fraction($rational) {
12614    # This registers the input rational number so that it can be passed on to
12615    # Unicode::UCD, both in rational and floating forms.
12616
12617    my $floating = eval $rational;
12618
12619    my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating;
12620
12621    # See if the denominator is a power of 2.
12622    $rational =~ m!.*/(.*)!;
12623    my $denominator = $1;
12624    if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) {
12625
12626        # Here the denominator is a power of 2.  This means it has an exact
12627        # representation in binary, so rounding could go either way.  It turns
12628        # out that Windows doesn't necessarily round towards even, so output
12629        # an extra entry.  This happens when the final digit we output is even
12630        # and the next digits would be 50* to the precision of the machine.
12631        my $extra_digit_float = sprintf "%e", $floating;
12632        my $q = $E_FLOAT_PRECISION - 1;
12633        if ($extra_digit_float =~ / ( .* \. \d{$q} )
12634                                    ( [02468] ) 5 0* ( e .*)
12635                                  /ix)
12636        {
12637            push @floats, $1 . ($2 + 1) . $3;
12638        }
12639    }
12640
12641    foreach my $float (@floats) {
12642        # Strip off any leading zeros beyond 2 digits to make it C99
12643        # compliant.  (Windows has 3 digit exponents, contrary to C99)
12644        $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
12645
12646        if (   defined $nv_floating_to_rational{$float}
12647            && $nv_floating_to_rational{$float} ne $rational)
12648        {
12649            die Carp::my_carp_bug("Both '$rational' and"
12650                            . " '$nv_floating_to_rational{$float}' evaluate to"
12651                            . " the same floating point number."
12652                            . "  \$E_FLOAT_PRECISION must be increased");
12653        }
12654        $nv_floating_to_rational{$float} = $rational;
12655    }
12656    return;
12657}
12658
12659sub gcd($a, $b) {   # Greatest-common-divisor; from
12660                # http://en.wikipedia.org/wiki/Euclidean_algorithm
12661    use integer;
12662
12663    while ($b != 0) {
12664       my $temp = $b;
12665       $b = $a % $b;
12666       $a = $temp;
12667    }
12668    return $a;
12669}
12670
12671sub reduce_fraction($fraction_ref) {
12672    # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
12673    # hence this is needed.  The argument is a reference to the
12674    # string denoting the fraction, which must be of the form:
12675    if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12676        Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
12677        return;
12678    }
12679
12680    my $sign = $1;
12681    my $numerator = $2;
12682    my $denominator = $3;
12683
12684    use integer;
12685
12686    # Find greatest common divisor
12687    my $gcd = gcd($numerator, $denominator);
12688
12689    # And reduce using the gcd.
12690    if ($gcd != 1) {
12691        $numerator    /= $gcd;
12692        $denominator  /= $gcd;
12693        $$fraction_ref = "$sign$numerator/$denominator";
12694    }
12695
12696    return;
12697}
12698
12699sub filter_numeric_value_line($file) {
12700    # DNumValues contains lines of a different syntax than the typical
12701    # property file:
12702    # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
12703    #
12704    # This routine transforms $_ containing the anomalous syntax to the
12705    # typical, by filtering out the extra columns, and convert early version
12706    # decimal numbers to strings that look like rational numbers.
12707
12708    # Starting in 5.1, there is a rational field.  Just use that, omitting the
12709    # extra columns.  Otherwise convert the decimal number in the second field
12710    # to a rational, and omit extraneous columns.
12711    my @fields = split /\s*;\s*/, $_, -1;
12712    my $rational;
12713
12714    if ($v_version ge v5.1.0) {
12715        if (@fields != 4) {
12716            $file->carp_bad_line('Not 4 semi-colon separated fields');
12717            $_ = "";
12718            return;
12719        }
12720        reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
12721        $rational = $fields[3];
12722
12723        $_ = join '; ', @fields[ 0, 3 ];
12724    }
12725    else {
12726
12727        # Here, is an older Unicode file, which has decimal numbers instead of
12728        # rationals in it.  Use the fraction to calculate the denominator and
12729        # convert to rational.
12730
12731        if (@fields != 2 && @fields != 3) {
12732            $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12733            $_ = "";
12734            return;
12735        }
12736
12737        my $codepoints = $fields[0];
12738        my $decimal = $fields[1];
12739        if ($decimal =~ s/\.0+$//) {
12740
12741            # Anything ending with a decimal followed by nothing but 0's is an
12742            # integer
12743            $_ = "$codepoints; $decimal";
12744            $rational = $decimal;
12745        }
12746        else {
12747
12748            my $denominator;
12749            if ($decimal =~ /\.50*$/) {
12750                $denominator = 2;
12751            }
12752
12753            # Here have the hardcoded repeating decimals in the fraction, and
12754            # the denominator they imply.  There were only a few denominators
12755            # in the older Unicode versions of this file which this code
12756            # handles, so it is easy to convert them.
12757
12758            # The 4 is because of a round-off error in the Unicode 3.2 files
12759            elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12760                $denominator = 3;
12761            }
12762            elsif ($decimal =~ /\.[27]50*$/) {
12763                $denominator = 4;
12764            }
12765            elsif ($decimal =~ /\.[2468]0*$/) {
12766                $denominator = 5;
12767            }
12768            elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12769                $denominator = 6;
12770            }
12771            elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12772                $denominator = 8;
12773            }
12774            if ($denominator) {
12775                my $sign = ($decimal < 0) ? "-" : "";
12776                my $numerator = int((abs($decimal) * $denominator) + .5);
12777                $rational = "$sign$numerator/$denominator";
12778                $_ = "$codepoints; $rational";
12779            }
12780            else {
12781                $file->carp_bad_line("Can't cope with number '$decimal'.");
12782                $_ = "";
12783                return;
12784            }
12785        }
12786    }
12787
12788    register_fraction($rational) if $rational =~ qr{/};
12789    return;
12790}
12791
12792{ # Closure
12793    my %unihan_properties;
12794
12795    sub construct_unihan($file_object) {
12796
12797        return unless file_exists($file_object->file);
12798
12799        if ($v_version lt v4.0.0) {
12800            push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
12801            push @cjk_property_values, split "\n", <<'END';
12802# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
12803END
12804        }
12805
12806        if ($v_version ge v3.0.0) {
12807            push @cjk_properties, split "\n", <<'END';
12808cjkIRG_GSource; kIRG_GSource
12809cjkIRG_JSource; kIRG_JSource
12810cjkIRG_KSource; kIRG_KSource
12811cjkIRG_TSource; kIRG_TSource
12812cjkIRG_VSource; kIRG_VSource
12813END
12814        push @cjk_property_values, split "\n", <<'END';
12815# @missing: 0000..10FFFF; cjkIRG_GSource; <none>
12816# @missing: 0000..10FFFF; cjkIRG_JSource; <none>
12817# @missing: 0000..10FFFF; cjkIRG_KSource; <none>
12818# @missing: 0000..10FFFF; cjkIRG_TSource; <none>
12819# @missing: 0000..10FFFF; cjkIRG_VSource; <none>
12820END
12821        }
12822        if ($v_version ge v3.1.0) {
12823            push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
12824            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
12825        }
12826        if ($v_version ge v3.1.1) {
12827            push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
12828            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
12829        }
12830        if ($v_version ge v3.2.0) {
12831            push @cjk_properties, split "\n", <<'END';
12832cjkAccountingNumeric; kAccountingNumeric
12833cjkCompatibilityVariant; kCompatibilityVariant
12834cjkOtherNumeric; kOtherNumeric
12835cjkPrimaryNumeric; kPrimaryNumeric
12836END
12837            push @cjk_property_values, split "\n", <<'END';
12838# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
12839# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
12840# @missing: 0000..10FFFF; cjkOtherNumeric; NaN
12841# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
12842END
12843        }
12844        if ($v_version gt v4.0.0) {
12845            push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
12846            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
12847        }
12848
12849        if ($v_version ge v4.1.0) {
12850            push @cjk_properties, 'cjkIICore ; kIICore';
12851            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
12852        }
12853    }
12854
12855    sub setup_unihan {
12856        # Do any special setup for Unihan properties.
12857
12858        # This property gives the wrong computed type, so override.
12859        my $usource = property_ref('kIRG_USource');
12860        $usource->set_type($STRING) if defined $usource;
12861
12862        # This property is to be considered binary (it says so in
12863        # http://www.unicode.org/reports/tr38/)
12864        my $iicore = property_ref('kIICore');
12865        if (defined $iicore) {
12866            $iicore->set_type($FORCED_BINARY);
12867            $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
12868
12869            # Unicode doesn't include the maps for this property, so don't
12870            # warn that they are missing.
12871            $iicore->set_pre_declared_maps(0);
12872            $iicore->add_comment(join_lines( <<END
12873This property contains string values, but any non-empty ones are considered to
12874be 'core', so Perl creates tables for both: 1) its string values, plus 2)
12875tables so that \\p{kIICore} matches any code point which has a non-empty
12876value for this property.
12877END
12878            ));
12879        }
12880
12881        return;
12882    }
12883
12884    sub filter_unihan_line {
12885        # Change unihan db lines to look like the others in the db.  Here is
12886        # an input sample:
12887        #   U+341C        kCangjie        IEKN
12888
12889        # Tabs are used instead of semi-colons to separate fields; therefore
12890        # they may have semi-colons embedded in them.  Change these to periods
12891        # so won't screw up the rest of the code.
12892        s/;/./g;
12893
12894        # Remove lines that don't look like ones we accept.
12895        if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12896            $_ = "";
12897            return;
12898        }
12899
12900        # Extract the property, and save a reference to its object.
12901        my $property = $1;
12902        if (! exists $unihan_properties{$property}) {
12903            $unihan_properties{$property} = property_ref($property);
12904        }
12905
12906        # Don't do anything unless the property is one we're handling, which
12907        # we determine by seeing if there is an object defined for it or not
12908        if (! defined $unihan_properties{$property}) {
12909            $_ = "";
12910            return;
12911        }
12912
12913        # Convert the tab separators to our standard semi-colons, and convert
12914        # the U+HHHH notation to the rest of the standard's HHHH
12915        s/\t/;/g;
12916        s/\b U \+ (?= $code_point_re )//xg;
12917
12918        #local $to_trace = 1 if main::DEBUG;
12919        trace $_ if main::DEBUG && $to_trace;
12920
12921        return;
12922    }
12923}
12924
12925sub filter_blocks_lines($file) {
12926    # In the Blocks.txt file, the names of the blocks don't quite match the
12927    # names given in PropertyValueAliases.txt, so this changes them so they
12928    # do match:  Blanks and hyphens are changed into underscores.  Also makes
12929    # early release versions look like later ones
12930    #
12931    # $_ is transformed to the correct value.
12932
12933    if ($v_version lt v3.2.0) {
12934        if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12935            $_ = "";
12936            return;
12937        }
12938
12939        # Old versions used a different syntax to mark the range.
12940        $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12941    }
12942
12943    my @fields = split /\s*;\s*/, $_, -1;
12944    if (@fields != 2) {
12945        $file->carp_bad_line("Expecting exactly two fields");
12946        $_ = "";
12947        return;
12948    }
12949
12950    # Change hyphens and blanks in the block name field only
12951    $fields[1] =~ s/[ -]/_/g;
12952    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
12953
12954    $_ = join("; ", @fields);
12955    return;
12956}
12957
12958{ # Closure
12959    my $current_property;
12960
12961    sub filter_old_style_proplist {
12962        # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
12963        # was in a completely different syntax.  Ken Whistler of Unicode says
12964        # that it was something he used as an aid for his own purposes, but
12965        # was never an official part of the standard.  Many of the properties
12966        # in it were incorporated into the later PropList.txt, but some were
12967        # not.  This program uses this early file to generate property tables
12968        # that are otherwise not accessible in the early UCD's.  It does this
12969        # for the ones that eventually became official, and don't appear to be
12970        # too different in their contents from the later official version, and
12971        # throws away the rest.  It could be argued that the ones it generates
12972        # were probably not really official at that time, so should be
12973        # ignored.  You can easily modify things to skip all of them by
12974        # changing this function to just set $_ to "", and return; and to skip
12975        # certain of them by simply removing their declarations from
12976        # get_old_property_aliases().
12977        #
12978        # Here is a list of all the ones that are thrown away:
12979        #   Alphabetic                   The definitions for this are very
12980        #                                defective, so better to not mislead
12981        #                                people into thinking it works.
12982        #                                Instead the Perl extension of the
12983        #                                same name is constructed from first
12984        #                                principles.
12985        #   Bidi=*                       duplicates UnicodeData.txt
12986        #   Combining                    never made into official property;
12987        #                                is \P{ccc=0}
12988        #   Composite                    never made into official property.
12989        #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
12990        #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
12991        #   Delimiter                    never made into official property;
12992        #                                removed in 3.0.1
12993        #   Format Control               never made into official property;
12994        #                                similar to gc=cf
12995        #   High Surrogate               duplicates Blocks.txt
12996        #   Ignorable Control            never made into official property;
12997        #                                similar to di=y
12998        #   ISO Control                  duplicates UnicodeData.txt: gc=cc
12999        #   Left of Pair                 never made into official property;
13000        #   Line Separator               duplicates UnicodeData.txt: gc=zl
13001        #   Low Surrogate                duplicates Blocks.txt
13002        #   Non-break                    was actually listed as a property
13003        #                                in 3.2, but without any code
13004        #                                points.  Unicode denies that this
13005        #                                was ever an official property
13006        #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
13007        #   Numeric                      duplicates UnicodeData.txt: gc=cc
13008        #   Paired Punctuation           never made into official property;
13009        #                                appears to be gc=ps + gc=pe
13010        #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
13011        #   Private Use                  duplicates UnicodeData.txt: gc=co
13012        #   Private Use High Surrogate   duplicates Blocks.txt
13013        #   Punctuation                  duplicates UnicodeData.txt: gc=p
13014        #   Space                        different definition than eventual
13015        #                                one.
13016        #   Titlecase                    duplicates UnicodeData.txt: gc=lt
13017        #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
13018        #   Zero-width                   never made into official property;
13019        #                                subset of gc=cf
13020        # Most of the properties have the same names in this file as in later
13021        # versions, but a couple do not.
13022        #
13023        # This subroutine filters $_, converting it from the old style into
13024        # the new style.  Here's a sample of the old-style
13025        #
13026        #   *******************************************
13027        #
13028        #   Property dump for: 0x100000A0 (Join Control)
13029        #
13030        #   200C..200D  (2 chars)
13031        #
13032        # In the example, the property is "Join Control".  It is kept in this
13033        # closure between calls to the subroutine.  The numbers beginning with
13034        # 0x were internal to Ken's program that generated this file.
13035
13036        # If this line contains the property name, extract it.
13037        if (/^Property dump for: [^(]*\((.*)\)/) {
13038            $_ = $1;
13039
13040            # Convert white space to underscores.
13041            s/ /_/g;
13042
13043            # Convert the few properties that don't have the same name as
13044            # their modern counterparts
13045            s/Identifier_Part/ID_Continue/
13046            or s/Not_a_Character/NChar/;
13047
13048            # If the name matches an existing property, use it.
13049            if (defined property_ref($_)) {
13050                trace "new property=", $_ if main::DEBUG && $to_trace;
13051                $current_property = $_;
13052            }
13053            else {        # Otherwise discard it
13054                trace "rejected property=", $_ if main::DEBUG && $to_trace;
13055                undef $current_property;
13056            }
13057            $_ = "";    # The property is saved for the next lines of the
13058                        # file, but this defining line is of no further use,
13059                        # so clear it so that the caller won't process it
13060                        # further.
13061        }
13062        elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13063
13064            # Here, the input line isn't a header defining a property for the
13065            # following section, and either we aren't in such a section, or
13066            # the line doesn't look like one that defines the code points in
13067            # such a section.  Ignore this line.
13068            $_ = "";
13069        }
13070        else {
13071
13072            # Here, we have a line defining the code points for the current
13073            # stashed property.  Anything starting with the first blank is
13074            # extraneous.  Otherwise, it should look like a normal range to
13075            # the caller.  Append the property name so that it looks just like
13076            # a modern PropList entry.
13077
13078            $_ =~ s/\s.*//;
13079            $_ .= "; $current_property";
13080        }
13081        trace $_ if main::DEBUG && $to_trace;
13082        return;
13083    }
13084} # End closure for old style proplist
13085
13086sub filter_old_style_normalization_lines {
13087    # For early releases of Unicode, the lines were like:
13088    #        74..2A76    ; NFKD_NO
13089    # For later releases this became:
13090    #        74..2A76    ; NFKD_QC; N
13091    # Filter $_ to look like those in later releases.
13092    # Similarly for MAYBEs
13093
13094    s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13095
13096    # Also, the property FC_NFKC was abbreviated to FNC
13097    s/FNC/FC_NFKC/;
13098    return;
13099}
13100
13101sub setup_script_extensions {
13102    # The Script_Extensions property starts out with a clone of the Script
13103    # property.
13104
13105    $scx = property_ref("Script_Extensions");
13106    return unless defined $scx;
13107
13108    $scx->_set_format($STRING_WHITE_SPACE_LIST);
13109    $scx->initialize($script);
13110    $scx->set_default_map($script->default_map);
13111    $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
13112    $scx->add_comment(join_lines( <<END
13113The values for code points that appear in one script are just the same as for
13114the 'Script' property.  Likewise the values for those that appear in many
13115scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
13116values of code points that appear in a few scripts are a space separated list
13117of those scripts.
13118END
13119    ));
13120
13121    # Initialize scx's tables and the aliases for them to be the same as sc's
13122    foreach my $table ($script->tables) {
13123        my $scx_table = $scx->add_match_table($table->name,
13124                                Full_Name => $table->full_name);
13125        foreach my $alias ($table->aliases) {
13126            $scx_table->add_alias($alias->name);
13127        }
13128    }
13129}
13130
13131sub  filter_script_extensions_line {
13132    # The Scripts file comes with the full name for the scripts; the
13133    # ScriptExtensions, with the short name.  The final mapping file is a
13134    # combination of these, and without adjustment, would have inconsistent
13135    # entries.  This filters the latter file to convert to full names.
13136    # Entries look like this:
13137    # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13138
13139    my @fields = split /\s*;\s*/;
13140
13141    # This script was erroneously omitted in this Unicode version.
13142    $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13143
13144    my @full_names;
13145    foreach my $short_name (split " ", $fields[1]) {
13146        push @full_names, $script->table($short_name)->full_name;
13147    }
13148    $fields[1] = join " ", @full_names;
13149    $_ = join "; ", @fields;
13150
13151    return;
13152}
13153
13154sub setup_emojidata {
13155    my $prop_ref = Property->new('ExtPict',
13156                                 Full_Name => 'Extended_Pictographic',
13157    );
13158    $prop_ref->set_fate($PLACEHOLDER,
13159                        "Not part of the Unicode Character Database");
13160}
13161
13162sub filter_emojidata_line {
13163    # We only are interested in this single property from this non-UCD data
13164    # file, and we turn it into a Perl property, so that it isn't accessible
13165    # to the users
13166
13167    $_ = "" unless /\bExtended_Pictographic\b/;
13168
13169    return;
13170}
13171
13172sub setup_IdStatus {
13173    my $ids = Property->new('Identifier_Status',
13174                            Match_SubDir => 'IdStatus',
13175                            Default_Map => 'Restricted',
13176                           );
13177    $ids->add_match_table('Allowed');
13178}
13179
13180sub setup_IdType {
13181    $idt = Property->new('Identifier_Type',
13182                            Match_SubDir => 'IdType',
13183                            Default_Map => 'Not_Character',
13184                            Format => $STRING_WHITE_SPACE_LIST,
13185                           );
13186}
13187
13188sub  filter_IdType_line {
13189
13190    # Some code points have more than one type, separated by spaces on the
13191    # input.  For now, we just add everything as a property value.  Later when
13192    # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve
13193    # things
13194
13195    my @fields = split /\s*;\s*/;
13196    my $types = $fields[1];
13197    $idt->add_match_table($types) unless defined $idt->table($types);
13198
13199    return;
13200}
13201
13202sub generate_hst($file) {
13203
13204    # Populates the Hangul Syllable Type property from first principles
13205
13206    # These few ranges are hard-coded in.
13207    $file->insert_lines(split /\n/, <<'END'
132081100..1159    ; L
13209115F          ; L
132101160..11A2    ; V
1321111A8..11F9    ; T
13212END
13213);
13214
13215    # The Hangul syllables in version 1 are at different code points than
13216    # those that came along starting in version 2, and have different names;
13217    # they comprise about 60% of the code points of the later version.
13218    # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13219    # initial set is a subset of the later version, with different English
13220    # transliterations.  I did not see an easy mapping between them.  The
13221    # later set includes essentially all possibilities, even ones that aren't
13222    # in modern use (if they ever were), and over 96% of the new ones are type
13223    # LVT.  Mathematically, the early set must also contain a preponderance of
13224    # LVT values.  In lieu of doing nothing, we just set them all to LVT, and
13225    # expect that this will be right most of the time, which is better than
13226    # not being right at all.
13227    if ($v_version lt v2.0.0) {
13228        my $property = property_ref($file->property);
13229        $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13230                                    $FIRST_REMOVED_HANGUL_SYLLABLE,
13231                                    $FINAL_REMOVED_HANGUL_SYLLABLE));
13232        push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13233        return;
13234    }
13235
13236    # The algorithmically derived syllables are almost all LVT ones, so
13237    # initialize the whole range with that.
13238    $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13239                        $SBase, $SBase + $SCount -1);
13240
13241    # Those ones that aren't LVT are LV, and they occur at intervals of
13242    # $TCount code points, starting with the first code point, at $SBase.
13243    for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13244        $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13245    }
13246
13247    return;
13248}
13249
13250sub generate_GCB($file) {
13251
13252    # Populates the Grapheme Cluster Break property from first principles
13253
13254    # All these definitions are from
13255    # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13256    # from http://www.unicode.org/reports/tr29/tr29-4.html
13257
13258    foreach my $range ($gc->ranges) {
13259
13260        # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13261        # and gc=Cf
13262        if ($range->value =~ / ^ M [en] $ /x) {
13263            $file->insert_lines(sprintf "%04X..%04X; Extend",
13264                                $range->start,  $range->end);
13265        }
13266        elsif ($range->value =~ / ^ C [cf] $ /x) {
13267            $file->insert_lines(sprintf "%04X..%04X; Control",
13268                                $range->start,  $range->end);
13269        }
13270    }
13271    $file->insert_lines("2028; Control"); # Line Separator
13272    $file->insert_lines("2029; Control"); # Paragraph Separator
13273
13274    $file->insert_lines("000D; CR");
13275    $file->insert_lines("000A; LF");
13276
13277    # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13278    foreach my $code_point ( qw{
13279                                09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13280                                0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13281                                }
13282    ) {
13283        my $category = $gc->value_of(hex $code_point);
13284        next if ! defined $category || $category eq 'Cn'; # But not if
13285                                                          # unassigned in this
13286                                                          # release
13287        $file->insert_lines("$code_point; Extend");
13288    }
13289
13290    my $hst = property_ref('Hangul_Syllable_Type');
13291    if ($hst->count > 0) {
13292        foreach my $range ($hst->ranges) {
13293            $file->insert_lines(sprintf "%04X..%04X; %s",
13294                                    $range->start, $range->end, $range->value);
13295        }
13296    }
13297    else {
13298        generate_hst($file);
13299    }
13300
13301    main::process_generic_property_file($file);
13302}
13303
13304
13305sub fixup_early_perl_name_alias($file) {
13306
13307    # Different versions of Unicode have varying support for the name synonyms
13308    # below.  Just include everything.  As of 6.1, all these are correct in
13309    # the Unicode-supplied file.
13310
13311    # ALERT did not come along until 6.0, at which point it became preferred
13312    # over BELL.  By inserting it last in early releases, BELL is preferred
13313    # over it; and vice-vers in 6.0
13314    my $type_for_bell = ($v_version lt v6.0.0)
13315               ? 'correction'
13316               : 'alternate';
13317    $file->insert_lines(split /\n/, <<END
133180007;BELL; $type_for_bell
13319000A;LINE FEED (LF);alternate
13320000C;FORM FEED (FF);alternate
13321000D;CARRIAGE RETURN (CR);alternate
133220085;NEXT LINE (NEL);alternate
13323END
13324
13325    );
13326
13327    # One might think that the 'Unicode_1_Name' field, could work for most
13328    # of the above names, but sadly that field varies depending on the
13329    # release.  Version 1.1.5 had no names for any of the controls; Version
13330    # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13331    # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13332    #   changed to parenthesized versions like "NEXT LINE" to
13333    #       "NEXT LINE (NEL)";
13334    #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13335    #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13336    #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13337    #
13338    # All these are present in the 6.1 NameAliases.txt
13339
13340    return;
13341}
13342
13343sub filter_later_version_name_alias_line {
13344
13345    # This file has an extra entry per line for the alias type.  This is
13346    # handled by creating a compound entry: "$alias: $type";  First, split
13347    # the line into components.
13348    my ($range, $alias, $type, @remainder)
13349        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13350
13351    # This file contains multiple entries for some components, so tell the
13352    # downstream code to allow this in our internal tables; the
13353    # $MULTIPLE_AFTER preserves the input ordering.
13354    $_ = join ";", $range, $CMD_DELIM
13355                           . $REPLACE_CMD
13356                           . '='
13357                           . $MULTIPLE_AFTER
13358                           . $CMD_DELIM
13359                           . "$alias: $type",
13360                   @remainder;
13361    return;
13362}
13363
13364sub filter_early_version_name_alias_line {
13365
13366    # Early versions did not have the trailing alias type field; implicitly it
13367    # was 'correction'.
13368    $_ .= "; correction";
13369
13370    filter_later_version_name_alias_line;
13371    return;
13372}
13373
13374sub filter_all_caps_script_names {
13375
13376    # Some early Unicode releases had the script names in all CAPS.  This
13377    # converts them to just the first letter of each word being capital.
13378
13379    my ($range, $script, @remainder)
13380        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13381    my @words = split /[_-]/, $script;
13382    for my $word (@words) {
13383        $word =
13384            ucfirst(lc($word)) if $word ne 'CJK';
13385    }
13386    $script = join "_", @words;
13387    $_ = join ";", $range, $script, @remainder;
13388}
13389
13390sub finish_Unicode() {
13391    # This routine should be called after all the Unicode files have been read
13392    # in.  It:
13393    # 1) Creates properties that are missing from the version of Unicode being
13394    #    compiled, and which, for whatever reason, are needed for the Perl
13395    #    core to function properly.  These are minimally populated as
13396    #    necessary.
13397    # 2) Adds the mappings for code points missing from the files which have
13398    #    defaults specified for them.
13399    # 3) At this point all mappings are known, so it computes the type of
13400    #    each property whose type hasn't been determined yet.
13401    # 4) Calculates all the regular expression match tables based on the
13402    #    mappings.
13403    # 5) Calculates and adds the tables which are defined by Unicode, but
13404    #    which aren't derived by them, and certain derived tables that Perl
13405    #    uses.
13406
13407    # Folding information was introduced later into Unicode data.  To get
13408    # Perl's case ignore (/i) to work at all in releases that don't have
13409    # folding, use the best available alternative, which is lower casing.
13410    my $fold = property_ref('Case_Folding');
13411    if ($fold->is_empty) {
13412        $fold->initialize(property_ref('Lowercase_Mapping'));
13413        $fold->add_note(join_lines(<<END
13414WARNING: This table uses lower case as a substitute for missing fold
13415information
13416END
13417        ));
13418    }
13419
13420    # Multiple-character mapping was introduced later into Unicode data, so it
13421    # is by default the simple version.  If to output the simple versions and
13422    # not present, just use the regular (which in these Unicode versions is
13423    # the simple as well).
13424    foreach my $map (qw {   Uppercase_Mapping
13425                            Lowercase_Mapping
13426                            Titlecase_Mapping
13427                            Case_Folding
13428                        } )
13429    {
13430        my $comment = <<END;
13431
13432Note that although the Perl core uses this file, it has the standard values
13433for code points from U+0000 to U+00FF compiled in, so changing this table will
13434not change the core's behavior with respect to these code points.  Use
13435Unicode::Casing to override this table.
13436END
13437        if ($map eq 'Case_Folding') {
13438            $comment .= <<END;
13439(/i regex matching is not overridable except by using a custom regex engine)
13440END
13441        }
13442        property_ref($map)->add_comment(join_lines($comment));
13443        my $simple = property_ref("Simple_$map");
13444        next if ! $simple->is_empty;
13445        if ($simple->to_output_map) {
13446            $simple->initialize(property_ref($map));
13447        }
13448        else {
13449            property_ref($map)->set_proxy_for($simple->name);
13450        }
13451    }
13452
13453    # For each property, fill in any missing mappings, and calculate the re
13454    # match tables.  If a property has more than one missing mapping, the
13455    # default is a reference to a data structure, and may require data from
13456    # other properties to resolve.  The sort is used to cause these to be
13457    # processed last, after all the other properties have been calculated.
13458    # (Fortunately, the missing properties so far don't depend on each other.)
13459    foreach my $property
13460        (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13461        property_ref('*'))
13462    {
13463        # $perl has been defined, but isn't one of the Unicode properties that
13464        # need to be finished up.
13465        next if $property == $perl;
13466
13467        # Nor do we need to do anything with properties that aren't going to
13468        # be output.
13469        next if $property->fate == $SUPPRESSED;
13470
13471        # Handle the properties that have more than one possible default
13472        if (ref $property->default_map) {
13473            my $default_map = $property->default_map;
13474
13475            # These properties have stored in the default_map:
13476            # One or more of:
13477            #   1)  A default map which applies to all code points in a
13478            #       certain class
13479            #   2)  an expression which will evaluate to the list of code
13480            #       points in that class
13481            # And
13482            #   3) the default map which applies to every other missing code
13483            #      point.
13484            #
13485            # Go through each list.
13486            while (my ($default, $eval) = $default_map->get_next_defaults) {
13487                last unless defined $eval;
13488
13489                # Get the class list, and intersect it with all the so-far
13490                # unspecified code points yielding all the code points
13491                # in the class that haven't been specified.
13492                my $list = eval $eval;
13493                if ($@) {
13494                    Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13495                    last;
13496                }
13497
13498                # Narrow down the list to just those code points we don't have
13499                # maps for yet.
13500                $list = $list & $property->inverse_list;
13501
13502                # Add mappings to the property for each code point in the list
13503                foreach my $range ($list->ranges) {
13504                    $property->add_map($range->start, $range->end, $default,
13505                    Replace => $NO);
13506                }
13507            }
13508
13509            # All remaining code points have the other mapping.  Set that up
13510            # so the normal single-default mapping code will work on them
13511            $property->set_default_map($default_map->other_default);
13512
13513            # And fall through to do that
13514        }
13515
13516        # We should have enough data now to compute the type of the property.
13517        my $property_name = $property->name;
13518        $property->compute_type;
13519        my $property_type = $property->type;
13520
13521        next if ! $property->to_create_match_tables;
13522
13523        # Here want to create match tables for this property
13524
13525        # The Unicode db always (so far, and they claim into the future) have
13526        # the default for missing entries in binary properties be 'N' (unless
13527        # there is a '@missing' line that specifies otherwise)
13528        if (! defined $property->default_map) {
13529            if ($property_type == $BINARY) {
13530                $property->set_default_map('N');
13531            }
13532            elsif ($property_type == $ENUM) {
13533                Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13534                $property->set_default_map('XXX This makes sure there is a default map');
13535            }
13536        }
13537
13538        # Add any remaining code points to the mapping, using the default for
13539        # missing code points.
13540        my $default_table;
13541        my $default_map = $property->default_map;
13542        if ($property_type == $FORCED_BINARY) {
13543
13544            # A forced binary property creates a 'Y' table that matches all
13545            # non-default values.  The actual string values are also written out
13546            # as a map table.  (The default value will almost certainly be the
13547            # empty string, so the pod glosses over the distinction, and just
13548            # talks about empty vs non-empty.)
13549            my $yes = $property->table("Y");
13550            foreach my $range ($property->ranges) {
13551                next if $range->value eq $default_map;
13552                $yes->add_range($range->start, $range->end);
13553            }
13554            $property->table("N")->set_complement($yes);
13555        }
13556        else {
13557            if (defined $default_map) {
13558
13559                # Make sure there is a match table for the default
13560                if (! defined ($default_table = $property->table($default_map)))
13561                {
13562                    $default_table = $property->add_match_table($default_map);
13563                }
13564
13565                # And, if the property is binary, the default table will just
13566                # be the complement of the other table.
13567                if ($property_type == $BINARY) {
13568                    my $non_default_table;
13569
13570                    # Find the non-default table.
13571                    for my $table ($property->tables) {
13572                        if ($table == $default_table) {
13573                            if ($v_version le v5.0.0) {
13574                                $table->add_alias($_) for qw(N No F False);
13575                            }
13576                            next;
13577                        } elsif ($v_version le v5.0.0) {
13578                            $table->add_alias($_) for qw(Y Yes T True);
13579                        }
13580                        $non_default_table = $table;
13581                    }
13582                    $default_table->set_complement($non_default_table);
13583                }
13584                else {
13585
13586                    # This fills in any missing values with the default.  It's
13587                    # not necessary to do this with binary properties, as the
13588                    # default is defined completely in terms of the Y table.
13589                    $property->add_map(0, $MAX_WORKING_CODEPOINT,
13590                                    $default_map, Replace => $NO);
13591                }
13592            }
13593
13594            # Have all we need to populate the match tables.
13595            my $maps_should_be_defined = $property->pre_declared_maps;
13596            foreach my $range ($property->ranges) {
13597                my $map = $range->value;
13598                my $table = $property->table($map);
13599                if (! defined $table) {
13600
13601                    # Integral and rational property values are not
13602                    # necessarily defined in PropValueAliases, but whether all
13603                    # the other ones should be depends on the property.
13604                    if ($maps_should_be_defined
13605                        && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13606                    {
13607                        Carp::my_carp("Table '$property_name=$map' should "
13608                                    . "have been defined.  Defining it now.")
13609                    }
13610                    $table = $property->add_match_table($map);
13611                }
13612
13613                next if $table->complement != 0; # Don't need to populate these
13614                $table->add_range($range->start, $range->end);
13615            }
13616        }
13617
13618        # For Perl 5.6 compatibility, all properties matchable in regexes can
13619        # have an optional 'Is_' prefix.  This is now done in Unicode::UCD.
13620        # But warn if this creates a conflict with a (new) Unicode property
13621        # name, although it appears that Unicode has made a decision never to
13622        # begin a property name with 'Is_', so this shouldn't happen.
13623        foreach my $alias ($property->aliases) {
13624            my $Is_name = 'Is_' . $alias->name;
13625            if (defined (my $pre_existing = property_ref($Is_name))) {
13626                Carp::my_carp(<<END
13627There is already an alias named $Is_name (from " . $pre_existing . "), so
13628creating one for $property won't work.  This is bad news.  If it is not too
13629late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13630from the git blame log for this area of the code that suppressed individual
13631aliases that conflict with the new Unicode names.  Proceeding anyway.
13632END
13633                );
13634            }
13635        } # End of loop through aliases for this property
13636
13637
13638        # Properties that have sets of values for some characters are now
13639        # converted.  For example, the Script_Extensions property started out
13640        # as a clone of the Script property.  But processing its data file
13641        # caused some elements to be replaced with different data.  (These
13642        # elements were for the Common and Inherited properties.)  This data
13643        # is a qw() list of all the scripts that the code points in the given
13644        # range are in.  An example line is:
13645        #
13646        # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
13647        #
13648        # Code executed earlier has created a new match table named "Arab Syrc
13649        # Thaa" which contains 060C.  (The cloned table started out with this
13650        # code point mapping to "Common".)  Now we add 060C to each of the
13651        # Arab, Syrc, and Thaa match tables.  Then we delete the now spurious
13652        # "Arab Syrc Thaa" match table.  This is repeated for all these tables
13653        # and ranges.  The map data is retained in the map table for
13654        # reference, but the spurious match tables are deleted.
13655        my $format = $property->format;
13656        if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) {
13657            foreach my $table ($property->tables) {
13658
13659                # Space separates the entries which should go in multiple
13660                # tables
13661                next unless $table->name =~ /\s/;
13662
13663                # The list of the entries, hence the names of the tables that
13664                # everything in this combo table should be added to.
13665                my @list = split /\s+/, $table->name;
13666
13667                # Add the entries from the combo table to each individual
13668                # table
13669                foreach my $individual (@list) {
13670                    my $existing_table = $property->table($individual);
13671
13672                    # This should only be necessary if this particular entry
13673                    # occurs only in combo with others.
13674                    $existing_table = $property->add_match_table($individual)
13675                                                unless defined $existing_table;
13676                    $existing_table += $table;
13677                }
13678                $property->delete_match_table($table);
13679            }
13680        }
13681    } # End of loop through all Unicode properties.
13682
13683    # Fill in the mappings that Unicode doesn't completely furnish.  First the
13684    # single letter major general categories.  If Unicode were to start
13685    # delivering the values, this would be redundant, but better that than to
13686    # try to figure out if should skip and not get it right.  Ths could happen
13687    # if a new major category were to be introduced, and the hard-coded test
13688    # wouldn't know about it.
13689    # This routine depends on the standard names for the general categories
13690    # being what it thinks they are, like 'Cn'.  The major categories are the
13691    # union of all the general category tables which have the same first
13692    # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13693    foreach my $minor_table ($gc->tables) {
13694        my $minor_name = $minor_table->name;
13695        next if length $minor_name == 1;
13696        if (length $minor_name != 2) {
13697            Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
13698            next;
13699        }
13700
13701        my $major_name = uc(substr($minor_name, 0, 1));
13702        my $major_table = $gc->table($major_name);
13703        $major_table += $minor_table;
13704    }
13705
13706    # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
13707    # defines it as LC)
13708    my $LC = $gc->table('LC');
13709    $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
13710    $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
13711
13712
13713    if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13714                         # deliver the correct values in it
13715        $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13716
13717        # Lt not in release 1.
13718        if (defined $gc->table('Lt')) {
13719            $LC += $gc->table('Lt');
13720            $gc->table('Lt')->set_caseless_equivalent($LC);
13721        }
13722    }
13723    $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13724
13725    $gc->table('Ll')->set_caseless_equivalent($LC);
13726    $gc->table('Lu')->set_caseless_equivalent($LC);
13727
13728    # Make sure this assumption in perl core code is valid in this Unicode
13729    # release, with known exceptions
13730    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13731        next if $range->end - $range->start == 9;
13732        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
13733        next if $range->end == 0x19DA && $v_version eq v5.2.0;
13734        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13735        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13736                    . " decimal digits.  Code in regcomp.c assumes it does,"
13737                    . " and will have to be fixed.  Proceeding anyway.");
13738    }
13739
13740    # Mark the scx table as the parent of the corresponding sc table for those
13741    # which are identical.  This causes the pod for the script table to refer
13742    # to the corresponding scx one.  This is done after everything, so as to
13743    # wait until the tables are stabilized before checking for equivalency.
13744    if (defined $scx) {
13745        if (defined $pod_directory) {
13746            foreach my $table ($scx->tables) {
13747                my $plain_sc_equiv = $script->table($table->name);
13748                if ($table->matches_identically_to($plain_sc_equiv)) {
13749                    $plain_sc_equiv->set_equivalent_to($table, Related => 1);
13750                }
13751            }
13752        }
13753    }
13754
13755    return;
13756}
13757
13758sub pre_3_dot_1_Nl () {
13759
13760    # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13761    # is when Unicode's became fully usable.  These code points were
13762    # determined by inspection and experimentation.  gc=nl is important for
13763    # certain Perl-extension properties that should be available in all
13764    # releases.
13765
13766    my $Nl = Range_List->new();
13767    if (defined (my $official = $gc->table('Nl'))) {
13768        $Nl += $official;
13769    }
13770    else {
13771        $Nl->add_range(0x2160, 0x2182);
13772        $Nl->add_range(0x3007, 0x3007);
13773        $Nl->add_range(0x3021, 0x3029);
13774    }
13775    $Nl->add_range(0xFE20, 0xFE23);
13776    $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13777                                                            # these were added
13778    return $Nl;
13779}
13780
13781sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
13782                            # called before the Cn's are completely filled.
13783                            # Works on Unicodes earlier than ones that
13784                            # explicitly specify Cn.
13785    return if defined $Assigned;
13786
13787    if (! defined $gc || $gc->is_empty()) {
13788        Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
13789    }
13790
13791    $Assigned = $perl->add_match_table('Assigned',
13792                                Description  => "All assigned code points",
13793                                );
13794    while (defined (my $range = $gc->each_range())) {
13795        my $standard_value = standardize($range->value);
13796        next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
13797        $Assigned->add_range($range->start, $range->end);
13798    }
13799}
13800
13801sub calculate_DI() {    # Set $DI to a Range_List equivalent to the
13802                        # Default_Ignorable_Code_Point property.  Works on
13803                        # Unicodes earlier than ones that explicitly specify
13804                        # DI.
13805    return if defined $DI;
13806
13807    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
13808        $DI = $di->table('Y');
13809    }
13810    else {
13811        $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
13812                                              0x2060 .. 0x206F,
13813                                              0xFE00 .. 0xFE0F,
13814                                              0xFFF0 .. 0xFFFB,
13815                                            ]);
13816        if ($v_version ge v2.0) {
13817            $DI += $gc->table('Cf')
13818                +  $gc->table('Cs');
13819
13820            # These are above the Unicode version 1 max
13821            $DI->add_range(0xE0000, 0xE0FFF);
13822        }
13823        $DI += $gc->table('Cc')
13824             - ord("\t")
13825             - utf8::unicode_to_native(0x0A)  # LINE FEED
13826             - utf8::unicode_to_native(0x0B)  # VERTICAL TAB
13827             - ord("\f")
13828             - utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
13829             - utf8::unicode_to_native(0x85); # NEL
13830    }
13831}
13832
13833sub calculate_NChar() {  # Create a Perl extension match table which is the
13834                         # same as the Noncharacter_Code_Point property, and
13835                         # set $NChar to point to it.  Works on Unicodes
13836                         # earlier than ones that explicitly specify NChar
13837    return if defined $NChar;
13838
13839    $NChar = $perl->add_match_table('_Perl_Nchar',
13840                                    Perl_Extension => 1,
13841                                    Fate => $INTERNAL_ONLY);
13842    if (defined (my $off_nchar = property_ref('NChar'))) {
13843        $NChar->initialize($off_nchar->table('Y'));
13844    }
13845    else {
13846        $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
13847        if ($v_version ge v2.0) {   # First release with these nchars
13848            for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
13849                $NChar += [ $i .. $i+1 ];
13850            }
13851        }
13852    }
13853}
13854
13855sub handle_compare_versions () {
13856    # This fixes things up for the $compare_versions capability, where we
13857    # compare Unicode version X with version Y (with Y > X), and we are
13858    # running it on the Unicode Data for version Y.
13859    #
13860    # It works by calculating the code points whose meaning has been specified
13861    # after release X, by using the Age property.  The complement of this set
13862    # is the set of code points whose meaning is unchanged between the
13863    # releases.  This is the set the program restricts itself to.  It includes
13864    # everything whose meaning has been specified by the time version X came
13865    # along, plus those still unassigned by the time of version Y.  (We will
13866    # continue to use the word 'assigned' to mean 'meaning has been
13867    # specified', as it's shorter and is accurate in all cases except the
13868    # Noncharacter code points.)
13869    #
13870    # This function is run after all the properties specified by Unicode have
13871    # been calculated for release Y.  This makes sure we get all the nuances
13872    # of Y's rules.  (It is done before the Perl extensions are calculated, as
13873    # those are based entirely on the Unicode ones.)  But doing it after the
13874    # Unicode table calculations means we have to fix up the Unicode tables.
13875    # We do this by subtracting the code points that have been assigned since
13876    # X (which is actually done by ANDing each table of assigned code points
13877    # with the set of unchanged code points).  Most Unicode properties are of
13878    # the form such that all unassigned code points have a default, grab-bag,
13879    # property value which is changed when the code point gets assigned.  For
13880    # these, we just remove the changed code points from the table for the
13881    # latter property value, and add them back in to the grab-bag one.  A few
13882    # other properties are not entirely of this form and have values for some
13883    # or all unassigned code points that are not the grab-bag one.  These have
13884    # to be handled specially, and are hard-coded in to this routine based on
13885    # manual inspection of the Unicode character database.  A list of the
13886    # outlier code points is made for each of these properties, and those
13887    # outliers are excluded from adding and removing from tables.
13888    #
13889    # Note that there are glitches when comparing against Unicode 1.1, as some
13890    # Hangul syllables in it were later ripped out and eventually replaced
13891    # with other things.
13892
13893    print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
13894
13895    my $after_first_version = "All matching code points were added after "
13896                            . "Unicode $string_compare_versions";
13897
13898    # Calculate the delta as those code points that have been newly assigned
13899    # since the first compare version.
13900    my $delta = Range_List->new();
13901    foreach my $table ($age->tables) {
13902        use version;
13903        next if $table == $age->table('Unassigned');
13904        next if version->parse($table->name)
13905             le version->parse($string_compare_versions);
13906        $delta += $table;
13907    }
13908    if ($delta->is_empty) {
13909        die ("No changes; perhaps you need a 'DAge.txt' file?");
13910    }
13911
13912    my $unchanged = ~ $delta;
13913
13914    calculate_Assigned() if ! defined $Assigned;
13915    $Assigned &= $unchanged;
13916
13917    # $Assigned now contains the code points that were assigned as of Unicode
13918    # version X.
13919
13920    # A block is all or nothing.  If nothing is assigned in it, it all goes
13921    # back to the No_Block pool; but if even one code point is assigned, the
13922    # block is retained.
13923    my $no_block = $block->table('No_Block');
13924    foreach my $this_block ($block->tables) {
13925        next if     $this_block == $no_block
13926                ||  ! ($this_block & $Assigned)->is_empty;
13927        $this_block->set_fate($SUPPRESSED, $after_first_version);
13928        foreach my $range ($this_block->ranges) {
13929            $block->replace_map($range->start, $range->end, 'No_Block')
13930        }
13931        $no_block += $this_block;
13932    }
13933
13934    my @special_delta_properties;   # List of properties that have to be
13935                                    # handled specially.
13936    my %restricted_delta;           # Keys are the entries in
13937                                    # @special_delta_properties;  values
13938                                    # are the range list of the code points
13939                                    # that behave normally when they get
13940                                    # assigned.
13941
13942    # In the next three properties, the Default Ignorable code points are
13943    # outliers.
13944    calculate_DI();
13945    $DI &= $unchanged;
13946
13947    push @special_delta_properties, property_ref('_Perl_GCB');
13948    $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13949
13950    if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
13951    {
13952        push @special_delta_properties, $cwnfkcc;
13953        $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13954    }
13955
13956    calculate_NChar();      # Non-character code points
13957    $NChar &= $unchanged;
13958
13959    # This may have to be updated from time-to-time to get the most accurate
13960    # results.
13961    my $default_BC_non_LtoR = Range_List->new(Initialize =>
13962                        # These came from the comments in v8.0 DBidiClass.txt
13963                                                        [ # AL
13964                                                            0x0600 .. 0x07BF,
13965                                                            0x08A0 .. 0x08FF,
13966                                                            0xFB50 .. 0xFDCF,
13967                                                            0xFDF0 .. 0xFDFF,
13968                                                            0xFE70 .. 0xFEFF,
13969                                                            0x1EE00 .. 0x1EEFF,
13970                                                           # R
13971                                                            0x0590 .. 0x05FF,
13972                                                            0x07C0 .. 0x089F,
13973                                                            0xFB1D .. 0xFB4F,
13974                                                            0x10800 .. 0x10FFF,
13975                                                            0x1E800 .. 0x1EDFF,
13976                                                            0x1EF00 .. 0x1EFFF,
13977                                                           # ET
13978                                                            0x20A0 .. 0x20CF,
13979                                                         ]
13980                                          );
13981    $default_BC_non_LtoR += $DI + $NChar;
13982    push @special_delta_properties, property_ref('BidiClass');
13983    $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
13984
13985    if (defined (my $eaw = property_ref('East_Asian_Width'))) {
13986
13987        my $default_EA_width_W = Range_List->new(Initialize =>
13988                                    # From comments in v8.0 EastAsianWidth.txt
13989                                                [
13990                                                    0x3400 .. 0x4DBF,
13991                                                    0x4E00 .. 0x9FFF,
13992                                                    0xF900 .. 0xFAFF,
13993                                                    0x20000 .. 0x2A6DF,
13994                                                    0x2A700 .. 0x2B73F,
13995                                                    0x2B740 .. 0x2B81F,
13996                                                    0x2B820 .. 0x2CEAF,
13997                                                    0x2F800 .. 0x2FA1F,
13998                                                    0x20000 .. 0x2FFFD,
13999                                                    0x30000 .. 0x3FFFD,
14000                                                ]
14001                                             );
14002        push @special_delta_properties, $eaw;
14003        $restricted_delta{$special_delta_properties[-1]}
14004                                                       = ~ $default_EA_width_W;
14005
14006        # Line break came along in the same release as East_Asian_Width, and
14007        # the non-grab-bag default set is a superset of the EAW one.
14008        if (defined (my $lb = property_ref('Line_Break'))) {
14009            my $default_LB_non_XX = Range_List->new(Initialize =>
14010                                        # From comments in v8.0 LineBreak.txt
14011                                                        [ 0x20A0 .. 0x20CF ]);
14012            $default_LB_non_XX += $default_EA_width_W;
14013            push @special_delta_properties, $lb;
14014            $restricted_delta{$special_delta_properties[-1]}
14015                                                        = ~ $default_LB_non_XX;
14016        }
14017    }
14018
14019    # Go through every property, skipping those we've already worked on, those
14020    # that are immutable, and the perl ones that will be calculated after this
14021    # routine has done its fixup.
14022    foreach my $property (property_ref('*')) {
14023        next if    $property == $perl     # Done later in the program
14024                || $property == $block    # Done just above
14025                || $property == $DI       # Done just above
14026                || $property == $NChar    # Done just above
14027
14028                   # The next two are invariant across Unicode versions
14029                || $property == property_ref('Pattern_Syntax')
14030                || $property == property_ref('Pattern_White_Space');
14031
14032        #  Find the grab-bag value.
14033        my $default_map = $property->default_map;
14034
14035        if (! $property->to_create_match_tables) {
14036
14037            # Here there aren't any match tables.  So far, all such properties
14038            # have a default map, and don't require special handling.  Just
14039            # change each newly assigned code point back to the default map,
14040            # as if they were unassigned.
14041            foreach my $range ($delta->ranges) {
14042                $property->add_map($range->start,
14043                                $range->end,
14044                                $default_map,
14045                                Replace => $UNCONDITIONALLY);
14046            }
14047        }
14048        else {  # Here there are match tables.  Find the one (if any) for the
14049                # grab-bag value that unassigned code points go to.
14050            my $default_table;
14051            if (defined $default_map) {
14052                $default_table = $property->table($default_map);
14053            }
14054
14055            # If some code points don't go back to the grab-bag when they
14056            # are considered unassigned, exclude them from the list that does
14057            # that.
14058            my $this_delta = $delta;
14059            my $this_unchanged = $unchanged;
14060            if (grep { $_ == $property } @special_delta_properties) {
14061                $this_delta = $delta & $restricted_delta{$property};
14062                $this_unchanged = ~ $this_delta;
14063            }
14064
14065            # Fix up each match table for this property.
14066            foreach my $table ($property->tables) {
14067                if (defined $default_table && $table == $default_table) {
14068
14069                    # The code points assigned after release X (the ones we
14070                    # are excluding in this routine) go back on to the default
14071                    # (grab-bag) table.  However, some of these tables don't
14072                    # actually exist, but are specified solely by the other
14073                    # tables.  (In a binary property, we don't need to
14074                    # actually have an 'N' table, as it's just the complement
14075                    # of the 'Y' table.)  Such tables will be locked, so just
14076                    # skip those.
14077                    $table += $this_delta unless $table->locked;
14078                }
14079                else {
14080
14081                    # Here the table is not for the default value.  We need to
14082                    # subtract the code points we are ignoring for this
14083                    # comparison (the deltas) from it.  But if the table
14084                    # started out with nothing, no need to exclude anything,
14085                    # and want to skip it here anyway, so it gets listed
14086                    # properly in the pod.
14087                    next if $table->is_empty;
14088
14089                    # Save the deltas for later, before we do the subtraction
14090                    my $deltas = $table & $this_delta;
14091
14092                    $table &= $this_unchanged;
14093
14094                    # Suppress the table if the subtraction left it with
14095                    # nothing in it
14096                    if ($table->is_empty) {
14097                        if ($property->type == $BINARY) {
14098                            push @tables_that_may_be_empty, $table->complete_name;
14099                        }
14100                        else {
14101                            $table->set_fate($SUPPRESSED, $after_first_version);
14102                        }
14103                    }
14104
14105                    # Now we add the removed code points to the property's
14106                    # map, as they should now map to the grab-bag default
14107                    # property (which they did in the first comparison
14108                    # version).  But we don't have to do this if the map is
14109                    # only for internal use.
14110                    if (defined $default_map && $property->to_output_map) {
14111
14112                        # The gc property has pseudo property values whose names
14113                        # have length 1.  These are the union of all the
14114                        # property values whose name is longer than 1 and
14115                        # whose first letter is all the same.  The replacement
14116                        # is done once for the longer-named tables.
14117                        next if $property == $gc && length $table->name == 1;
14118
14119                        foreach my $range ($deltas->ranges) {
14120                            $property->add_map($range->start,
14121                                            $range->end,
14122                                            $default_map,
14123                                            Replace => $UNCONDITIONALLY);
14124                        }
14125                    }
14126                }
14127            }
14128        }
14129    }
14130
14131    # The above code doesn't work on 'gc=C', as it is a superset of the default
14132    # ('Cn') table.  It's easiest to just special case it here.
14133    my $C = $gc->table('C');
14134    $C += $gc->table('Cn');
14135
14136    return;
14137}
14138
14139sub compile_perl() {
14140    # Create perl-defined tables.  Almost all are part of the pseudo-property
14141    # named 'perl' internally to this program.  Many of these are recommended
14142    # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14143    # on those found there.
14144    # Almost all of these are equivalent to some Unicode property.
14145    # A number of these properties have equivalents restricted to the ASCII
14146    # range, with their names prefaced by 'Posix', to signify that these match
14147    # what the Posix standard says they should match.  A couple are
14148    # effectively this, but the name doesn't have 'Posix' in it because there
14149    # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
14150    # to the full Unicode range, by our guesses as to what is appropriate.
14151
14152    # 'All' is all code points.  As an error check, instead of just setting it
14153    # to be that, construct it to be the union of all the major categories
14154    $All = $perl->add_match_table('All',
14155      Description
14156        => "All code points, including those above Unicode.  Same as qr/./s",
14157      Matches_All => 1);
14158
14159    foreach my $major_table ($gc->tables) {
14160
14161        # Major categories are the ones with single letter names.
14162        next if length($major_table->name) != 1;
14163
14164        $All += $major_table;
14165    }
14166
14167    if ($All->max != $MAX_WORKING_CODEPOINT) {
14168        Carp::my_carp_bug("Generated highest code point ("
14169           . sprintf("%X", $All->max)
14170           . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14171    }
14172    if ($All->range_count != 1 || $All->min != 0) {
14173     Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14174    }
14175
14176    my $Any = $perl->add_match_table('Any',
14177                                    Description  => "All Unicode code points");
14178    $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14179    $Any->add_alias('Unicode');
14180
14181    calculate_Assigned();
14182
14183    my $ASCII = $perl->add_match_table('ASCII');
14184    if (defined $block) {   # This is equivalent to the block if have it.
14185        my $Unicode_ASCII = $block->table('Basic_Latin');
14186        if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14187            $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14188        }
14189    }
14190
14191    # Very early releases didn't have blocks, so initialize ASCII ourselves if
14192    # necessary
14193    if ($ASCII->is_empty) {
14194        if (! NON_ASCII_PLATFORM) {
14195            $ASCII->add_range(0, 127);
14196        }
14197        else {
14198            for my $i (0 .. 127) {
14199                $ASCII->add_range(utf8::unicode_to_native($i),
14200                                  utf8::unicode_to_native($i));
14201            }
14202        }
14203    }
14204
14205    # Get the best available case definitions.  Early Unicode versions didn't
14206    # have Uppercase and Lowercase defined, so use the general category
14207    # instead for them, modified by hard-coding in the code points each is
14208    # missing.
14209    my $Lower = $perl->add_match_table('XPosixLower');
14210    my $Unicode_Lower = property_ref('Lowercase');
14211    if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14212        $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14213
14214    }
14215    else {
14216        $Lower += $gc->table('Lowercase_Letter');
14217
14218        # There are quite a few code points in Lower, that aren't in gc=lc,
14219        # and not all are in all releases.
14220        my $temp = Range_List->new(Initialize => [
14221                                                utf8::unicode_to_native(0xAA),
14222                                                utf8::unicode_to_native(0xBA),
14223                                                0x02B0 .. 0x02B8,
14224                                                0x02C0 .. 0x02C1,
14225                                                0x02E0 .. 0x02E4,
14226                                                0x0345,
14227                                                0x037A,
14228                                                0x1D2C .. 0x1D6A,
14229                                                0x1D78,
14230                                                0x1D9B .. 0x1DBF,
14231                                                0x2071,
14232                                                0x207F,
14233                                                0x2090 .. 0x209C,
14234                                                0x2170 .. 0x217F,
14235                                                0x24D0 .. 0x24E9,
14236                                                0x2C7C .. 0x2C7D,
14237                                                0xA770,
14238                                                0xA7F8 .. 0xA7F9,
14239                                ]);
14240        $Lower += $temp & $Assigned;
14241    }
14242    my $Posix_Lower = $perl->add_match_table("PosixLower",
14243                            Initialize => $Lower & $ASCII,
14244                            );
14245
14246    my $Upper = $perl->add_match_table("XPosixUpper");
14247    my $Unicode_Upper = property_ref('Uppercase');
14248    if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14249        $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14250    }
14251    else {
14252
14253        # Unlike Lower, there are only two ranges in Upper that aren't in
14254        # gc=Lu, and all code points were assigned in all releases.
14255        $Upper += $gc->table('Uppercase_Letter');
14256        $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
14257        $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
14258    }
14259    my $Posix_Upper = $perl->add_match_table("PosixUpper",
14260                            Initialize => $Upper & $ASCII,
14261                            );
14262
14263    # Earliest releases didn't have title case.  Initialize it to empty if not
14264    # otherwise present
14265    my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14266                                       Description => '(= \p{Gc=Lt})');
14267    my $lt = $gc->table('Lt');
14268
14269    # Earlier versions of mktables had this related to $lt since they have
14270    # identical code points, but their caseless equivalents are not the same,
14271    # one being 'Cased' and the other being 'LC', and so now must be kept as
14272    # separate entities.
14273    if (defined $lt) {
14274        $Title += $lt;
14275    }
14276    else {
14277        push @tables_that_may_be_empty, $Title->complete_name;
14278    }
14279
14280    my $Unicode_Cased = property_ref('Cased');
14281    if (defined $Unicode_Cased) {
14282        my $yes = $Unicode_Cased->table('Y');
14283        my $no = $Unicode_Cased->table('N');
14284        $Title->set_caseless_equivalent($yes);
14285        if (defined $Unicode_Upper) {
14286            $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14287            $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14288        }
14289        $Upper->set_caseless_equivalent($yes);
14290        if (defined $Unicode_Lower) {
14291            $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14292            $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14293        }
14294        $Lower->set_caseless_equivalent($yes);
14295    }
14296    else {
14297        # If this Unicode version doesn't have Cased, set up the Perl
14298        # extension from first principles.  From Unicode 5.1: Definition D120:
14299        # A character C is defined to be cased if and only if C has the
14300        # Lowercase or Uppercase property or has a General_Category value of
14301        # Titlecase_Letter.
14302        my $cased = $perl->add_match_table('Cased',
14303                        Initialize => $Lower + $Upper + $Title,
14304                        Description => 'Uppercase or Lowercase or Titlecase',
14305                        );
14306        # $notcased is purely for the caseless equivalents below
14307        my $notcased = $perl->add_match_table('_Not_Cased',
14308                                Initialize => ~ $cased,
14309                                Fate => $INTERNAL_ONLY,
14310                                Description => 'All not-cased code points');
14311        $Title->set_caseless_equivalent($cased);
14312        if (defined $Unicode_Upper) {
14313            $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14314            $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14315        }
14316        $Upper->set_caseless_equivalent($cased);
14317        if (defined $Unicode_Lower) {
14318            $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14319            $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14320        }
14321        $Lower->set_caseless_equivalent($cased);
14322    }
14323
14324    # The remaining perl defined tables are mostly based on Unicode TR 18,
14325    # "Annex C: Compatibility Properties".  All of these have two versions,
14326    # one whose name generally begins with Posix that is posix-compliant, and
14327    # one that matches Unicode characters beyond the Posix, ASCII range
14328
14329    my $Alpha = $perl->add_match_table('XPosixAlpha');
14330
14331    # Alphabetic was not present in early releases
14332    my $Alphabetic = property_ref('Alphabetic');
14333    if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14334        $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14335    }
14336    else {
14337
14338        # The Alphabetic property doesn't exist for early releases, so
14339        # generate it.  The actual definition, in 5.2 terms is:
14340        #
14341        # gc=L + gc=Nl + Other_Alphabetic
14342        #
14343        # Other_Alphabetic is also not defined in these early releases, but it
14344        # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14345        # those last two as well, then subtract the relatively few of them that
14346        # shouldn't have been added.  (The gc=So range is the circled capital
14347        # Latin characters.  Early releases mistakenly didn't also include the
14348        # lower-case versions of these characters, and so we don't either, to
14349        # maintain consistency with those releases that first had this
14350        # property.
14351        $Alpha->initialize($gc->table('Letter')
14352                           + pre_3_dot_1_Nl()
14353                           + $gc->table('Mn')
14354                           + $gc->table('Mc')
14355                        );
14356        $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
14357        foreach my $range (     [ 0x0300, 0x0344 ],
14358                                [ 0x0346, 0x034E ],
14359                                [ 0x0360, 0x0362 ],
14360                                [ 0x0483, 0x0486 ],
14361                                [ 0x0591, 0x05AF ],
14362                                [ 0x06DF, 0x06E0 ],
14363                                [ 0x06EA, 0x06EC ],
14364                                [ 0x0740, 0x074A ],
14365                                0x093C,
14366                                0x094D,
14367                                [ 0x0951, 0x0954 ],
14368                                0x09BC,
14369                                0x09CD,
14370                                0x0A3C,
14371                                0x0A4D,
14372                                0x0ABC,
14373                                0x0ACD,
14374                                0x0B3C,
14375                                0x0B4D,
14376                                0x0BCD,
14377                                0x0C4D,
14378                                0x0CCD,
14379                                0x0D4D,
14380                                0x0DCA,
14381                                [ 0x0E47, 0x0E4C ],
14382                                0x0E4E,
14383                                [ 0x0EC8, 0x0ECC ],
14384                                [ 0x0F18, 0x0F19 ],
14385                                0x0F35,
14386                                0x0F37,
14387                                0x0F39,
14388                                [ 0x0F3E, 0x0F3F ],
14389                                [ 0x0F82, 0x0F84 ],
14390                                [ 0x0F86, 0x0F87 ],
14391                                0x0FC6,
14392                                0x1037,
14393                                0x1039,
14394                                [ 0x17C9, 0x17D3 ],
14395                                [ 0x20D0, 0x20DC ],
14396                                0x20E1,
14397                                [ 0x302A, 0x302F ],
14398                                [ 0x3099, 0x309A ],
14399                                [ 0xFE20, 0xFE23 ],
14400                                [ 0x1D165, 0x1D169 ],
14401                                [ 0x1D16D, 0x1D172 ],
14402                                [ 0x1D17B, 0x1D182 ],
14403                                [ 0x1D185, 0x1D18B ],
14404                                [ 0x1D1AA, 0x1D1AD ],
14405        ) {
14406            if (ref $range) {
14407                $Alpha->delete_range($range->[0], $range->[1]);
14408            }
14409            else {
14410                $Alpha->delete_range($range, $range);
14411            }
14412        }
14413        $Alpha->add_description('Alphabetic');
14414        $Alpha->add_alias('Alphabetic');
14415    }
14416    my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14417                            Initialize => $Alpha & $ASCII,
14418                            );
14419    $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14420    $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14421
14422    my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14423                        Description => 'Alphabetic and (decimal) Numeric',
14424                        Initialize => $Alpha + $gc->table('Decimal_Number'),
14425                        );
14426    $perl->add_match_table("PosixAlnum",
14427                            Initialize => $Alnum & $ASCII,
14428                            );
14429
14430    my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14431                                Description => '\w, including beyond ASCII;'
14432                                            . ' = \p{Alnum} + \pM + \p{Pc}'
14433                                            . ' + \p{Join_Control}',
14434                                Initialize => $Alnum + $gc->table('Mark'),
14435                                );
14436    my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14437    if (defined $Pc) {
14438        $Word += $Pc;
14439    }
14440    else {
14441        $Word += ord('_');  # Make sure this is a $Word
14442    }
14443    my $JC = property_ref('Join_Control');  # Wasn't in release 1
14444    if (defined $JC) {
14445        $Word += $JC->table('Y');
14446    }
14447    else {
14448        $Word += 0x200C + 0x200D;
14449    }
14450
14451    # This is a Perl extension, so the name doesn't begin with Posix.
14452    my $PerlWord = $perl->add_match_table('PosixWord',
14453                    Description => '\w, restricted to ASCII',
14454                    Initialize => $Word & $ASCII,
14455                    );
14456    $PerlWord->add_alias('PerlWord');
14457
14458    my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14459                                Description => '\h, Horizontal white space',
14460
14461                                # 200B is Zero Width Space which is for line
14462                                # break control, and was listed as
14463                                # Space_Separator in early releases
14464                                Initialize => $gc->table('Space_Separator')
14465                                            +   ord("\t")
14466                                            -   0x200B, # ZWSP
14467                                );
14468    $Blank->add_alias('HorizSpace');        # Another name for it.
14469    $perl->add_match_table("PosixBlank",
14470                            Initialize => $Blank & $ASCII,
14471                            );
14472
14473    my $VertSpace = $perl->add_match_table('VertSpace',
14474                            Description => '\v',
14475                            Initialize =>
14476                               $gc->table('Line_Separator')
14477                             + $gc->table('Paragraph_Separator')
14478                             + utf8::unicode_to_native(0x0A)  # LINE FEED
14479                             + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14480                             + ord("\f")
14481                             + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14482                             + utf8::unicode_to_native(0x85)  # NEL
14483                    );
14484    # No Posix equivalent for vertical space
14485
14486    my $Space = $perl->add_match_table('XPosixSpace',
14487                Description => '\s including beyond ASCII and vertical tab',
14488                Initialize => $Blank + $VertSpace,
14489    );
14490    $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
14491    $Space->add_alias('SpacePerl');
14492    $Space->add_alias('Space') if $v_version lt v4.1.0;
14493
14494    my $Posix_space = $perl->add_match_table("PosixSpace",
14495                            Initialize => $Space & $ASCII,
14496                            );
14497    $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14498
14499    my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14500                                        Description => 'Control characters');
14501    $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14502    $perl->add_match_table("PosixCntrl",
14503                            Description => "ASCII control characters",
14504                            Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14505                                         . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14506                                         . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14507                                         . " HT, LF, NAK, NUL, RS, SI, SO,"
14508                                         . " SOH, STX, SUB, SYN, US, VT",
14509                            Initialize => $Cntrl & $ASCII,
14510                            );
14511
14512    my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14513    my $Cs = $gc->table('Cs');
14514    if (defined $Cs && ! $Cs->is_empty) {
14515        $perl_surrogate += $Cs;
14516    }
14517    else {
14518        push @tables_that_may_be_empty, '_Perl_Surrogate';
14519    }
14520
14521    # $controls is a temporary used to construct Graph.
14522    my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14523                                                + $gc->table('Control')
14524                                                + $perl_surrogate);
14525
14526    # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
14527    my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14528                        Description => 'Characters that are graphical',
14529                        Initialize => ~ ($Space + $controls),
14530                        );
14531    $perl->add_match_table("PosixGraph",
14532                            Initialize => $Graph & $ASCII,
14533                            );
14534
14535    $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14536                        Description => 'Characters that are graphical plus space characters (but no controls)',
14537                        Initialize => $Blank + $Graph - $gc->table('Control'),
14538                        );
14539    $perl->add_match_table("PosixPrint",
14540                            Initialize => $print & $ASCII,
14541                            );
14542
14543    my $Punct = $perl->add_match_table('Punct');
14544    $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14545
14546    # \p{punct} doesn't include the symbols, which posix does
14547    my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14548                    Description => '\p{Punct} + ASCII-range \p{Symbol}',
14549                    Initialize => $gc->table('Punctuation')
14550                                + ($ASCII & $gc->table('Symbol')),
14551                                Perl_Extension => 1
14552        );
14553    $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14554        Initialize => $ASCII & $XPosixPunct,
14555        );
14556
14557    my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14558                            Description => '[0-9] + all other decimal digits');
14559    $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14560    my $PosixDigit = $perl->add_match_table("PosixDigit",
14561                                            Initialize => $Digit & $ASCII,
14562                                            );
14563
14564    # Hex_Digit was not present in first release
14565    my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14566    my $Hex = property_ref('Hex_Digit');
14567    if (defined $Hex && ! $Hex->is_empty) {
14568        $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14569    }
14570    else {
14571        $Xdigit->initialize([ ord('0') .. ord('9'),
14572                              ord('A') .. ord('F'),
14573                              ord('a') .. ord('f'),
14574                              0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14575    }
14576
14577    # AHex was not present in early releases
14578    my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14579    my $AHex = property_ref('ASCII_Hex_Digit');
14580    if (defined $AHex && ! $AHex->is_empty) {
14581        $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14582    }
14583    else {
14584        $PosixXDigit->initialize($Xdigit & $ASCII);
14585        $PosixXDigit->add_alias('AHex');
14586        $PosixXDigit->add_alias('Ascii_Hex_Digit');
14587    }
14588
14589    my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14590                    Description => "Code points that particpate in some fold",
14591                    );
14592    my $loc_problem_folds = $perl->add_match_table(
14593               "_Perl_Problematic_Locale_Folds",
14594               Description =>
14595                   "Code points that are in some way problematic under locale",
14596    );
14597
14598    # This allows regexec.c to skip some work when appropriate.  Some of the
14599    # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14600    my $loc_problem_folds_start = $perl->add_match_table(
14601               "_Perl_Problematic_Locale_Foldeds_Start",
14602               Description =>
14603                   "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14604    );
14605
14606    my $cf = property_ref('Case_Folding');
14607
14608    # Every character 0-255 is problematic because what each folds to depends
14609    # on the current locale
14610    $loc_problem_folds->add_range(0, 255);
14611    $loc_problem_folds->add_range(0x130, 0x131);    # These are problematic in
14612                                                    # Turkic locales
14613    $loc_problem_folds_start += $loc_problem_folds;
14614
14615    # Also problematic are anything these fold to outside the range.  Likely
14616    # forever the only thing folded to by these outside the 0-255 range is the
14617    # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14618    # completely general, which should catch any unexpected changes or errors.
14619    # We look at each code point 0-255, and add its fold (including each part
14620    # of a multi-char fold) to the list.  See commit message
14621    # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14622    # of the MU issue.
14623    foreach my $range ($loc_problem_folds->ranges) {
14624        foreach my $code_point ($range->start .. $range->end) {
14625            my $fold_range = $cf->containing_range($code_point);
14626            next unless defined $fold_range;
14627
14628            # Skip if folds to itself
14629            next if $fold_range->value eq $CODE_POINT;
14630
14631            my @hex_folds = split " ", $fold_range->value;
14632            my $start_cp = $hex_folds[0];
14633            next if $start_cp eq $CODE_POINT;
14634            $start_cp = hex $start_cp;
14635            foreach my $i (0 .. @hex_folds - 1) {
14636                my $cp = $hex_folds[$i];
14637                next if $cp eq $CODE_POINT;
14638                $cp = hex $cp;
14639                next unless $cp > 255;    # Already have the < 256 ones
14640
14641                $loc_problem_folds->add_range($cp, $cp);
14642                $loc_problem_folds_start->add_range($start_cp, $start_cp);
14643            }
14644        }
14645    }
14646
14647    my $folds_to_multi_char = $perl->add_match_table(
14648         "_Perl_Folds_To_Multi_Char",
14649         Description =>
14650              "Code points whose fold is a string of more than one character",
14651    );
14652    my $in_multi_fold = $perl->add_match_table(
14653               "_Perl_Is_In_Multi_Char_Fold",
14654               Description =>
14655                   "Code points that are in some multiple character fold",
14656    );
14657    if ($v_version lt v3.0.1) {
14658        push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char',
14659                                        '_Perl_Is_In_Multi_Char_Fold',
14660                                        '_Perl_Non_Final_Folds';
14661    }
14662
14663    # Look through all the known folds to populate these tables.
14664    foreach my $range ($cf->ranges) {
14665        next if $range->value eq $CODE_POINT;
14666        my $start = $range->start;
14667        my $end = $range->end;
14668        $any_folds->add_range($start, $end);
14669
14670        my @hex_folds = split " ", $range->value;
14671        if (@hex_folds > 1) {   # Is multi-char fold
14672            $folds_to_multi_char->add_range($start, $end);
14673        }
14674
14675        my $found_locale_problematic = 0;
14676
14677        my $folded_count = @hex_folds;
14678        if ($folded_count > 3) {
14679            die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's  $folded_count for U+" . sprintf "%04X", $range->start);
14680        }
14681
14682        # Look at each of the folded-to characters...
14683        foreach my $i (1 .. $folded_count) {
14684            my $cp = hex $hex_folds[$i-1];
14685            $any_folds->add_range($cp, $cp);
14686
14687            # The fold is problematic if any of the folded-to characters is
14688            # already considered problematic.
14689            if ($loc_problem_folds->contains($cp)) {
14690                $loc_problem_folds->add_range($start, $end);
14691                $found_locale_problematic = 1;
14692            }
14693
14694            if ($folded_count > 1) {
14695                $in_multi_fold->add_range($cp, $cp);
14696            }
14697        }
14698
14699        # If this is a problematic fold, add to the start chars the
14700        # folding-from characters and first folded-to character.
14701        if ($found_locale_problematic) {
14702            $loc_problem_folds_start->add_range($start, $end);
14703            my $cp = hex $hex_folds[0];
14704            $loc_problem_folds_start->add_range($cp, $cp);
14705        }
14706    }
14707
14708    my $dt = property_ref('Decomposition_Type');
14709    $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14710        Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14711        Perl_Extension => 1,
14712        Note => 'Union of all non-canonical decompositions',
14713        );
14714
14715    # For backward compatibility, Perl has its own definition for IDStart.
14716    # It is regular XID_Start plus the underscore, but all characters must be
14717    # Word characters as well
14718    my $XID_Start = property_ref('XID_Start');
14719    my $perl_xids = $perl->add_match_table('_Perl_IDStart',
14720                                            Perl_Extension => 1,
14721                                            Fate => $INTERNAL_ONLY,
14722                                            Initialize => ord('_')
14723                                            );
14724    if (defined $XID_Start
14725        || defined ($XID_Start = property_ref('ID_Start')))
14726    {
14727        $perl_xids += $XID_Start->table('Y');
14728    }
14729    else {
14730        # For Unicode versions that don't have the property, construct our own
14731        # from first principles.  The actual definition is:
14732        #     Letters
14733        #   + letter numbers (Nl)
14734        #   - Pattern_Syntax
14735        #   - Pattern_White_Space
14736        #   + stability extensions
14737        #   - NKFC modifications
14738        #
14739        # What we do in the code below is to include the identical code points
14740        # that are in the first release that had Unicode's version of this
14741        # property, essentially extrapolating backwards.  There were no
14742        # stability extensions until v4.1, so none are included; likewise in
14743        # no Unicode version so far do subtracting PatSyn and PatWS make any
14744        # difference, so those also are ignored.
14745        $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14746
14747        # We do subtract the NFKC modifications that are in the first version
14748        # that had this property.  We don't bother to test if they are in the
14749        # version in question, because if they aren't, the operation is a
14750        # no-op.  The NKFC modifications are discussed in
14751        # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14752        foreach my $range ( 0x037A,
14753                            0x0E33,
14754                            0x0EB3,
14755                            [ 0xFC5E, 0xFC63 ],
14756                            [ 0xFDFA, 0xFE70 ],
14757                            [ 0xFE72, 0xFE76 ],
14758                            0xFE78,
14759                            0xFE7A,
14760                            0xFE7C,
14761                            0xFE7E,
14762                            [ 0xFF9E, 0xFF9F ],
14763        ) {
14764            if (ref $range) {
14765                $perl_xids->delete_range($range->[0], $range->[1]);
14766            }
14767            else {
14768                $perl_xids->delete_range($range, $range);
14769            }
14770        }
14771    }
14772
14773    $perl_xids &= $Word;
14774
14775    my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14776                                        Perl_Extension => 1,
14777                                        Fate => $INTERNAL_ONLY);
14778    my $XIDC = property_ref('XID_Continue');
14779    if (defined $XIDC
14780        || defined ($XIDC = property_ref('ID_Continue')))
14781    {
14782        $perl_xidc += $XIDC->table('Y');
14783    }
14784    else {
14785        # Similarly, we construct our own XIDC if necessary for early Unicode
14786        # versions.  The definition is:
14787        #     everything in XIDS
14788        #   + Gc=Mn
14789        #   + Gc=Mc
14790        #   + Gc=Nd
14791        #   + Gc=Pc
14792        #   - Pattern_Syntax
14793        #   - Pattern_White_Space
14794        #   + stability extensions
14795        #   - NFKC modifications
14796        #
14797        # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14798        # and stability extensions.  There is a somewhat different set of NFKC
14799        # mods to remove (and add in this case).  The ones below make this
14800        # have identical code points as in the first release that defined it.
14801        $perl_xidc += $perl_xids
14802                    + $gc->table('L')
14803                    + $gc->table('Mn')
14804                    + $gc->table('Mc')
14805                    + $gc->table('Nd')
14806                    + utf8::unicode_to_native(0xB7)
14807                    ;
14808        if (defined (my $pc = $gc->table('Pc'))) {
14809            $perl_xidc += $pc;
14810        }
14811        else {  # 1.1.5 didn't have Pc, but these should have been in it
14812            $perl_xidc += 0xFF3F;
14813            $perl_xidc->add_range(0x203F, 0x2040);
14814            $perl_xidc->add_range(0xFE33, 0xFE34);
14815            $perl_xidc->add_range(0xFE4D, 0xFE4F);
14816        }
14817
14818        # Subtract the NFKC mods
14819        foreach my $range ( 0x037A,
14820                            [ 0xFC5E, 0xFC63 ],
14821                            [ 0xFDFA, 0xFE1F ],
14822                            0xFE70,
14823                            [ 0xFE72, 0xFE76 ],
14824                            0xFE78,
14825                            0xFE7A,
14826                            0xFE7C,
14827                            0xFE7E,
14828        ) {
14829            if (ref $range) {
14830                $perl_xidc->delete_range($range->[0], $range->[1]);
14831            }
14832            else {
14833                $perl_xidc->delete_range($range, $range);
14834            }
14835        }
14836    }
14837
14838    $perl_xidc &= $Word;
14839
14840    my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14841                    Perl_Extension => 1,
14842                    Fate => $INTERNAL_ONLY,
14843                    Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14844                    );
14845
14846    my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14847                        Perl_Extension => 1,
14848                        Fate => $INTERNAL_ONLY,
14849                        Initialize => $perl_xidc
14850                                    + ord(" ")
14851                                    + ord("(")
14852                                    + ord(")")
14853                                    + ord("-")
14854                        );
14855
14856    my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
14857
14858    if (@named_sequences) {
14859        push @composition, 'Named_Sequence';
14860        foreach my $sequence (@named_sequences) {
14861            $perl_charname->add_anomalous_entry($sequence);
14862        }
14863    }
14864
14865    my $alias_sentence = "";
14866    my %abbreviations;
14867    my $alias = property_ref('_Perl_Name_Alias');
14868    $perl_charname->set_proxy_for('_Perl_Name_Alias');
14869
14870    # Add each entry in _Perl_Name_Alias to Perl_Charnames.  Where these go
14871    # with respect to any existing entry depends on the entry type.
14872    # Corrections go before said entry, as they should be returned in
14873    # preference over the existing entry.  (A correction to a correction
14874    # should be later in the _Perl_Name_Alias table, so it will correctly
14875    # precede the erroneous correction in Perl_Charnames.)
14876    #
14877    # Abbreviations go after everything else, so they are saved temporarily in
14878    # a hash for later.
14879    #
14880    # Everything else is added afterwards, which preserves the input
14881    # ordering
14882
14883    foreach my $range ($alias->ranges) {
14884        next if $range->value eq "";
14885        my $code_point = $range->start;
14886        if ($code_point != $range->end) {
14887            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14888        }
14889        my ($value, $type) = split ': ', $range->value;
14890        my $replace_type;
14891        if ($type eq 'correction') {
14892            $replace_type = $MULTIPLE_BEFORE;
14893        }
14894        elsif ($type eq 'abbreviation') {
14895
14896            # Save for later
14897            $abbreviations{$value} = $code_point;
14898            next;
14899        }
14900        else {
14901            $replace_type = $MULTIPLE_AFTER;
14902        }
14903
14904        # Actually add; before or after current entry(ies) as determined
14905        # above.
14906
14907        $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14908    }
14909    $alias_sentence = <<END;
14910The _Perl_Name_Alias property adds duplicate code point entries that are
14911alternatives to the original name.  If an addition is a corrected
14912name, it will be physically first in the table.  The original (less correct,
14913but still valid) name will be next; then any alternatives, in no particular
14914order; and finally any abbreviations, again in no particular order.
14915END
14916
14917    # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
14918    # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
14919    # so should be first in the file; the other names have precedence starting
14920    # in 6.1,
14921    my $before_or_after = ($v_version lt v6.1.0)
14922                          ? $MULTIPLE_BEFORE
14923                          : $MULTIPLE_AFTER;
14924
14925    foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14926        my $code_point = $range->start;
14927        my $unicode_1_value = $range->value;
14928        next if $unicode_1_value eq "";     # Skip if name doesn't exist.
14929
14930        if ($code_point != $range->end) {
14931            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14932        }
14933
14934        # To handle EBCDIC, we don't hard code in the code points of the
14935        # controls; instead realizing that all of them are below 256.
14936        last if $code_point > 255;
14937
14938        # We only add in the controls.
14939        next if $gc->value_of($code_point) ne 'Cc';
14940
14941        # We reject this Unicode1 name for later Perls, as it is used for
14942        # another code point
14943        next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14944
14945        # This won't add an exact duplicate.
14946        $perl_charname->add_duplicate($code_point, $unicode_1_value,
14947                                        Replace => $before_or_after);
14948    }
14949
14950    # Now that have everything added, add in abbreviations after
14951    # everything else.  Sort so results don't change between runs of this
14952    # program
14953    foreach my $value (sort keys %abbreviations) {
14954        $perl_charname->add_duplicate($abbreviations{$value}, $value,
14955                                        Replace => $MULTIPLE_AFTER);
14956    }
14957
14958    my $comment;
14959    if (@composition <= 2) { # Always at least 2
14960        $comment = join " and ", @composition;
14961    }
14962    else {
14963        $comment = join ", ", @composition[0 .. scalar @composition - 2];
14964        $comment .= ", and $composition[-1]";
14965    }
14966
14967    $perl_charname->add_comment(join_lines( <<END
14968This file is for charnames.pm.  It is the union of the $comment properties.
14969Unicode_1_Name entries are used only for nameless code points in the Name
14970property.
14971$alias_sentence
14972This file doesn't include the algorithmically determinable names.  For those,
14973use 'unicore/Name.pm'
14974END
14975    ));
14976    property_ref('Name')->add_comment(join_lines( <<END
14977This file doesn't include the algorithmically determinable names.  For those,
14978use 'unicore/Name.pm'
14979END
14980    ));
14981
14982    # Construct the Present_In property from the Age property.
14983    if (-e 'DAge.txt' && defined $age) {
14984        my $default_map = $age->default_map;
14985        my $in = Property->new('In',
14986                                Default_Map => $default_map,
14987                                Full_Name => "Present_In",
14988                                Perl_Extension => 1,
14989                                Type => $ENUM,
14990                                Initialize => $age,
14991                                );
14992        $in->add_comment(join_lines(<<END
14993THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
14994same as for $age, and not for what $in really means.  This is because anything
14995defined in a given release should have multiple values: that release and all
14996higher ones.  But only one value per code point can be represented in a table
14997like this.
14998END
14999        ));
15000
15001        # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
15002        # lowest numbered (earliest) come first, with the non-numeric one
15003        # last.
15004        my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15005                                            ? 1
15006                                            : ($b->name !~ /^[\d.]*$/)
15007                                                ? -1
15008                                                : $a->name <=> $b->name
15009                                            } $age->tables;
15010
15011        # The Present_In property is the cumulative age properties.  The first
15012        # one hence is identical to the first age one.
15013        my $first_in = $in->add_match_table($first_age->name);
15014        $first_in->set_equivalent_to($first_age, Related => 1);
15015
15016        my $description_start = "Code point's usage introduced in version ";
15017        $first_age->add_description($description_start . $first_age->name);
15018        foreach my $alias ($first_age->aliases) {   # Include its aliases
15019            $first_in->add_alias($alias->name);
15020        }
15021
15022        # To construct the accumulated values, for each of the age tables
15023        # starting with the 2nd earliest, merge the earliest with it, to get
15024        # all those code points existing in the 2nd earliest.  Repeat merging
15025        # the new 2nd earliest with the 3rd earliest to get all those existing
15026        # in the 3rd earliest, and so on.
15027        my $previous_in = $first_in;
15028        foreach my $current_age (@rest_ages) {
15029            next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
15030
15031            my $current_in = $in->add_match_table(
15032                                    $current_age->name,
15033                                    Initialize => $current_age + $previous_in,
15034                                    Description => $description_start
15035                                                    . $current_age->name
15036                                                    . ' or earlier',
15037                                    );
15038            foreach my $alias ($current_age->aliases) {
15039                $current_in->add_alias($alias->name);
15040            }
15041            $previous_in = $current_in;
15042
15043            # Add clarifying material for the corresponding age file.  This is
15044            # in part because of the confusing and contradictory information
15045            # given in the Standard's documentation itself, as of 5.2.
15046            $current_age->add_description(
15047                            "Code point's usage was introduced in version "
15048                            . $current_age->name);
15049            $current_age->add_note("See also $in");
15050
15051        }
15052
15053        # And finally the code points whose usages have yet to be decided are
15054        # the same in both properties.  Note that permanently unassigned code
15055        # points actually have their usage assigned (as being permanently
15056        # unassigned), so that these tables are not the same as gc=cn.
15057        my $unassigned = $in->add_match_table($default_map);
15058        my $age_default = $age->table($default_map);
15059        $age_default->add_description(<<END
15060Code point's usage has not been assigned in any Unicode release thus far.
15061END
15062        );
15063        $unassigned->set_equivalent_to($age_default, Related => 1);
15064        foreach my $alias ($age_default->aliases) {
15065            $unassigned->add_alias($alias->name);
15066        }
15067    }
15068
15069    my $patws = $perl->add_match_table('_Perl_PatWS',
15070                                       Perl_Extension => 1,
15071                                       Fate => $INTERNAL_ONLY);
15072    if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15073        $patws->initialize($off_patws->table('Y'));
15074    }
15075    else {
15076        $patws->initialize([ ord("\t"),
15077                             ord("\n"),
15078                             utf8::unicode_to_native(0x0B), # VT
15079                             ord("\f"),
15080                             ord("\r"),
15081                             ord(" "),
15082                             utf8::unicode_to_native(0x85), # NEL
15083                             0x200E..0x200F,             # Left, Right marks
15084                             0x2028..0x2029              # Line, Paragraph seps
15085                           ] );
15086    }
15087
15088    # See L<perlfunc/quotemeta>
15089    my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15090                                           Perl_Extension => 1,
15091                                           Fate => $INTERNAL_ONLY,
15092
15093                                           # Initialize to what's common in
15094                                           # all Unicode releases.
15095                                           Initialize =>
15096                                                  $gc->table('Control')
15097                                                + $Space
15098                                                + $patws
15099                                                + ((~ $Word) & $ASCII)
15100                           );
15101
15102    if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15103        $quotemeta += $patsyn->table('Y');
15104    }
15105    else {
15106        $quotemeta += ((~ $Word) & Range->new(0, 255))
15107                    - utf8::unicode_to_native(0xA8)
15108                    - utf8::unicode_to_native(0xAF)
15109                    - utf8::unicode_to_native(0xB2)
15110                    - utf8::unicode_to_native(0xB3)
15111                    - utf8::unicode_to_native(0xB4)
15112                    - utf8::unicode_to_native(0xB7)
15113                    - utf8::unicode_to_native(0xB8)
15114                    - utf8::unicode_to_native(0xB9)
15115                    - utf8::unicode_to_native(0xBC)
15116                    - utf8::unicode_to_native(0xBD)
15117                    - utf8::unicode_to_native(0xBE);
15118        $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15119                        # same in all releases
15120                        0x2010 .. 0x2027,
15121                        0x2030 .. 0x203E,
15122                        0x2041 .. 0x2053,
15123                        0x2055 .. 0x205E,
15124                        0x2190 .. 0x245F,
15125                        0x2500 .. 0x2775,
15126                        0x2794 .. 0x2BFF,
15127                        0x2E00 .. 0x2E7F,
15128                        0x3001 .. 0x3003,
15129                        0x3008 .. 0x3020,
15130                        0x3030 .. 0x3030,
15131                        0xFD3E .. 0xFD3F,
15132                        0xFE45 .. 0xFE46
15133                      ];
15134    }
15135
15136    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15137        $quotemeta += $di->table('Y')
15138    }
15139    else {
15140        if ($v_version ge v2.0) {
15141            $quotemeta += $gc->table('Cf')
15142                       +  $gc->table('Cs');
15143
15144            # These are above the Unicode version 1 max
15145            $quotemeta->add_range(0xE0000, 0xE0FFF);
15146        }
15147        $quotemeta += $gc->table('Cc')
15148                    - $Space;
15149        my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15150                                                   0x2060 .. 0x206F,
15151                                                   0xFE00 .. 0xFE0F,
15152                                                   0xFFF0 .. 0xFFFB,
15153                                                  ]);
15154        $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15155        $quotemeta += $temp;
15156    }
15157    calculate_DI();
15158    $quotemeta += $DI;
15159
15160    calculate_NChar();
15161
15162    # Finished creating all the perl properties.  All non-internal non-string
15163    # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
15164    # an underscore.)  These do not get a separate entry in the pod file
15165    foreach my $table ($perl->tables) {
15166        foreach my $alias ($table->aliases) {
15167            next if $alias->name =~ /^_/;
15168            $table->add_alias('Is_' . $alias->name,
15169                               Re_Pod_Entry => 0,
15170                               UCD => 0,
15171                               Status => $alias->status,
15172                               OK_as_Filename => 0);
15173        }
15174    }
15175
15176    # Perl tailors the WordBreak property so that \b{wb} doesn't split
15177    # adjacent spaces into separate words.  Unicode 11.0 moved in that
15178    # direction, but left TAB,  FIGURE SPACE (U+2007), and (ironically) NO
15179    # BREAK SPACE as breaking, so we retained the original Perl customization.
15180    # To do this, in the Perl copy of WB, simply replace the mappings of
15181    # horizontal space characters that otherwise would map to the default or
15182    # the 11.0 'WSegSpace' to instead map to our tailoring.
15183    my $perl_wb = property_ref('_Perl_WB');
15184    my $default = $perl_wb->default_map;
15185    for my $range ($Blank->ranges) {
15186        for my $i ($range->start .. $range->end) {
15187            my $value = $perl_wb->value_of($i);
15188
15189            next unless $value eq $default || $value eq 'WSegSpace';
15190            $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15191                              Replace => $UNCONDITIONALLY);
15192        }
15193    }
15194
15195    # Also starting in Unicode 11.0, rules for some of the boundary types are
15196    # based on a non-UCD property (which we have read in if it exists).
15197    # Recall that these boundary properties partition the code points into
15198    # equivalence classes (represented as enums).
15199    #
15200    # The loop below goes through each code point that matches the non-UCD
15201    # property, and for each current equivalence class containing such a code
15202    # point, splits it so that those that are in both are now in a newly
15203    # created equivalence class whose name is a combination of the property
15204    # and the old class name, leaving unchanged everything that doesn't match
15205    # the non-UCD property.
15206    my $ep = property_ref('ExtPict');
15207    $ep = $ep->table('Y') if defined $ep;
15208    if (defined $ep) {
15209        foreach my $base_property (property_ref('GCB'),
15210                                   property_ref('WB'))
15211        {
15212            my $property = property_ref('_Perl_' . $base_property->name);
15213            foreach my $range ($ep->ranges) {
15214                foreach my $i ($range->start .. $range->end) {
15215                    my $current = $property->value_of($i);
15216                    $current = $property->table($current)->short_name;
15217                    $property->add_map($i, $i, 'ExtPict_' . $current,
15218                                       Replace => $UNCONDITIONALLY);
15219                }
15220            }
15221        }
15222    }
15223
15224    # Create a version of the LineBreak property with the mappings that are
15225    # omitted in the default algorithm remapped to what
15226    # http://www.unicode.org/reports/tr14 says they should be.
15227    #
15228    # First, create a plain copy, but with all property values written out in
15229    # their long form, as regen/mk_invlist.pl expects that, and also fix
15230    # occurrences of the typo in early Unicode versions: 'inseperable'.
15231    my $perl_lb = property_ref('_Perl_LB');
15232    if (! defined $perl_lb) {
15233        $perl_lb = Property->new('_Perl_LB',
15234                                 Fate => $INTERNAL_ONLY,
15235                                 Perl_Extension => 1,
15236                                 Directory => $map_directory,
15237                                 Type => $STRING);
15238        my $lb = property_ref('Line_Break');
15239
15240        # Populate from $lb, but use full name and fix typo.
15241        foreach my $range ($lb->ranges) {
15242            my $full_name = $lb->table($range->value)->full_name;
15243            $full_name = 'Inseparable'
15244                                if standardize($full_name) eq 'inseperable';
15245            $perl_lb->add_map($range->start, $range->end, $full_name);
15246        }
15247    }
15248
15249    # What tr14 says is this:
15250
15251    # Original 	   Resolved  General_Category
15252    # AI, SG, XX      AL      Any
15253    # SA              CM      Only Mn or Mc
15254    # SA              AL      Any except Mn and Mc
15255    # CJ              NS      Any
15256
15257    $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
15258
15259    my $ea = property_ref('East_Asian_Width');
15260    my $Cn_EP;
15261    $Cn_EP = $ep & $gc->table('Unassigned') if defined $ep;
15262
15263    for my $range ($perl_lb->ranges) {
15264        my $value = standardize($range->value);
15265        if (   $value eq standardize('Unknown')
15266            || $value eq standardize('Ambiguous')
15267            || $value eq standardize('Surrogate'))
15268        {
15269            $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15270                              Replace => $UNCONDITIONALLY);
15271        }
15272        elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15273            $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15274                              Replace => $UNCONDITIONALLY);
15275        }
15276        elsif ($value eq standardize('Complex_Context')) {
15277            for my $i ($range->start .. $range->end) {
15278                my $gc_val = $gc->value_of($i);
15279                if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15280                    $perl_lb->add_map($i, $i, 'Combining_Mark',
15281                                      Replace => $UNCONDITIONALLY);
15282                }
15283                else {
15284                    $perl_lb->add_map($i, $i, 'Alphabetic',
15285                                      Replace => $UNCONDITIONALLY);
15286                }
15287            }
15288        }
15289        elsif (defined $ep && $value eq standardize('Ideographic')) {
15290
15291            # Unicode 14 adds a rule to not break lines before any potential
15292            # EBase,  They say that any unassigned code point that is ExtPict,
15293            # is potentially an EBase.  In 14.0, all such ones are in the
15294            # ExtPict=ID category.  We must split that category for the
15295            # pairwise rule table to work.
15296            for my $i ($range->start .. $range->end) {
15297                if ($Cn_EP->contains($i)) {
15298                    $perl_lb->add_map($i, $i,
15299                                'Unassigned_Extended_Pictographic_Ideographic',
15300                                Replace => $UNCONDITIONALLY);
15301                }
15302            }
15303        }
15304        elsif (    defined $ea
15305               && (   $value eq standardize('Close_Parenthesis')
15306                   || $value eq standardize('Open_Punctuation')))
15307        {
15308            # Unicode 13 splits the OP and CP properties each into East Asian,
15309            # and non-.  We retain the (now somewhat misleading) names OP and
15310            # CP for the non-East Asian variety, as there are very few East
15311            # Asian ones.
15312            my $replace = ($value eq standardize('Open_Punctuation'))
15313                          ? 'East_Asian_OP'
15314                          : 'East_Asian_CP';
15315            for my $i ($range->start .. $range->end) {
15316                my $ea_val = $ea->value_of($i);
15317                if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') {
15318                    $perl_lb->add_map($i, $i, $replace,
15319                                                Replace => $UNCONDITIONALLY);
15320                }
15321            }
15322        }
15323    }
15324
15325    # This property is a modification of the scx property
15326    my $perl_scx = Property->new('_Perl_SCX',
15327                                 Fate => $INTERNAL_ONLY,
15328                                 Perl_Extension => 1,
15329                                 Directory => $map_directory,
15330                                 Type => $ENUM);
15331    my $source;
15332
15333    # Use scx if available; otherwise sc;  if neither is there (a very old
15334    # Unicode version, just say that everything is 'Common'
15335    if (defined $scx) {
15336        $source = $scx;
15337        $perl_scx->set_default_map('Unknown');
15338    }
15339    elsif (defined $script) {
15340        $source = $script;
15341
15342        # Early versions of 'sc', had everything be 'Common'
15343        if (defined $script->table('Unknown')) {
15344            $perl_scx->set_default_map('Unknown');
15345        }
15346        else {
15347            $perl_scx->set_default_map('Common');
15348        }
15349    } else {
15350        $perl_scx->add_match_table('Common');
15351        $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15352
15353        $perl_scx->add_match_table('Unknown');
15354        $perl_scx->set_default_map('Unknown');
15355    }
15356
15357    $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15358    $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15359
15360    if (defined $source) {
15361        $perl_scx->initialize($source);
15362
15363        # UTS 39 says that the scx property should be modified for these
15364        # countries where certain mixed scripts are commonly used.
15365        for my $range ($perl_scx->ranges) {
15366            my $value = $range->value;
15367            my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15368             $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15369             $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15370             $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15371                                     {$1 Katakana Hiragana Jpan}xi;
15372             $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15373             $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15374
15375            if ($changed) {
15376                $value = join " ", uniques split " ", $value;
15377                $range->set_value($value)
15378            }
15379        }
15380
15381        foreach my $table ($source->tables) {
15382            my $scx_table = $perl_scx->add_match_table($table->name,
15383                                    Full_Name => $table->full_name);
15384            foreach my $alias ($table->aliases) {
15385                $scx_table->add_alias($alias->name);
15386            }
15387        }
15388    }
15389
15390    # Here done with all the basic stuff.  Ready to populate the information
15391    # about each character if annotating them.
15392    if ($annotate) {
15393
15394        # See comments at its declaration
15395        $annotate_ranges = Range_Map->new;
15396
15397        # This separates out the non-characters from the other unassigneds, so
15398        # can give different annotations for each.
15399        $unassigned_sans_noncharacters = Range_List->new(
15400                                    Initialize => $gc->table('Unassigned'));
15401        $unassigned_sans_noncharacters &= (~ $NChar);
15402
15403        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15404            $i = populate_char_info($i);    # Note sets $i so may cause skips
15405
15406        }
15407    }
15408
15409    return;
15410}
15411
15412sub add_perl_synonyms() {
15413    # A number of Unicode tables have Perl synonyms that are expressed in
15414    # the single-form, \p{name}.  These are:
15415    #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15416    #       \p{Is_Name} as synonyms
15417    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15418    #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15419    #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15420    #       conflict, \p{Value} and \p{Is_Value} as well
15421    #
15422    # This routine generates these synonyms, warning of any unexpected
15423    # conflicts.
15424
15425    # Construct the list of tables to get synonyms for.  Start with all the
15426    # binary and the General_Category ones.
15427    my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15428                                                            property_ref('*');
15429    push @tables, $gc->tables;
15430
15431    # If the version of Unicode includes the Script Extensions (preferably),
15432    # or Script property, add its tables
15433    if (defined $scx) {
15434        push @tables, $scx->tables;
15435    }
15436    else {
15437        push @tables, $script->tables if defined $script;
15438    }
15439
15440    # The Block tables are kept separate because they are treated differently.
15441    # And the earliest versions of Unicode didn't include them, so add only if
15442    # there are some.
15443    my @blocks;
15444    push @blocks, $block->tables if defined $block;
15445
15446    # Here, have the lists of tables constructed.  Process blocks last so that
15447    # if there are name collisions with them, blocks have lowest priority.
15448    # Should there ever be other collisions, manual intervention would be
15449    # required.  See the comments at the beginning of the program for a
15450    # possible way to handle those semi-automatically.
15451    foreach my $table (@tables,  @blocks) {
15452
15453        # For non-binary properties, the synonym is just the name of the
15454        # table, like Greek, but for binary properties the synonym is the name
15455        # of the property, and means the code points in its 'Y' table.
15456        my $nominal = $table;
15457        my $nominal_property = $nominal->property;
15458        my $actual;
15459        if (! $nominal->isa('Property')) {
15460            $actual = $table;
15461        }
15462        else {
15463
15464            # Here is a binary property.  Use the 'Y' table.  Verify that is
15465            # there
15466            my $yes = $nominal->table('Y');
15467            unless (defined $yes) {  # Must be defined, but is permissible to
15468                                     # be empty.
15469                Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
15470                next;
15471            }
15472            $actual = $yes;
15473        }
15474
15475        foreach my $alias ($nominal->aliases) {
15476
15477            # Attempt to create a table in the perl directory for the
15478            # candidate table, using whatever aliases in it that don't
15479            # conflict.  Also add non-conflicting aliases for all these
15480            # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15481            PREFIX:
15482            foreach my $prefix ("", 'Is_', 'In_') {
15483
15484                # Only Block properties can have added 'In_' aliases.
15485                next if $prefix eq 'In_' and $nominal_property != $block;
15486
15487                my $proposed_name = $prefix . $alias->name;
15488
15489                # No Is_Is, In_In, nor combinations thereof
15490                trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15491                next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15492
15493                trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15494
15495                # Get a reference to any existing table in the perl
15496                # directory with the desired name.
15497                my $pre_existing = $perl->table($proposed_name);
15498
15499                if (! defined $pre_existing) {
15500
15501                    # No name collision, so OK to add the perl synonym.
15502
15503                    my $make_re_pod_entry;
15504                    my $ok_as_filename;
15505                    my $status = $alias->status;
15506                    if ($nominal_property == $block) {
15507
15508                        # For block properties, only the compound form is
15509                        # preferred for external use; the others are
15510                        # discouraged.  The pod file contains wild cards for
15511                        # the 'In' and 'Is' forms so no entries for those; and
15512                        # we don't want people using the name without any
15513                        # prefix, so discourage that.
15514                        if ($prefix eq "") {
15515                            $make_re_pod_entry = 1;
15516                            $status = $status || $DISCOURAGED;
15517                            $ok_as_filename = 0;
15518                        }
15519                        elsif ($prefix eq 'In_') {
15520                            $make_re_pod_entry = 0;
15521                            $status = $status || $DISCOURAGED;
15522                            $ok_as_filename = 1;
15523                        }
15524                        else {
15525                            $make_re_pod_entry = 0;
15526                            $status = $status || $DISCOURAGED;
15527                            $ok_as_filename = 0;
15528                        }
15529                    }
15530                    elsif ($prefix ne "") {
15531
15532                        # The 'Is' prefix is handled in the pod by a wild
15533                        # card, and we won't use it for an external name
15534                        $make_re_pod_entry = 0;
15535                        $status = $status || $NORMAL;
15536                        $ok_as_filename = 0;
15537                    }
15538                    else {
15539
15540                        # Here, is an empty prefix, non block.  This gets its
15541                        # own pod entry and can be used for an external name.
15542                        $make_re_pod_entry = 1;
15543                        $status = $status || $NORMAL;
15544                        $ok_as_filename = 1;
15545                    }
15546
15547                    # Here, there isn't a perl pre-existing table with the
15548                    # name.  Look through the list of equivalents of this
15549                    # table to see if one is a perl table.
15550                    foreach my $equivalent ($actual->leader->equivalents) {
15551                        next if $equivalent->property != $perl;
15552
15553                        # Here, have found a table for $perl.  Add this alias
15554                        # to it, and are done with this prefix.
15555                        $equivalent->add_alias($proposed_name,
15556                                        Re_Pod_Entry => $make_re_pod_entry,
15557
15558                                        # Currently don't output these in the
15559                                        # ucd pod, as are strongly discouraged
15560                                        # from being used
15561                                        UCD => 0,
15562
15563                                        Status => $status,
15564                                        OK_as_Filename => $ok_as_filename);
15565                        trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15566                        next PREFIX;
15567                    }
15568
15569                    # Here, $perl doesn't already have a table that is a
15570                    # synonym for this property, add one.
15571                    my $added_table = $perl->add_match_table($proposed_name,
15572                                            Re_Pod_Entry => $make_re_pod_entry,
15573
15574                                            # See UCD comment just above
15575                                            UCD => 0,
15576
15577                                            Status => $status,
15578                                            OK_as_Filename => $ok_as_filename);
15579                    # And it will be related to the actual table, since it is
15580                    # based on it.
15581                    $added_table->set_equivalent_to($actual, Related => 1);
15582                    trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15583                    next;
15584                } # End of no pre-existing.
15585
15586                # Here, there is a pre-existing table that has the proposed
15587                # name.  We could be in trouble, but not if this is just a
15588                # synonym for another table that we have already made a child
15589                # of the pre-existing one.
15590                if ($pre_existing->is_set_equivalent_to($actual)) {
15591                    trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15592                    $pre_existing->add_alias($proposed_name);
15593                    next;
15594                }
15595
15596                # Here, there is a name collision, but it still could be OK if
15597                # the tables match the identical set of code points, in which
15598                # case, we can combine the names.  Compare each table's code
15599                # point list to see if they are identical.
15600                trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15601                if ($pre_existing->matches_identically_to($actual)) {
15602
15603                    # Here, they do match identically.  Not a real conflict.
15604                    # Make the perl version a child of the Unicode one, except
15605                    # in the non-obvious case of where the perl name is
15606                    # already a synonym of another Unicode property.  (This is
15607                    # excluded by the test for it being its own parent.)  The
15608                    # reason for this exclusion is that then the two Unicode
15609                    # properties become related; and we don't really know if
15610                    # they are or not.  We generate documentation based on
15611                    # relatedness, and this would be misleading.  Code
15612                    # later executed in the process will cause the tables to
15613                    # be represented by a single file anyway, without making
15614                    # it look in the pod like they are necessarily related.
15615                    if ($pre_existing->parent == $pre_existing
15616                        && ($pre_existing->property == $perl
15617                            || $actual->property == $perl))
15618                    {
15619                        trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15620                        $pre_existing->set_equivalent_to($actual, Related => 1);
15621                    }
15622                    elsif (main::DEBUG && $to_trace) {
15623                        trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15624                        trace $pre_existing->parent;
15625                    }
15626                    next PREFIX;
15627                }
15628
15629                # Here they didn't match identically, there is a real conflict
15630                # between our new name and a pre-existing property.
15631                $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15632                $pre_existing->add_conflicting($nominal->full_name,
15633                                               'p',
15634                                               $actual);
15635
15636                # Don't output a warning for aliases for the block
15637                # properties (unless they start with 'In_') as it is
15638                # expected that there will be conflicts and the block
15639                # form loses.
15640                if ($verbosity >= $NORMAL_VERBOSITY
15641                    && ($actual->property != $block || $prefix eq 'In_'))
15642                {
15643                    print simple_fold(join_lines(<<END
15644There is already an alias named $proposed_name (from $pre_existing),
15645so not creating this alias for $actual
15646END
15647                    ), "", 4);
15648                }
15649
15650                # Keep track for documentation purposes.
15651                $has_In_conflicts++ if $prefix eq 'In_';
15652                $has_Is_conflicts++ if $prefix eq 'Is_';
15653            }
15654        }
15655    }
15656
15657    # There are some properties which have No and Yes (and N and Y) as
15658    # property values, but aren't binary, and could possibly be confused with
15659    # binary ones.  So create caveats for them.  There are tables that are
15660    # named 'No', and tables that are named 'N', but confusion is not likely
15661    # unless they are the same table.  For example, N meaning Number or
15662    # Neutral is not likely to cause confusion, so don't add caveats to things
15663    # like them.
15664    foreach my $property (grep { $_->type != $BINARY
15665                                 && $_->type != $FORCED_BINARY }
15666                                                            property_ref('*'))
15667    {
15668        my $yes = $property->table('Yes');
15669        if (defined $yes) {
15670            my $y = $property->table('Y');
15671            if (defined $y && $yes == $y) {
15672                foreach my $alias ($property->aliases) {
15673                    $yes->add_conflicting($alias->name);
15674                }
15675            }
15676        }
15677        my $no = $property->table('No');
15678        if (defined $no) {
15679            my $n = $property->table('N');
15680            if (defined $n && $no == $n) {
15681                foreach my $alias ($property->aliases) {
15682                    $no->add_conflicting($alias->name, 'P');
15683                }
15684            }
15685        }
15686    }
15687
15688    return;
15689}
15690
15691sub register_file_for_name($table, $directory_ref, $file) {
15692    # Given info about a table and a datafile that it should be associated
15693    # with, register that association
15694
15695    # $directory_ref    # Array of the directory path for the file
15696    # $file             # The file name in the final directory.
15697
15698    trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15699
15700    if ($table->isa('Property')) {
15701        $table->set_file_path(@$directory_ref, $file);
15702        push @map_properties, $table;
15703
15704        # No swash means don't do the rest of this.
15705        return if $table->fate != $ORDINARY
15706                  && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15707
15708        # Get the path to the file
15709        my @path = $table->file_path;
15710
15711        # Use just the file name if no subdirectory.
15712        shift @path if $path[0] eq File::Spec->curdir();
15713
15714        my $file = join '/', @path;
15715
15716        # Create a hash entry for Unicode::UCD to get the file that stores this
15717        # property's map table
15718        foreach my $alias ($table->aliases) {
15719            my $name = $alias->name;
15720            if ($name =~ /^_/) {
15721                $strict_property_to_file_of{lc $name} = $file;
15722            }
15723            else {
15724                $loose_property_to_file_of{standardize($name)} = $file;
15725            }
15726        }
15727
15728        # And a way for Unicode::UCD to find the proper key in the SwashInfo
15729        # hash for this property.
15730        $file_to_swash_name{$file} = "To" . $table->swash_name;
15731        return;
15732    }
15733
15734    # Do all of the work for all equivalent tables when called with the leader
15735    # table, so skip if isn't the leader.
15736    return if $table->leader != $table;
15737
15738    # If this is a complement of another file, use that other file instead,
15739    # with a ! prepended to it.
15740    my $complement;
15741    if (($complement = $table->complement) != 0) {
15742        my @directories = $complement->file_path;
15743
15744        # This assumes that the 0th element is something like 'lib',
15745        # the 1th element the property name (in its own directory), like
15746        # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15747        # appended to it later.
15748        $directories[1] =~ s/^/!/;
15749        $file = pop @directories;
15750        $directory_ref =\@directories;
15751    }
15752
15753    # Join all the file path components together, using slashes.
15754    my $full_filename = join('/', @$directory_ref, $file);
15755
15756    # All go in the same subdirectory of unicore, or the special
15757    # pseudo-directory '#'
15758    if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15759        Carp::my_carp("Unexpected directory in "
15760                .  join('/', @{$directory_ref}, $file));
15761    }
15762
15763    # For this table and all its equivalents ...
15764    foreach my $table ($table, $table->equivalents) {
15765
15766        # Associate it with its file internally.  Don't include the
15767        # $matches_directory first component
15768        $table->set_file_path(@$directory_ref, $file);
15769
15770        # No swash means don't do the rest of this.
15771        next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15772
15773        my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15774
15775        my $property = $table->property;
15776        my $property_name = ($property == $perl)
15777                             ? ""  # 'perl' is never explicitly stated
15778                             : standardize($property->name) . '=';
15779
15780        my $is_default = 0; # Is this table the default one for the property?
15781
15782        # To calculate $is_default, we find if this table is the same as the
15783        # default one for the property.  But this is complicated by the
15784        # possibility that there is a master table for this one, and the
15785        # information is stored there instead of here.
15786        my $parent = $table->parent;
15787        my $leader_prop = $parent->property;
15788        my $default_map = $leader_prop->default_map;
15789        if (defined $default_map) {
15790            my $default_table = $leader_prop->table($default_map);
15791            $is_default = 1 if defined $default_table && $parent == $default_table;
15792        }
15793
15794        # Calculate the loose name for this table.  Mostly it's just its name,
15795        # standardized.  But in the case of Perl tables that are single-form
15796        # equivalents to Unicode properties, it is the latter's name.
15797        my $loose_table_name =
15798                        ($property != $perl || $leader_prop == $perl)
15799                        ? standardize($table->name)
15800                        : standardize($parent->name);
15801
15802        my $deprecated = ($table->status eq $DEPRECATED)
15803                         ? $table->status_info
15804                         : "";
15805        my $caseless_equivalent = $table->caseless_equivalent;
15806
15807        # And for each of the table's aliases...  This inner loop eventually
15808        # goes through all aliases in the UCD that we generate regex match
15809        # files for
15810        foreach my $alias ($table->aliases) {
15811            my $standard = UCD_name($table, $alias);
15812
15813            # Generate an entry in either the loose or strict hashes, which
15814            # will translate the property and alias names combination into the
15815            # file where the table for them is stored.
15816            if ($alias->loose_match) {
15817                if (exists $loose_to_file_of{$standard}) {
15818                    Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
15819                }
15820                else {
15821                    $loose_to_file_of{$standard} = $sub_filename;
15822                }
15823            }
15824            else {
15825                if (exists $stricter_to_file_of{$standard}) {
15826                    Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
15827                }
15828                else {
15829                    $stricter_to_file_of{$standard} = $sub_filename;
15830
15831                    # Tightly coupled with how Unicode::UCD works, for a
15832                    # floating point number that is a whole number, get rid of
15833                    # the trailing decimal point and 0's, so that Unicode::UCD
15834                    # will work.  Also note that this assumes that such a
15835                    # number is matched strictly; so if that were to change,
15836                    # this would be wrong.
15837                    if ((my $integer_name = $alias->name)
15838                            =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
15839                    {
15840                        $stricter_to_file_of{$property_name . $integer_name}
15841                                                            = $sub_filename;
15842                    }
15843                }
15844            }
15845
15846            # For Unicode::UCD, create a mapping of the prop=value to the
15847            # canonical =value for that property.
15848            if ($standard =~ /=/) {
15849
15850                # This could happen if a strict name mapped into an existing
15851                # loose name.  In that event, the strict names would have to
15852                # be moved to a new hash.
15853                if (exists($loose_to_standard_value{$standard})) {
15854                    Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
15855                }
15856                $loose_to_standard_value{$standard} = $loose_table_name;
15857            }
15858
15859            # Keep a list of the deprecated properties and their filenames
15860            if ($deprecated && $complement == 0) {
15861                $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated;
15862            }
15863
15864            # And a substitute table, if any, for case-insensitive matching
15865            if ($caseless_equivalent != 0) {
15866                $caseless_equivalent_to{$standard} = $caseless_equivalent;
15867            }
15868
15869            # Add to defaults list if the table this alias belongs to is the
15870            # default one
15871            $loose_defaults{$standard} = 1 if $is_default;
15872        }
15873    }
15874
15875    return;
15876}
15877
15878{   # Closure
15879    my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
15880                     # conflicts
15881    my %full_dir_name_of;   # Full length names of directories used.
15882
15883    sub construct_filename($name, $mutable, $directories_ref) {
15884        # Return a file name for a table, based on the table name, but perhaps
15885        # changed to get rid of non-portable characters in it, and to make
15886        # sure that it is unique on a file system that allows the names before
15887        # any period to be at most 8 characters (DOS).  While we're at it
15888        # check and complain if there are any directory conflicts.
15889
15890        # $name                 # The name to start with
15891        # $mutable              # Boolean: can it be changed?  If no, but
15892                                # yet it must be to work properly, a warning
15893                                # is given
15894        # $directories_ref      # A reference to an array containing the
15895                                # path to the file, with each element one path
15896                                # component.  This is used because the same
15897                                # name can be used in different directories.
15898
15899        my $warn = ! defined wantarray;  # If true, then if the name is
15900                                # changed, a warning is issued as well.
15901
15902        if (! defined $name) {
15903            Carp::my_carp("Undefined name in directory "
15904                          . File::Spec->join(@$directories_ref)
15905                          . ". '_' used");
15906            return '_';
15907        }
15908
15909        # Make sure that no directory names conflict with each other.  Look at
15910        # each directory in the input file's path.  If it is already in use,
15911        # assume it is correct, and is merely being re-used, but if we
15912        # truncate it to 8 characters, and find that there are two directories
15913        # that are the same for the first 8 characters, but differ after that,
15914        # then that is a problem.
15915        foreach my $directory (@$directories_ref) {
15916            my $short_dir = substr($directory, 0, 8);
15917            if (defined $full_dir_name_of{$short_dir}) {
15918                next if $full_dir_name_of{$short_dir} eq $directory;
15919                Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
15920            }
15921            else {
15922                $full_dir_name_of{$short_dir} = $directory;
15923            }
15924        }
15925
15926        my $path = join '/', @$directories_ref;
15927        $path .= '/' if $path;
15928
15929        # Remove interior underscores.
15930        (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
15931
15932        # Convert the dot in floating point numbers to an underscore
15933        $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
15934
15935        my $suffix = "";
15936
15937        # Extract any suffix, delete any non-word character, and truncate to 3
15938        # after the dot
15939        if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
15940            $filename = $1;
15941            $suffix = $2;
15942            $suffix =~ s/\W+//g;
15943            substr($suffix, 4) = "" if length($suffix) > 4;
15944        }
15945
15946        # Change any non-word character outside the suffix into an underscore,
15947        # and truncate to 8.
15948        $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
15949        substr($filename, 8) = "" if length($filename) > 8;
15950
15951        # Make sure the basename doesn't conflict with something we
15952        # might have already written. If we have, say,
15953        #     InGreekExtended1
15954        #     InGreekExtended2
15955        # they become
15956        #     InGreekE
15957        #     InGreek2
15958        my $warned = 0;
15959        while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
15960            $num++; # so basenames with numbers start with '2', which
15961                    # just looks more natural.
15962
15963            # Want to append $num, but if it'll make the basename longer
15964            # than 8 characters, pre-truncate $filename so that the result
15965            # is acceptable.
15966            my $delta = length($filename) + length($num) - 8;
15967            if ($delta > 0) {
15968                substr($filename, -$delta) = $num;
15969            }
15970            else {
15971                $filename .= $num;
15972            }
15973            if ($warn && ! $warned) {
15974                $warned = 1;
15975                Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
15976            }
15977        }
15978
15979        return $filename if $mutable;
15980
15981        # If not changeable, must return the input name, but warn if needed to
15982        # change it beyond shortening it.
15983        if ($name ne $filename
15984            && substr($name, 0, length($filename)) ne $filename) {
15985            Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
15986        }
15987        return $name;
15988    }
15989}
15990
15991# The pod file contains a very large table.  Many of the lines in that table
15992# would exceed a typical output window's size, and so need to be wrapped with
15993# a hanging indent to make them look good.  The pod language is really
15994# insufficient here.  There is no general construct to do that in pod, so it
15995# is done here by beginning each such line with a space to cause the result to
15996# be output without formatting, and doing all the formatting here.  This leads
15997# to the result that if the eventual display window is too narrow it won't
15998# look good, and if the window is too wide, no advantage is taken of that
15999# extra width.  A further complication is that the output may be indented by
16000# the formatter so that there is less space than expected.  What I (khw) have
16001# done is to assume that that indent is a particular number of spaces based on
16002# what it is in my Linux system;  people can always resize their windows if
16003# necessary, but this is obviously less than desirable, but the best that can
16004# be expected.
16005my $automatic_pod_indent = 8;
16006
16007# Try to format so that uses fewest lines, but few long left column entries
16008# slide into the right column.  An experiment on 5.1 data yielded the
16009# following percentages that didn't cut into the other side along with the
16010# associated first-column widths
16011# 69% = 24
16012# 80% not too bad except for a few blocks
16013# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
16014# 95% = 37;
16015my $indent_info_column = 27;    # 75% of lines didn't have overlap
16016
16017my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
16018                    # The 3 is because of:
16019                    #   1   for the leading space to tell the pod formatter to
16020                    #       output as-is
16021                    #   1   for the flag
16022                    #   1   for the space between the flag and the main data
16023
16024sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) {
16025    # Take a pod line and return it, formatted properly
16026
16027    # $entry Contents of left column
16028    # $info Contents of right column
16029
16030    my $flags = "";
16031    $flags .= $STRICTER if ! $loose_match;
16032
16033    $flags .= $status if $status;
16034
16035    # There is a blank in the left column to cause the pod formatter to
16036    # output the line as-is.
16037    return sprintf " %-*s%-*s %s\n",
16038                    # The first * in the format is replaced by this, the -1 is
16039                    # to account for the leading blank.  There isn't a
16040                    # hard-coded blank after this to separate the flags from
16041                    # the rest of the line, so that in the unlikely event that
16042                    # multiple flags are shown on the same line, they both
16043                    # will get displayed at the expense of that separation,
16044                    # but since they are left justified, a blank will be
16045                    # inserted in the normal case.
16046                    $FILLER - 1,
16047                    $flags,
16048
16049                    # The other * in the format is replaced by this number to
16050                    # cause the first main column to right fill with blanks.
16051                    # The -1 is for the guaranteed blank following it.
16052                    $first_column_width - $FILLER - 1,
16053                    $entry,
16054                    $info;
16055}
16056
16057my @zero_match_tables;  # List of tables that have no matches in this release
16058
16059sub make_re_pod_entries($input_table) {
16060    # This generates the entries for the pod file for a given table.
16061    # Also done at this time are any children tables.  The output looks like:
16062    # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
16063
16064    # Generate parent and all its children at the same time.
16065    return if $input_table->parent != $input_table;
16066
16067    my $property = $input_table->property;
16068    my $type = $property->type;
16069    my $full_name = $property->full_name;
16070
16071    my $count = $input_table->count;
16072    my $unicode_count;
16073    my $non_unicode_string;
16074    if ($count > $MAX_UNICODE_CODEPOINTS) {
16075        $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16076                                    - $MAX_UNICODE_CODEPOINT);
16077        $non_unicode_string = " plus all above-Unicode code points";
16078    }
16079    else {
16080        $unicode_count = $count;
16081        $non_unicode_string = "";
16082    }
16083
16084    my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16085
16086    my $definition = $input_table->calculate_table_definition;
16087    if ($definition) {
16088
16089        # Save the definition for later use.
16090        $input_table->set_definition($definition);
16091
16092        $definition = ": $definition";
16093    }
16094
16095    my $status = $input_table->status;
16096    my $status_info = $input_table->status_info;
16097    my $caseless_equivalent = $input_table->caseless_equivalent;
16098
16099    # Don't mention a placeholder equivalent as it isn't to be listed in the
16100    # pod
16101    $caseless_equivalent = 0 if $caseless_equivalent != 0
16102                                && $caseless_equivalent->fate > $ORDINARY;
16103
16104    my $entry_for_first_table; # The entry for the first table output.
16105                           # Almost certainly, it is the parent.
16106
16107    # For each related table (including itself), we will generate a pod entry
16108    # for each name each table goes by
16109    foreach my $table ($input_table, $input_table->children) {
16110
16111        # Unicode::UCD cannot deal with null string property values, so skip
16112        # any tables that have no non-null names.
16113        next if ! grep { $_->name ne "" } $table->aliases;
16114
16115        # First, gather all the info that applies to this table as a whole.
16116
16117        push @zero_match_tables, $table if $count == 0
16118                                            # Don't mention special tables
16119                                            # as being zero length
16120                                           && $table->fate == $ORDINARY;
16121
16122        my $table_property = $table->property;
16123
16124        # The short name has all the underscores removed, while the full name
16125        # retains them.  Later, we decide whether to output a short synonym
16126        # for the full one, we need to compare apples to apples, so we use the
16127        # short name's length including underscores.
16128        my $table_property_short_name_length;
16129        my $table_property_short_name
16130            = $table_property->short_name(\$table_property_short_name_length);
16131        my $table_property_full_name = $table_property->full_name;
16132
16133        # Get how much savings there is in the short name over the full one
16134        # (delta will always be <= 0)
16135        my $table_property_short_delta = $table_property_short_name_length
16136                                         - length($table_property_full_name);
16137        my @table_description = $table->description;
16138        my @table_note = $table->note;
16139
16140        # Generate an entry for each alias in this table.
16141        my $entry_for_first_alias;  # saves the first one encountered.
16142        foreach my $alias ($table->aliases) {
16143
16144            # Skip if not to go in pod.
16145            next unless $alias->make_re_pod_entry;
16146
16147            # Start gathering all the components for the entry
16148            my $name = $alias->name;
16149
16150            # Skip if name is empty, as can't be accessed by regexes.
16151            next if $name eq "";
16152
16153            my $entry;      # Holds the left column, may include extras
16154            my $entry_ref;  # To refer to the left column's contents from
16155                            # another entry; has no extras
16156
16157            # First the left column of the pod entry.  Tables for the $perl
16158            # property always use the single form.
16159            if ($table_property == $perl) {
16160                $entry = "\\p{$name}";
16161                $entry .= " \\p$name" if length $name == 1; # Show non-braced
16162                                                            # form too
16163                $entry_ref = "\\p{$name}";
16164            }
16165            else {    # Compound form.
16166
16167                # Only generate one entry for all the aliases that mean true
16168                # or false in binary properties.  Append a '*' to indicate
16169                # some are missing.  (The heading comment notes this.)
16170                my $rhs;
16171                if ($type == $BINARY) {
16172                    next if $name ne 'N' && $name ne 'Y';
16173                    $rhs = "$name*";
16174                }
16175                elsif ($type != $FORCED_BINARY) {
16176                    $rhs = $name;
16177                }
16178                else {
16179
16180                    # Forced binary properties require special handling.  It
16181                    # has two sets of tables, one set is true/false; and the
16182                    # other set is everything else.  Entries are generated for
16183                    # each set.  Use the Bidi_Mirrored property (which appears
16184                    # in all Unicode versions) to get a list of the aliases
16185                    # for the true/false tables.  Of these, only output the N
16186                    # and Y ones, the same as, a regular binary property.  And
16187                    # output all the rest, same as a non-binary property.
16188                    my $bm = property_ref("Bidi_Mirrored");
16189                    if ($name eq 'N' || $name eq 'Y') {
16190                        $rhs = "$name*";
16191                    } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16192                                                        $bm->table("N")->aliases)
16193                    {
16194                        next;
16195                    }
16196                    else {
16197                        $rhs = $name;
16198                    }
16199                }
16200
16201                # Colon-space is used to give a little more space to be easier
16202                # to read;
16203                $entry = "\\p{"
16204                        . $table_property_full_name
16205                        . ": $rhs}";
16206
16207                # But for the reference to this entry, which will go in the
16208                # right column, where space is at a premium, use equals
16209                # without a space
16210                $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16211            }
16212
16213            # Then the right (info) column.  This is stored as components of
16214            # an array for the moment, then joined into a string later.  For
16215            # non-internal only properties, begin the info with the entry for
16216            # the first table we encountered (if any), as things are ordered
16217            # so that that one is the most descriptive.  This leads to the
16218            # info column of an entry being a more descriptive version of the
16219            # name column
16220            my @info;
16221            if ($name =~ /^_/) {
16222                push @info,
16223                        '(For internal use by Perl, not necessarily stable)';
16224            }
16225            elsif ($entry_for_first_alias) {
16226                push @info, $entry_for_first_alias;
16227            }
16228
16229            # If this entry is equivalent to another, add that to the info,
16230            # using the first such table we encountered
16231            if ($entry_for_first_table) {
16232                if (@info) {
16233                    push @info, "(= $entry_for_first_table)";
16234                }
16235                else {
16236                    push @info, $entry_for_first_table;
16237                }
16238            }
16239
16240            # If the name is a large integer, add an equivalent with an
16241            # exponent for better readability
16242            if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16243                push @info, sprintf "(= %.1e)", $name
16244            }
16245
16246            my $parenthesized = "";
16247            if (! $entry_for_first_alias) {
16248
16249                # This is the first alias for the current table.  The alias
16250                # array is ordered so that this is the fullest, most
16251                # descriptive alias, so it gets the fullest info.  The other
16252                # aliases are mostly merely pointers to this one, using the
16253                # information already added above.
16254
16255                # Display any status message, but only on the parent table
16256                if ($status && ! $entry_for_first_table) {
16257                    push @info, $status_info;
16258                }
16259
16260                # Put out any descriptive info
16261                if (@table_description || @table_note) {
16262                    push @info, join "; ", @table_description, @table_note;
16263                }
16264
16265                # Look to see if there is a shorter name we can point people
16266                # at
16267                my $standard_name = standardize($name);
16268                my $short_name;
16269                my $proposed_short = $table->short_name;
16270                if (defined $proposed_short) {
16271                    my $standard_short = standardize($proposed_short);
16272
16273                    # If the short name is shorter than the standard one, or
16274                    # even if it's not, but the combination of it and its
16275                    # short property name (as in \p{prop=short} ($perl doesn't
16276                    # have this form)) saves at least two characters, then,
16277                    # cause it to be listed as a shorter synonym.
16278                    if (length $standard_short < length $standard_name
16279                        || ($table_property != $perl
16280                            && (length($standard_short)
16281                                - length($standard_name)
16282                                + $table_property_short_delta)  # (<= 0)
16283                                < -2))
16284                    {
16285                        $short_name = $proposed_short;
16286                        if ($table_property != $perl) {
16287                            $short_name = $table_property_short_name
16288                                          . "=$short_name";
16289                        }
16290                        $short_name = "\\p{$short_name}";
16291                    }
16292                }
16293
16294                # And if this is a compound form name, see if there is a
16295                # single form equivalent
16296                my $single_form;
16297                if ($table_property != $perl && $table_property != $block) {
16298
16299                    # Special case the binary N tables, so that will print
16300                    # \P{single}, but use the Y table values to populate
16301                    # 'single', as we haven't likewise populated the N table.
16302                    # For forced binary tables, we can't just look at the N
16303                    # table, but must see if this table is equivalent to the N
16304                    # one, as there are two equivalent beasts in these
16305                    # properties.
16306                    my $test_table;
16307                    my $p;
16308                    if (   ($type == $BINARY
16309                            && $input_table == $property->table('No'))
16310                        || ($type == $FORCED_BINARY
16311                            && $property->table('No')->
16312                                        is_set_equivalent_to($input_table)))
16313                    {
16314                        $test_table = $property->table('Yes');
16315                        $p = 'P';
16316                    }
16317                    else {
16318                        $test_table = $input_table;
16319                        $p = 'p';
16320                    }
16321
16322                    # Look for a single form amongst all the children.
16323                    foreach my $table ($test_table->children) {
16324                        next if $table->property != $perl;
16325                        my $proposed_name = $table->short_name;
16326                        next if ! defined $proposed_name;
16327
16328                        # Don't mention internal-only properties as a possible
16329                        # single form synonym
16330                        next if substr($proposed_name, 0, 1) eq '_';
16331
16332                        $proposed_name = "\\$p\{$proposed_name}";
16333                        if (! defined $single_form
16334                            || length($proposed_name) < length $single_form)
16335                        {
16336                            $single_form = $proposed_name;
16337
16338                            # The goal here is to find a single form; not the
16339                            # shortest possible one.  We've already found a
16340                            # short name.  So, stop at the first single form
16341                            # found, which is likely to be closer to the
16342                            # original.
16343                            last;
16344                        }
16345                    }
16346                }
16347
16348                # Output both short and single in the same parenthesized
16349                # expression, but with only one of 'Single', 'Short' if there
16350                # are both items.
16351                if ($short_name || $single_form || $table->conflicting) {
16352                    $parenthesized .= "Short: $short_name" if $short_name;
16353                    if ($short_name && $single_form) {
16354                        $parenthesized .= ', ';
16355                    }
16356                    elsif ($single_form) {
16357                        $parenthesized .= 'Single: ';
16358                    }
16359                    $parenthesized .= $single_form if $single_form;
16360                }
16361            }
16362
16363            if ($caseless_equivalent != 0) {
16364                $parenthesized .=  '; ' if $parenthesized ne "";
16365                $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16366            }
16367
16368
16369            # Warn if this property isn't the same as one that a
16370            # semi-casual user might expect.  The other components of this
16371            # parenthesized structure are calculated only for the first entry
16372            # for this table, but the conflicting is deemed important enough
16373            # to go on every entry.
16374            my $conflicting = join " NOR ", $table->conflicting;
16375            if ($conflicting) {
16376                $parenthesized .=  '; ' if $parenthesized ne "";
16377                $parenthesized .= "NOT $conflicting";
16378            }
16379
16380            push @info, "($parenthesized)" if $parenthesized;
16381
16382            if ($name =~ /_$/ && $alias->loose_match) {
16383                push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16384            }
16385
16386            if ($table_property != $perl && $table->perl_extension) {
16387                push @info, '(Perl extension)';
16388            }
16389            my $definition = $table->definition // "";
16390            $definition = "" if $entry_for_first_alias;
16391            $definition = ": $definition" if $definition;
16392            push @info, "($string_count$definition)";
16393
16394            # Now, we have both the entry and info so add them to the
16395            # list of all the properties.
16396            push @match_properties,
16397                format_pod_line($indent_info_column,
16398                                $entry,
16399                                join( " ", @info),
16400                                $alias->status,
16401                                $alias->loose_match);
16402
16403            $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16404        } # End of looping through the aliases for this table.
16405
16406        if (! $entry_for_first_table) {
16407            $entry_for_first_table = $entry_for_first_alias;
16408        }
16409    } # End of looping through all the related tables
16410    return;
16411}
16412
16413sub make_ucd_table_pod_entries($table) {
16414    # Generate the entries for the UCD section of the pod for $table.  This
16415    # also calculates if names are ambiguous, so has to be called even if the
16416    # pod is not being output
16417
16418    my $short_name = $table->name;
16419    my $standard_short_name = standardize($short_name);
16420    my $full_name = $table->full_name;
16421    my $standard_full_name = standardize($full_name);
16422
16423    my $full_info = "";     # Text of info column for full-name entries
16424    my $other_info = "";    # Text of info column for short-name entries
16425    my $short_info = "";    # Text of info column for other entries
16426    my $meaning = "";       # Synonym of this table
16427
16428    my $property = ($table->isa('Property'))
16429                   ? $table
16430                   : $table->parent->property;
16431
16432    my $perl_extension = $table->perl_extension;
16433    my $is_perl_extension_match_table_but_not_dollar_perl
16434                                                        = $property != $perl
16435                                                       && $perl_extension
16436                                                       && $property != $table;
16437
16438    # Get the more official name for perl extensions that aren't
16439    # stand-alone properties
16440    if ($is_perl_extension_match_table_but_not_dollar_perl) {
16441        if ($property->type == $BINARY) {
16442            $meaning = $property->full_name;
16443        }
16444        else {
16445            $meaning = $table->parent->complete_name;
16446        }
16447    }
16448
16449    # There are three types of info column.  One for the short name, one for
16450    # the full name, and one for everything else.  They mostly are the same,
16451    # so initialize in the same loop.
16452
16453    foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16454        if ($info_ref != \$full_info) {
16455
16456            # The non-full name columns include the full name
16457            $$info_ref .= $full_name;
16458        }
16459
16460
16461        if ($is_perl_extension_match_table_but_not_dollar_perl) {
16462
16463            # Add the synonymous name for the non-full name entries; and to
16464            # the full-name entry if it adds extra information
16465            if (   standardize($meaning) ne $standard_full_name
16466                || $info_ref == \$other_info
16467                || $info_ref == \$short_info)
16468            {
16469                my $parenthesized =  $info_ref != \$full_info;
16470                $$info_ref .= " " if $$info_ref && $parenthesized;
16471                $$info_ref .= "(=" if $parenthesized;
16472                $$info_ref .= "$meaning";
16473                $$info_ref .= ")" if $parenthesized;
16474                $$info_ref .= ".";
16475            }
16476        }
16477
16478        # And the full-name entry includes the short name, if shorter
16479        if ($info_ref == \$full_info
16480            && length $standard_short_name < length $standard_full_name)
16481        {
16482            $full_info =~ s/\.\Z//;
16483            $full_info .= "  " if $full_info;
16484            $full_info .= "(Short: $short_name)";
16485        }
16486
16487        if ($table->perl_extension) {
16488            $$info_ref =~ s/\.\Z//;
16489            $$info_ref .= ".  " if $$info_ref;
16490            $$info_ref .= "(Perl extension)";
16491        }
16492    }
16493
16494    my $definition;
16495    my $definition_table;
16496    my $type = $table->property->type;
16497    if ($type == $BINARY || $type == $FORCED_BINARY) {
16498        $definition_table = $table->property->table('Y');
16499    }
16500    elsif ($table->isa('Match_Table')) {
16501        $definition_table = $table;
16502    }
16503
16504    $definition = $definition_table->calculate_table_definition
16505                                            if defined $definition_table
16506                                                    && $definition_table != 0;
16507
16508    # Add any extra annotations to the full name entry
16509    foreach my $more_info ($table->description,
16510                            $definition,
16511                            $table->note,
16512                            $table->status_info)
16513    {
16514        next unless $more_info;
16515        $full_info =~ s/\.\Z//;
16516        $full_info .= ".  " if $full_info;
16517        $full_info .= $more_info;
16518    }
16519    if ($table->property->type == $FORCED_BINARY) {
16520        if ($full_info) {
16521            $full_info =~ s/\.\Z//;
16522            $full_info .= ".  ";
16523        }
16524        $full_info .= "This is a combination property which has both:"
16525                    . " 1) a map to various string values; and"
16526                    . " 2) a map to boolean Y/N, where 'Y' means the"
16527                    . " string value is non-empty.  Add the prefix 'is'"
16528                    . " to the prop_invmap() call to get the latter";
16529    }
16530
16531    # These keep track if have created full and short name pod entries for the
16532    # property
16533    my $done_full = 0;
16534    my $done_short = 0;
16535
16536    # Every possible name is kept track of, even those that aren't going to be
16537    # output.  This way we can be sure to find the ambiguities.
16538    foreach my $alias ($table->aliases) {
16539        my $name = $alias->name;
16540        my $standard = standardize($name);
16541        my $info;
16542        my $output_this = $alias->ucd;
16543
16544        # If the full and short names are the same, we want to output the full
16545        # one's entry, so it has priority.
16546        if ($standard eq $standard_full_name) {
16547            next if $done_full;
16548            $done_full = 1;
16549            $info = $full_info;
16550        }
16551        elsif ($standard eq $standard_short_name) {
16552            next if $done_short;
16553            $done_short = 1;
16554            next if $standard_short_name eq $standard_full_name;
16555            $info = $short_info;
16556        }
16557        else {
16558            $info = $other_info;
16559        }
16560
16561        $combination_property{$standard} = 1
16562                                  if $table->property->type == $FORCED_BINARY;
16563
16564        # Here, we have set up the two columns for this entry.  But if an
16565        # entry already exists for this name, we have to decide which one
16566        # we're going to later output.
16567        if (exists $ucd_pod{$standard}) {
16568
16569            # If the two entries refer to the same property, it's not going to
16570            # be ambiguous.  (Likely it's because the names when standardized
16571            # are the same.)  But that means if they are different properties,
16572            # there is ambiguity.
16573            if ($ucd_pod{$standard}->{'property'} != $property) {
16574
16575                # Here, we have an ambiguity.  This code assumes that one is
16576                # scheduled to be output and one not and that one is a perl
16577                # extension (which is not to be output) and the other isn't.
16578                # If those assumptions are wrong, things have to be rethought.
16579                if ($ucd_pod{$standard}{'output_this'} == $output_this
16580                    || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16581                    || $output_this == $perl_extension)
16582                {
16583                    Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
16584                }
16585
16586                # We modify the info column of the one being output to
16587                # indicate the ambiguity.  Set $which to point to that one's
16588                # info.
16589                my $which;
16590                if ($ucd_pod{$standard}{'output_this'}) {
16591                    $which = \$ucd_pod{$standard}->{'info'};
16592                }
16593                else {
16594                    $which = \$info;
16595                    $meaning = $ucd_pod{$standard}{'meaning'};
16596                }
16597
16598                chomp $$which;
16599                $$which =~ s/\.\Z//;
16600                $$which .= "; NOT '$standard' meaning '$meaning'";
16601
16602                $ambiguous_names{$standard} = 1;
16603            }
16604
16605            # Use the non-perl-extension variant
16606            next unless $ucd_pod{$standard}{'perl_extension'};
16607        }
16608
16609        # Store enough information about this entry that we can later look for
16610        # ambiguities, and output it properly.
16611        $ucd_pod{$standard} = { 'name' => $name,
16612                                'info' => $info,
16613                                'meaning' => $meaning,
16614                                'output_this' => $output_this,
16615                                'perl_extension' => $perl_extension,
16616                                'property' => $property,
16617                                'status' => $alias->status,
16618        };
16619    } # End of looping through all this table's aliases
16620
16621    return;
16622}
16623
16624sub pod_alphanumeric_sort {
16625    # Sort pod entries alphanumerically.
16626
16627    # The first few character columns are filler, plus the '\p{'; and get rid
16628    # of all the trailing stuff, starting with the trailing '}', so as to sort
16629    # on just 'Name=Value'
16630    (my $a = lc $a) =~ s/^ .*? \{ //x;
16631    $a =~ s/}.*//;
16632    (my $b = lc $b) =~ s/^ .*? \{ //x;
16633    $b =~ s/}.*//;
16634
16635    # Determine if the two operands are both internal only or both not.
16636    # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16637    # should be the underscore that begins internal only
16638    my $a_is_internal = (substr($a, 0, 1) eq '_');
16639    my $b_is_internal = (substr($b, 0, 1) eq '_');
16640
16641    # Sort so the internals come last in the table instead of first (which the
16642    # leading underscore would otherwise indicate).
16643    if ($a_is_internal != $b_is_internal) {
16644        return 1 if $a_is_internal;
16645        return -1
16646    }
16647
16648    # Determine if the two operands are compound or not, and if so if are
16649    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
16650    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16651    # all of which this considers numeric, and for sorting, looks just at the
16652    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
16653    my $split_re = qr/
16654        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16655                     # property name
16656        [:=] \s*     # The syntax for the compound form
16657        (?:          # followed by ...
16658            (        # $2 gets defined if what follows is a "numeric"
16659                     # expression, which is ...
16660              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
16661                                        # number, optionally signed
16662               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
16663                                         # of these go into $3
16664             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
16665                                         # number, into $4
16666            )
16667            | .* $    # If not "numeric", accept anything so that $1 gets
16668                      # defined if it is any compound form
16669        ) /ix;
16670    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16671    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16672
16673    # Sort alphabeticlly on the whole property name if either operand isn't
16674    # compound, or they differ.
16675    return $a cmp $b if   ! defined $a_initial
16676                       || ! defined $b_initial
16677                       || $a_initial ne $b_initial;
16678
16679    if (! defined $a_numeric) {
16680
16681        # If neither is numeric, use alpha sort
16682        return $a cmp $b if ! defined $b_numeric;
16683        return 1;  # Sort numeric ahead of alpha
16684    }
16685
16686    # Here $a is numeric
16687    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
16688
16689    # Here they are both numeric in the same property.
16690    # Convert version numbers into regular numbers
16691    if (defined $a_version) {
16692        ($a_number = $a_version) =~ s/^V//i;
16693        $a_number =~ s/_/./;
16694    }
16695    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
16696        $a_number =~ s/ ^ [[:alpha:]]+ //x;
16697    }
16698    if (defined $b_version) {
16699        ($b_number = $b_version) =~ s/^V//i;
16700        $b_number =~ s/_/./;
16701    }
16702    else {
16703        $b_number =~ s/ ^ [[:alpha:]]+ //x;
16704    }
16705
16706    # Convert rationals to floating for the comparison.
16707    $a_number = eval $a_number if $a_number =~ qr{/};
16708    $b_number = eval $b_number if $b_number =~ qr{/};
16709
16710    return $a_number <=> $b_number || $a cmp $b;
16711}
16712
16713sub make_pod () {
16714    # Create the .pod file.  This generates the various subsections and then
16715    # combines them in one big HERE document.
16716
16717    my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16718
16719    return unless defined $pod_directory;
16720    print "Making pod file\n" if $verbosity >= $PROGRESS;
16721
16722    my $exception_message =
16723    '(Any exceptions are individually noted beginning with the word NOT.)';
16724    my @block_warning;
16725    if (-e 'Blocks.txt') {
16726
16727        # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
16728        # if the global $has_In_conflicts indicates we have them.
16729        push @match_properties, format_pod_line($indent_info_column,
16730                                                '\p{In_*}',
16731                                                '\p{Block: *}'
16732                                                    . (($has_In_conflicts)
16733                                                      ? " $exception_message"
16734                                                      : ""),
16735                                                 $DISCOURAGED);
16736        @block_warning = << "END";
16737
16738In particular, matches in the Block property have single forms
16739defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
16740all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
16741C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
16742C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
16743come along that would force Perl to change the meaning of one or more of
16744these, and your program would no longer be correct.  Currently there are no
16745such conflicts with the form that begins C<"In_">, but there are many with the
16746other two shortcuts, and Unicode continues to define new properties that begin
16747with C<"In">, so it's quite possible that a conflict will occur in the future.
16748The compound form is guaranteed to not become obsolete, and its meaning is
16749clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
16750
16751User-defined properties must begin with "In" or "Is".  These override any
16752Unicode property of the same name.
16753END
16754    }
16755    my $text = $Is_flags_text;
16756    $text = "$exception_message $text" if $has_Is_conflicts;
16757
16758    # And the 'Is_ line';
16759    push @match_properties, format_pod_line($indent_info_column,
16760                                            '\p{Is_*}',
16761                                            "\\p{*} $text");
16762    push @match_properties, format_pod_line($indent_info_column,
16763            '\p{Name=*}',
16764            "Combination of Name and Name_Alias properties; has special"
16765          . " loose matching rules, for which see Unicode UAX #44");
16766    push @match_properties, format_pod_line($indent_info_column,
16767                                            '\p{Na=*}',
16768                                            '\p{Name=*}');
16769
16770    # Sort the properties array for output.  It is sorted alphabetically
16771    # except numerically for numeric properties, and only output unique lines.
16772    @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16773
16774    my $formatted_properties = simple_fold(\@match_properties,
16775                                        "",
16776                                        # indent succeeding lines by two extra
16777                                        # which looks better
16778                                        $indent_info_column + 2,
16779
16780                                        # shorten the line length by how much
16781                                        # the formatter indents, so the folded
16782                                        # line will fit in the space
16783                                        # presumably available
16784                                        $automatic_pod_indent);
16785    # Add column headings, indented to be a little more centered, but not
16786    # exactly
16787    $formatted_properties =  format_pod_line($indent_info_column,
16788                                                    '    NAME',
16789                                                    '           INFO')
16790                                    . "\n"
16791                                    . $formatted_properties;
16792
16793    # Generate pod documentation lines for the tables that match nothing
16794    my $zero_matches = "";
16795    if (@zero_match_tables) {
16796        @zero_match_tables = uniques(@zero_match_tables);
16797        $zero_matches = join "\n\n",
16798                        map { $_ = '=item \p{' . $_->complete_name . "}" }
16799                            sort { $a->complete_name cmp $b->complete_name }
16800                            @zero_match_tables;
16801
16802        $zero_matches = <<END;
16803
16804=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
16805
16806Unicode has some property-value pairs that currently don't match anything.
16807This happens generally either because they are obsolete, or they exist for
16808symmetry with other forms, but no language has yet been encoded that uses
16809them.  In this version of Unicode, the following match zero code points:
16810
16811=over 4
16812
16813$zero_matches
16814
16815=back
16816
16817END
16818    }
16819
16820    # Generate list of properties that we don't accept, grouped by the reasons
16821    # why.  This is so only put out the 'why' once, and then list all the
16822    # properties that have that reason under it.
16823
16824    my %why_list;   # The keys are the reasons; the values are lists of
16825                    # properties that have the key as their reason
16826
16827    # For each property, add it to the list that are suppressed for its reason
16828    # The sort will cause the alphabetically first properties to be added to
16829    # each list first, so each list will be sorted.
16830    foreach my $property (sort keys %why_suppressed) {
16831        next unless $why_suppressed{$property};
16832        push @{$why_list{$why_suppressed{$property}}}, $property;
16833    }
16834
16835    # For each reason (sorted by the first property that has that reason)...
16836    my @bad_re_properties;
16837    foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
16838                     keys %why_list)
16839    {
16840        # Add to the output, all the properties that have that reason.
16841        my $has_item = 0;   # Flag if actually output anything.
16842        foreach my $name (@{$why_list{$why}}) {
16843
16844            # Split compound names into $property and $table components
16845            my $property = $name;
16846            my $table;
16847            if ($property =~ / (.*) = (.*) /x) {
16848                $property = $1;
16849                $table = $2;
16850            }
16851
16852            # This release of Unicode may not have a property that is
16853            # suppressed, so don't reference a non-existent one.
16854            $property = property_ref($property);
16855            next if ! defined $property;
16856
16857            # And since this list is only for match tables, don't list the
16858            # ones that don't have match tables.
16859            next if ! $property->to_create_match_tables;
16860
16861            # Find any abbreviation, and turn it into a compound name if this
16862            # is a property=value pair.
16863            my $short_name = $property->name;
16864            $short_name .= '=' . $property->table($table)->name if $table;
16865
16866            # Start with an empty line.
16867            push @bad_re_properties, "\n\n" unless $has_item;
16868
16869            # And add the property as an item for the reason.
16870            push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
16871            $has_item = 1;
16872        }
16873
16874        # And add the reason under the list of properties, if such a list
16875        # actually got generated.  Note that the header got added
16876        # unconditionally before.  But pod ignores extra blank lines, so no
16877        # harm.
16878        push @bad_re_properties, "\n$why\n" if $has_item;
16879
16880    } # End of looping through each reason.
16881
16882    if (! @bad_re_properties) {
16883        push @bad_re_properties,
16884                "*** This installation accepts ALL non-Unihan properties ***";
16885    }
16886    else {
16887        # Add =over only if non-empty to avoid an empty =over/=back section,
16888        # which is considered bad form.
16889        unshift @bad_re_properties, "\n=over 4\n";
16890        push @bad_re_properties, "\n=back\n";
16891    }
16892
16893    # Similarly, generate a list of files that we don't use, grouped by the
16894    # reasons why (Don't output if the reason is empty).  First, create a hash
16895    # whose keys are the reasons, and whose values are anonymous arrays of all
16896    # the files that share that reason.
16897    my %grouped_by_reason;
16898    foreach my $file (keys %skipped_files) {
16899        next unless $skipped_files{$file};
16900        push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
16901    }
16902
16903    # Then, sort each group.
16904    foreach my $group (keys %grouped_by_reason) {
16905        @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
16906                                        @{$grouped_by_reason{$group}} ;
16907    }
16908
16909    # Finally, create the output text.  For each reason (sorted by the
16910    # alphabetically first file that has that reason)...
16911    my @unused_files;
16912    foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
16913                               cmp lc $grouped_by_reason{$b}->[0]
16914                              }
16915                         keys %grouped_by_reason)
16916    {
16917        # Add all the files that have that reason to the output.  Start
16918        # with an empty line.
16919        push @unused_files, "\n\n";
16920        push @unused_files, map { "\n=item F<$_> \n" }
16921                            @{$grouped_by_reason{$reason}};
16922        # And add the reason under the list of files
16923        push @unused_files, "\n$reason\n";
16924    }
16925
16926    # Similarly, create the output text for the UCD section of the pod
16927    my @ucd_pod;
16928    foreach my $key (keys %ucd_pod) {
16929        next unless $ucd_pod{$key}->{'output_this'};
16930        push @ucd_pod, format_pod_line($indent_info_column,
16931                                       $ucd_pod{$key}->{'name'},
16932                                       $ucd_pod{$key}->{'info'},
16933                                       $ucd_pod{$key}->{'status'},
16934                                      );
16935    }
16936
16937    # Sort alphabetically, and fold for output
16938    @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
16939    my $ucd_pod = simple_fold(\@ucd_pod,
16940                           ' ',
16941                           $indent_info_column,
16942                           $automatic_pod_indent);
16943    $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
16944                . "\n"
16945                . $ucd_pod;
16946    my $space_hex = sprintf("%02x", ord " ");
16947    local $" = "";
16948
16949    # Everything is ready to assemble.
16950    my @OUT = << "END";
16951=begin comment
16952
16953$HEADER
16954
16955To change this file, edit $0 instead.
16956
16957=end comment
16958
16959=head1 NAME
16960
16961$pod_file - Index of Unicode Version $unicode_version character properties in Perl
16962
16963=head1 DESCRIPTION
16964
16965This document provides information about the portion of the Unicode database
16966that deals with character properties, that is the portion that is defined on
16967single code points.  (L</Other information in the Unicode data base>
16968below briefly mentions other data that Unicode provides.)
16969
16970Perl can provide access to all non-provisional Unicode character properties,
16971though not all are enabled by default.  The omitted ones are the Unihan
16972properties and certain
16973deprecated or Unicode-internal properties.  (An installation may choose to
16974recompile Perl's tables to change this.  See L</Unicode character
16975properties that are NOT accepted by Perl>.)
16976
16977For most purposes, access to Unicode properties from the Perl core is through
16978regular expression matches, as described in the next section.
16979For some special purposes, and to access the properties that are not suitable
16980for regular expression matching, all the Unicode character properties that
16981Perl handles are accessible via the standard L<Unicode::UCD> module, as
16982described in the section L</Properties accessible through Unicode::UCD>.
16983
16984Perl also provides some additional extensions and short-cut synonyms
16985for Unicode properties.
16986
16987This document merely lists all available properties and does not attempt to
16988explain what each property really means.  There is a brief description of each
16989Perl extension; see L<perlunicode/Other Properties> for more information on
16990these.  There is some detail about Blocks, Scripts, General_Category,
16991and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
16992official Unicode properties, refer to the Unicode standard.  A good starting
16993place is L<$unicode_reference_url>.
16994
16995Note that you can define your own properties; see
16996L<perlunicode/"User-Defined Character Properties">.
16997
16998=head1 Properties accessible through C<\\p{}> and C<\\P{}>
16999
17000The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
17001most of the Unicode character properties.  The table below shows all these
17002constructs, both single and compound forms.
17003
17004B<Compound forms> consist of two components, separated by an equals sign or a
17005colon.  The first component is the property name, and the second component is
17006the particular value of the property to match against, for example,
17007C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
17008to match characters whose Script_Extensions property value is Greek.
17009(C<Script_Extensions> is an improved version of the C<Script> property.)
17010
17011B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
17012their equivalent compound forms.  The table shows these equivalences.  (In our
17013example, C<\\p{Greek}> is a just a shortcut for
17014C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
17015forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
17016These are also listed in the table.
17017
17018In parsing these constructs, Perl always ignores Upper/lower case differences
17019everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
17020C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
17021the left brace completely changes the meaning of the construct, from "match"
17022(for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
17023for improved legibility.
17024
17025Also, white space, hyphens, and underscores are normally ignored
17026everywhere between the {braces}, and hence can be freely added or removed
17027even if the C</x> modifier hasn't been specified on the regular expression.
17028But in the table below $a_bold_stricter at the beginning of an entry
17029means that tighter (stricter) rules are used for that entry:
17030
17031=over 4
17032
17033=over 4
17034
17035=item Single form (C<\\p{name}>) tighter rules:
17036
17037White space, hyphens, and underscores ARE significant
17038except for:
17039
17040=over 4
17041
17042=item * white space adjacent to a non-word character
17043
17044=item * underscores separating digits in numbers
17045
17046=back
17047
17048That means, for example, that you can freely add or remove white space
17049adjacent to (but within) the braces without affecting the meaning.
17050
17051=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17052
17053The tighter rules given above for the single form apply to everything to the
17054right of the colon or equals; the looser rules still apply to everything to
17055the left.
17056
17057That means, for example, that you can freely add or remove white space
17058adjacent to (but within) the braces and the colon or equal sign.
17059
17060=back
17061
17062=back
17063
17064Some properties are considered obsolete by Unicode, but still available.
17065There are several varieties of obsolescence:
17066
17067=over 4
17068
17069=over 4
17070
17071=item Stabilized
17072
17073A property may be stabilized.  Such a determination does not indicate
17074that the property should or should not be used; instead it is a declaration
17075that the property will not be maintained nor extended for newly encoded
17076characters.  Such properties are marked with $a_bold_stabilized in the
17077table.
17078
17079=item Deprecated
17080
17081A property may be deprecated, perhaps because its original intent
17082has been replaced by another property, or because its specification was
17083somehow defective.  This means that its use is strongly
17084discouraged, so much so that a warning will be issued if used, unless the
17085regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17086statement.  $A_bold_deprecated flags each such entry in the table, and
17087the entry there for the longest, most descriptive version of the property will
17088give the reason it is deprecated, and perhaps advice.  Perl may issue such a
17089warning, even for properties that aren't officially deprecated by Unicode,
17090when there used to be characters or code points that were matched by them, but
17091no longer.  This is to warn you that your program may not work like it did on
17092earlier Unicode releases.
17093
17094A deprecated property may be made unavailable in a future Perl version, so it
17095is best to move away from them.
17096
17097A deprecated property may also be stabilized, but this fact is not shown.
17098
17099=item Obsolete
17100
17101Properties marked with $a_bold_obsolete in the table are considered (plain)
17102obsolete.  Generally this designation is given to properties that Unicode once
17103used for internal purposes (but not any longer).
17104
17105=item Discouraged
17106
17107This is not actually a Unicode-specified obsolescence, but applies to certain
17108Perl extensions that are present for backwards compatibility, but are
17109discouraged from being used.  These are not obsolete, but their meanings are
17110not stable.  Future Unicode versions could force any of these extensions to be
17111removed without warning, replaced by another property with the same name that
17112means something different.  $A_bold_discouraged flags each such entry in the
17113table.  Use the equivalent shown instead.
17114
17115@block_warning
17116
17117=back
17118
17119=back
17120
17121The table below has two columns.  The left column contains the C<\\p{}>
17122constructs to look up, possibly preceded by the flags mentioned above; and
17123the right column contains information about them, like a description, or
17124synonyms.  The table shows both the single and compound forms for each
17125property that has them.  If the left column is a short name for a property,
17126the right column will give its longer, more descriptive name; and if the left
17127column is the longest name, the right column will show any equivalent shortest
17128name, in both single and compound forms if applicable.
17129
17130If braces are not needed to specify a property (e.g., C<\\pL>), the left
17131column contains both forms, with and without braces.
17132
17133The right column will also caution you if a property means something different
17134than what might normally be expected.
17135
17136All single forms are Perl extensions; a few compound forms are as well, and
17137are noted as such.
17138
17139Numbers in (parentheses) indicate the total number of Unicode code points
17140matched by the property.  For the entries that give the longest, most
17141descriptive version of the property, the count is followed by a list of some
17142of the code points matched by it.  The list includes all the matched
17143characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17144a regular expression bracketed character class.  Following that, the next few
17145higher matching ranges are also given.  To avoid visual ambiguity, the SPACE
17146character is represented as C<\\x$space_hex>.
17147
17148For emphasis, those properties that match no code points at all are listed as
17149well in a separate section following the table.
17150
17151Most properties match the same code points regardless of whether C<"/i">
17152case-insensitive matching is specified or not.  But a few properties are
17153affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
17154in the second column.  Under case-insensitive matching they match the
17155same code pode points as the property I<other_property>.
17156
17157There is no description given for most non-Perl defined properties (See
17158L<$unicode_reference_url> for that).
17159
17160For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17161combinations.  For example, entries like:
17162
17163 \\p{Gc: *}                                  \\p{General_Category: *}
17164
17165mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17166for the latter is also valid for the former.  Similarly,
17167
17168 \\p{Is_*}                                   \\p{*}
17169
17170means that if and only if, for example, C<\\p{Foo}> exists, then
17171C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17172And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17173C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
17174underscore.
17175
17176Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17177And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
17178'N*' to indicate this, and doesn't have separate entries for the other
17179possibilities.  Note that not all properties which have values 'Yes' and 'No'
17180are binary, and they have all their values spelled out without using this wild
17181card, and a C<NOT> clause in their description that highlights their not being
17182binary.  These also require the compound form to match them, whereas true
17183binary properties have both single and compound forms available.
17184
17185Note that all non-essential underscores are removed in the display of the
17186short names below.
17187
17188B<Legend summary:>
17189
17190=over 4
17191
17192=item Z<>B<*> is a wild-card
17193
17194=item B<(\\d+)> in the info column gives the number of Unicode code points matched
17195by this property.
17196
17197=item B<$DEPRECATED> means this is deprecated.
17198
17199=item B<$OBSOLETE> means this is obsolete.
17200
17201=item B<$STABILIZED> means this is stabilized.
17202
17203=item B<$STRICTER> means tighter (stricter) name matching applies.
17204
17205=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17206stable.
17207
17208=back
17209
17210$formatted_properties
17211
17212$zero_matches
17213
17214=head1 Properties accessible through Unicode::UCD
17215
17216The value of any Unicode (not including Perl extensions) character
17217property mentioned above for any single code point is available through
17218L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
17219values of all the Unicode properties for a given code point.
17220
17221Besides these, all the Unicode character properties mentioned above
17222(except for those marked as for internal use by Perl) are also
17223accessible by L<Unicode::UCD/prop_invlist()>.
17224
17225Due to their nature, not all Unicode character properties are suitable for
17226regular expression matches, nor C<prop_invlist()>.  The remaining
17227non-provisional, non-internal ones are accessible via
17228L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17229hasn't included; see L<below for which those are|/Unicode character properties
17230that are NOT accepted by Perl>).
17231
17232For compatibility with other parts of Perl, all the single forms given in the
17233table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17234are recognized.  BUT, there are some ambiguities between some Perl extensions
17235and the Unicode properties, all of which are silently resolved in favor of the
17236official Unicode property.  To avoid surprises, you should only use
17237C<prop_invmap()> for forms listed in the table below, which omits the
17238non-recommended ones.  The affected forms are the Perl single form equivalents
17239of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17240C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17241whose short name is C<sc>.  The table indicates the current ambiguities in the
17242INFO column, beginning with the word C<"NOT">.
17243
17244The standard Unicode properties listed below are documented in
17245L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17246L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
17247L<perlunicode/Other Properties>;
17248
17249The first column in the table is a name for the property; the second column is
17250an alternative name, if any, plus possibly some annotations.  The alternative
17251name is the property's full name, unless that would simply repeat the first
17252column, in which case the second column indicates the property's short name
17253(if different).  The annotations are given only in the entry for the full
17254name.  The annotations for binary properties include a list of the first few
17255ranges that the property matches.  To avoid any ambiguity, the SPACE character
17256is represented as C<\\x$space_hex>.
17257
17258If a property is obsolete, etc, the entry will be flagged with the same
17259characters used in the table in the L<section above|/Properties accessible
17260through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17261
17262$ucd_pod
17263
17264=head1 Properties accessible through other means
17265
17266Certain properties are accessible also via core function calls.  These are:
17267
17268 Lowercase_Mapping          lc() and lcfirst()
17269 Titlecase_Mapping          ucfirst()
17270 Uppercase_Mapping          uc()
17271
17272Also, Case_Folding is accessible through the C</i> modifier in regular
17273expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17274operator.
17275
17276Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases
17277properties are accessible through the C<\\N{}> interpolation in double-quoted
17278strings and regular expressions; and functions C<charnames::viacode()>,
17279C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a
17280C<use charnames ();> to be specified.
17281
17282Finally, most properties related to decomposition are accessible via
17283L<Unicode::Normalize>.
17284
17285=head1 Unicode character properties that are NOT accepted by Perl
17286
17287Perl will generate an error for a few character properties in Unicode when
17288used in a regular expression.  The non-Unihan ones are listed below, with the
17289reasons they are not accepted, perhaps with work-arounds.  The short names for
17290the properties are listed enclosed in (parentheses).
17291As described after the list, an installation can change the defaults and choose
17292to accept any of these.  The list is machine generated based on the
17293choices made for the installation that generated this document.
17294
17295@bad_re_properties
17296
17297An installation can choose to allow any of these to be matched by downloading
17298the Unicode database from L<http://www.unicode.org/Public/> to
17299C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17300controlling lists contained in the program
17301C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17302(C<\%Config> is available from the Config module).
17303
17304Also, perl can be recompiled to operate on an earlier version of the Unicode
17305standard.  Further information is at
17306C<\$Config{privlib}>/F<unicore/README.perl>.
17307
17308=head1 Other information in the Unicode data base
17309
17310The Unicode data base is delivered in two different formats.  The XML version
17311is valid for more modern Unicode releases.  The other version is a collection
17312of files.  The two are intended to give equivalent information.  Perl uses the
17313older form; this allows you to recompile Perl to use early Unicode releases.
17314
17315The only non-character property that Perl currently supports is Named
17316Sequences, in which a sequence of code points
17317is given a name and generally treated as a single entity.  (Perl supports
17318these via the C<\\N{...}> double-quotish construct,
17319L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17320
17321Below is a list of the files in the Unicode data base that Perl doesn't
17322currently use, along with very brief descriptions of their purposes.
17323Some of the names of the files have been shortened from those that Unicode
17324uses, in order to allow them to be distinguishable from similarly named files
17325on file systems for which only the first 8 characters of a name are
17326significant.
17327
17328=over 4
17329
17330@unused_files
17331
17332=back
17333
17334=head1 SEE ALSO
17335
17336L<$unicode_reference_url>
17337
17338L<perlrecharclass>
17339
17340L<perlunicode>
17341
17342END
17343
17344    # And write it.  The 0 means no utf8.
17345    main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17346    return;
17347}
17348
17349sub make_Name_pm () {
17350    # Create and write Name.pm, which contains subroutines and data to use in
17351    # conjunction with Name.pl
17352
17353    # Maybe there's nothing to do.
17354    return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17355
17356    my @name = <<END;
17357$HEADER
17358$INTERNAL_ONLY_HEADER
17359
17360END
17361
17362    # Convert these structures to output format.
17363    my $code_points_ending_in_code_point =
17364        main::simple_dumper(\@code_points_ending_in_code_point,
17365                            ' ' x 8);
17366    my $names = main::simple_dumper(\%names_ending_in_code_point,
17367                                    ' ' x 8);
17368    my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17369                                    ' ' x 8);
17370
17371    # Do the same with the Hangul names,
17372    my $jamo;
17373    my $jamo_l;
17374    my $jamo_v;
17375    my $jamo_t;
17376    my $jamo_re;
17377    if ($has_hangul_syllables) {
17378
17379        # Construct a regular expression of all the possible
17380        # combinations of the Hangul syllables.
17381        my @L_re;   # Leading consonants
17382        for my $i ($LBase .. $LBase + $LCount - 1) {
17383            push @L_re, $Jamo{$i}
17384        }
17385        my @V_re;   # Middle vowels
17386        for my $i ($VBase .. $VBase + $VCount - 1) {
17387            push @V_re, $Jamo{$i}
17388        }
17389        my @T_re;   # Trailing consonants
17390        for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17391            push @T_re, $Jamo{$i}
17392        }
17393
17394        # The whole re is made up of the L V T combination.
17395        $jamo_re = '('
17396                    . join ('|', sort @L_re)
17397                    . ')('
17398                    . join ('|', sort @V_re)
17399                    . ')('
17400                    . join ('|', sort @T_re)
17401                    . ')?';
17402
17403        # These hashes needed by the algorithm were generated
17404        # during reading of the Jamo.txt file
17405        $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17406        $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17407        $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17408        $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17409    }
17410
17411    push @name, <<END;
17412
17413package charnames;
17414
17415# This module contains machine-generated tables and code for the
17416# algorithmically-determinable Unicode character names.  The following
17417# routines can be used to translate between name and code point and vice versa
17418
17419{ # Closure
17420
17421    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
17422    # two must be 10; if there are 5, the first must not be a 0.  Written this
17423    # way to decrease backtracking.  The first regex allows the code point to
17424    # be at the end of a word, but to work properly, the word shouldn't end
17425    # with a valid hex character.  The second one won't match a code point at
17426    # the end of a word, and doesn't have the run-on issue
17427    my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17428    my \$code_point_re = qr/$code_point_re/;
17429
17430    # In the following hash, the keys are the bases of names which include
17431    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
17432    # of each key is another hash which is used to get the low and high ends
17433    # for each range of code points that apply to the name.
17434    my %names_ending_in_code_point = (
17435$names
17436    );
17437
17438    # The following hash is a copy of the previous one, except is for loose
17439    # matching, so each name has blanks and dashes squeezed out
17440    my %loose_names_ending_in_code_point = (
17441$loose_names
17442    );
17443
17444    # And the following array gives the inverse mapping from code points to
17445    # names.  Lowest code points are first
17446    \@code_points_ending_in_code_point = (
17447$code_points_ending_in_code_point
17448    );
17449
17450    # Is exportable, make read-only
17451    Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
17452END
17453    # Earlier releases didn't have Jamos.  No sense outputting
17454    # them unless will be used.
17455    if ($has_hangul_syllables) {
17456        push @name, <<END;
17457
17458    # Convert from code point to Jamo short name for use in composing Hangul
17459    # syllable names
17460    my %Jamo = (
17461$jamo
17462    );
17463
17464    # Leading consonant (can be null)
17465    my %Jamo_L = (
17466$jamo_l
17467    );
17468
17469    # Vowel
17470    my %Jamo_V = (
17471$jamo_v
17472    );
17473
17474    # Optional trailing consonant
17475    my %Jamo_T = (
17476$jamo_t
17477    );
17478
17479    # Computed re that splits up a Hangul name into LVT or LV syllables
17480    my \$syllable_re = qr/$jamo_re/;
17481
17482    my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17483    my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17484
17485    # These constants names and values were taken from the Unicode standard,
17486    # version 5.1, section 3.12.  They are used in conjunction with Hangul
17487    # syllables
17488    my \$SBase = $SBase_string;
17489    my \$LBase = $LBase_string;
17490    my \$VBase = $VBase_string;
17491    my \$TBase = $TBase_string;
17492    my \$SCount = $SCount;
17493    my \$LCount = $LCount;
17494    my \$VCount = $VCount;
17495    my \$TCount = $TCount;
17496    my \$NCount = \$VCount * \$TCount;
17497END
17498    } # End of has Jamos
17499
17500    push @name, << 'END';
17501
17502    sub name_to_code_point_special {
17503        my ($name, $loose) = @_;
17504
17505        # Returns undef if not one of the specially handled names; otherwise
17506        # returns the code point equivalent to the input name
17507        # $loose is non-zero if to use loose matching, 'name' in that case
17508        # must be input as upper case with all blanks and dashes squeezed out.
17509END
17510    if ($has_hangul_syllables) {
17511        push @name, << 'END';
17512
17513        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17514            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17515        {
17516            return if $name !~ qr/^$syllable_re$/;
17517            my $L = $Jamo_L{$1};
17518            my $V = $Jamo_V{$2};
17519            my $T = (defined $3) ? $Jamo_T{$3} : 0;
17520            return ($L * $VCount + $V) * $TCount + $T + $SBase;
17521        }
17522END
17523    }
17524    push @name, << 'END';
17525
17526        # Name must end in 'code_point' for this to handle.
17527        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17528                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17529
17530        my $base = $1;
17531        my $code_point = CORE::hex $2;
17532        my $names_ref;
17533
17534        if ($loose) {
17535            $names_ref = \%loose_names_ending_in_code_point;
17536        }
17537        else {
17538            return if $base !~ s/-$//;
17539            $names_ref = \%names_ending_in_code_point;
17540        }
17541
17542        # Name must be one of the ones which has the code point in it.
17543        return if ! $names_ref->{$base};
17544
17545        # Look through the list of ranges that apply to this name to see if
17546        # the code point is in one of them.
17547        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17548            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17549            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17550
17551            # Here, the code point is in the range.
17552            return $code_point;
17553        }
17554
17555        # Here, looked like the name had a code point number in it, but
17556        # did not match one of the valid ones.
17557        return;
17558    }
17559
17560    sub code_point_to_name_special {
17561        my $code_point = shift;
17562
17563        # Returns the name of a code point if algorithmically determinable;
17564        # undef if not
17565END
17566    if ($has_hangul_syllables) {
17567        push @name, << 'END';
17568
17569        # If in the Hangul range, calculate the name based on Unicode's
17570        # algorithm
17571        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17572            use integer;
17573            my $SIndex = $code_point - $SBase;
17574            my $L = $LBase + $SIndex / $NCount;
17575            my $V = $VBase + ($SIndex % $NCount) / $TCount;
17576            my $T = $TBase + $SIndex % $TCount;
17577            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17578            $name .= $Jamo{$T} if $T != $TBase;
17579            return $name;
17580        }
17581END
17582    }
17583    push @name, << 'END';
17584
17585        # Look through list of these code points for one in range.
17586        foreach my $hash (@code_points_ending_in_code_point) {
17587            return if $code_point < $hash->{'low'};
17588            if ($code_point <= $hash->{'high'}) {
17589                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17590            }
17591        }
17592        return;            # None found
17593    }
17594} # End closure
17595
175961;
17597END
17598
17599    main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
17600    return;
17601}
17602
17603sub make_UCD () {
17604    # Create and write UCD.pl, which passes info about the tables to
17605    # Unicode::UCD
17606
17607    # Stringify structures for output
17608    my $loose_property_name_of
17609                           = simple_dumper(\%loose_property_name_of, ' ' x 4);
17610    chomp $loose_property_name_of;
17611
17612    my $strict_property_name_of
17613                           = simple_dumper(\%strict_property_name_of, ' ' x 4);
17614    chomp $strict_property_name_of;
17615
17616    my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17617    chomp $stricter_to_file_of;
17618
17619    my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17620    chomp $inline_definitions;
17621
17622    my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17623    chomp $loose_to_file_of;
17624
17625    my $nv_floating_to_rational
17626                           = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17627    chomp $nv_floating_to_rational;
17628
17629    my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4);
17630    chomp $why_deprecated;
17631
17632    # We set the key to the file when we associated files with tables, but we
17633    # couldn't do the same for the value then, as we might not have the file
17634    # for the alternate table figured out at that time.
17635    foreach my $cased (keys %caseless_equivalent_to) {
17636        my @path = $caseless_equivalent_to{$cased}->file_path;
17637        my $path;
17638        if ($path[0] eq "#") {  # Pseudo-directory '#'
17639            $path = join '/', @path;
17640        }
17641        else {  # Gets rid of lib/
17642            $path = join '/', @path[1, -1];
17643        }
17644        $caseless_equivalent_to{$cased} = $path;
17645    }
17646    my $caseless_equivalent_to
17647                           = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17648    chomp $caseless_equivalent_to;
17649
17650    my $loose_property_to_file_of
17651                        = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17652    chomp $loose_property_to_file_of;
17653
17654    my $strict_property_to_file_of
17655                        = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17656    chomp $strict_property_to_file_of;
17657
17658    my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17659    chomp $file_to_swash_name;
17660
17661    # Create a mapping from each alias of Perl single-form extensions to all
17662    # its equivalent aliases, for quick look-up.
17663    my %perlprop_to_aliases;
17664    foreach my $table ($perl->tables) {
17665
17666        # First create the list of the aliases of each extension
17667        my @aliases_list;    # List of legal aliases for this extension
17668
17669        my $table_name = $table->name;
17670        my $standard_table_name = standardize($table_name);
17671        my $table_full_name = $table->full_name;
17672        my $standard_table_full_name = standardize($table_full_name);
17673
17674        # Make sure that the list has both the short and full names
17675        push @aliases_list, $table_name, $table_full_name;
17676
17677        my $found_ucd = 0;  # ? Did we actually get an alias that should be
17678                            # output for this table
17679
17680        # Go through all the aliases (including the two just added), and add
17681        # any new unique ones to the list
17682        foreach my $alias ($table->aliases) {
17683
17684            # Skip non-legal names
17685            next unless $alias->ok_as_filename;
17686            next unless $alias->ucd;
17687
17688            $found_ucd = 1;     # have at least one legal name
17689
17690            my $name = $alias->name;
17691            my $standard = standardize($name);
17692
17693            # Don't repeat a name that is equivalent to one already on the
17694            # list
17695            next if $standard eq $standard_table_name;
17696            next if $standard eq $standard_table_full_name;
17697
17698            push @aliases_list, $name;
17699        }
17700
17701        # If there were no legal names, don't output anything.
17702        next unless $found_ucd;
17703
17704        # To conserve memory in the program reading these in, omit full names
17705        # that are identical to the short name, when those are the only two
17706        # aliases for the property.
17707        if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
17708            pop @aliases_list;
17709        }
17710
17711        # Here, @aliases_list is the list of all the aliases that this
17712        # extension legally has.  Now can create a map to it from each legal
17713        # standardized alias
17714        foreach my $alias ($table->aliases) {
17715            next unless $alias->ucd;
17716            next unless $alias->ok_as_filename;
17717            push @{$perlprop_to_aliases{standardize($alias->name)}},
17718                 uniques @aliases_list;
17719        }
17720    }
17721
17722    # Make a list of all combinations of properties/values that are suppressed.
17723    my @suppressed;
17724    if (! $debug_skip) {    # This tends to fail in this debug mode
17725        foreach my $property_name (keys %why_suppressed) {
17726
17727            # Just the value
17728            my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
17729
17730            # The hash may contain properties not in this release of Unicode
17731            next unless defined (my $property = property_ref($property_name));
17732
17733            # Find all combinations
17734            foreach my $prop_alias ($property->aliases) {
17735                my $prop_alias_name = standardize($prop_alias->name);
17736
17737                # If no =value, there's just one combination possible for this
17738                if (! $value_name) {
17739
17740                    # The property may be suppressed, but there may be a proxy
17741                    # for it, so it shouldn't be listed as suppressed
17742                    next if $prop_alias->ucd;
17743                    push @suppressed, $prop_alias_name;
17744                }
17745                else {  # Otherwise
17746                    foreach my $value_alias
17747                                    ($property->table($value_name)->aliases)
17748                    {
17749                        next if $value_alias->ucd;
17750
17751                        push @suppressed, "$prop_alias_name="
17752                                        .  standardize($value_alias->name);
17753                    }
17754                }
17755            }
17756        }
17757    }
17758    @suppressed = sort @suppressed; # So doesn't change between runs of this
17759                                    # program
17760
17761    # Convert the structure below (designed for Name.pm) to a form that UCD
17762    # wants, so it doesn't have to modify it at all; i.e. so that it includes
17763    # an element for the Hangul syllables in the appropriate place, and
17764    # otherwise changes the name to include the "-<code point>" suffix.
17765    my @algorithm_names;
17766    my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
17767                                             # along in this version
17768    # Copy it linearly.
17769    for my $i (0 .. @code_points_ending_in_code_point - 1) {
17770
17771        # Insert the hanguls in the correct place.
17772        if (! $done_hangul
17773            && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
17774        {
17775            $done_hangul = 1;
17776            push @algorithm_names, { low => $SBase,
17777                                     high => $SBase + $SCount - 1,
17778                                     name => '<hangul syllable>',
17779                                    };
17780        }
17781
17782        # Copy the current entry, modified.
17783        push @algorithm_names, {
17784            low => $code_points_ending_in_code_point[$i]->{'low'},
17785            high => $code_points_ending_in_code_point[$i]->{'high'},
17786            name =>
17787               "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
17788        };
17789    }
17790
17791    # Serialize these structures for output.
17792    my $loose_to_standard_value
17793                          = simple_dumper(\%loose_to_standard_value, ' ' x 4);
17794    chomp $loose_to_standard_value;
17795
17796    my $string_property_loose_to_name
17797                    = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
17798    chomp $string_property_loose_to_name;
17799
17800    my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
17801    chomp $perlprop_to_aliases;
17802
17803    my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
17804    chomp $prop_aliases;
17805
17806    my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
17807    chomp $prop_value_aliases;
17808
17809    my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
17810    chomp $suppressed;
17811
17812    my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
17813    chomp $algorithm_names;
17814
17815    my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
17816    chomp $ambiguous_names;
17817
17818    my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
17819    chomp $combination_property;
17820
17821    my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
17822    chomp $loose_defaults;
17823
17824    my @ucd = <<END;
17825$HEADER
17826$INTERNAL_ONLY_HEADER
17827
17828# This file is for the use of Unicode::UCD
17829
17830# Highest legal Unicode code point
17831\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
17832
17833# Hangul syllables
17834\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
17835\$Unicode::UCD::HANGUL_COUNT = $SCount;
17836
17837# Maps Unicode (not Perl single-form extensions) property names in loose
17838# standard form to their corresponding standard names
17839\%Unicode::UCD::loose_property_name_of = (
17840$loose_property_name_of
17841);
17842
17843# Same, but strict names
17844\%Unicode::UCD::strict_property_name_of = (
17845$strict_property_name_of
17846);
17847
17848# Gives the definitions (in the form of inversion lists) for those properties
17849# whose definitions aren't kept in files
17850\@Unicode::UCD::inline_definitions = (
17851$inline_definitions
17852);
17853
17854# Maps property, table to file for those using stricter matching.  For paths
17855# whose directory is '#', the file is in the form of a numeric index into
17856# \@inline_definitions
17857\%Unicode::UCD::stricter_to_file_of = (
17858$stricter_to_file_of
17859);
17860
17861# Maps property, table to file for those using loose matching.  For paths
17862# whose directory is '#', the file is in the form of a numeric index into
17863# \@inline_definitions
17864\%Unicode::UCD::loose_to_file_of = (
17865$loose_to_file_of
17866);
17867
17868# Maps floating point to fractional form
17869\%Unicode::UCD::nv_floating_to_rational = (
17870$nv_floating_to_rational
17871);
17872
17873# If a %e floating point number doesn't have this number of digits in it after
17874# the decimal point to get this close to a fraction, it isn't considered to be
17875# that fraction even if all the digits it does have match.
17876\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION;
17877
17878# Deprecated tables to generate a warning for.  The key is the file containing
17879# the table, so as to avoid duplication, as many property names can map to the
17880# file, but we only need one entry for all of them.
17881\%Unicode::UCD::why_deprecated = (
17882$why_deprecated
17883);
17884
17885# A few properties have different behavior under /i matching.  This maps
17886# those to substitute files to use under /i.
17887\%Unicode::UCD::caseless_equivalent = (
17888$caseless_equivalent_to
17889);
17890
17891# Property names to mapping files
17892\%Unicode::UCD::loose_property_to_file_of = (
17893$loose_property_to_file_of
17894);
17895
17896# Property names to mapping files
17897\%Unicode::UCD::strict_property_to_file_of = (
17898$strict_property_to_file_of
17899);
17900
17901# Files to the swash names within them.
17902\%Unicode::UCD::file_to_swash_name = (
17903$file_to_swash_name
17904);
17905
17906# Keys are all the possible "prop=value" combinations, in loose form; values
17907# are the standard loose name for the 'value' part of the key
17908\%Unicode::UCD::loose_to_standard_value = (
17909$loose_to_standard_value
17910);
17911
17912# String property loose names to standard loose name
17913\%Unicode::UCD::string_property_loose_to_name = (
17914$string_property_loose_to_name
17915);
17916
17917# Keys are Perl extensions in loose form; values are each one's list of
17918# aliases
17919\%Unicode::UCD::loose_perlprop_to_name = (
17920$perlprop_to_aliases
17921);
17922
17923# Keys are standard property name; values are each one's aliases
17924\%Unicode::UCD::prop_aliases = (
17925$prop_aliases
17926);
17927
17928# Keys of top level are standard property name; values are keys to another
17929# hash,  Each one is one of the property's values, in standard form.  The
17930# values are that prop-val's aliases.  If only one specified, the short and
17931# long alias are identical.
17932\%Unicode::UCD::prop_value_aliases = (
17933$prop_value_aliases
17934);
17935
17936# Ordered (by code point ordinal) list of the ranges of code points whose
17937# names are algorithmically determined.  Each range entry is an anonymous hash
17938# of the start and end points and a template for the names within it.
17939\@Unicode::UCD::algorithmic_named_code_points = (
17940$algorithm_names
17941);
17942
17943# The properties that as-is have two meanings, and which must be disambiguated
17944\%Unicode::UCD::ambiguous_names = (
17945$ambiguous_names
17946);
17947
17948# Keys are the prop-val combinations which are the default values for the
17949# given property, expressed in standard loose form
17950\%Unicode::UCD::loose_defaults = (
17951$loose_defaults
17952);
17953
17954# The properties that are combinations, in that they have both a map table and
17955# a match table.  This is actually for UCD.t, so it knows how to test for
17956# these.
17957\%Unicode::UCD::combination_property = (
17958$combination_property
17959);
17960
17961# All combinations of names that are suppressed.
17962# This is actually for UCD.t, so it knows which properties shouldn't have
17963# entries.  If it got any bigger, would probably want to put it in its own
17964# file to use memory only when it was needed, in testing.
17965\@Unicode::UCD::suppressed_properties = (
17966$suppressed
17967);
17968
179691;
17970END
17971
17972    main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
17973    return;
17974}
17975
17976sub write_all_tables() {
17977    # Write out all the tables generated by this program to files, as well as
17978    # the supporting data structures, pod file, and .t file.
17979
17980    my @writables;              # List of tables that actually get written
17981    my %match_tables_to_write;  # Used to collapse identical match tables
17982                                # into one file.  Each key is a hash function
17983                                # result to partition tables into buckets.
17984                                # Each value is an array of the tables that
17985                                # fit in the bucket.
17986
17987    # For each property ...
17988    # (sort so that if there is an immutable file name, it has precedence, so
17989    # some other property can't come in and take over its file name.  (We
17990    # don't care if both defined, as they had better be different anyway.)
17991    # The property named 'Perl' needs to be first (it doesn't have any
17992    # immutable file name) because empty properties are defined in terms of
17993    # its table named 'All' under the -annotate option.)   We also sort by
17994    # the property's name.  This is just for repeatability of the outputs
17995    # between runs of this program, but does not affect correctness.
17996    PROPERTY:
17997    foreach my $property ($perl,
17998                          sort { return -1 if defined $a->file;
17999                                 return 1 if defined $b->file;
18000                                 return $a->name cmp $b->name;
18001                                } grep { $_ != $perl } property_ref('*'))
18002    {
18003        my $type = $property->type;
18004
18005        # And for each table for that property, starting with the mapping
18006        # table for it ...
18007        TABLE:
18008        foreach my $table($property,
18009
18010                        # and all the match tables for it (if any), sorted so
18011                        # the ones with the shortest associated file name come
18012                        # first.  The length sorting prevents problems of a
18013                        # longer file taking a name that might have to be used
18014                        # by a shorter one.  The alphabetic sorting prevents
18015                        # differences between releases
18016                        sort {  my $ext_a = $a->external_name;
18017                                return 1 if ! defined $ext_a;
18018                                my $ext_b = $b->external_name;
18019                                return -1 if ! defined $ext_b;
18020
18021                                # But return the non-complement table before
18022                                # the complement one, as the latter is defined
18023                                # in terms of the former, and needs to have
18024                                # the information for the former available.
18025                                return 1 if $a->complement != 0;
18026                                return -1 if $b->complement != 0;
18027
18028                                # Similarly, return a subservient table after
18029                                # a leader
18030                                return 1 if $a->leader != $a;
18031                                return -1 if $b->leader != $b;
18032
18033                                my $cmp = length $ext_a <=> length $ext_b;
18034
18035                                # Return result if lengths not equal
18036                                return $cmp if $cmp;
18037
18038                                # Alphabetic if lengths equal
18039                                return $ext_a cmp $ext_b
18040                        } $property->tables
18041                    )
18042        {
18043
18044            # Here we have a table associated with a property.  It could be
18045            # the map table (done first for each property), or one of the
18046            # other tables.  Determine which type.
18047            my $is_property = $table->isa('Property');
18048
18049            my $name = $table->name;
18050            my $complete_name = $table->complete_name;
18051
18052            # See if should suppress the table if is empty, but warn if it
18053            # contains something.
18054            my $suppress_if_empty_warn_if_not
18055                    = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18056
18057            # Calculate if this table should have any code points associated
18058            # with it or not.
18059            my $expected_empty =
18060
18061                # $perl should be empty
18062                ($is_property && ($table == $perl))
18063
18064                # Match tables in properties we skipped populating should be
18065                # empty
18066                || (! $is_property && ! $property->to_create_match_tables)
18067
18068                # Tables and properties that are expected to have no code
18069                # points should be empty
18070                || $suppress_if_empty_warn_if_not
18071            ;
18072
18073            # Set a boolean if this table is the complement of an empty binary
18074            # table
18075            my $is_complement_of_empty_binary =
18076                $type == $BINARY &&
18077                (($table == $property->table('Y')
18078                    && $property->table('N')->is_empty)
18079                || ($table == $property->table('N')
18080                    && $property->table('Y')->is_empty));
18081
18082            if ($table->is_empty) {
18083
18084                if ($suppress_if_empty_warn_if_not) {
18085                    $table->set_fate($SUPPRESSED,
18086                                     $suppress_if_empty_warn_if_not);
18087                }
18088
18089                # Suppress (by skipping them) expected empty tables.
18090                next TABLE if $expected_empty;
18091
18092                # And setup to later output a warning for those that aren't
18093                # known to be allowed to be empty.  Don't do the warning if
18094                # this table is a child of another one to avoid duplicating
18095                # the warning that should come from the parent one.
18096                if (($table == $property || $table->parent == $table)
18097                    && $table->fate != $SUPPRESSED
18098                    && $table->fate != $MAP_PROXIED
18099                    && ! grep { $complete_name =~ /^$_$/ }
18100                                                    @tables_that_may_be_empty)
18101                {
18102                    push @unhandled_properties, "$table";
18103                }
18104
18105                # The old way of expressing an empty match list was to
18106                # complement the list that matches everything.  The new way is
18107                # to create an empty inversion list, but this doesn't work for
18108                # annotating, so use the old way then.
18109                $table->set_complement($All) if $annotate
18110                                                && $table != $property;
18111            }
18112            elsif ($expected_empty) {
18113                my $because = "";
18114                if ($suppress_if_empty_warn_if_not) {
18115                    $because = " because $suppress_if_empty_warn_if_not";
18116                }
18117
18118                Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
18119            }
18120
18121            # Some tables should match everything
18122            my $expected_full =
18123                ($table->fate == $SUPPRESSED)
18124                ? 0
18125                : ($is_property)
18126                  ? # All these types of map tables will be full because
18127                    # they will have been populated with defaults
18128                    ($type == $ENUM)
18129
18130                  : # A match table should match everything if its method
18131                    # shows it should
18132                    ($table->matches_all
18133
18134                    # The complement of an empty binary table will match
18135                    # everything
18136                    || $is_complement_of_empty_binary
18137                    )
18138            ;
18139
18140            my $count = $table->count;
18141            if ($expected_full) {
18142                if ($count != $MAX_WORKING_CODEPOINTS) {
18143                    Carp::my_carp("$table matches only "
18144                    . clarify_number($count)
18145                    . " Unicode code points but should match "
18146                    . clarify_number($MAX_WORKING_CODEPOINTS)
18147                    . " (off by "
18148                    .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18149                    . ").  Proceeding anyway.");
18150                }
18151
18152                # Here is expected to be full.  If it is because it is the
18153                # complement of an (empty) binary table that is to be
18154                # suppressed, then suppress this one as well.
18155                if ($is_complement_of_empty_binary) {
18156                    my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18157                    my $opposing = $property->table($opposing_name);
18158                    my $opposing_status = $opposing->status;
18159                    if ($opposing_status) {
18160                        $table->set_status($opposing_status,
18161                                           $opposing->status_info);
18162                    }
18163                }
18164            }
18165            elsif ($count == $MAX_UNICODE_CODEPOINTS
18166                   && $name ne "Any"
18167                   && ($table == $property || $table->leader == $table)
18168                   && $table->property->status ne $NORMAL)
18169            {
18170                    Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
18171            }
18172
18173            if ($table->fate >= $SUPPRESSED) {
18174                if (! $is_property) {
18175                    my @children = $table->children;
18176                    foreach my $child (@children) {
18177                        if ($child->fate < $SUPPRESSED) {
18178                            Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18179                        }
18180                    }
18181                }
18182                next TABLE;
18183
18184            }
18185
18186            if (! $is_property) {
18187
18188                make_ucd_table_pod_entries($table) if $table->property == $perl;
18189
18190                # Several things need to be done just once for each related
18191                # group of match tables.  Do them on the parent.
18192                if ($table->parent == $table) {
18193
18194                    # Add an entry in the pod file for the table; it also does
18195                    # the children.
18196                    make_re_pod_entries($table) if defined $pod_directory;
18197
18198                    # See if the table matches identical code points with
18199                    # something that has already been processed and is ready
18200                    # for output.  In that case, no need to have two files
18201                    # with the same code points in them.  We use the table's
18202                    # hash() method to store these in buckets, so that it is
18203                    # quite likely that if two tables are in the same bucket
18204                    # they will be identical, so don't have to compare tables
18205                    # frequently.  The tables have to have the same status to
18206                    # share a file, so add this to the bucket hash.  (The
18207                    # reason for this latter is that UCD.pm associates a
18208                    # status with a file.) We don't check tables that are
18209                    # inverses of others, as it would lead to some coding
18210                    # complications, and checking all the regular ones should
18211                    # find everything.
18212                    if ($table->complement == 0) {
18213                        my $hash = $table->hash . ';' . $table->status;
18214
18215                        # Look at each table that is in the same bucket as
18216                        # this one would be.
18217                        foreach my $comparison
18218                                            (@{$match_tables_to_write{$hash}})
18219                        {
18220                            # If the table doesn't point back to this one, we
18221                            # see if it matches identically
18222                            if (   $comparison->leader != $table
18223                                && $table->matches_identically_to($comparison))
18224                            {
18225                                $table->set_equivalent_to($comparison,
18226                                                                Related => 0);
18227                                next TABLE;
18228                            }
18229                        }
18230
18231                        # Here, not equivalent, add this table to the bucket.
18232                        push @{$match_tables_to_write{$hash}}, $table;
18233                    }
18234                }
18235            }
18236            else {
18237
18238                # Here is the property itself.
18239                # Don't write out or make references to the $perl property
18240                next if $table == $perl;
18241
18242                make_ucd_table_pod_entries($table);
18243
18244                # There is a mapping stored of the various synonyms to the
18245                # standardized name of the property for Unicode::UCD.
18246                # Also, the pod file contains entries of the form:
18247                # \p{alias: *}         \p{full: *}
18248                # rather than show every possible combination of things.
18249
18250                my @property_aliases = $property->aliases;
18251
18252                my $full_property_name = $property->full_name;
18253                my $property_name = $property->name;
18254                my $standard_property_name = standardize($property_name);
18255                my $standard_property_full_name
18256                                        = standardize($full_property_name);
18257
18258                # We also create for Unicode::UCD a list of aliases for
18259                # the property.  The list starts with the property name;
18260                # then its full name.
18261                my @property_list;
18262                my @standard_list;
18263                if ( $property->fate <= $MAP_PROXIED) {
18264                    @property_list = ($property_name, $full_property_name);
18265                    @standard_list = ($standard_property_name,
18266                                        $standard_property_full_name);
18267                }
18268
18269                # For each synonym ...
18270                for my $i (0 .. @property_aliases - 1)  {
18271                    my $alias = $property_aliases[$i];
18272                    my $alias_name = $alias->name;
18273                    my $alias_standard = standardize($alias_name);
18274
18275
18276                    # Add other aliases to the list of property aliases
18277                    if ($property->fate <= $MAP_PROXIED
18278                        && ! grep { $alias_standard eq $_ } @standard_list)
18279                    {
18280                        push @property_list, $alias_name;
18281                        push @standard_list, $alias_standard;
18282                    }
18283
18284                    # For Unicode::UCD, set the mapping of the alias to the
18285                    # property
18286                    if ($type == $STRING) {
18287                        if ($property->fate <= $MAP_PROXIED) {
18288                            $string_property_loose_to_name{$alias_standard}
18289                                            = $standard_property_name;
18290                        }
18291                    }
18292                    else {
18293                        my $hash_ref = ($alias_standard =~ /^_/)
18294                                       ? \%strict_property_name_of
18295                                       : \%loose_property_name_of;
18296                        if (exists $hash_ref->{$alias_standard}) {
18297                            Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
18298                        }
18299                        else {
18300                            $hash_ref->{$alias_standard}
18301                                                = $standard_property_name;
18302                        }
18303
18304                        # Now for the re pod entry for this alias.  Skip if not
18305                        # outputting a pod; skip the first one, which is the
18306                        # full name so won't have an entry like: '\p{full: *}
18307                        # \p{full: *}', and skip if don't want an entry for
18308                        # this one.
18309                        next if $i == 0
18310                                || ! defined $pod_directory
18311                                || ! $alias->make_re_pod_entry;
18312
18313                        my $rhs = "\\p{$full_property_name: *}";
18314                        if ($property != $perl && $table->perl_extension) {
18315                            $rhs .= ' (Perl extension)';
18316                        }
18317                        push @match_properties,
18318                            format_pod_line($indent_info_column,
18319                                        '\p{' . $alias->name . ': *}',
18320                                        $rhs,
18321                                        $alias->status);
18322                    }
18323                }
18324
18325                # The list of all possible names is attached to each alias, so
18326                # lookup is easy
18327                if (@property_list) {
18328                    push @{$prop_aliases{$standard_list[0]}}, @property_list;
18329                }
18330
18331                if ($property->fate <= $MAP_PROXIED) {
18332
18333                    # Similarly, we create for Unicode::UCD a list of
18334                    # property-value aliases.
18335
18336                    # Look at each table in the property...
18337                    foreach my $table ($property->tables) {
18338                        my @values_list;
18339                        my $table_full_name = $table->full_name;
18340                        my $standard_table_full_name
18341                                              = standardize($table_full_name);
18342                        my $table_name = $table->name;
18343                        my $standard_table_name = standardize($table_name);
18344
18345                        # The list starts with the table name and its full
18346                        # name.
18347                        push @values_list, $table_name, $table_full_name;
18348
18349                        # We add to the table each unique alias that isn't
18350                        # discouraged from use.
18351                        foreach my $alias ($table->aliases) {
18352                            next if $alias->status
18353                                 && $alias->status eq $DISCOURAGED;
18354                            my $name = $alias->name;
18355                            my $standard = standardize($name);
18356                            next if $standard eq $standard_table_name;
18357                            next if $standard eq $standard_table_full_name;
18358                            push @values_list, $name;
18359                        }
18360
18361                        # Here @values_list is a list of all the aliases for
18362                        # the table.  That is, all the property-values given
18363                        # by this table.  By agreement with Unicode::UCD,
18364                        # if the name and full name are identical, and there
18365                        # are no other names, drop the duplicate entry to save
18366                        # memory.
18367                        if (@values_list == 2
18368                            && $values_list[0] eq $values_list[1])
18369                        {
18370                            pop @values_list
18371                        }
18372
18373                        # To save memory, unlike the similar list for property
18374                        # aliases above, only the standard forms have the list.
18375                        # This forces an extra step of converting from input
18376                        # name to standard name, but the savings are
18377                        # considerable.  (There is only marginal savings if we
18378                        # did this with the property aliases.)
18379                        push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18380                    }
18381                }
18382
18383                # Don't write out a mapping file if not desired.
18384                next if ! $property->to_output_map;
18385            }
18386
18387            # Here, we know we want to write out the table, but don't do it
18388            # yet because there may be other tables that come along and will
18389            # want to share the file, and the file's comments will change to
18390            # mention them.  So save for later.
18391            push @writables, $table;
18392
18393        } # End of looping through the property and all its tables.
18394    } # End of looping through all properties.
18395
18396    # Now have all the tables that will have files written for them.  Do it.
18397    foreach my $table (@writables) {
18398        my @directory;
18399        my $filename;
18400        my $property = $table->property;
18401        my $is_property = ($table == $property);
18402
18403        # For very short tables, instead of writing them out to actual files,
18404        # we in-line their inversion list definitions into UCD.pm.  The
18405        # definition replaces the file name, and the special pseudo-directory
18406        # '#' is used to signal this.  This significantly cuts down the number
18407        # of files written at little extra cost to the hashes in UCD.pm.
18408        # And it means, no run-time files to read to get the definitions.
18409        if (! $is_property
18410            && ! $annotate  # For annotation, we want to explicitly show
18411                            # everything, so keep in files
18412            && $table->ranges <= 3)
18413        {
18414            my @ranges = $table->ranges;
18415            my $count = @ranges;
18416            if ($count == 0) {  # 0th index reserved for 0-length lists
18417                $filename = 0;
18418            }
18419            elsif ($table->leader != $table) {
18420
18421                # Here, is a table that is equivalent to another; code
18422                # in register_file_for_name() causes its leader's definition
18423                # to be used
18424
18425                next;
18426            }
18427            else {  # No equivalent table so far.
18428
18429                # Build up its definition range-by-range.
18430                my $definition = "";
18431                while (defined (my $range = shift @ranges)) {
18432                    my $end = $range->end;
18433                    if ($end < $MAX_WORKING_CODEPOINT) {
18434                        $count++;
18435                        $end = "\n" . ($end + 1);
18436                    }
18437                    else {  # Extends to infinity, hence no 'end'
18438                        $end = "";
18439                    }
18440                    $definition .= "\n" . $range->start . $end;
18441                }
18442                $definition = "V$count" . $definition;
18443                $filename = @inline_definitions;
18444                push @inline_definitions, $definition;
18445            }
18446            @directory = "#";
18447            register_file_for_name($table, \@directory, $filename);
18448            next;
18449        }
18450
18451        if (! $is_property) {
18452            # Match tables for the property go in lib/$subdirectory, which is
18453            # the property's name.  Don't use the standard file name for this,
18454            # as may get an unfamiliar alias
18455            @directory = ($matches_directory, ($property->match_subdir)
18456                                              ? $property->match_subdir
18457                                              : $property->external_name);
18458        }
18459        else {
18460
18461            @directory = $table->directory;
18462            $filename = $table->file;
18463        }
18464
18465        # Use specified filename if available, or default to property's
18466        # shortest name.  We need an 8.3 safe filename (which means "an 8
18467        # safe" filename, since after the dot is only 'pl', which is < 3)
18468        # The 2nd parameter is if the filename shouldn't be changed, and
18469        # it shouldn't iff there is a hard-coded name for this table.
18470        $filename = construct_filename(
18471                                $filename || $table->external_name,
18472                                ! $filename,    # mutable if no filename
18473                                \@directory);
18474
18475        register_file_for_name($table, \@directory, $filename);
18476
18477        # Only need to write one file when shared by more than one
18478        # property
18479        next if ! $is_property
18480                && ($table->leader != $table || $table->complement != 0);
18481
18482        # Construct a nice comment to add to the file
18483        $table->set_final_comment;
18484
18485        $table->write;
18486    }
18487
18488
18489    # Write out the pod file
18490    make_pod;
18491
18492    # And Name.pm, UCD.pl
18493    make_Name_pm;
18494    make_UCD;
18495
18496    make_property_test_script() if $make_test_script;
18497    make_normalization_test_script() if $make_norm_test_script;
18498    return;
18499}
18500
18501my @white_space_separators = ( # This used only for making the test script.
18502                            "",
18503                            ' ',
18504                            "\t",
18505                            '   '
18506                        );
18507
18508sub generate_separator($lhs) {
18509    # This used only for making the test script.  It generates the colon or
18510    # equal separator between the property and property value, with random
18511    # white space surrounding the separator
18512
18513    return "" if $lhs eq "";  # No separator if there's only one (the r) side
18514
18515    # Choose space before and after randomly
18516    my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18517    my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18518
18519    # And return the whole complex, half the time using a colon, half the
18520    # equals
18521    return $spaces_before
18522            . (rand() < 0.5) ? '=' : ':'
18523            . $spaces_after;
18524}
18525
18526sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18527    # This used only for making the test script.  It generates test cases that
18528    # are expected to compile successfully in perl.  Note that the LHS and
18529    # RHS are assumed to already be as randomized as the caller wants.
18530
18531    # $lhs          # The property: what's to the left of the colon
18532                    #  or equals separator
18533    # $rhs          # The property value; what's to the right
18534    # $valid_code   # A code point that's known to be in the
18535                        # table given by LHS=RHS; undef if table is
18536                        # empty
18537    # $invalid_code # A code point known to not be in the table;
18538                    # undef if the table is all code points
18539    # $warning
18540
18541    # Get the colon or equal
18542    my $separator = generate_separator($lhs);
18543
18544    # The whole 'property=value'
18545    my $name = "$lhs$separator$rhs";
18546
18547    my @output;
18548    # Create a complete set of tests, with complements.
18549    if (defined $valid_code) {
18550        push @output, <<"EOC"
18551Expect(1, $valid_code, '\\p{$name}', $warning);
18552Expect(0, $valid_code, '\\p{^$name}', $warning);
18553Expect(0, $valid_code, '\\P{$name}', $warning);
18554Expect(1, $valid_code, '\\P{^$name}', $warning);
18555EOC
18556    }
18557    if (defined $invalid_code) {
18558        push @output, <<"EOC"
18559Expect(0, $invalid_code, '\\p{$name}', $warning);
18560Expect(1, $invalid_code, '\\p{^$name}', $warning);
18561Expect(1, $invalid_code, '\\P{$name}', $warning);
18562Expect(0, $invalid_code, '\\P{^$name}', $warning);
18563EOC
18564    }
18565    return @output;
18566}
18567
18568sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18569    # This used only for making the test script.  It generates wildcardl
18570    # matching test cases that are expected to compile successfully in perl.
18571
18572    # $lhs           # The property: what's to the left of the
18573                     # or equals separator
18574    # $rhs           # The property value; what's to the right
18575    # $valid_code    # A code point that's known to be in the
18576                     # table given by LHS=RHS; undef if table is
18577                     # empty
18578    # $invalid_code  # A code point known to not be in the table;
18579                     # undef if the table is all code points
18580    # $warning
18581
18582    return if $lhs eq "";
18583    return if $lhs =~ / ^ Is_ /x;   # These are not currently supported
18584
18585    # Generate a standardized pattern, with colon being the delimitter
18586    my $wildcard = "$lhs=:\\A$rhs\\z:";
18587
18588    my @output;
18589    push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
18590                                                        if defined $valid_code;
18591    push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
18592                                                      if defined $invalid_code;
18593    return @output;
18594}
18595
18596sub generate_error($lhs, $rhs, $already_in_error=0) {
18597    # This used only for making the test script.  It generates test cases that
18598    # are expected to not only not match, but to be syntax or similar errors
18599
18600    # $lhs                # The property: what's to the left of the
18601                          # colon or equals separator
18602    # $rhs                # The property value; what's to the right
18603    # $already_in_error   # Boolean; if true it's known that the
18604                          # unmodified LHS and RHS will cause an error.
18605                          # This routine should not force another one
18606    # Get the colon or equal
18607    my $separator = generate_separator($lhs);
18608
18609    # Since this is an error only, don't bother to randomly decide whether to
18610    # put the error on the left or right side; and assume that the RHS is
18611    # loosely matched, again for convenience rather than rigor.
18612    $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18613
18614    my $property = $lhs . $separator . $rhs;
18615
18616    return <<"EOC";
18617Error('\\p{$property}');
18618Error('\\P{$property}');
18619EOC
18620}
18621
18622# These are used only for making the test script
18623# XXX Maybe should also have a bad strict seps, which includes underscore.
18624
18625my @good_loose_seps = (
18626            " ",
18627            "-",
18628            "\t",
18629            "",
18630            "_",
18631           );
18632my @bad_loose_seps = (
18633           "/a/",
18634           ':=',
18635          );
18636
18637sub randomize_stricter_name($name) {
18638    # This used only for making the test script.  Take the input name and
18639    # return a randomized, but valid version of it under the stricter matching
18640    # rules.
18641
18642    # If the name looks like a number (integer, floating, or rational), do
18643    # some extra work
18644    if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18645        my $sign = $1;
18646        my $number = $2;
18647        my $separator = $3;
18648
18649        # If there isn't a sign, part of the time add a plus
18650        # Note: Not testing having any denominator having a minus sign
18651        if (! $sign) {
18652            $sign = '+' if rand() <= .3;
18653        }
18654
18655        # And add 0 or more leading zeros.
18656        $name = $sign . ('0' x int rand(10)) . $number;
18657
18658        if (defined $separator) {
18659            my $extra_zeros = '0' x int rand(10);
18660
18661            if ($separator eq '.') {
18662
18663                # Similarly, add 0 or more trailing zeros after a decimal
18664                # point
18665                $name .= $extra_zeros;
18666            }
18667            else {
18668
18669                # Or, leading zeros before the denominator
18670                $name =~ s,/,/$extra_zeros,;
18671            }
18672        }
18673    }
18674
18675    # For legibility of the test, only change the case of whole sections at a
18676    # time.  To do this, first split into sections.  The split returns the
18677    # delimiters
18678    my @sections;
18679    for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18680        trace $section if main::DEBUG && $to_trace;
18681
18682        if (length $section > 1 && $section !~ /\D/) {
18683
18684            # If the section is a sequence of digits, about half the time
18685            # randomly add underscores between some of them.
18686            if (rand() > .5) {
18687
18688                # Figure out how many underscores to add.  max is 1 less than
18689                # the number of digits.  (But add 1 at the end to make sure
18690                # result isn't 0, and compensate earlier by subtracting 2
18691                # instead of 1)
18692                my $num_underscores = int rand(length($section) - 2) + 1;
18693
18694                # And add them evenly throughout, for convenience, not rigor
18695                use integer;
18696                my $spacing = (length($section) - 1)/ $num_underscores;
18697                my $temp = $section;
18698                $section = "";
18699                for my $i (1 .. $num_underscores) {
18700                    $section .= substr($temp, 0, $spacing, "") . '_';
18701                }
18702                $section .= $temp;
18703            }
18704            push @sections, $section;
18705        }
18706        else {
18707
18708            # Here not a sequence of digits.  Change the case of the section
18709            # randomly
18710            my $switch = int rand(4);
18711            if ($switch == 0) {
18712                push @sections, uc $section;
18713            }
18714            elsif ($switch == 1) {
18715                push @sections, lc $section;
18716            }
18717            elsif ($switch == 2) {
18718                push @sections, ucfirst $section;
18719            }
18720            else {
18721                push @sections, $section;
18722            }
18723        }
18724    }
18725    trace "returning", join "", @sections if main::DEBUG && $to_trace;
18726    return join "", @sections;
18727}
18728
18729sub randomize_loose_name($name, $want_error=0) {
18730    # This used only for making the test script
18731
18732    $name = randomize_stricter_name($name);
18733
18734    my @parts;
18735    push @parts, $good_loose_seps[rand(@good_loose_seps)];
18736
18737    # Preserve trailing ones for the sake of not stripping the underscore from
18738    # 'L_'
18739    for my $part (split /[-\s_]+ (?= . )/, $name) {
18740        if (@parts) {
18741            if ($want_error and rand() < 0.3) {
18742                push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18743                $want_error = 0;
18744            }
18745            else {
18746                push @parts, $good_loose_seps[rand(@good_loose_seps)];
18747            }
18748        }
18749        push @parts, $part;
18750    }
18751    my $new = join("", @parts);
18752    trace "$name => $new" if main::DEBUG && $to_trace;
18753
18754    if ($want_error) {
18755        if (rand() >= 0.5) {
18756            $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18757        }
18758        else {
18759            $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18760        }
18761    }
18762    return $new;
18763}
18764
18765# Used to make sure don't generate duplicate test cases.
18766my %test_generated;
18767
18768sub make_property_test_script() {
18769    # This used only for making the test script
18770    # this written directly -- it's huge.
18771
18772    print "Making test script\n" if $verbosity >= $PROGRESS;
18773
18774    # This uses randomness to test different possibilities without testing all
18775    # possibilities.  To ensure repeatability, set the seed to 0.  But if
18776    # tests are added, it will perturb all later ones in the .t file
18777    srand 0;
18778
18779    $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18780
18781    # Create a list of what the %f representation is for each rational number.
18782    # This will be used below.
18783    my @valid_base_floats = '0.0';
18784    foreach my $e_representation (keys %nv_floating_to_rational) {
18785        push @valid_base_floats,
18786                            eval $nv_floating_to_rational{$e_representation};
18787    }
18788
18789    # It doesn't matter whether the elements of this array contain single lines
18790    # or multiple lines. main::write doesn't count the lines.
18791    my @output;
18792
18793    push @output, <<'EOF_CODE';
18794Error('\p{Script=InGreek}');    # Bug #69018
18795Test_GCB("1100 $nobreak 1161");  # Bug #70940
18796Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18797Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18798Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
18799Error('\p{InKana}');    # 'Kana' is not a block so InKana shouldn't compile
18800Expect(1, 0xB6, '\p{In=V1_1}', ""); # Didn't use to work
18801Expect(1, 0x3A2,'\p{In=NA}', "");   # Didn't use to work
18802
18803# Make sure this gets tested; it was not part of the official test suite at
18804# the time this was added.  Note that this is as it would appear in the
18805# official suite, and gets modified to check for the perl tailoring by
18806# Test_WB()
18807Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
18808Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
18809Expect(1, ord(" "), '\p{gc=:(?aa)s:}', "");     # /aa is valid
18810Expect(1, ord(" "), '\p{gc=:(?-s)s:}', "");     # /-s is valid
18811EOF_CODE
18812
18813    # Sort these so get results in same order on different runs of this
18814    # program
18815    foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
18816                                    or
18817                                 lc $a->name cmp lc $b->name
18818                               } property_ref('*'))
18819    {
18820        # Non-binary properties should not match \p{};  Test all for that.
18821        if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
18822            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
18823                                                            $property->aliases;
18824            foreach my $property_alias ($property->aliases) {
18825                my $name = standardize($property_alias->name);
18826
18827                # But some names are ambiguous, meaning a binary property with
18828                # the same name when used in \p{}, and a different
18829                # (non-binary) property in other contexts.
18830                next if grep { $name eq $_ } keys %ambiguous_names;
18831
18832                push @output, <<"EOF_CODE";
18833Error('\\p{$name}');
18834Error('\\P{$name}');
18835EOF_CODE
18836            }
18837        }
18838        foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
18839                                    or
18840                                  lc $a->name cmp lc $b->name
18841                                } $property->tables)
18842        {
18843
18844            # Find code points that match, and don't match this table.
18845            my $valid = $table->get_valid_code_point;
18846            my $invalid = $table->get_invalid_code_point;
18847            my $warning = ($table->status eq $DEPRECATED)
18848                            ? "'deprecated'"
18849                            : '""';
18850
18851            # Test each possible combination of the property's aliases with
18852            # the table's.  If this gets to be too many, could do what is done
18853            # in the set_final_comment() for Tables
18854            my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
18855            next unless @table_aliases;
18856            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
18857            next unless @property_aliases;
18858
18859            # Every property can be optionally be prefixed by 'Is_', so test
18860            # that those work, by creating such a new alias for each
18861            # pre-existing one.
18862            push @property_aliases, map { Alias->new("Is_" . $_->name,
18863                                                    $_->loose_match,
18864                                                    $_->make_re_pod_entry,
18865                                                    $_->ok_as_filename,
18866                                                    $_->status,
18867                                                    $_->ucd,
18868                                                    )
18869                                         } @property_aliases;
18870            my $max = max(scalar @table_aliases, scalar @property_aliases);
18871            for my $j (0 .. $max - 1) {
18872
18873                # The current alias for property is the next one on the list,
18874                # or if beyond the end, start over.  Similarly for table
18875                my $property_name
18876                            = $property_aliases[$j % @property_aliases]->name;
18877
18878                $property_name = "" if $table->property == $perl;
18879                my $table_alias = $table_aliases[$j % @table_aliases];
18880                my $table_name = $table_alias->name;
18881                my $loose_match = $table_alias->loose_match;
18882
18883                # If the table doesn't have a file, any test for it is
18884                # already guaranteed to be in error
18885                my $already_error = ! $table->file_path;
18886
18887                # A table that begins with these could actually be a
18888                # user-defined property, so won't be compile time errors, as
18889                # the definitions of those can be deferred until runtime
18890                next if $already_error && $table_name =~ / ^ I[ns] /x;
18891
18892                # Generate error cases for this alias.
18893                push @output, generate_error($property_name,
18894                                             $table_name,
18895                                             $already_error);
18896
18897                # If the table is guaranteed to always generate an error,
18898                # quit now without generating success cases.
18899                next if $already_error;
18900
18901                # Now for the success cases.  First, wildcard matching, as it
18902                # shouldn't have any randomization.
18903                if ($table_alias->status eq $NORMAL) {
18904                    push @output, generate_wildcard_tests($property_name,
18905                                                          $table_name,
18906                                                          $valid,
18907                                                          $invalid,
18908                                                          $warning,
18909                                                         );
18910                }
18911                my $random;
18912                if ($loose_match) {
18913
18914                    # For loose matching, create an extra test case for the
18915                    # standard name.
18916                    my $standard = standardize($table_name);
18917
18918                    # $test_name should be a unique combination for each test
18919                    # case; used just to avoid duplicate tests
18920                    my $test_name = "$property_name=$standard";
18921
18922                    # Don't output duplicate test cases.
18923                    if (! exists $test_generated{$test_name}) {
18924                        $test_generated{$test_name} = 1;
18925                        push @output, generate_tests($property_name,
18926                                                     $standard,
18927                                                     $valid,
18928                                                     $invalid,
18929                                                     $warning,
18930                                                 );
18931                        if ($table_alias->status eq $NORMAL) {
18932                            push @output, generate_wildcard_tests(
18933                                                     $property_name,
18934                                                     $standard,
18935                                                     $valid,
18936                                                     $invalid,
18937                                                     $warning,
18938                                                 );
18939                        }
18940                    }
18941                    $random = randomize_loose_name($table_name)
18942                }
18943                else { # Stricter match
18944                    $random = randomize_stricter_name($table_name);
18945                }
18946
18947                # Now for the main test case for this alias.
18948                my $test_name = "$property_name=$random";
18949                if (! exists $test_generated{$test_name}) {
18950                    $test_generated{$test_name} = 1;
18951                    push @output, generate_tests($property_name,
18952                                                 $random,
18953                                                 $valid,
18954                                                 $invalid,
18955                                                 $warning,
18956                                             );
18957
18958                    if ($property->name eq 'nv') {
18959                        if ($table_name !~ qr{/}) {
18960                            push @output, generate_tests($property_name,
18961                                                sprintf("%.15e", $table_name),
18962                                                $valid,
18963                                                $invalid,
18964                                                $warning,
18965                                            );
18966                    }
18967                    else {
18968                        # If the name is a rational number, add tests for a
18969                        # non-reduced form, and for a floating point equivalent.
18970
18971                        # 60 is a number divisible by a bunch of things
18972                        my ($numerator, $denominator) = $table_name
18973                                                        =~ m! (.+) / (.+) !x;
18974                        $numerator *= 60;
18975                        $denominator *= 60;
18976                        push @output, generate_tests($property_name,
18977                                                    "$numerator/$denominator",
18978                                                    $valid,
18979                                                    $invalid,
18980                                                    $warning,
18981                                    );
18982
18983                        # Calculate the float, and the %e representation
18984                        my $float = eval $table_name;
18985                        my $e_representation = sprintf("%.*e",
18986                                                $E_FLOAT_PRECISION, $float);
18987                        # Parse that
18988                        my ($non_zeros, $zeros, $exponent_sign, $exponent)
18989                           = $e_representation
18990                               =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
18991                        my $min_e_precision;
18992                        my $min_f_precision;
18993
18994                        if ($exponent_sign eq '+' && $exponent != 0) {
18995                            Carp::my_carp_bug("Not yet equipped to handle"
18996                                            . " positive exponents");
18997                            return;
18998                        }
18999                        else {
19000                            # We're trying to find the minimum precision that
19001                            # is needed to indicate this particular rational
19002                            # for the given $E_FLOAT_PRECISION.  For %e, any
19003                            # trailing zeros, like 1.500e-02 aren't needed, so
19004                            # the correct value is how many non-trailing zeros
19005                            # there are after the decimal point.
19006                            $min_e_precision = length $non_zeros;
19007
19008                            # For %f, like .01500, we want at least
19009                            # $E_FLOAT_PRECISION digits, but any trailing
19010                            # zeros aren't needed, so we can subtract the
19011                            # length of those.  But we also need to include
19012                            # the zeros after the decimal point, but before
19013                            # the first significant digit.
19014                            $min_f_precision = $E_FLOAT_PRECISION
19015                                             + $exponent
19016                                             - length $zeros;
19017                        }
19018
19019                        # Make tests for each possible precision from 1 to
19020                        # just past the worst case.
19021                        my $upper_limit = ($min_e_precision > $min_f_precision)
19022                                           ? $min_e_precision
19023                                           : $min_f_precision;
19024
19025                        for my $i (1 .. $upper_limit + 1) {
19026                            for my $format ("e", "f") {
19027                                my $this_table
19028                                          = sprintf("%.*$format", $i, $float);
19029
19030                                # If we don't have enough precision digits,
19031                                # make a fail test; otherwise a pass test.
19032                                my $pass = ($format eq "e")
19033                                            ? $i >= $min_e_precision
19034                                            : $i >= $min_f_precision;
19035                                if ($pass) {
19036                                    push @output, generate_tests($property_name,
19037                                                                $this_table,
19038                                                                $valid,
19039                                                                $invalid,
19040                                                                $warning,
19041                                                );
19042                                }
19043                                elsif (   $format eq "e"
19044
19045                                          # Here we would fail, but in the %f
19046                                          # case, the representation at this
19047                                          # precision could actually be a
19048                                          # valid one for some other rational
19049                                       || ! grep { $this_table
19050                                                            =~ / ^ $_ 0* $ /x }
19051                                                            @valid_base_floats)
19052                                {
19053                                    push @output,
19054                                        generate_error($property_name,
19055                                                       $this_table,
19056                                                       1   # 1 => already an
19057                                                           # error
19058                                                );
19059                                }
19060                            }
19061                        }
19062                    }
19063                    }
19064                }
19065            }
19066            $table->DESTROY();
19067        }
19068        $property->DESTROY();
19069    }
19070
19071    # Make any test of the boundary (break) properties TODO if the code
19072    # doesn't match the version being compiled
19073    my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19074                             ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19075                             : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19076
19077    @output= map {
19078        map s/^/    /mgr,
19079        map "$_;\n",
19080        split /;\n/, $_
19081    } @output;
19082
19083    # Cause there to be 'if' statements to only execute a portion of this
19084    # long-running test each time, so that we can have a bunch of .t's running
19085    # in parallel
19086    my $chunks = 10     # Number of test files
19087               - 1      # For GCB & SB
19088               - 1      # For WB
19089               - 4;     # LB split into this many files
19090    my @output_chunked;
19091    my $chunk_count=0;
19092    my $chunk_size= int(@output / $chunks) + 1;
19093    while (@output) {
19094        $chunk_count++;
19095        my @chunk= splice @output, 0, $chunk_size;
19096        push @output_chunked,
19097            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19098                @chunk,
19099            "}\n";
19100    }
19101
19102    $chunk_count++;
19103    push @output_chunked,
19104        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19105            (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
19106            (map {"    Test_SB('$_');\n"} @SB_tests),
19107        "}\n";
19108
19109
19110    $chunk_size= int(@LB_tests / 4) + 1;
19111    @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
19112    while (@LB_tests) {
19113        $chunk_count++;
19114        my @chunk= splice @LB_tests, 0, $chunk_size;
19115        push @output_chunked,
19116            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19117                @chunk,
19118            "}\n";
19119    }
19120
19121    $chunk_count++;
19122    push @output_chunked,
19123        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19124            (map {"    Test_WB('$_');\n"} @WB_tests),
19125        "}\n";
19126
19127    &write($t_path,
19128           0,           # Not utf8;
19129           [$HEADER,
19130            $TODO_FAILING_BREAKS,
19131            <DATA>,
19132            @output_chunked,
19133            "Finished();\n",
19134           ]);
19135
19136    return;
19137}
19138
19139sub make_normalization_test_script() {
19140    print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19141
19142    my $n_path = 'TestNorm.pl';
19143
19144    unshift @normalization_tests, <<'END';
19145use utf8;
19146use Test::More;
19147
19148sub ord_string {    # Convert packed ords to printable string
19149    use charnames ();
19150    return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19151                                                unpack "U*", shift) .  "'";
19152    #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
19153}
19154
19155sub Test_N {
19156    my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19157    my $display_source = ord_string($source);
19158    my $display_nfc = ord_string($nfc);
19159    my $display_nfd = ord_string($nfd);
19160    my $display_nfkc = ord_string($nfkc);
19161    my $display_nfkd = ord_string($nfkd);
19162
19163    use Unicode::Normalize;
19164    #    NFC
19165    #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
19166    #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
19167    #
19168    #    NFD
19169    #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
19170    #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
19171    #
19172    #    NFKC
19173    #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19174    #      toNFKC(nfkc) == toNFKC(nfkd)
19175    #
19176    #    NFKD
19177    #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19178    #      toNFKD(nfkc) == toNFKD(nfkd)
19179
19180    is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19181    is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19182    is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19183    is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19184    is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19185
19186    is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19187    is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19188    is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19189    is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19190    is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19191
19192    is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19193    is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19194    is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19195    is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19196    is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19197
19198    is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19199    is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19200    is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19201    is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19202    is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19203}
19204END
19205
19206    &write($n_path,
19207           1,           # Is utf8;
19208           [
19209            @normalization_tests,
19210            'done_testing();'
19211            ]);
19212    return;
19213}
19214
19215# Skip reasons, so will be exact same text and hence the files with each
19216# reason will get grouped together in perluniprops.
19217my $Documentation = "Documentation";
19218my $Indic_Skip
19219            = "Provisional; for the analysis and processing of Indic scripts";
19220my $Validation = "Validation Tests";
19221my $Validation_Documentation = "Documentation of validation Tests";
19222my $Unused_Skip = "Currently unused by Perl";
19223
19224# This is a list of the input files and how to handle them.  The files are
19225# processed in their order in this list.  Some reordering is possible if
19226# desired, but the PropertyAliases and PropValueAliases files should be first,
19227# and the extracted before the others (as data in an extracted file can be
19228# over-ridden by the non-extracted.  Some other files depend on data derived
19229# from an earlier file, like UnicodeData requires data from Jamo, and the case
19230# changing and folding requires data from Unicode.  Mostly, it is safest to
19231# order by first version releases in (except the Jamo).
19232#
19233# The version strings allow the program to know whether to expect a file or
19234# not, but if a file exists in the directory, it will be processed, even if it
19235# is in a version earlier than expected, so you can copy files from a later
19236# release into an earlier release's directory.
19237my @input_file_objects = (
19238    Input_file->new('PropertyAliases.txt', v3.2,
19239                    Handler => \&process_PropertyAliases,
19240                    Early => [ \&substitute_PropertyAliases ],
19241                    Required_Even_in_Debug_Skip => 1,
19242                   ),
19243    Input_file->new(undef, v0,  # No file associated with this
19244                    Progress_Message => 'Finishing property setup',
19245                    Handler => \&finish_property_setup,
19246                   ),
19247    Input_file->new('PropValueAliases.txt', v3.2,
19248                     Handler => \&process_PropValueAliases,
19249                     Early => [ \&substitute_PropValueAliases ],
19250                     Has_Missings_Defaults => $NOT_IGNORED,
19251                     Required_Even_in_Debug_Skip => 1,
19252                    ),
19253    Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19254                    Property => 'General_Category',
19255                   ),
19256    Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19257                    Property => 'Canonical_Combining_Class',
19258                    Has_Missings_Defaults => $NOT_IGNORED,
19259                   ),
19260    Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19261                    Property => 'Numeric_Type',
19262                    Has_Missings_Defaults => $NOT_IGNORED,
19263                   ),
19264    Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19265                    Property => 'East_Asian_Width',
19266                    Has_Missings_Defaults => $NOT_IGNORED,
19267                   ),
19268    Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19269                    Property => 'Line_Break',
19270                    Has_Missings_Defaults => $NOT_IGNORED,
19271                   ),
19272    Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19273                    Property => 'Bidi_Class',
19274                    Has_Missings_Defaults => $NOT_IGNORED,
19275                   ),
19276    Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19277                    Property => 'Decomposition_Type',
19278                    Has_Missings_Defaults => $NOT_IGNORED,
19279                   ),
19280    Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19281    Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19282                    Property => 'Numeric_Value',
19283                    Each_Line_Handler => \&filter_numeric_value_line,
19284                    Has_Missings_Defaults => $NOT_IGNORED,
19285                   ),
19286    Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19287                    Property => 'Joining_Group',
19288                    Has_Missings_Defaults => $NOT_IGNORED,
19289                   ),
19290
19291    Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19292                    Property => 'Joining_Type',
19293                    Has_Missings_Defaults => $NOT_IGNORED,
19294                   ),
19295    Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19296                    Skip => 'This file adds no new information not already'
19297                          . ' present in other files',
19298                    # And it's unnecessary programmer work to handle this new
19299                    # format.  Previous Derived files actually had bug fixes
19300                    # in them that were useful, but that should not be the
19301                    # case here.
19302                   ),
19303    Input_file->new('Jamo.txt', v2.0.0,
19304                    Property => 'Jamo_Short_Name',
19305                    Each_Line_Handler => \&filter_jamo_line,
19306                   ),
19307    Input_file->new('UnicodeData.txt', v1.1.5,
19308                    Pre_Handler => \&setup_UnicodeData,
19309
19310                    # We clean up this file for some early versions.
19311                    Each_Line_Handler => [ (($v_version lt v2.0.0 )
19312                                            ? \&filter_v1_ucd
19313                                            : ($v_version eq v2.1.5)
19314                                                ? \&filter_v2_1_5_ucd
19315
19316                                                # And for 5.14 Perls with 6.0,
19317                                                # have to also make changes
19318                                                : ($v_version ge v6.0.0
19319                                                   && $^V lt v5.17.0)
19320                                                    ? \&filter_v6_ucd
19321                                                    : undef),
19322
19323                                            # Early versions did not have the
19324                                            # proper Unicode_1 names for the
19325                                            # controls
19326                                            (($v_version lt v3.0.0)
19327                                            ? \&filter_early_U1_names
19328                                            : undef),
19329
19330                                            # Early versions did not correctly
19331                                            # use the later method for giving
19332                                            # decimal digit values
19333                                            (($v_version le v3.2.0)
19334                                            ? \&filter_bad_Nd_ucd
19335                                            : undef),
19336
19337                                            # And the main filter
19338                                            \&filter_UnicodeData_line,
19339                                         ],
19340                    EOF_Handler => \&EOF_UnicodeData,
19341                   ),
19342    Input_file->new('CJKXREF.TXT', v1.1.5,
19343                    Withdrawn => v2.0.0,
19344                    Skip => 'Gives the mapping of CJK code points '
19345                          . 'between Unicode and various other standards',
19346                   ),
19347    Input_file->new('ArabicShaping.txt', v2.0.0,
19348                    Each_Line_Handler =>
19349                        ($v_version lt 4.1.0)
19350                                    ? \&filter_old_style_arabic_shaping
19351                                    : undef,
19352                    # The first field after the range is a "schematic name"
19353                    # not used by Perl
19354                    Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19355                    Has_Missings_Defaults => $NOT_IGNORED,
19356                   ),
19357    Input_file->new('Blocks.txt', v2.0.0,
19358                    Property => 'Block',
19359                    Has_Missings_Defaults => $NOT_IGNORED,
19360                    Each_Line_Handler => \&filter_blocks_lines
19361                   ),
19362    Input_file->new('Index.txt', v2.0.0,
19363                    Skip => 'Alphabetical index of Unicode characters',
19364                   ),
19365    Input_file->new('NamesList.txt', v2.0.0,
19366                    Skip => 'Annotated list of characters',
19367                   ),
19368    Input_file->new('PropList.txt', v2.0.0,
19369                    Each_Line_Handler => (($v_version lt v3.1.0)
19370                                            ? \&filter_old_style_proplist
19371                                            : undef),
19372                   ),
19373    Input_file->new('Props.txt', v2.0.0,
19374                    Withdrawn => v3.0.0,
19375                    Skip => 'A subset of F<PropList.txt> (which is used instead)',
19376                   ),
19377    Input_file->new('ReadMe.txt', v2.0.0,
19378                    Skip => $Documentation,
19379                   ),
19380    Input_file->new('Unihan.txt', v2.0.0,
19381                    Withdrawn => v5.2.0,
19382                    Construction_Time_Handler => \&construct_unihan,
19383                    Pre_Handler => \&setup_unihan,
19384                    Optional => [ "",
19385                                  'Unicode_Radical_Stroke'
19386                                ],
19387                    Each_Line_Handler => \&filter_unihan_line,
19388                   ),
19389    Input_file->new('SpecialCasing.txt', v2.1.8,
19390                    Each_Line_Handler => ($v_version eq 2.1.8)
19391                                         ? \&filter_2_1_8_special_casing_line
19392                                         : \&filter_special_casing_line,
19393                    Pre_Handler => \&setup_special_casing,
19394                    Has_Missings_Defaults => $IGNORED,
19395                   ),
19396    Input_file->new(
19397                    'LineBreak.txt', v3.0.0,
19398                    Has_Missings_Defaults => $NOT_IGNORED,
19399                    Property => 'Line_Break',
19400                    # Early versions had problematic syntax
19401                    Each_Line_Handler => ($v_version ge v3.1.0)
19402                                          ? undef
19403                                          : ($v_version lt v3.0.0)
19404                                            ? \&filter_substitute_lb
19405                                            : \&filter_early_ea_lb,
19406                    # Must use long names for property values see comments at
19407                    # sub filter_substitute_lb
19408                    Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19409                               'Alphabetic', # default to this because XX ->
19410                                             # AL
19411
19412                               # Don't use _Perl_LB as a synonym for
19413                               # Line_Break in later perls, as it is tailored
19414                               # and isn't the same as Line_Break
19415                               'ONLY_EARLY' ],
19416                   ),
19417    Input_file->new('EastAsianWidth.txt', v3.0.0,
19418                    Property => 'East_Asian_Width',
19419                    Has_Missings_Defaults => $NOT_IGNORED,
19420                    # Early versions had problematic syntax
19421                    Each_Line_Handler => (($v_version lt v3.1.0)
19422                                        ? \&filter_early_ea_lb
19423                                        : undef),
19424                   ),
19425    Input_file->new('CompositionExclusions.txt', v3.0.0,
19426                    Property => 'Composition_Exclusion',
19427                   ),
19428    Input_file->new('UnicodeData.html', v3.0.0,
19429                    Withdrawn => v4.0.1,
19430                    Skip => $Documentation,
19431                   ),
19432    Input_file->new('BidiMirroring.txt', v3.0.1,
19433                    Property => 'Bidi_Mirroring_Glyph',
19434                    Has_Missings_Defaults => ($v_version lt v6.2.0)
19435                                              ? $NO_DEFAULTS
19436                                              # Is <none> which doesn't mean
19437                                              # anything to us, we will use the
19438                                              # null string
19439                                              : $IGNORED,
19440                   ),
19441    Input_file->new('NamesList.html', v3.0.0,
19442                    Skip => 'Describes the format and contents of '
19443                          . 'F<NamesList.txt>',
19444                   ),
19445    Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19446                    Withdrawn => v5.1,
19447                    Skip => $Documentation,
19448                   ),
19449    Input_file->new('CaseFolding.txt', v3.0.1,
19450                    Pre_Handler => \&setup_case_folding,
19451                    Each_Line_Handler =>
19452                        [ ($v_version lt v3.1.0)
19453                                 ? \&filter_old_style_case_folding
19454                                 : undef,
19455                           \&filter_case_folding_line
19456                        ],
19457                    Has_Missings_Defaults => $IGNORED,
19458                   ),
19459    Input_file->new("NormTest.txt", v3.0.1,
19460                     Handler => \&process_NormalizationsTest,
19461                     Skip => ($make_norm_test_script) ? 0 : $Validation,
19462                   ),
19463    Input_file->new('DCoreProperties.txt', v3.1.0,
19464                    # 5.2 changed this file
19465                    Has_Missings_Defaults => (($v_version ge v5.2.0)
19466                                            ? $NOT_IGNORED
19467                                            : $NO_DEFAULTS),
19468                   ),
19469    Input_file->new('DProperties.html', v3.1.0,
19470                    Withdrawn => v3.2.0,
19471                    Skip => $Documentation,
19472                   ),
19473    Input_file->new('PropList.html', v3.1.0,
19474                    Withdrawn => v5.1,
19475                    Skip => $Documentation,
19476                   ),
19477    Input_file->new('Scripts.txt', v3.1.0,
19478                    Property => 'Script',
19479                    Each_Line_Handler => (($v_version le v4.0.0)
19480                                          ? \&filter_all_caps_script_names
19481                                          : undef),
19482                    Has_Missings_Defaults => $NOT_IGNORED,
19483                   ),
19484    Input_file->new('DNormalizationProps.txt', v3.1.0,
19485                    Has_Missings_Defaults => $NOT_IGNORED,
19486                    Each_Line_Handler => (($v_version lt v4.0.1)
19487                                      ? \&filter_old_style_normalization_lines
19488                                      : undef),
19489                   ),
19490    Input_file->new('DerivedProperties.html', v3.1.1,
19491                    Withdrawn => v5.1,
19492                    Skip => $Documentation,
19493                   ),
19494    Input_file->new('DAge.txt', v3.2.0,
19495                    Has_Missings_Defaults => $NOT_IGNORED,
19496                    Property => 'Age'
19497                   ),
19498    Input_file->new('HangulSyllableType.txt', v4.0,
19499                    Has_Missings_Defaults => $NOT_IGNORED,
19500                    Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19501                    Property => 'Hangul_Syllable_Type'
19502                   ),
19503    Input_file->new('NormalizationCorrections.txt', v3.2.0,
19504                     # This documents the cumulative fixes to erroneous
19505                     # normalizations in earlier Unicode versions.  Its main
19506                     # purpose is so that someone running on an earlier
19507                     # version can use this file to override what got
19508                     # published in that earlier release.  It would be easy
19509                     # for mktables to handle this file.  But all the
19510                     # corrections in it should already be in the other files
19511                     # for the release it is.  To get it to actually mean
19512                     # something useful, someone would have to be using an
19513                     # earlier Unicode release, and copy it into the directory
19514                     # for that release and recompile.  So far there has been
19515                     # no demand to do that, so this hasn't been implemented.
19516                    Skip => 'Documentation of corrections already '
19517                          . 'incorporated into the Unicode data base',
19518                   ),
19519    Input_file->new('StandardizedVariants.html', v3.2.0,
19520                    Skip => 'Obsoleted as of Unicode 9.0, but previously '
19521                          . 'provided a visual display of the standard '
19522                          . 'variant sequences derived from '
19523                          . 'F<StandardizedVariants.txt>.',
19524                        # I don't know why the html came earlier than the
19525                        # .txt, but both are skipped anyway, so it doesn't
19526                        # matter.
19527                   ),
19528    Input_file->new('StandardizedVariants.txt', v4.0.0,
19529                    Skip => 'Certain glyph variations for character display '
19530                          . 'are standardized.  This lists the non-Unihan '
19531                          . 'ones; the Unihan ones are also not used by '
19532                          . 'Perl, and are in a separate Unicode data base '
19533                          . 'L<http://www.unicode.org/ivd>',
19534                   ),
19535    Input_file->new('UCD.html', v4.0.0,
19536                    Withdrawn => v5.2,
19537                    Skip => $Documentation,
19538                   ),
19539    Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19540                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
19541                    Property => 'Word_Break',
19542                    Has_Missings_Defaults => $NOT_IGNORED,
19543                   ),
19544    Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19545                    Early => [ \&generate_GCB, '_Perl_GCB' ],
19546                    Property => 'Grapheme_Cluster_Break',
19547                    Has_Missings_Defaults => $NOT_IGNORED,
19548                   ),
19549    Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19550                    Handler => \&process_GCB_test,
19551                    retain_trailing_comments => 1,
19552                   ),
19553    Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19554                    Skip => $Validation_Documentation,
19555                   ),
19556    Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19557                    Handler => \&process_SB_test,
19558                    retain_trailing_comments => 1,
19559                   ),
19560    Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19561                    Skip => $Validation_Documentation,
19562                   ),
19563    Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19564                    Handler => \&process_WB_test,
19565                    retain_trailing_comments => 1,
19566                   ),
19567    Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19568                    Skip => $Validation_Documentation,
19569                   ),
19570    Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19571                    Property => 'Sentence_Break',
19572                    Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19573                    Has_Missings_Defaults => $NOT_IGNORED,
19574                   ),
19575    Input_file->new('NamedSequences.txt', v4.1.0,
19576                    Handler => \&process_NamedSequences
19577                   ),
19578    Input_file->new('Unihan.html', v4.1.0,
19579                    Withdrawn => v5.2,
19580                    Skip => $Documentation,
19581                   ),
19582    Input_file->new('NameAliases.txt', v5.0,
19583                    Property => 'Name_Alias',
19584                    Each_Line_Handler => ($v_version le v6.0.0)
19585                                   ? \&filter_early_version_name_alias_line
19586                                   : \&filter_later_version_name_alias_line,
19587                   ),
19588        # NameAliases.txt came along in v5.0.  The above constructor handles
19589        # this.  But until 6.1, it was lacking some information needed by core
19590        # perl.  The constructor below handles that.  It is either a kludge or
19591        # clever, depending on your point of view.  The 'Withdrawn' parameter
19592        # indicates not to use it at all starting in 6.1 (so the above
19593        # constructor applies), and the 'v6.1' parameter indicates to use the
19594        # Early parameter before 6.1.  Therefore 'Early" is always used,
19595        # yielding the internal-only property '_Perl_Name_Alias', which it
19596        # gets from a NameAliases.txt from 6.1 or later stored in
19597        # N_Asubst.txt.  In combination with the above constructor,
19598        # 'Name_Alias' is publicly accessible starting with v5.0, and the
19599        # better 6.1 version is accessible to perl core in all releases.
19600    Input_file->new("NameAliases.txt", v6.1,
19601                    Withdrawn => v6.1,
19602                    Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19603                    Property => 'Name_Alias',
19604                    EOF_Handler => \&fixup_early_perl_name_alias,
19605                    Each_Line_Handler =>
19606                                       \&filter_later_version_name_alias_line,
19607                   ),
19608    Input_file->new('NamedSqProv.txt', v5.0.0,
19609                    Skip => 'Named sequences proposed for inclusion in a '
19610                          . 'later version of the Unicode Standard; if you '
19611                          . 'need them now, you can append this file to '
19612                          . 'F<NamedSequences.txt> and recompile perl',
19613                   ),
19614    Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19615                    Handler => \&process_LB_test,
19616                    retain_trailing_comments => 1,
19617                   ),
19618    Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19619                    Skip => $Validation_Documentation,
19620                   ),
19621    Input_file->new("BidiTest.txt", v5.2.0,
19622                    Skip => $Validation,
19623                   ),
19624    Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19625                    Optional => "",
19626                    Each_Line_Handler => \&filter_unihan_line,
19627                   ),
19628    Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19629                    Optional => "",
19630                    Each_Line_Handler => \&filter_unihan_line,
19631                   ),
19632    Input_file->new('UnihanIRGSources.txt', v5.2.0,
19633                    Optional => [ "",
19634                                  'kCompatibilityVariant',
19635                                  'kIICore',
19636                                  'kIRG_GSource',
19637                                  'kIRG_HSource',
19638                                  'kIRG_JSource',
19639                                  'kIRG_KPSource',
19640                                  'kIRG_MSource',
19641                                  'kIRG_KSource',
19642                                  'kIRG_SSource',
19643                                  'kIRG_TSource',
19644                                  'kIRG_USource',
19645                                  'kIRG_UKSource',
19646                                  'kIRG_VSource',
19647                               ],
19648                    Pre_Handler => \&setup_unihan,
19649                    Each_Line_Handler => \&filter_unihan_line,
19650                   ),
19651    Input_file->new('UnihanNumericValues.txt', v5.2.0,
19652                    Optional => [ "",
19653                                  'kAccountingNumeric',
19654                                  'kOtherNumeric',
19655                                  'kPrimaryNumeric',
19656                                ],
19657                    Each_Line_Handler => \&filter_unihan_line,
19658                   ),
19659    Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19660                    Optional => "",
19661                    Each_Line_Handler => \&filter_unihan_line,
19662                   ),
19663    Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19664                    Optional => [ "",
19665                                  'Unicode_Radical_Stroke'
19666                                ],
19667                    Each_Line_Handler => \&filter_unihan_line,
19668                   ),
19669    Input_file->new('UnihanReadings.txt', v5.2.0,
19670                    Optional => "",
19671                    Each_Line_Handler => \&filter_unihan_line,
19672                   ),
19673    Input_file->new('UnihanVariants.txt', v5.2.0,
19674                    Optional => "",
19675                    Each_Line_Handler => \&filter_unihan_line,
19676                   ),
19677    Input_file->new('CJKRadicals.txt', v5.2.0,
19678                    Skip => 'Maps the kRSUnicode property values to '
19679                          . 'corresponding code points',
19680                   ),
19681    Input_file->new('EmojiSources.txt', v6.0.0,
19682                    Skip => 'Maps certain Unicode code points to their '
19683                          . 'legacy Japanese cell-phone values',
19684                   ),
19685    # This file is actually not usable as-is until 6.1.0, because the property
19686    # is provisional, so its name is missing from PropertyAliases.txt until
19687    # that release, so that further work would have to be done to get it to
19688    # work properly
19689    Input_file->new('ScriptExtensions.txt', v6.0.0,
19690                    Property => 'Script_Extensions',
19691                    Early => [ sub {} ], # Doesn't do anything but ensures
19692                                         # that this isn't skipped for early
19693                                         # versions
19694                    Pre_Handler => \&setup_script_extensions,
19695                    Each_Line_Handler => \&filter_script_extensions_line,
19696                    Has_Missings_Defaults => (($v_version le v6.0.0)
19697                                            ? $NO_DEFAULTS
19698                                            : $IGNORED),
19699                   ),
19700    # These two Indic files are actually not usable as-is until 6.1.0,
19701    # because they are provisional, so their property values are missing from
19702    # PropValueAliases.txt until that release, so that further work would have
19703    # to be done to get them to work properly.
19704    Input_file->new('IndicMatraCategory.txt', v6.0.0,
19705                    Withdrawn => v8.0.0,
19706                    Property => 'Indic_Matra_Category',
19707                    Has_Missings_Defaults => $NOT_IGNORED,
19708                    Skip => $Indic_Skip,
19709                   ),
19710    Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19711                    Property => 'Indic_Syllabic_Category',
19712                    Has_Missings_Defaults => $NOT_IGNORED,
19713                    Skip => (($v_version lt v8.0.0)
19714                              ? $Indic_Skip
19715                              : 0),
19716                   ),
19717    Input_file->new('USourceData.txt', v6.2.0,
19718                    Skip => 'Documentation of status and cross reference of '
19719                          . 'proposals for encoding by Unicode of Unihan '
19720                          . 'characters',
19721                   ),
19722    Input_file->new('USourceGlyphs.pdf', v6.2.0,
19723                    Skip => 'Pictures of the characters in F<USourceData.txt>',
19724                   ),
19725    Input_file->new('BidiBrackets.txt', v6.3.0,
19726                    Properties => [ 'Bidi_Paired_Bracket',
19727                                    'Bidi_Paired_Bracket_Type'
19728                                  ],
19729                    Has_Missings_Defaults => $NO_DEFAULTS,
19730                   ),
19731    Input_file->new("BidiCharacterTest.txt", v6.3.0,
19732                    Skip => $Validation,
19733                   ),
19734    Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19735                    Property => 'Indic_Positional_Category',
19736                    Has_Missings_Defaults => $NOT_IGNORED,
19737                   ),
19738    Input_file->new('TangutSources.txt', v9.0.0,
19739                    Skip => 'Specifies source mappings for Tangut ideographs'
19740                          . ' and components. This data file also includes'
19741                          . ' informative radical-stroke values that are used'
19742                          . ' internally by Unicode',
19743                   ),
19744    Input_file->new('VerticalOrientation.txt', v10.0.0,
19745                    Property => 'Vertical_Orientation',
19746                    Has_Missings_Defaults => $NOT_IGNORED,
19747                   ),
19748    Input_file->new('NushuSources.txt', v10.0.0,
19749                    Skip => 'Specifies source material for Nushu characters',
19750                   ),
19751    Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
19752                    Property => 'Equivalent_Unified_Ideograph',
19753                    Has_Missings_Defaults => $NOT_IGNORED,
19754                   ),
19755    Input_file->new('EmojiData.txt', v11.0.0,
19756                    # Is in UAX #51 and not the UCD, so must be updated
19757                    # separately, and the first line edited to indicate the
19758                    # UCD release we're pretending it to be in.  The UTC says
19759                    # this is a transitional state, and in fact was moved to
19760                    # the UCD in 13.0
19761                    Withdrawn => v13.0.0,
19762                    Pre_Handler => \&setup_emojidata,
19763                    Has_Missings_Defaults => $NOT_IGNORED,
19764                    Each_Line_Handler => \&filter_emojidata_line,
19765                    UCD => 0,
19766                   ),
19767    Input_file->new("$EMOJI/emoji.txt", v13.0.0,
19768                    Has_Missings_Defaults => $NOT_IGNORED,
19769                    UCD => 0,
19770                   ),
19771    Input_file->new("$EMOJI/ReadMe.txt", v13.0.0,
19772                    Skip => $Documentation,
19773                    UCD => 0,
19774                   ),
19775    Input_file->new('IdStatus.txt', v13.0.0,
19776                    Pre_Handler => \&setup_IdStatus,
19777                    Property => 'Identifier_Status',
19778                    UCD => 0,
19779                   ),
19780    Input_file->new('IdType.txt', v13.0.0,
19781                    Pre_Handler => \&setup_IdType,
19782                    Each_Line_Handler => \&filter_IdType_line,
19783                    Property => 'Identifier_Type',
19784                    UCD => 0,
19785                   ),
19786    Input_file->new('confusables.txt', v15.0.0,
19787                    Skip => $Unused_Skip,
19788                    UCD => 0,
19789                   ),
19790    Input_file->new('confusablesSummary.txt', v15.0.0,
19791                    Skip => $Unused_Skip,
19792                    UCD => 0,
19793                   ),
19794    Input_file->new('intentional.txt', v15.0.0,
19795                    Skip => $Unused_Skip,
19796                    UCD => 0,
19797                   ),
19798);
19799
19800# End of all the preliminaries.
19801# Do it...
19802
19803if (@missing_early_files) {
19804    print simple_fold(join_lines(<<END
19805
19806The compilation cannot be completed because one or more required input files,
19807listed below, are missing.  This is because you are compiling Unicode version
19808$unicode_version, which predates the existence of these file(s).  To fully
19809function, perl needs the data that these files would have contained if they
19810had been in this release.  To work around this, create copies of later
19811versions of the missing files in the directory containing '$0'.  (Perl will
19812make the necessary adjustments to the data to compensate for it not being the
19813same version as is being compiled.)  The files are available from unicode.org,
19814via either ftp or http.  If using http, they will be under
19815www.unicode.org/versions/.  Below are listed the source file name of each
19816missing file, the Unicode version to copy it from, and the name to store it
19817as.  (Note that the listed source file name may not be exactly the one that
19818Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
19819to get the correct name.)
19820END
19821    ));
19822    print simple_fold(join_lines("\n$_")) for @missing_early_files;
19823    exit 2;
19824}
19825
19826if ($compare_versions) {
19827    Carp::my_carp(<<END
19828Warning.  \$compare_versions is set.  Output is not suitable for production
19829END
19830    );
19831}
19832
19833# Put into %potential_files a list of all the files in the directory structure
19834# that could be inputs to this program
19835File::Find::find({
19836    wanted=>sub {
19837        return unless / \. ( txt | htm l? ) $ /xi;  # Some platforms change the
19838                                                    # name's case
19839        my $full = lc(File::Spec->rel2abs($_));
19840        $potential_files{$full} = 1;
19841        return;
19842    }
19843}, File::Spec->curdir());
19844
19845my @mktables_list_output_files;
19846my $old_start_time = 0;
19847my $old_options = "";
19848
19849if (! -e $file_list) {
19850    print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19851    $write_unchanged_files = 1;
19852} elsif ($write_unchanged_files) {
19853    print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19854}
19855else {
19856    print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19857    my $file_handle;
19858    if (! open $file_handle, "<", $file_list) {
19859        Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
19860        $glob_list = 1;
19861    }
19862    else {
19863        my @input;
19864
19865        # Read and parse mktables.lst, placing the results from the first part
19866        # into @input, and the second part into @mktables_list_output_files
19867        for my $list ( \@input, \@mktables_list_output_files ) {
19868            while (<$file_handle>) {
19869                s/^ \s+ | \s+ $//xg;
19870                if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
19871                    $old_start_time = $1;
19872                    next;
19873                }
19874                if (/^ \s* \# \s* From\ options\ (.+) /x) {
19875                    $old_options = $1;
19876                    next;
19877                }
19878                next if /^ \s* (?: \# .* )? $/x;
19879                last if /^ =+ $/x;
19880                my ( $file ) = split /\t/;
19881                push @$list, $file;
19882            }
19883            @$list = uniques(@$list);
19884            next;
19885        }
19886
19887        # Look through all the input files
19888        foreach my $input (@input) {
19889            next if $input eq 'version'; # Already have checked this.
19890
19891            # Ignore if doesn't exist.  The checking about whether we care or
19892            # not is done via the Input_file object.
19893            next if ! file_exists($input);
19894
19895            # The paths are stored with relative names, and with '/' as the
19896            # delimiter; convert to absolute on this machine
19897            my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
19898            $potential_files{lc $full} = 1;
19899        }
19900    }
19901
19902    close $file_handle;
19903}
19904
19905if ($glob_list) {
19906
19907    # Here wants to process all .txt files in the directory structure.
19908    # Convert them to full path names.  They are stored in the platform's
19909    # relative style
19910    my @known_files;
19911    foreach my $object (@input_file_objects) {
19912        my $file = $object->file;
19913        next unless defined $file;
19914        push @known_files, File::Spec->rel2abs($file);
19915    }
19916
19917    my @unknown_input_files;
19918    foreach my $file (keys %potential_files) {  # The keys are stored in lc
19919        next if grep { $file eq lc($_) } @known_files;
19920
19921        # Here, the file is unknown to us.  Get relative path name
19922        $file = File::Spec->abs2rel($file);
19923        push @unknown_input_files, $file;
19924
19925        # What will happen is we create a data structure for it, and add it to
19926        # the list of input files to process.  First get the subdirectories
19927        # into an array
19928        my (undef, $directories, undef) = File::Spec->splitpath($file);
19929        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
19930        my @directories = File::Spec->splitdir($directories);
19931
19932        # If the file isn't extracted (meaning none of the directories is the
19933        # extracted one), just add it to the end of the list of inputs.
19934        if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
19935            push @input_file_objects, Input_file->new($file, v0);
19936        }
19937        else {
19938
19939            # Here, the file is extracted.  It needs to go ahead of most other
19940            # processing.  Search for the first input file that isn't a
19941            # special required property (that is, find one whose first_release
19942            # is non-0), and isn't extracted.  Also, the Age property file is
19943            # processed before the extracted ones, just in case
19944            # $compare_versions is set.
19945            for (my $i = 0; $i < @input_file_objects; $i++) {
19946                if ($input_file_objects[$i]->first_released ne v0
19947                    && lc($input_file_objects[$i]->file) ne 'dage.txt'
19948                    && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
19949                {
19950                    splice @input_file_objects, $i, 0,
19951                                                Input_file->new($file, v0);
19952                    last;
19953                }
19954            }
19955
19956        }
19957    }
19958    if (@unknown_input_files) {
19959        print STDERR simple_fold(join_lines(<<END
19960
19961The following files are unknown as to how to handle.  Assuming they are
19962typical property files.  You'll know by later error messages if it worked or
19963not:
19964END
19965        ) . " " . join(", ", @unknown_input_files) . "\n\n");
19966    }
19967} # End of looking through directory structure for more .txt files.
19968
19969# Create the list of input files from the objects we have defined, plus
19970# version
19971my @input_files = qw(version Makefile);
19972foreach my $object (@input_file_objects) {
19973    my $file = $object->file;
19974    next if ! defined $file;    # Not all objects have files
19975    next if defined $object->skip;;
19976    push @input_files,  $file;
19977}
19978
19979if ( $verbosity >= $VERBOSE ) {
19980    print "Expecting ".scalar( @input_files )." input files. ",
19981         "Checking ".scalar( @mktables_list_output_files )." output files.\n";
19982}
19983
19984# We set $most_recent to be the most recently changed input file, including
19985# this program itself (done much earlier in this file)
19986foreach my $in (@input_files) {
19987    next unless -e $in;        # Keep going even if missing a file
19988    my $mod_time = (stat $in)[9];
19989    $most_recent = $mod_time if $mod_time > $most_recent;
19990
19991    # See that the input files have distinct names, to warn someone if they
19992    # are adding a new one
19993    if ($make_list) {
19994        my ($volume, $directories, $file ) = File::Spec->splitpath($in);
19995        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
19996        my @directories = File::Spec->splitdir($directories);
19997        construct_filename($file, 'mutable', \@directories);
19998    }
19999}
20000
20001# We use 'Makefile' just to see if it has changed since the last time we
20002# rebuilt.  Now discard it.
20003@input_files = grep { $_ ne 'Makefile' } @input_files;
20004
20005my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
20006              || ! scalar @mktables_list_output_files  # or if no outputs known
20007              || $old_start_time < $most_recent        # or out-of-date
20008              || $old_options ne $command_line_arguments; # or with different
20009                                                          # options
20010
20011# Now we check to see if any output files are older than youngest, if
20012# they are, we need to continue on, otherwise we can presumably bail.
20013if (! $rebuild) {
20014    foreach my $out (@mktables_list_output_files) {
20015        if ( ! file_exists($out)) {
20016            print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
20017            $rebuild = 1;
20018            last;
20019         }
20020        #local $to_trace = 1 if main::DEBUG;
20021        trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
20022        if ( (stat $out)[9] <= $most_recent ) {
20023            #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
20024            print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
20025            $rebuild = 1;
20026            last;
20027        }
20028    }
20029}
20030if (! $rebuild) {
20031    print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
20032    exit(0);
20033}
20034print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20035
20036# Ready to do the major processing.  First create the perl pseudo-property.
20037$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20038
20039# Process each input file
20040foreach my $file (@input_file_objects) {
20041    $file->run;
20042}
20043
20044# Finish the table generation.
20045
20046print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20047finish_Unicode();
20048
20049# For the very specialized case of comparing two Unicode versions...
20050if (DEBUG && $compare_versions) {
20051    handle_compare_versions();
20052}
20053
20054print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20055compile_perl();
20056
20057print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20058add_perl_synonyms();
20059
20060print "Writing tables\n" if $verbosity >= $PROGRESS;
20061write_all_tables();
20062
20063# Write mktables.lst
20064if ( $file_list and $make_list ) {
20065
20066    print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20067    foreach my $file (@input_files, @files_actually_output) {
20068        my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20069        my @directories = grep length, File::Spec->splitdir($directories);
20070        $file = join '/', @directories, $basefile;
20071    }
20072
20073    my $ofh;
20074    if (! open $ofh,">",$file_list) {
20075        Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
20076        return
20077    }
20078    else {
20079        my $localtime = localtime $start_time;
20080        print $ofh <<"END";
20081#
20082# $file_list -- File list for $0.
20083#
20084#   Autogenerated starting on $start_time ($localtime)
20085#   From options $command_line_arguments
20086#
20087# - First section is input files
20088#   ($0 itself is not listed but is automatically considered an input)
20089# - Section separator is /^=+\$/
20090# - Second section is a list of output files.
20091# - Lines matching /^\\s*#/ are treated as comments
20092#   which along with blank lines are ignored.
20093#
20094
20095# Input files:
20096
20097END
20098        print $ofh "$_\n" for sort(@input_files);
20099        print $ofh "\n=================================\n# Output files:\n\n";
20100        print $ofh "$_\n" for sort @files_actually_output;
20101        print $ofh "\n# ",scalar(@input_files)," input files\n",
20102                "# ",scalar(@files_actually_output)+1," output files\n\n",
20103                "# End list\n";
20104        close $ofh
20105            or Carp::my_carp("Failed to close $ofh: $!");
20106
20107        print "Filelist has ",scalar(@input_files)," input files and ",
20108            scalar(@files_actually_output)+1," output files\n"
20109            if $verbosity >= $VERBOSE;
20110    }
20111}
20112
20113# Output these warnings unless -q explicitly specified.
20114if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20115    if (@unhandled_properties) {
20116        print "\nProperties and tables that unexpectedly have no code points\n";
20117        foreach my $property (sort @unhandled_properties) {
20118            print $property, "\n";
20119        }
20120    }
20121
20122    if (%potential_files) {
20123        print "\nInput files that are not considered:\n";
20124        foreach my $file (sort keys %potential_files) {
20125            print File::Spec->abs2rel($file), "\n";
20126        }
20127    }
20128    print "\nAll done\n" if $verbosity >= $VERBOSE;
20129}
20130
20131if ($version_of_mk_invlist_bounds lt $v_version) {
20132    Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20133                . " to be checked and possibly updated to Unicode"
20134                . " $string_version.  Failing tests will be marked TODO");
20135}
20136
20137exit(0);
20138
20139# TRAILING CODE IS USED BY make_property_test_script()
20140__DATA__
20141
20142use strict;
20143use warnings;
20144
20145use feature 'signatures';
20146
20147no warnings 'experimental::uniprop_wildcards';
20148
20149# Test qr/\X/ and the \p{} regular expression constructs.  This file is
20150# constructed by mktables from the tables it generates, so if mktables is
20151# buggy, this won't necessarily catch those bugs.  Tests are generated for all
20152# feasible properties; a few aren't currently feasible; see
20153# is_code_point_usable() in mktables for details.
20154
20155# Standard test packages are not used because this manipulates SIG_WARN.  It
20156# exits 0 if every non-skipped test succeeded; -1 if any failed.
20157
20158my $Tests = 0;
20159my $Fails = 0;
20160
20161# loc_tools.pl requires this function to be defined
20162sub ok($pass, @msg) {
20163    print "not " unless $pass;
20164    print "ok ";
20165    print ++$Tests;
20166    print " - ", join "", @msg if @msg;
20167    print "\n";
20168}
20169
20170sub Expect($expected, $ord, $regex, $warning_type='') {
20171    my $line   = (caller)[2];
20172
20173    # Convert the code point to hex form
20174    my $string = sprintf "\"\\x{%04X}\"", $ord;
20175
20176    my @tests = "";
20177
20178    # The first time through, use all warnings.  If the input should generate
20179    # a warning, add another time through with them turned off
20180    push @tests, "no warnings '$warning_type';" if $warning_type;
20181
20182    foreach my $no_warnings (@tests) {
20183
20184        # Store any warning messages instead of outputting them
20185        local $SIG{__WARN__} = $SIG{__WARN__};
20186        my $warning_message;
20187        $SIG{__WARN__} = sub { $warning_message = $_[0] };
20188
20189        $Tests++;
20190
20191        # A string eval is needed because of the 'no warnings'.
20192        # Assumes no parentheses in the regular expression
20193        my $result = eval "$no_warnings
20194                            my \$RegObj = qr($regex);
20195                            $string =~ \$RegObj ? 1 : 0";
20196        if (not defined $result) {
20197            print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20198            $Fails++;
20199        }
20200        elsif ($result ^ $expected) {
20201            print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20202            $Fails++;
20203        }
20204        elsif ($warning_message) {
20205            if (! $warning_type || ($warning_type && $no_warnings)) {
20206                print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20207                $Fails++;
20208            }
20209            else {
20210                print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20211            }
20212        }
20213        elsif ($warning_type && ! $no_warnings) {
20214            print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20215            $Fails++;
20216        }
20217        else {
20218            print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20219        }
20220    }
20221    return;
20222}
20223
20224sub Error($regex) {
20225    $Tests++;
20226    if (eval { 'x' =~ qr/$regex/; 1 }) {
20227        $Fails++;
20228        my $line = (caller)[2];
20229        print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20230    }
20231    else {
20232        my $line = (caller)[2];
20233        print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20234    }
20235    return;
20236}
20237
20238# Break test files (e.g. GCBTest.txt) character that break allowed here
20239my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20240utf8::upgrade($breakable_utf8);
20241
20242# Break test files (e.g. GCBTest.txt) character that indicates can't break
20243# here
20244my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20245utf8::upgrade($nobreak_utf8);
20246
20247my $are_ctype_locales_available;
20248my $utf8_locale;
20249chdir 't' if -d 't';
20250eval { require "./loc_tools.pl" };
20251if (defined &locales_enabled) {
20252    $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20253    if ($are_ctype_locales_available) {
20254        $utf8_locale = &find_utf8_ctype_locale;
20255    }
20256}
20257
20258# Eval'd so can run on versions earlier than the property is available in
20259my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20260if (! defined $WB_Extend_or_Format_re) {
20261    $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20262}
20263
20264sub _test_break($template, $break_type) {
20265    # Test various break property matches.  The 2nd parameter gives the
20266    # property name.  The input is a line from auxiliary/*Test.txt for the
20267    # given property.  Each such line is a sequence of Unicode (not native)
20268    # code points given by their hex numbers, separated by the two characters
20269    # defined just before this subroutine that indicate that either there can
20270    # or cannot be a break between the adjacent code points.  All these are
20271    # tested.
20272    #
20273    # For the gcb property extra tests are made.  if there isn't a break, that
20274    # means the sequence forms an extended grapheme cluster, which means that
20275    # \X should match the whole thing.  If there is a break, \X should stop
20276    # there.  This is all converted by this routine into a match: $string =~
20277    # /(\X)/, Each \X should match the next cluster; and that is what is
20278    # checked.
20279
20280    my $line   = (caller 1)[2];   # Line number
20281    my $comment = "";
20282
20283    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20284        $template = $1;
20285        $comment = $2;
20286
20287        # Replace leading spaces with a single one.
20288        $comment =~ s/ ^ \s* / # /x;
20289    }
20290
20291    # The line contains characters above the ASCII range, but in Latin1.  It
20292    # may or may not be in utf8, and if it is, it may or may not know it.  So,
20293    # convert these characters to 8 bits.  If knows is in utf8, simply
20294    # downgrade.
20295    if (utf8::is_utf8($template)) {
20296        utf8::downgrade($template);
20297    } else {
20298
20299        # Otherwise, if it is in utf8, but doesn't know it, the next lines
20300        # convert the two problematic characters to their 8-bit equivalents.
20301        # If it isn't in utf8, they don't harm anything.
20302        use bytes;
20303        $template =~ s/$nobreak_utf8/$nobreak/g;
20304        $template =~ s/$breakable_utf8/$breakable/g;
20305    }
20306
20307    # Perl customizes wb.  So change the official tests accordingly
20308    if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20309
20310        # Split into elements that alternate between code point and
20311        # break/no-break
20312        my @line = split / +/, $template;
20313
20314        # Look at each code point and its following one
20315        for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
20316
20317            # The customization only involves changing some breaks to
20318            # non-breaks.
20319            next if $line[$i+1] =~ /$nobreak/;
20320
20321            my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20322            my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20323
20324            # And it only affects adjacent space characters.
20325            next if $lhs !~ /\s/u;
20326
20327            # But, we want to make sure to test spaces followed by a Extend
20328            # or Format.
20329            next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20330
20331            # To test the customization, add some white-space before this to
20332            # create a span.  The $lhs white space may or may not be bound to
20333            # that span, and also with the $rhs.  If the $rhs is a binding
20334            # character, the $lhs is bound to it and not to the span, unless
20335            # $lhs is vertical space.  In all other cases, the $lhs is bound
20336            # to the span.  If the $rhs is white space, it is bound to the
20337            # $lhs
20338            my $bound;
20339            my $span;
20340            if ($rhs =~ /$WB_Extend_or_Format_re/) {
20341                if ($lhs =~ /\v/) {
20342                    $bound = $breakable;
20343                    $span = $nobreak;
20344                }
20345                else {
20346                    $bound = $nobreak;
20347                    $span = $breakable;
20348                }
20349            }
20350            else {
20351                $span = $nobreak;
20352                $bound = $nobreak;
20353            }
20354
20355            splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20356            $i += 4;
20357            $line[$i+1] = $bound;
20358        }
20359        $template = join " ", @line;
20360    }
20361
20362    # The input is just the break/no-break symbols and sequences of Unicode
20363    # code points as hex digits separated by spaces for legibility. e.g.:
20364    # �� 0020 �� 0308 �� 0020 ��
20365    # Convert to native \x format
20366    $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20367    $template =~ s/ \s* //gx;   # Probably the line above removed all spaces;
20368                                # but be sure
20369
20370    # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20371    # appropriate
20372    my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20373    $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20374
20375    my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20376    my $string = eval "\"$display_string\"";
20377
20378    # The remaining massaging of the input is for the \X tests.  Get rid of
20379    # the leading and trailing breakables
20380    $template =~ s/^ \s* $breakable \s* //x;
20381    $template =~ s/ \s* $breakable \s* $ //x;
20382
20383    # Delete no-breaks
20384    $template =~ s/ \s* $nobreak \s* //xg;
20385
20386    # Split the input into segments that are breakable between them.
20387    my @should_display = split /\s*$breakable\s*/, $template;
20388    my @should_match = map { eval "\"$_\"" } @should_display;
20389
20390    # If a string can be represented in both non-ut8 and utf8, test both cases
20391    my $display_upgrade = "";
20392    UPGRADE:
20393    for my $to_upgrade (0 .. 1) {
20394
20395        if ($to_upgrade) {
20396
20397            # If already in utf8, would just be a repeat
20398            next UPGRADE if utf8::is_utf8($string);
20399
20400            utf8::upgrade($string);
20401            $display_upgrade = " (utf8-upgraded)";
20402        }
20403
20404        my @modifiers = qw(a aa d u i);
20405        if ($are_ctype_locales_available) {
20406            push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20407
20408            # The /l modifier has C after it to indicate the locale to try
20409            push @modifiers, "lC";
20410        }
20411
20412        # Test for each of the regex modifiers.
20413        for my $modifier (@modifiers) {
20414            my $display_locale = "";
20415
20416            # For /l, set the locale to what it says to.
20417            if ($modifier =~ / ^ l (.*) /x) {
20418                my $locale = $1;
20419                $display_locale = "(locale = $locale)";
20420                POSIX::setlocale(POSIX::LC_CTYPE(), $locale);
20421                $modifier = 'l';
20422            }
20423
20424            no warnings qw(locale regexp surrogate);
20425            my $pattern = "(?$modifier:$break_pattern)";
20426
20427            # Actually do the test
20428            my $matched_text;
20429            my $matched = $string =~ qr/$pattern/;
20430            if ($matched) {
20431                $matched_text = "matched";
20432            }
20433            else {
20434                $matched_text = "failed to match";
20435                print "not ";
20436
20437                if (TODO_FAILING_BREAKS) {
20438                    $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20439                    $comment =~ s/#/# TODO/;
20440                }
20441            }
20442            print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20443
20444            # Only print the comment on the first use of this line
20445            $comment = "";
20446
20447            # Repeat with the first \B{} in the pattern.  This makes sure the
20448            # code in regexec.c:find_byclass() for \B gets executed
20449            if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20450                my $B_pattern = "$1$2";
20451                $matched = $string =~ qr/$B_pattern/;
20452                print "not " unless $matched;
20453                $matched_text = ($matched) ? "matched" : "failed to match";
20454                print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20455                print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20456                print "\n";
20457            }
20458        }
20459
20460        next if $break_type ne 'gcb';
20461
20462        # Finally, do the \X match.
20463        my @matches = $string =~ /(\X)/g;
20464
20465        # Look through each matched cluster to verify that it matches what we
20466        # expect.
20467        my $min = (@matches < @should_match) ? @matches : @should_match;
20468        for my $i (0 .. $min - 1) {
20469            $Tests++;
20470            if ($matches[$i] eq $should_match[$i]) {
20471                print "ok $Tests - ";
20472                if ($i == 0) {
20473                    print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20474                } else {
20475                    print "And \\X #", $i + 1,
20476                }
20477                print " correctly matched $should_display[$i]; line $line\n";
20478            } else {
20479                $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20480                                                    split "", $matches[$i]);
20481                print "not ok $Tests -";
20482                print " # TODO" if TODO_FAILING_BREAKS;
20483                print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20484                    $i + 1,
20485                    " should have matched $should_display[$i]",
20486                    " but instead matched $matches[$i]",
20487                    ".  Abandoning rest of line $line\n";
20488                next UPGRADE;
20489            }
20490        }
20491
20492        # And the number of matches should equal the number of expected matches.
20493        $Tests++;
20494        if (@matches == @should_match) {
20495            print "ok $Tests - Nothing was left over; line $line\n";
20496        } else {
20497            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20498            print " # TODO" if TODO_FAILING_BREAKS;
20499            print "\n";
20500        }
20501    }
20502
20503    return;
20504}
20505
20506sub Test_GCB($t) {
20507    _test_break($t, 'gcb');
20508}
20509
20510sub Test_LB($t) {
20511    _test_break($t, 'lb');
20512}
20513
20514sub Test_SB($t) {
20515    _test_break($t, 'sb');
20516}
20517
20518sub Test_WB($t) {
20519    _test_break($t, 'wb');
20520}
20521
20522sub Finished() {
20523    print "1..$Tests\n";
20524    exit($Fails ? -1 : 0);
20525}
20526
20527