xref: /openbsd-src/gnu/usr.bin/perl/lib/unicore/mktables (revision 3d61058aa5c692477b6d18acfbbdb653a9930ff9)
1#!/usr/bin/perl -w
2
3# !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
7# There was an attempt when this was first rewritten to make it 5.8
8# compatible, but that has now been abandoned, and newer constructs are used
9# as convenient.
10
11# NOTE: this script can run quite slowly in older/slower systems.
12# It can also consume a lot of memory (128 MB or more), you may need
13# to raise your process resource limits (e.g. in bash, "ulimit -a"
14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
15
16my $start_time;
17BEGIN { # Get the time the script started running; do it at compilation to
18        # get it as close as possible
19    $start_time= time;
20}
21
22require 5.010_001;
23use strict;
24use warnings;
25use builtin qw(refaddr);
26use Carp;
27use Config;
28use File::Find;
29use File::Path;
30use File::Spec;
31use Text::Tabs;
32use re "/aa";
33
34use feature 'state';
35use feature 'signatures';
36no warnings qw( experimental::builtin );
37
38sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
39$| = 1 if DEBUG;
40my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
41
42sub NON_ASCII_PLATFORM { ord("A") != 65 }
43
44# When a new version of Unicode is published, unfortunately the algorithms for
45# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
46# manually.  The changes may or may not be backward compatible with older
47# releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
48# changes, then come back here and set the variable below to what version the
49# code is expecting.  If a newer version of Unicode is being compiled than
50# expected, a warning will be generated.  If an older version is being
51# compiled, any bounds tests that fail in the generated test file (-maketest
52# option) will be marked as TODO.
53my $version_of_mk_invlist_bounds = v15.0.0;
54
55##########################################################################
56#
57# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
58# from the Unicode database files (lib/unicore/.../*.txt),  It also generates
59# a pod file and .t files, depending on option parameters.
60#
61# The structure of this file is:
62#   First these introductory comments; then
63#   code needed for everywhere, such as debugging stuff; then
64#   code to handle input parameters; then
65#   data structures likely to be of external interest (some of which depend on
66#       the input parameters, so follows them; then
67#   more data structures and subroutine and package (class) definitions; then
68#   the small actual loop to process the input files and finish up; then
69#   a __DATA__ section, for the .t tests
70#
71# This program works on all releases of Unicode so far.  The outputs have been
72# scrutinized most intently for release 5.1.  The others have been checked for
73# somewhat more than just sanity.  It can handle all non-provisional Unicode
74# character properties in those releases.
75#
76# This program is mostly about Unicode character (or code point) properties.
77# A property describes some attribute or quality of a code point, like if it
78# is lowercase or not, its name, what version of Unicode it was first defined
79# in, or what its uppercase equivalent is.  Unicode deals with these disparate
80# possibilities by making all properties into mappings from each code point
81# into some corresponding value.  In the case of it being lowercase or not,
82# the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
83# property maps each Unicode code point to a single value, called a "property
84# value".  (Some more recently defined properties, map a code point to a set
85# of values.)
86#
87# When using a property in a regular expression, what is desired isn't the
88# mapping of the code point to its property's value, but the reverse (or the
89# mathematical "inverse relation"): starting with the property value, "Does a
90# code point map to it?"  These are written in a "compound" form:
91# \p{property=value}, e.g., \p{category=punctuation}.  This program generates
92# files containing the lists of code points that map to each such regular
93# expression property value, one file per list
94#
95# There is also a single form shortcut that Perl adds for many of the commonly
96# used properties.  This happens for all binary properties, plus script,
97# general_category, and block properties.
98#
99# Thus the outputs of this program are files.  There are map files, mostly in
100# the 'To' directory; and there are list files for use in regular expression
101# matching, all in subdirectories of the 'lib' directory, with each
102# subdirectory being named for the property that the lists in it are for.
103# Bookkeeping, test, and documentation files are also generated.
104
105my $matches_directory = 'lib';   # Where match (\p{}) files go.
106my $map_directory = 'To';        # Where map files go.
107
108# DATA STRUCTURES
109#
110# The major data structures of this program are Property, of course, but also
111# Table.  There are two kinds of tables, very similar to each other.
112# "Match_Table" is the data structure giving the list of code points that have
113# a particular property value, mentioned above.  There is also a "Map_Table"
114# data structure which gives the property's mapping from code point to value.
115# There are two structures because the match tables need to be combined in
116# various ways, such as constructing unions, intersections, complements, etc.,
117# and the map ones don't.  And there would be problems, perhaps subtle, if
118# a map table were inadvertently operated on in some of those ways.
119# The use of separate classes with operations defined on one but not the other
120# prevents accidentally confusing the two.
121#
122# At the heart of each table's data structure is a "Range_List", which is just
123# an ordered list of "Ranges", plus ancillary information, and methods to
124# operate on them.  A Range is a compact way to store property information.
125# Each range has a starting code point, an ending code point, and a value that
126# is meant to apply to all the code points between the two end points,
127# inclusive.  For a map table, this value is the property value for those
128# code points.  Two such ranges could be written like this:
129#   0x41 .. 0x5A, 'Upper',
130#   0x61 .. 0x7A, 'Lower'
131#
132# Each range also has a type used as a convenience to classify the values.
133# Most ranges in this program will be Type 0, or normal, but there are some
134# ranges that have a non-zero type.  These are used only in map tables, and
135# are for mappings that don't fit into the normal scheme of things.  Mappings
136# that require a hash entry to communicate with utf8.c are one example;
137# another example is mappings for charnames.pm to use which indicate a name
138# that is algorithmically determinable from its code point (and the reverse).
139# These are used to significantly compact these tables, instead of listing
140# each one of the tens of thousands individually.
141#
142# In a match table, the value of a range is irrelevant (and hence the type as
143# well, which will always be 0), and arbitrarily set to the empty string.
144# Using the example above, there would be two match tables for those two
145# entries, one named Upper would contain the 0x41..0x5A range, and the other
146# named Lower would contain 0x61..0x7A.
147#
148# Actually, there are two types of range lists, "Range_Map" is the one
149# associated with map tables, and "Range_List" with match tables.
150# Again, this is so that methods can be defined on one and not the others so
151# as to prevent operating on them in incorrect ways.
152#
153# Eventually, most tables are written out to files to be read by Unicode::UCD.
154# All tables could in theory be written, but some are suppressed because there
155# is no current practical use for them.  It is easy to change which get
156# written by changing various lists that are near the top of the actual code
157# in this file.  The table data structures contain enough ancillary
158# information to allow them to be treated as separate entities for writing,
159# such as the path to each one's file.  There is a heading in each map table
160# that gives the format of its entries, and what the map is for all the code
161# points missing from it.  (This allows tables to be more compact.)
162#
163# The Property data structure contains one or more tables.  All properties
164# contain a map table (except the $perl property which is a
165# pseudo-property containing only match tables), and any properties that
166# are usable in regular expression matches also contain various matching
167# tables, one for each value the property can have.  A binary property can
168# have two values, True and False (or Y and N, which are preferred by Unicode
169# terminology).  Thus each of these properties will have a map table that
170# takes every code point and maps it to Y or N (but having ranges cuts the
171# number of entries in that table way down), and two match tables, one
172# which has a list of all the code points that map to Y, and one for all the
173# code points that map to N.  (For each binary property, a third table is also
174# generated for the pseudo Perl property.  It contains the identical code
175# points as the Y table, but can be written in regular expressions, not in the
176# compound form, but in a "single" form like \p{IsUppercase}.)  Many
177# properties are binary, but some properties have several possible values,
178# some have many, and properties like Name have a different value for every
179# named code point.  Those will not, unless the controlling lists are changed,
180# have their match tables written out.  But all the ones which can be used in
181# regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
182# a property would have either its map table or its match tables written but
183# not both.  Again, what gets written is controlled by lists which can easily
184# be changed.  Starting in 5.14, advantage was taken of this, and all the map
185# tables needed to reconstruct the Unicode db are now written out, while
186# suppressing the Unicode .txt files that contain the data.  Our tables are
187# much more compact than the .txt files, so a significant space savings was
188# achieved.  Also, tables are not written out that are trivially derivable
189# from tables that do get written.  So, there typically is no file containing
190# the code points not matched by a binary property (the table for \P{} versus
191# lowercase \p{}), since you just need to invert the True table to get the
192# False table.
193
194# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
195# how many match tables there are and the content of the maps.  This 'Type' is
196# different than a range 'Type', so don't get confused by the two concepts
197# having the same name.
198#
199# For information about the Unicode properties, see Unicode's UAX44 document:
200
201my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
202
203# As stated earlier, this program will work on any release of Unicode so far.
204# Most obvious problems in earlier data have NOT been corrected except when
205# necessary to make Perl or this program work reasonably, and to keep out
206# potential security issues.  For example, no folding information was given in
207# early releases, so this program substitutes lower case instead, just so that
208# a regular expression with the /i option will do something that actually
209# gives the right results in many cases.  There are also a couple other
210# corrections for version 1.1.5, commented at the point they are made.  As an
211# example of corrections that weren't made (but could be) is this statement
212# from DerivedAge.txt: "The supplementary private use code points and the
213# non-character code points were assigned in version 2.0, but not specifically
214# listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
215# it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
216# further down in these introductory comments.
217#
218# This program works on all non-provisional properties as of the current
219# Unicode release, though the files for some are suppressed for various
220# reasons.  You can change which are output by changing lists in this program.
221#
222# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
223# loose matchings rules (from Unicode TR18):
224#
225#    The recommended names for UCD properties and property values are in
226#    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
227#    [PropValue]. There are both abbreviated names and longer, more
228#    descriptive names. It is strongly recommended that both names be
229#    recognized, and that loose matching of property names be used,
230#    whereby the case distinctions, whitespace, hyphens, and underbar
231#    are ignored.
232#
233# The program still allows Fuzzy to override its determination of if loose
234# matching should be used, but it isn't currently used, as it is no longer
235# needed; the calculations it makes are good enough.
236#
237# SUMMARY OF HOW IT WORKS:
238#
239#   Process arguments
240#
241#   A list is constructed containing each input file that is to be processed
242#
243#   Each file on the list is processed in a loop, using the associated handler
244#   code for each:
245#        The PropertyAliases.txt and PropValueAliases.txt files are processed
246#            first.  These files name the properties and property values.
247#            Objects are created of all the property and property value names
248#            that the rest of the input should expect, including all synonyms.
249#        The other input files give mappings from properties to property
250#           values.  That is, they list code points and say what the mapping
251#           is under the given property.  Some files give the mappings for
252#           just one property; and some for many.  This program goes through
253#           each file and populates the properties and their map tables from
254#           them.  Some properties are listed in more than one file, and
255#           Unicode has set up a precedence as to which has priority if there
256#           is a conflict.  Thus the order of processing matters, and this
257#           program handles the conflict possibility by processing the
258#           overriding input files last, so that if necessary they replace
259#           earlier values.
260#        After this is all done, the program creates the property mappings not
261#            furnished by Unicode, but derivable from what it does give.
262#        The tables of code points that match each property value in each
263#            property that is accessible by regular expressions are created.
264#        The Perl-defined properties are created and populated.  Many of these
265#            require data determined from the earlier steps
266#        Any Perl-defined synonyms are created, and name clashes between Perl
267#            and Unicode are reconciled and warned about.
268#        All the properties are written to files
269#        Any other files are written, and final warnings issued.
270#
271# For clarity, a number of operators have been overloaded to work on tables:
272#   ~ means invert (take all characters not in the set).  The more
273#       conventional '!' is not used because of the possibility of confusing
274#       it with the actual boolean operation.
275#   + means union
276#   - means subtraction
277#   & means intersection
278# The precedence of these is the order listed.  Parentheses should be
279# copiously used.  These are not a general scheme.  The operations aren't
280# defined for a number of things, deliberately, to avoid getting into trouble.
281# Operations are done on references and affect the underlying structures, so
282# that the copy constructors for them have been overloaded to not return a new
283# clone, but the input object itself.
284#
285# The bool operator is deliberately not overloaded to avoid confusion with
286# "should it mean if the object merely exists, or also is non-empty?".
287#
288# WHY CERTAIN DESIGN DECISIONS WERE MADE
289#
290# This program needs to be able to run under miniperl.  Therefore, it uses a
291# minimum of other modules, and hence implements some things itself that could
292# be gotten from CPAN
293#
294# This program uses inputs published by the Unicode Consortium.  These can
295# change incompatibly between releases without the Perl maintainers realizing
296# it.  Therefore this program is now designed to try to flag these.  It looks
297# at the directories where the inputs are, and flags any unrecognized files.
298# It keeps track of all the properties in the files it handles, and flags any
299# that it doesn't know how to handle.  It also flags any input lines that
300# don't match the expected syntax, among other checks.
301#
302# It is also designed so if a new input file matches one of the known
303# templates, one hopefully just needs to add it to a list to have it
304# processed.
305#
306# As mentioned earlier, some properties are given in more than one file.  In
307# particular, the files in the extracted directory are supposedly just
308# reformattings of the others.  But they contain information not easily
309# derivable from the other files, including results for Unihan (which isn't
310# usually available to this program) and for unassigned code points.  They
311# also have historically had errors or been incomplete.  In an attempt to
312# create the best possible data, this program thus processes them first to
313# glean information missing from the other files; then processes those other
314# files to override any errors in the extracted ones.  Much of the design was
315# driven by this need to store things and then possibly override them.
316#
317# It tries to keep fatal errors to a minimum, to generate something usable for
318# testing purposes.  It always looks for files that could be inputs, and will
319# warn about any that it doesn't know how to handle (the -q option suppresses
320# the warning).
321#
322# Why is there more than one type of range?
323#   This simplified things.  There are some very specialized code points that
324#   have to be handled specially for output, such as Hangul syllable names.
325#   By creating a range type (done late in the development process), it
326#   allowed this to be stored with the range, and overridden by other input.
327#   Originally these were stored in another data structure, and it became a
328#   mess trying to decide if a second file that was for the same property was
329#   overriding the earlier one or not.
330#
331# Why are there two kinds of tables, match and map?
332#   (And there is a base class shared by the two as well.)  As stated above,
333#   they actually are for different things.  Development proceeded much more
334#   smoothly when I (khw) realized the distinction.  Map tables are used to
335#   give the property value for every code point (actually every code point
336#   that doesn't map to a default value).  Match tables are used for regular
337#   expression matches, and are essentially the inverse mapping.  Separating
338#   the two allows more specialized methods, and error checks so that one
339#   can't just take the intersection of two map tables, for example, as that
340#   is nonsensical.
341#
342# What about 'fate' and 'status'.  The concept of a table's fate was created
343#   late when it became clear that something more was needed.  The difference
344#   between this and 'status' is unclean, and could be improved if someone
345#   wanted to spend the effort.
346#
347# DEBUGGING
348#
349# This program is written so it will run under miniperl.  Occasionally changes
350# will cause an error where the backtrace doesn't work well under miniperl.
351# To diagnose the problem, you can instead run it under regular perl, if you
352# have one compiled.
353#
354# There is a good trace facility.  To enable it, first sub DEBUG must be set
355# to return true.  Then a line like
356#
357# local $to_trace = 1 if main::DEBUG;
358#
359# can be added to enable tracing in its lexical scope (plus dynamic) or until
360# you insert another line:
361#
362# local $to_trace = 0 if main::DEBUG;
363#
364# To actually trace, use a line like "trace $a, @b, %c, ...;
365#
366# Some of the more complex subroutines already have trace statements in them.
367# Permanent trace statements should be like:
368#
369# trace ... if main::DEBUG && $to_trace;
370#
371# main::stack_trace() will display what its name implies
372#
373# If there is just one or a few files that you're debugging, you can easily
374# cause most everything else to be skipped.  Change the line
375#
376# my $debug_skip = 0;
377#
378# to 1, and every file whose object is in @input_file_objects and doesn't have
379# a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
380# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
381#
382# To compare the output tables, it may be useful to specify the -annotate
383# flag.  (As of this writing, this can't be done on a clean workspace, due to
384# requirements in Text::Tabs used in this option; so first run mktables
385# without this option.)  This option adds comment lines to each table, one for
386# each non-algorithmically named character giving, currently its code point,
387# name, and graphic representation if printable (and you have a font that
388# knows about it).  This makes it easier to see what the particular code
389# points are in each output table.  Non-named code points are annotated with a
390# description of their status, and contiguous ones with the same description
391# will be output as a range rather than individually.  Algorithmically named
392# characters are also output as ranges, except when there are just a few
393# contiguous ones.
394#
395# FUTURE ISSUES
396#
397# The program would break if Unicode were to change its names so that
398# interior white space, underscores, or dashes differences were significant
399# within property and property value names.
400#
401# It might be easier to use the xml versions of the UCD if this program ever
402# would need heavy revision, and the ability to handle old versions was not
403# required.  Also, it turns out to be risky to rely on this, as in early 2024,
404# Unicode decided to drop the xml version.  It was news to many that this was
405# not considered to be an official product that needs to be maintained going
406# forward.  Someone acceptable to the Unicode management volunteered to take
407# over from the retiring volunteer, and so it continues, but beware.
408#
409# There is the potential for name collisions, in that Perl has chosen names
410# that Unicode could decide it also likes.  There have been such collisions in
411# the past, with mostly Perl deciding to adopt the Unicode definition of the
412# name.  However in the 5.2 Unicode beta testing, there were a number of such
413# collisions, which were withdrawn before the final release, because of Perl's
414# and other's protests.  These all involved new properties which began with
415# 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
416# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
417# Unicode document, so they are unlikely to be used by Unicode for another
418# purpose.  However, they might try something beginning with 'In', or use any
419# of the other Perl-defined properties.  This program will warn you of name
420# collisions, and refuse to generate tables with them, but manual intervention
421# will be required in this event.  One scheme that could be implemented, if
422# necessary, would be to have this program generate another file, or add a
423# field to mktables.lst that gives the date of first definition of a property.
424# Each new release of Unicode would use that file as a basis for the next
425# iteration.  And the Perl synonym addition code could sort based on the age
426# of the property, so older properties get priority, and newer ones that clash
427# would be refused; hence existing code would not be impacted, and some other
428# synonym would have to be used for the new property.  This is ugly, and
429# manual intervention would certainly be easier to do in the short run; lets
430# hope it never comes to this.
431#
432# A NOTE ON UNIHAN
433#
434# This program can generate tables from the Unihan database.  But that DB
435# isn't normally available, so it is marked as optional.  Prior to version
436# 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
437# was split into 8 different files, all beginning with the letters 'Unihan'.
438# If you plunk those files down into the directory mktables ($0) is in, this
439# program will read them and automatically create tables for the properties
440# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
441# plus any you add to the @cjk_properties array and the @cjk_property_values
442# array, being sure to add necessary '# @missings' lines to the latter.  For
443# Unicode versions earlier than 5.2, most of the Unihan properties are not
444# listed at all in PropertyAliases nor PropValueAliases.  This program assumes
445# for these early releases that you want the properties that are specified in
446# the 5.2 release.
447#
448# You may need to adjust the entries to suit your purposes.  setup_unihan(),
449# and filter_unihan_line() are the functions where this is done.  This program
450# already does some adjusting to make the lines look more like the rest of the
451# Unicode DB;  You can see what that is in filter_unihan_line()
452#
453# There is a bug in the 3.2 data file in which some values for the
454# kPrimaryNumeric property have commas and an unexpected comment.  A filter
455# could be added to correct these; or for a particular installation, the
456# Unihan.txt file could be edited to fix them.
457#
458# HOW TO ADD A FILE TO BE PROCESSED
459#
460# A new file from Unicode needs to have an object constructed for it in
461# @input_file_objects, probably at the end or at the end of the extracted
462# ones.  The program should warn you if its name will clash with others on
463# restrictive file systems, like DOS.  If so, figure out a better name, and
464# add lines to the README.perl file giving that.  If the file is a character
465# property, it should be in the format that Unicode has implicitly
466# standardized for such files for the more recently introduced ones.
467# If so, the Input_file constructor for @input_file_objects can just be the
468# file name and release it first appeared in.  If not, then it should be
469# possible to construct an each_line_handler() to massage the line into the
470# standardized form.
471#
472# For non-character properties, more code will be needed.  You can look at
473# the existing entries for clues.
474#
475# UNICODE VERSIONS NOTES
476#
477# The Unicode UCD has had a number of errors in it over the versions.  And
478# these remain, by policy, in the standard for that version.  Therefore it is
479# risky to correct them, because code may be expecting the error.  So this
480# program doesn't generally make changes, unless the error breaks the Perl
481# core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
482# for U+1105, which causes real problems for the algorithms for Jamo
483# calculations, so it is changed here.
484#
485# But it isn't so clear cut as to what to do about concepts that are
486# introduced in a later release; should they extend back to earlier releases
487# where the concept just didn't exist?  It was easier to do this than to not,
488# so that's what was done.  For example, the default value for code points not
489# in the files for various properties was probably undefined until changed by
490# some version.  No_Block for blocks is such an example.  This program will
491# assign No_Block even in Unicode versions that didn't have it.  This has the
492# benefit that code being written doesn't have to special case earlier
493# versions; and the detriment that it doesn't match the Standard precisely for
494# the affected versions.
495#
496# Here are some observations about some of the issues in early versions:
497#
498# Prior to version 3.0, there were 3 character decompositions.  These are not
499# handled by Unicode::Normalize, nor will it compile when presented a version
500# that has them.  However, you can trivially get it to compile by simply
501# ignoring those decompositions, by changing the croak to a carp.  At the time
502# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
503# dist/Unicode-Normalize/mkheader) reads
504#
505#   croak("Weird Canonical Decomposition of U+$h");
506#
507# Simply comment it out.  It will compile, but will not know about any three
508# character decompositions.
509
510# The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
511# that the reason is that the CJK block starting at 4E00 was removed from
512# PropList, and was not put back in until 3.1.0.  The Perl extension (the
513# single property name \p{alpha}) has the correct values.  But the compound
514# form is simply not generated until 3.1, as it can be argued that prior to
515# this release, this was not an official property.  The comments for
516# filter_old_style_proplist() give more details.
517#
518# Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
519# always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
520# reason is that 3.2 introduced U+205F=medium math space, which was not
521# classed as white space, but Perl figured out that it should have been. 4.0
522# reclassified it correctly.
523#
524# Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
525# this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
526# became 202, and ATBL was left with no code points, as all the ones that
527# mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
528# name for the class, it would not have been affected, but if it used the
529# mnemonic, it would have been.
530#
531# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
532# points which eventually came to have this script property value, instead
533# mapped to "Unknown".  But in the next release all these code points were
534# moved to \p{sc=common} instead.
535
536# The tests furnished  by Unicode for testing WordBreak and SentenceBreak
537# generate errors in 5.0 and earlier.
538#
539# The default for missing code points for BidiClass is complicated.  Starting
540# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
541# tries to do the best it can for earlier releases.  It is done in
542# process_PropertyAliases()
543#
544# In version 2.1.2, the entry in UnicodeData.txt:
545#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
546# should instead be
547#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
548# Without this change, there are casing problems for this character.
549#
550# Search for $string_compare_versions to see how to compare changes to
551# properties between Unicode versions
552#
553##############################################################################
554
555my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
556                        # and errors
557my $MAX_LINE_WIDTH = 78;
558
559# Debugging aid to skip most files so as to not be distracted by them when
560# concentrating on the ones being debugged.  Add
561# non_skip => 1,
562# to the constructor for those files you want processed when you set this.
563# Files with a first version number of 0 are special: they are always
564# processed regardless of the state of this flag.  Generally, Jamo.txt and
565# UnicodeData.txt must not be skipped if you want this program to not die
566# before normal completion.
567my $debug_skip = 0;
568
569
570# Normally these are suppressed.
571my $write_Unicode_deprecated_tables = 0;
572
573# Set to 1 to enable tracing.
574our $to_trace = 0;
575
576{ # Closure for trace: debugging aid
577    my $print_caller = 1;        # ? Include calling subroutine name
578    my $main_with_colon = 'main::';
579    my $main_colon_length = length($main_with_colon);
580
581    sub trace {
582        return unless $to_trace;        # Do nothing if global flag not set
583
584        my @input = @_;
585
586        local $DB::trace = 0;
587        $DB::trace = 0;          # Quiet 'used only once' message
588
589        my $line_number;
590
591        # Loop looking up the stack to get the first non-trace caller
592        my $caller_line;
593        my $caller_name;
594        my $i = 0;
595        do {
596            $line_number = $caller_line;
597            (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
598            $caller = $main_with_colon unless defined $caller;
599
600            $caller_name = $caller;
601
602            # get rid of pkg
603            $caller_name =~ s/.*:://;
604            if (substr($caller_name, 0, $main_colon_length)
605                eq $main_with_colon)
606            {
607                $caller_name = substr($caller_name, $main_colon_length);
608            }
609
610        } until ($caller_name ne 'trace');
611
612        # If the stack was empty, we were called from the top level
613        $caller_name = 'main' if ($caller_name eq ""
614                                    || $caller_name eq 'trace');
615
616        my $output = "";
617        #print STDERR __LINE__, ": ", join ", ", @input, "\n";
618        foreach my $string (@input) {
619            if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
620                $output .= simple_dumper($string);
621            }
622            else {
623                $string = "$string" if ref $string;
624                $string = $UNDEF unless defined $string;
625                chomp $string;
626                $string = '""' if $string eq "";
627                $output .= " " if $output ne ""
628                                && $string ne ""
629                                && substr($output, -1, 1) ne " "
630                                && substr($string, 0, 1) ne " ";
631                $output .= $string;
632            }
633        }
634
635        print STDERR sprintf "%4d: ", $line_number if defined $line_number;
636        print STDERR "$caller_name: " if $print_caller;
637        print STDERR $output, "\n";
638        return;
639    }
640}
641
642sub stack_trace() {
643    local $to_trace = 1 if main::DEBUG;
644    my $line = (caller(0))[2];
645    my $i = 1;
646
647    # Accumulate the stack trace
648    while (1) {
649        my ($pkg, $file, $caller_line, $caller) = caller $i++;
650
651        last unless defined $caller;
652
653        trace "called from $caller() at line $line";
654        $line = $caller_line;
655    }
656}
657
658# This is for a rarely used development feature that allows you to compare two
659# versions of the Unicode standard without having to deal with changes caused
660# by the code points introduced in the later version.  You probably also want
661# to use the -annotate option when using this.  Run this program on a unicore
662# containing the starting release you want to compare.  Save that output
663# structure.  Then, switching to a unicore with the ending release, change the
664# "" in the $string_compare_versions definition just below to a string
665# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
666# to the starting release.  This program will then compile, but throw away all
667# code points introduced after the starting release.  Finally use a diff tool
668# to compare the two directory structures.  They include only the code points
669# common to both releases, and you can see the changes caused just by the
670# underlying release semantic changes.  For versions earlier than 3.2, you
671# must copy a version of DAge.txt into the directory.
672my $string_compare_versions = DEBUG && "";
673my $compare_versions = DEBUG
674                       && $string_compare_versions
675                       && pack "C*", split /\./, $string_compare_versions;
676
677sub uniques {
678    # Returns non-duplicated input values.  From "Perl Best Practices:
679    # Encapsulated Cleverness".  p. 455 in first edition.
680
681    my %seen;
682    # Arguably this breaks encapsulation, if the goal is to permit multiple
683    # distinct objects to stringify to the same value, and be interchangeable.
684    # However, for this program, no two objects stringify identically, and all
685    # lists passed to this function are either objects or strings. So this
686    # doesn't affect correctness, but it does give a couple of percent speedup.
687    no overloading;
688    return grep { ! $seen{$_}++ } @_;
689}
690
691$0 = File::Spec->canonpath($0);
692
693my $make_test_script = 0;      # ? Should we output a test script
694my $make_norm_test_script = 0; # ? Should we output a normalization test script
695my $write_unchanged_files = 0; # ? Should we update the output files even if
696                               #    we don't think they have changed
697my $use_directory = "";        # ? Should we chdir somewhere.
698my $pod_directory;             # input directory to store the pod file.
699my $pod_file = 'perluniprops';
700my $t_path;                     # Path to the .t test file
701my $file_list = 'mktables.lst'; # File to store input and output file names.
702                               # This is used to speed up the build, by not
703                               # executing the main body of the program if
704                               # nothing on the list has changed since the
705                               # previous build
706my $make_list = 1;             # ? Should we write $file_list.  Set to always
707                               # make a list so that when the release manager
708                               # is preparing a release, they won't have to do
709                               # special things
710my $glob_list = 0;             # ? Should we try to include unknown .txt files
711                               # in the input.
712my $output_range_counts = $debugging_build;   # ? Should we include the number
713                                              # of code points in ranges in
714                                              # the output
715my $annotate = 0;              # ? Should character names be in the output
716
717# Verbosity levels; 0 is quiet
718my $NORMAL_VERBOSITY = 1;
719my $PROGRESS = 2;
720my $VERBOSE = 3;
721
722my $verbosity = $NORMAL_VERBOSITY;
723
724# Stored in mktables.lst so that if this program is called with different
725# options, will regenerate even if the files otherwise look like they're
726# up-to-date.
727my $command_line_arguments = join " ", @ARGV;
728
729# Process arguments
730while (@ARGV) {
731    my $arg = shift @ARGV;
732    if ($arg eq '-v') {
733        $verbosity = $VERBOSE;
734    }
735    elsif ($arg eq '-p') {
736        $verbosity = $PROGRESS;
737        $| = 1;     # Flush buffers as we go.
738    }
739    elsif ($arg eq '-q') {
740        $verbosity = 0;
741    }
742    elsif ($arg eq '-w') {
743        # update the files even if they haven't changed
744        $write_unchanged_files = 1;
745    }
746    elsif ($arg eq '-check') {
747        my $this = shift @ARGV;
748        my $ok = shift @ARGV;
749        if ($this ne $ok) {
750            print "Skipping as check params are not the same.\n";
751            exit(0);
752        }
753    }
754    elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
755        -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
756    }
757    elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
758    {
759        $make_test_script = 1;
760    }
761    elsif ($arg eq '-makenormtest')
762    {
763        $make_norm_test_script = 1;
764    }
765    elsif ($arg eq '-makelist') {
766        $make_list = 1;
767    }
768    elsif ($arg eq '-C' && defined ($use_directory = shift)) {
769        -d $use_directory or croak "Unknown directory '$use_directory'";
770    }
771    elsif ($arg eq '-L') {
772
773        # Existence not tested until have chdir'd
774        $file_list = shift;
775    }
776    elsif ($arg eq '-globlist') {
777        $glob_list = 1;
778    }
779    elsif ($arg eq '-c') {
780        $output_range_counts = ! $output_range_counts
781    }
782    elsif ($arg eq '-annotate') {
783        $annotate = 1;
784        $debugging_build = 1;
785        $output_range_counts = 1;
786    }
787    else {
788        my $with_c = 'with';
789        $with_c .= 'out' if $output_range_counts;   # Complements the state
790        croak <<END;
791usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
792          [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
793          [-check A B ]
794  -c          : Output comments $with_c number of code points in ranges
795  -q          : Quiet Mode: Only output serious warnings.
796  -p          : Set verbosity level to normal plus show progress.
797  -v          : Set Verbosity level high:  Show progress and non-serious
798                warnings
799  -w          : Write files regardless
800  -C dir      : Change to this directory before proceeding. All relative paths
801                except those specified by the -P and -T options will be done
802                with respect to this directory.
803  -P dir      : Output $pod_file file to directory 'dir'.
804  -T path     : Create a test script as 'path'; overrides -maketest
805  -L filelist : Use alternate 'filelist' instead of standard one
806  -globlist   : Take as input all non-Test *.txt files in current and sub
807                directories
808  -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
809                overrides -T
810  -makelist   : Rewrite the file list $file_list based on current setup
811  -annotate   : Output an annotation for each character in the table files;
812                useful for debugging mktables, looking at diffs; but is slow
813                and memory intensive
814  -check A B  : Executes $0 only if A and B are the same
815END
816    }
817}
818
819# Stores the most-recently changed file.  If none have changed, can skip the
820# build
821my $most_recent = (stat $0)[9];   # Do this before the chdir!
822
823# Change directories now, because need to read 'version' early.
824if ($use_directory) {
825    if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
826        $pod_directory = File::Spec->rel2abs($pod_directory);
827    }
828    if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
829        $t_path = File::Spec->rel2abs($t_path);
830    }
831    chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
832    if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
833        $pod_directory = File::Spec->abs2rel($pod_directory);
834    }
835    if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
836        $t_path = File::Spec->abs2rel($t_path);
837    }
838}
839
840# Get Unicode version into regular and v-string.  This is done now because
841# various tables below get populated based on it.  These tables are populated
842# here to be near the top of the file, and so easily seeable by those needing
843# to modify things.
844open my $VERSION, "<", "version"
845                    or croak "$0: can't open required file 'version': $!\n";
846my $string_version = <$VERSION>;
847close $VERSION;
848chomp $string_version;
849my $v_version = pack "C*", split /\./, $string_version;        # v string
850
851my $unicode_version = ($compare_versions)
852                      ? (  "$string_compare_versions (using "
853                         . "$string_version rules)")
854                      : $string_version;
855
856# The following are the complete names of properties with property values that
857# are known to not match any code points in some versions of Unicode, but that
858# may change in the future so they should be matchable, hence an empty file is
859# generated for them.
860my @tables_that_may_be_empty;
861push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
862                                                    if $v_version lt v6.3.0;
863push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
864push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
865push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
866                                                    if $v_version ge v4.1.0;
867push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
868                                                    if $v_version ge v6.0.0;
869push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
870                                                    if $v_version ge v6.1.0;
871push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
872                                                    if $v_version ge v6.2.0;
873
874# The lists below are hashes, so the key is the item in the list, and the
875# value is the reason why it is in the list.  This makes generation of
876# documentation easier.
877
878my %why_suppressed;  # No file generated for these.
879
880# Files aren't generated for empty extraneous properties.  This is arguable.
881# Extraneous properties generally come about because a property is no longer
882# used in a newer version of Unicode.  If we generated a file without code
883# points, programs that used to work on that property will still execute
884# without errors.  It just won't ever match (or will always match, with \P{}).
885# This means that the logic is now likely wrong.  I (khw) think its better to
886# find this out by getting an error message.  Just move them to the table
887# above to change this behavior
888my %why_suppress_if_empty_warn_if_not = (
889
890   # It is the only property that has ever officially been removed from the
891   # Standard.  The database never contained any code points for it.
892   'Special_Case_Condition' => 'Obsolete',
893
894   # Apparently never official, but there were code points in some versions of
895   # old-style PropList.txt
896   'Non_Break' => 'Obsolete',
897);
898
899# These would normally go in the warn table just above, but they were changed
900# a long time before this program was written, so warnings about them are
901# moot.
902if ($v_version gt v3.2.0) {
903    push @tables_that_may_be_empty,
904                                'Canonical_Combining_Class=Attached_Below_Left'
905}
906
907# Obsoleted
908if ($v_version ge v11.0.0) {
909    push @tables_that_may_be_empty, qw(
910                                       Grapheme_Cluster_Break=E_Base
911                                       Grapheme_Cluster_Break=E_Base_GAZ
912                                       Grapheme_Cluster_Break=E_Modifier
913                                       Grapheme_Cluster_Break=Glue_After_Zwj
914                                       Word_Break=E_Base
915                                       Word_Break=E_Base_GAZ
916                                       Word_Break=E_Modifier
917                                       Word_Break=Glue_After_Zwj);
918}
919
920# Enum values for to_output_map() method in the Map_Table package. (0 is don't
921# output)
922my $EXTERNAL_MAP = 1;
923my $INTERNAL_MAP = 2;
924my $OUTPUT_ADJUSTED = 3;
925
926# To override computed values for writing the map tables for these properties.
927# The default for enum map tables is to write them out, so that the Unicode
928# .txt files can be removed, but all the data to compute any property value
929# for any code point is available in a more compact form.
930my %global_to_output_map = (
931    # Needed by UCD.pm, but don't want to publicize that it exists, so won't
932    # get stuck supporting it if things change.  Since it is a STRING
933    # property, it normally would be listed in the pod, but INTERNAL_MAP
934    # suppresses that.
935    Unicode_1_Name => $INTERNAL_MAP,
936
937    Present_In => 0,                # Suppress, as easily computed from Age
938    Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
939                                            # retained, but needed for
940                                            # non-ASCII
941
942    # Suppress, as mapping can be found instead from the
943    # Perl_Decomposition_Mapping file
944    Decomposition_Type => 0,
945);
946
947# There are several types of obsolete properties defined by Unicode.  These
948# must be hand-edited for every new Unicode release.
949my %why_deprecated;  # Generates a deprecated warning message if used.
950my %why_stabilized;  # Documentation only
951my %why_obsolete;    # Documentation only
952
953{   # Closure
954    my $simple = 'Perl uses the more complete version';
955    my $unihan = 'Unihan properties are by default not enabled in the Perl core.';
956
957    my $other_properties = 'other properties';
958    my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
959    my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
960
961    %why_deprecated = (
962        'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
963        'Jamo_Short_Name' => $contributory,
964        'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
965        'Other_Alphabetic' => $contributory,
966        'Other_Default_Ignorable_Code_Point' => $contributory,
967        'Other_Grapheme_Extend' => $contributory,
968        'Other_ID_Continue' => $contributory,
969        'Other_ID_Start' => $contributory,
970        'Other_Lowercase' => $contributory,
971        'Other_Math' => $contributory,
972        'Other_Uppercase' => $contributory,
973        'Expands_On_NFC' => $why_no_expand,
974        'Expands_On_NFD' => $why_no_expand,
975        'Expands_On_NFKC' => $why_no_expand,
976        'Expands_On_NFKD' => $why_no_expand,
977    );
978
979    %why_suppressed = (
980        # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
981        # contains the same information, but without the algorithmically
982        # determinable Hangul syllables'.  This file is not published, so it's
983        # existence is not noted in the comment.
984        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
985
986        # Don't suppress ISO_Comment, as otherwise special handling is needed
987        # to differentiate between it and gc=c, which can be written as 'isc',
988        # which is the same characters as ISO_Comment's short name.
989
990        'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
991
992        'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
993        'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
994        'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
995        'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
996
997        FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
998    );
999
1000    foreach my $property (
1001
1002            # The following are suppressed because they were made contributory
1003            # or deprecated by Unicode before Perl ever thought about
1004            # supporting them.
1005            'Jamo_Short_Name',
1006            'Grapheme_Link',
1007            'Expands_On_NFC',
1008            'Expands_On_NFD',
1009            'Expands_On_NFKC',
1010            'Expands_On_NFKD',
1011
1012            # The following are suppressed because they have been marked
1013            # as deprecated for a sufficient amount of time
1014            'Other_Alphabetic',
1015            'Other_Default_Ignorable_Code_Point',
1016            'Other_Grapheme_Extend',
1017            'Other_ID_Continue',
1018            'Other_ID_Start',
1019            'Other_Lowercase',
1020            'Other_Math',
1021            'Other_Uppercase',
1022    ) {
1023        $why_suppressed{$property} = $why_deprecated{$property};
1024    }
1025
1026    # Customize the message for all the 'Other_' properties
1027    foreach my $property (keys %why_deprecated) {
1028        next if (my $main_property = $property) !~ s/^Other_//;
1029        $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1030    }
1031}
1032
1033if ($write_Unicode_deprecated_tables) {
1034    foreach my $property (keys %why_suppressed) {
1035        delete $why_suppressed{$property} if $property =~
1036                                                    / ^ Other | Grapheme /x;
1037    }
1038}
1039
1040if ($v_version ge 4.0.0) {
1041    $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1042    if ($v_version ge 6.0.0) {
1043        $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1044    }
1045}
1046if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1047    $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1048    if ($v_version ge 6.0.0) {
1049        $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1050    }
1051}
1052
1053# Probably obsolete forever
1054if ($v_version ge v4.1.0) {
1055    $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1056}
1057if ($v_version ge v6.0.0) {
1058    $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1059    $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1060}
1061
1062# This program can create files for enumerated-like properties, such as
1063# 'Numeric_Type'.  This file would be the same format as for a string
1064# property, with a mapping from code point to its value, so you could look up,
1065# for example, the script a code point is in.  But no one so far wants this
1066# mapping, or they have found another way to get it since this is a new
1067# feature.  So no file is generated except if it is in this list.
1068my @output_mapped_properties = split "\n", <<END;
1069END
1070
1071# If you want more Unihan properties than the default, you need to add them to
1072# these arrays.  Depending on the property type, @missing lines might have to
1073# be added to the second array.  A sample entry would be (including the '#'):
1074# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1075my @cjk_properties = split "\n", <<'END';
1076END
1077my @cjk_property_values = split "\n", <<'END';
1078END
1079
1080# The input files don't list every code point.  Those not listed are to be
1081# defaulted to some value.  Below are hard-coded what those values are for
1082# non-binary properties as of 5.1.  Starting in 5.0, there are
1083# machine-parsable comment lines in the files that give the defaults; so this
1084# list shouldn't have to be extended.  The claim is that all missing entries
1085# for binary properties will default to 'N'.  Unicode tried to change that in
1086# 5.2, but the beta period produced enough protest that they backed off.
1087#
1088# The defaults for the fields that appear in UnicodeData.txt in this hash must
1089# be in the form that it expects.  The others may be synonyms.
1090my $CODE_POINT = '<code point>';
1091my %default_mapping = (
1092    Age => "Unassigned",
1093    # Bidi_Class => Complicated; set in code
1094    Bidi_Mirroring_Glyph => "",
1095    Block => 'No_Block',
1096    Canonical_Combining_Class => 0,
1097    Case_Folding => $CODE_POINT,
1098    Decomposition_Mapping => $CODE_POINT,
1099    Decomposition_Type => 'None',
1100    East_Asian_Width => "Neutral",
1101    FC_NFKC_Closure => $CODE_POINT,
1102    General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1103    Grapheme_Cluster_Break => 'Other',
1104    Hangul_Syllable_Type => 'NA',
1105    ISO_Comment => "",
1106    Jamo_Short_Name => "",
1107    Joining_Group => "No_Joining_Group",
1108    # Joining_Type => Complicated; set in code
1109    kIICore => 'N',   #                       Is converted to binary
1110    #Line_Break => Complicated; set in code
1111    Lowercase_Mapping => $CODE_POINT,
1112    Name => "",
1113    Name_Alias => "",
1114    NFC_QC => 'Yes',
1115    NFD_QC => 'Yes',
1116    NFKC_QC => 'Yes',
1117    NFKD_QC => 'Yes',
1118    Numeric_Type => 'None',
1119    Numeric_Value => 'NaN',
1120    Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1121    Sentence_Break => 'Other',
1122    Simple_Case_Folding => $CODE_POINT,
1123    Simple_Lowercase_Mapping => $CODE_POINT,
1124    Simple_Titlecase_Mapping => $CODE_POINT,
1125    Simple_Uppercase_Mapping => $CODE_POINT,
1126    Titlecase_Mapping => $CODE_POINT,
1127    Unicode_1_Name => "",
1128    Unicode_Radical_Stroke => "",
1129    Uppercase_Mapping => $CODE_POINT,
1130    Word_Break => 'Other',
1131);
1132
1133### End of externally interesting definitions, except for @input_file_objects
1134
1135my $HEADER=<<"EOF";
1136# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1137# This file is machine-generated by $0 from the Unicode
1138# database, Version $unicode_version.  Any changes made here will be lost!
1139EOF
1140
1141my $INTERNAL_ONLY_HEADER = <<"EOF";
1142
1143# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1144# This file is for internal use by core Perl only.  The format and even the
1145# name or existence of this file are subject to change without notice.  Don't
1146# use it directly.  Use Unicode::UCD to access the Unicode character data
1147# base.
1148EOF
1149
1150my $DEVELOPMENT_ONLY=<<"EOF";
1151# !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1152# This file contains information artificially constrained to code points
1153# present in Unicode release $string_compare_versions.
1154# IT CANNOT BE RELIED ON.  It is for use during development only and should
1155# not be used for production.
1156
1157EOF
1158
1159my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1160                                   ? "10FFFF"
1161                                   : "FFFF";
1162my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1163my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1164
1165# We work with above-Unicode code points, up to IV_MAX, but we may want to use
1166# sentinels above that number.  Therefore for internal use, we use a much
1167# smaller number, translating it to IV_MAX only for output.  The exact number
1168# is immaterial (all above-Unicode code points are treated exactly the same),
1169# but the algorithm requires it to be at least
1170# 2 * $MAX_UNICODE_CODEPOINTS + 1
1171my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1172my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1173my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1174
1175my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1176
1177# Matches legal code point.  4-6 hex numbers, If there are 6, the first
1178# two must be 10; if there are 5, the first must not be a 0.  Written this way
1179# to decrease backtracking.  The first regex allows the code point to be at
1180# the end of a word, but to work properly, the word shouldn't end with a valid
1181# hex character.  The second one won't match a code point at the end of a
1182# word, and doesn't have the run-on issue
1183my $run_on_code_point_re =
1184            qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1185my $code_point_re = qr/\b$run_on_code_point_re/;
1186
1187# This matches the beginning of the line in the Unicode DB files that give the
1188# defaults for code points not listed (i.e., missing) in the file.  The code
1189# depends on this ending with a semi-colon, so it can assume it is a valid
1190# field when the line is split() by semi-colons
1191my $missing_defaults_prefix = qr/ ^ \# \s+ \@missing: \s+
1192                                           ($code_point_re)
1193                                           \.\.
1194                                           ($code_point_re)
1195                                       \s* ;
1196                                /x;
1197
1198# Property types.  Unicode has more types, but these are sufficient for our
1199# purposes.
1200my $UNKNOWN = -1;   # initialized to illegal value
1201my $NON_STRING = 1; # Either binary or enum
1202my $BINARY = 2;
1203my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1204                       # tables, additional true and false tables are
1205                       # generated so that false is anything matching the
1206                       # default value, and true is everything else.
1207my $ENUM = 4;       # Include catalog
1208my $STRING = 5;     # Anything else: string or misc
1209
1210# Some input files have lines that give default values for code points not
1211# contained in the file.  Sometimes these should be ignored.
1212my $NO_DEFAULTS = 0;        # Must evaluate to false
1213my $NOT_IGNORED = 1;
1214my $IGNORED = 2;
1215
1216# Range types.  Each range has a type.  Most ranges are type 0, for normal,
1217# and will appear in the main body of the tables in the output files, but
1218# there are other types of ranges as well, listed below, that are specially
1219# handled.   There are pseudo-types as well that will never be stored as a
1220# type, but will affect the calculation of the type.
1221
1222# 0 is for normal, non-specials
1223my $MULTI_CP = 1;           # Sequence of more than code point
1224my $HANGUL_SYLLABLE = 2;
1225my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1226my $NULL = 4;               # The map is to the null string; utf8.c can't
1227                            # handle these, nor is there an accepted syntax
1228                            # for them in \p{} constructs
1229my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1230                             # otherwise be $MULTI_CP type are instead type 0
1231
1232# process_generic_property_file() can accept certain overrides in its input.
1233# Each of these must begin AND end with $CMD_DELIM.
1234my $CMD_DELIM = "\a";
1235my $REPLACE_CMD = 'replace';    # Override the Replace
1236my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1237
1238my $NO = 0;
1239my $YES = 1;
1240
1241# Values for the Replace argument to add_range.
1242# $NO                      # Don't replace; add only the code points not
1243                           # already present.
1244my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1245                           # the comments at the subroutine definition.
1246my $UNCONDITIONALLY = 2;   # Replace without conditions.
1247my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1248                           # already there
1249my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1250                           # already there
1251my $CROAK = 6;             # Die with an error if is already there
1252
1253# Flags to give property statuses.  The phrases are to remind maintainers that
1254# if the flag is changed, the indefinite article referring to it in the
1255# documentation may need to be as well.
1256my $NORMAL = "";
1257my $DEPRECATED = 'D';
1258my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1259my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1260my $DISCOURAGED = 'X';
1261my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1262my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1263my $STRICTER = 'T';
1264my $a_bold_stricter = "a 'B<$STRICTER>'";
1265my $A_bold_stricter = "A 'B<$STRICTER>'";
1266my $STABILIZED = 'S';
1267my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1268my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1269my $OBSOLETE = 'O';
1270my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1271my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1272
1273# Aliases can also have an extra status:
1274my $INTERNAL_ALIAS = 'P';
1275
1276my %status_past_participles = (
1277    $DISCOURAGED => 'discouraged',
1278    $STABILIZED => 'stabilized',
1279    $OBSOLETE => 'obsolete',
1280    $DEPRECATED => 'deprecated',
1281    $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1282);
1283
1284# Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1285# externally documented.
1286my $ORDINARY = 0;       # The normal fate.
1287my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1288                        # but there is a file written that can be used to
1289                        # reconstruct this table
1290my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1291                        # for Perl's internal use only
1292my $SUPPRESSED = 3;     # The file for this table is not written out, and as a
1293                        # result, we don't bother to do many computations on
1294                        # it.
1295my $PLACEHOLDER = 4;    # Like $SUPPRESSED, but we go through all the
1296                        # computations anyway, as the values are needed for
1297                        # things to work.  This happens when we have Perl
1298                        # extensions that depend on Unicode tables that
1299                        # wouldn't normally be in a given Unicode version.
1300
1301# The format of the values of the tables:
1302my $EMPTY_FORMAT = "";
1303my $BINARY_FORMAT = 'b';
1304my $DECIMAL_FORMAT = 'd';
1305my $FLOAT_FORMAT = 'f';
1306my $INTEGER_FORMAT = 'i';
1307my $HEX_FORMAT = 'x';
1308my $RATIONAL_FORMAT = 'r';
1309my $STRING_FORMAT = 's';
1310my $ADJUST_FORMAT = 'a';
1311my $HEX_ADJUST_FORMAT = 'ax';
1312my $DECOMP_STRING_FORMAT = 'c';
1313my $STRING_WHITE_SPACE_LIST = 'sw';
1314
1315my %map_table_formats = (
1316    $BINARY_FORMAT => 'binary',
1317    $DECIMAL_FORMAT => 'single decimal digit',
1318    $FLOAT_FORMAT => 'floating point number',
1319    $INTEGER_FORMAT => 'integer',
1320    $HEX_FORMAT => 'non-negative hex whole number; a code point',
1321    $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1322    $STRING_FORMAT => 'string',
1323    $ADJUST_FORMAT => 'some entries need adjustment',
1324    $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1325    $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1326    $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1327);
1328
1329# Unicode didn't put such derived files in a separate directory at first.
1330my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1331my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1332my $AUXILIARY = 'auxiliary';
1333my $EMOJI = 'emoji';
1334
1335# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm
1336my %loose_to_file_of;       # loosely maps table names to their respective
1337                            # files
1338my %stricter_to_file_of;    # same; but for stricter mapping.
1339my %loose_property_to_file_of; # Maps a loose property name to its map file
1340my %strict_property_to_file_of; # Same, but strict
1341my @inline_definitions = "V0"; # Each element gives a definition of a unique
1342                            # inversion list.  When a definition is inlined,
1343                            # its value in the hash it's in (one of the two
1344                            # defined just above) will include an index into
1345                            # this array.  The 0th element is initialized to
1346                            # the definition for a zero length inversion list
1347my %file_to_swash_name;     # Maps the file name to its corresponding key name
1348                            # in the hash %Unicode::UCD::SwashInfo
1349my %nv_floating_to_rational; # maps numeric values floating point numbers to
1350                             # their rational equivalent
1351my %loose_property_name_of; # Loosely maps (non_string) property names to
1352                            # standard form
1353my %strict_property_name_of; # Strictly maps (non_string) property names to
1354                            # standard form
1355my %string_property_loose_to_name; # Same, for string properties.
1356my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1357                            # the property name in standard loose form, and
1358                            # 'value' is the default value for that property,
1359                            # also in standard loose form.
1360my %loose_to_standard_value; # loosely maps table names to the canonical
1361                            # alias for them
1362my %ambiguous_names;        # keys are alias names (in standard form) that
1363                            # have more than one possible meaning.
1364my %combination_property;   # keys are alias names (in standard form) that
1365                            # have both a map table, and a binary one that
1366                            # yields true for all non-null maps.
1367my %prop_aliases;           # Keys are standard property name; values are each
1368                            # one's aliases
1369my %prop_value_aliases;     # Keys of top level are standard property name;
1370                            # values are keys to another hash,  Each one is
1371                            # one of the property's values, in standard form.
1372                            # The values are that prop-val's aliases.
1373my %skipped_files;          # List of files that we skip
1374my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1375
1376# Most properties are immune to caseless matching, otherwise you would get
1377# nonsensical results, as properties are a function of a code point, not
1378# everything that is caselessly equivalent to that code point.  For example,
1379# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1380# be true because 's' and 'S' are equivalent caselessly.  However,
1381# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1382# extend that concept to those very few properties that are like this.  Each
1383# such property will match the full range caselessly.  They are hard-coded in
1384# the program; it's not worth trying to make it general as it's extremely
1385# unlikely that they will ever change.
1386my %caseless_equivalent_to;
1387
1388# This is the range of characters that were in Release 1 of Unicode, and
1389# removed in Release 2 (replaced with the current Hangul syllables starting at
1390# U+AC00).  The range was reused starting in Release 3 for other purposes.
1391my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1392my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1393
1394# These constants names and values were taken from the Unicode standard,
1395# version 5.1, section 3.12.  They are used in conjunction with Hangul
1396# syllables.  The '_string' versions are so generated tables can retain the
1397# hex format, which is the more familiar value
1398my $SBase_string = "0xAC00";
1399my $SBase = CORE::hex $SBase_string;
1400my $LBase_string = "0x1100";
1401my $LBase = CORE::hex $LBase_string;
1402my $VBase_string = "0x1161";
1403my $VBase = CORE::hex $VBase_string;
1404my $TBase_string = "0x11A7";
1405my $TBase = CORE::hex $TBase_string;
1406my $SCount = 11172;
1407my $LCount = 19;
1408my $VCount = 21;
1409my $TCount = 28;
1410my $NCount = $VCount * $TCount;
1411
1412# For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1413# with the above published constants.
1414my %Jamo;
1415my %Jamo_L;     # Leading consonants
1416my %Jamo_V;     # Vowels
1417my %Jamo_T;     # Trailing consonants
1418
1419# For code points whose name contains its ordinal as a '-ABCD' suffix.
1420# The key is the base name of the code point, and the value is an
1421# array giving all the ranges that use this base name.  Each range
1422# is actually a hash giving the 'low' and 'high' values of it.
1423my %names_ending_in_code_point;
1424my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1425                                        # removed from the names
1426# Inverse mapping.  The list of ranges that have these kinds of
1427# names.  Each element contains the low, high, and base names in an
1428# anonymous hash.
1429my @code_points_ending_in_code_point;
1430
1431# To hold Unicode's normalization test suite
1432my @normalization_tests;
1433
1434# Boolean: does this Unicode version have the hangul syllables, and are we
1435# writing out a table for them?
1436my $has_hangul_syllables = 0;
1437
1438# Does this Unicode version have code points whose names end in their
1439# respective code points, and are we writing out a table for them?  0 for no;
1440# otherwise points to first property that a table is needed for them, so that
1441# if multiple tables are needed, we don't create duplicates
1442my $needing_code_points_ending_in_code_point = 0;
1443
1444my @backslash_X_tests;     # List of tests read in for testing \X
1445my @LB_tests;              # List of tests read in for testing \b{lb}
1446my @SB_tests;              # List of tests read in for testing \b{sb}
1447my @WB_tests;              # List of tests read in for testing \b{wb}
1448my @unhandled_properties;  # Will contain a list of properties found in
1449                           # the input that we didn't process.
1450my @match_properties;      # Properties that have match tables, to be
1451                           # listed in the pod
1452my @map_properties;        # Properties that get map files written
1453my @named_sequences;       # NamedSequences.txt contents.
1454my %potential_files;       # Generated list of all .txt files in the directory
1455                           # structure so we can warn if something is being
1456                           # ignored.
1457my @missing_early_files;   # Generated list of absent files that we need to
1458                           # proceed in compiling this early Unicode version
1459my @files_actually_output; # List of files we generated.
1460my @more_Names;            # Some code point names are compound; this is used
1461                           # to store the extra components of them.
1462my $E_FLOAT_PRECISION = 3; # The minimum number of digits after the decimal
1463                           # point of a normalized floating point number
1464                           # needed to match before we consider it equivalent
1465                           # to a candidate rational
1466
1467# These store references to certain commonly used property objects
1468my $age;
1469my $ccc;
1470my $gc;
1471my $perl;
1472my $block;
1473my $perl_charname;
1474my $print;
1475my $All;
1476my $Assigned;   # All assigned characters in this Unicode release
1477my $DI;         # Default_Ignorable_Code_Point property
1478my $NChar;      # Noncharacter_Code_Point property
1479my $script;
1480my $scx;        # Script_Extensions property
1481my $idt;        # Identifier_Type property
1482
1483# Are there conflicting names because of beginning with 'In_', or 'Is_'
1484my $has_In_conflicts = 0;
1485my $has_Is_conflicts = 0;
1486
1487sub internal_file_to_platform ($file=undef) {
1488    # Convert our file paths which have '/' separators to those of the
1489    # platform.
1490
1491    return undef unless defined $file;
1492
1493    return File::Spec->join(split '/', $file);
1494}
1495
1496sub file_exists ($file=undef) {   # platform independent '-e'.  This program internally
1497                        # uses slash as a path separator.
1498    return 0 unless defined $file;
1499    return -e internal_file_to_platform($file);
1500}
1501
1502sub objaddr($addr) {
1503    # Returns the address of the blessed input object.
1504    # It doesn't check for blessedness because that would do a string eval
1505    # every call, and the program is structured so that this is never called
1506    # for a non-blessed object.
1507
1508    return pack 'J', refaddr $addr;
1509}
1510
1511# These are used only if $annotate is true.
1512# The entire range of Unicode characters is examined to populate these
1513# after all the input has been processed.  But most can be skipped, as they
1514# have the same descriptive phrases, such as being unassigned
1515my @viacode;            # Contains the 1 million character names
1516my @age;                # And their ages ("" if none)
1517my @printable;          # boolean: And are those characters printable?
1518my @annotate_char_type; # Contains a type of those characters, specifically
1519                        # for the purposes of annotation.
1520my $annotate_ranges;    # A map of ranges of code points that have the same
1521                        # name for the purposes of annotation.  They map to the
1522                        # upper edge of the range, so that the end point can
1523                        # be immediately found.  This is used to skip ahead to
1524                        # the end of a range, and avoid processing each
1525                        # individual code point in it.
1526my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1527                                   # characters, but excluding those which are
1528                                   # also noncharacter code points
1529
1530# The annotation types are an extension of the regular range types, though
1531# some of the latter are folded into one.  Make the new types negative to
1532# avoid conflicting with the regular types
1533my $SURROGATE_TYPE = -1;
1534my $UNASSIGNED_TYPE = -2;
1535my $PRIVATE_USE_TYPE = -3;
1536my $NONCHARACTER_TYPE = -4;
1537my $CONTROL_TYPE = -5;
1538my $ABOVE_UNICODE_TYPE = -6;
1539my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1540
1541sub populate_char_info ($i) {
1542    # Used only with the $annotate option.  Populates the arrays with the
1543    # input code point's info that are needed for outputting more detailed
1544    # comments.  If calling context wants a return, it is the end point of
1545    # any contiguous range of characters that share essentially the same info
1546
1547    $viacode[$i] = $perl_charname->value_of($i) || "";
1548    $age[$i] = (defined $age)
1549               ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1550                  ? $age->value_of($i)
1551                  : "")
1552               : "";
1553
1554    # A character is generally printable if Unicode says it is,
1555    # but below we make sure that most Unicode general category 'C' types
1556    # aren't.
1557    $printable[$i] = $print->contains($i);
1558
1559    # But the characters in this range were removed in v2.0 and replaced by
1560    # different ones later.  Modern fonts will be for the replacement
1561    # characters, so suppress printing them.
1562    if (($v_version lt v2.0
1563         || ($compare_versions && $compare_versions lt v2.0))
1564        && (   $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1565            && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1566    {
1567        $printable[$i] = 0;
1568    }
1569
1570    $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1571
1572    # Only these two regular types are treated specially for annotations
1573    # purposes
1574    $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1575                                && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1576
1577    # Give a generic name to all code points that don't have a real name.
1578    # We output ranges, if applicable, for these.  Also calculate the end
1579    # point of the range.
1580    my $end;
1581    if (! $viacode[$i]) {
1582        if ($i > $MAX_UNICODE_CODEPOINT) {
1583            $viacode[$i] = 'Above-Unicode';
1584            $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1585            $printable[$i] = 0;
1586            $end = $MAX_WORKING_CODEPOINT;
1587        }
1588        elsif ($gc-> table('Private_use')->contains($i)) {
1589            $viacode[$i] = 'Private Use';
1590            $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1591            $printable[$i] = 0;
1592            $end = $gc->table('Private_Use')->containing_range($i)->end;
1593        }
1594        elsif ($NChar->contains($i)) {
1595            $viacode[$i] = 'Noncharacter';
1596            $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1597            $printable[$i] = 0;
1598            $end = $NChar->containing_range($i)->end;
1599        }
1600        elsif ($gc-> table('Control')->contains($i)) {
1601            my $name_ref = property_ref('Name_Alias');
1602            $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1603            $viacode[$i] = (defined $name_ref)
1604                           ? $name_ref->value_of($i)
1605                           : 'Control';
1606            $annotate_char_type[$i] = $CONTROL_TYPE;
1607            $printable[$i] = 0;
1608        }
1609        elsif ($gc-> table('Unassigned')->contains($i)) {
1610            $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1611            $printable[$i] = 0;
1612            $viacode[$i] = 'Unassigned';
1613
1614            if (defined $block) { # No blocks in earliest releases
1615                $viacode[$i] .= ', block=' . $block-> value_of($i);
1616                $end = $gc-> table('Unassigned')->containing_range($i)->end;
1617
1618                # Because we name the unassigned by the blocks they are in, it
1619                # can't go past the end of that block, and it also can't go
1620                # past the unassigned range it is in.  The special table makes
1621                # sure that the non-characters, which are unassigned, are
1622                # separated out.
1623                $end = min($block->containing_range($i)->end,
1624                           $unassigned_sans_noncharacters->
1625                                                    containing_range($i)->end);
1626            }
1627            else {
1628                $end = $i + 1;
1629                while ($unassigned_sans_noncharacters->contains($end)) {
1630                    $end++;
1631                }
1632                $end--;
1633            }
1634        }
1635        elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1636            $viacode[$i] = 'Surrogate';
1637            $annotate_char_type[$i] = $SURROGATE_TYPE;
1638            $printable[$i] = 0;
1639            $end = $gc->table('Surrogate')->containing_range($i)->end;
1640        }
1641        else {
1642            Carp::my_carp_bug("Can't figure out how to annotate "
1643                              . sprintf("U+%04X", $i)
1644                              . ".  Proceeding anyway.");
1645            $viacode[$i] = 'UNKNOWN';
1646            $annotate_char_type[$i] = $UNKNOWN_TYPE;
1647            $printable[$i] = 0;
1648        }
1649    }
1650
1651    # Here, has a name, but if it's one in which the code point number is
1652    # appended to the name, do that.
1653    elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1654        $viacode[$i] .= sprintf("-%04X", $i);
1655
1656        my $limit = $perl_charname->containing_range($i)->end;
1657        if (defined $age) {
1658            # Do all these as groups of the same age, instead of individually,
1659            # because their names are so meaningless, and there are typically
1660            # large quantities of them.
1661            $end = $i + 1;
1662            while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1663                $end++;
1664            }
1665            $end--;
1666        }
1667        else {
1668            $end = $limit;
1669        }
1670    }
1671
1672    # And here, has a name, but if it's a hangul syllable one, replace it with
1673    # the correct name from the Unicode algorithm
1674    elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1675        use integer;
1676        my $SIndex = $i - $SBase;
1677        my $L = $LBase + $SIndex / $NCount;
1678        my $V = $VBase + ($SIndex % $NCount) / $TCount;
1679        my $T = $TBase + $SIndex % $TCount;
1680        $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1681        $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1682        $end = $perl_charname->containing_range($i)->end;
1683    }
1684
1685    return if ! defined wantarray;
1686    return $i if ! defined $end;    # If not a range, return the input
1687
1688    # Save this whole range so can find the end point quickly
1689    $annotate_ranges->add_map($i, $end, $end);
1690
1691    return $end;
1692}
1693
1694sub max($a, $b) {
1695    return $a >= $b ? $a : $b;
1696}
1697
1698sub min($a, $b) {
1699    return $a <= $b ? $a : $b;
1700}
1701
1702sub clarify_number ($number) {
1703    # This returns the input number with underscores inserted every 3 digits
1704    # in large (5 digits or more) numbers.  Input must be entirely digits, not
1705    # checked.
1706
1707    my $pos = length($number) - 3;
1708    return $number if $pos <= 1;
1709    while ($pos > 0) {
1710        substr($number, $pos, 0) = '_';
1711        $pos -= 3;
1712    }
1713    return $number;
1714}
1715
1716sub clarify_code_point_count ($number) {
1717    # This is like clarify_number(), but the input is assumed to be a count of
1718    # code points, rather than a generic number.
1719
1720    my $append = "";
1721
1722    if ($number > $MAX_UNICODE_CODEPOINTS) {
1723        $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1724        return "All above-Unicode code points" if $number == 0;
1725        $append = " + all above-Unicode code points";
1726    }
1727    return clarify_number($number) . $append;
1728}
1729
1730package Carp;
1731
1732# These routines give a uniform treatment of messages in this program.  They
1733# are placed in the Carp package to cause the stack trace to not include them,
1734# although an alternative would be to use another package and set @CARP_NOT
1735# for it.
1736
1737our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1738
1739# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1740# and overload trying to load Scalar:Util under miniperl.  See
1741# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1742undef $overload::VERSION;
1743
1744sub my_carp($message="", $nofold=0) {
1745
1746    if ($message) {
1747        $message = main::join_lines($message);
1748        $message =~ s/^$0: *//;     # Remove initial program name
1749        $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1750        $message = "\n$0: $message;";
1751
1752        # Fold the message with program name, semi-colon end punctuation
1753        # (which looks good with the message that carp appends to it), and a
1754        # hanging indent for continuation lines.
1755        $message = main::simple_fold($message, "", 4) unless $nofold;
1756        $message =~ s/\n$//;        # Remove the trailing nl so what carp
1757                                    # appends is to the same line
1758    }
1759
1760    return $message if defined wantarray;   # If a caller just wants the msg
1761
1762    carp $message;
1763    return;
1764}
1765
1766sub my_carp_bug($message="") {
1767    # This is called when it is clear that the problem is caused by a bug in
1768    # this program.
1769    $message =~ s/^$0: *//;
1770    $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1771    carp $message;
1772    return;
1773}
1774
1775sub carp_too_few_args($args_ref, $count) {
1776    my_carp_bug("Need at least $count arguments to "
1777        . (caller 1)[3]
1778        . ".  Instead got: '"
1779        . join ', ', @$args_ref
1780        . "'.  No action taken.");
1781    return;
1782}
1783
1784sub carp_extra_args($args_ref) {
1785    unless (ref $args_ref) {
1786        my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1787        return;
1788    }
1789    my ($package, $file, $line) = caller;
1790    my $subroutine = (caller 1)[3];
1791
1792    my $list;
1793    if (ref $args_ref eq 'HASH') {
1794        foreach my $key (keys %$args_ref) {
1795            $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1796        }
1797        $list = join ', ', each %{$args_ref};
1798    }
1799    elsif (ref $args_ref eq 'ARRAY') {
1800        foreach my $arg (@$args_ref) {
1801            $arg = $UNDEF unless defined $arg;
1802        }
1803        $list = join ', ', @$args_ref;
1804    }
1805    else {
1806        my_carp_bug("Can't cope with ref "
1807                . ref($args_ref)
1808                . " . argument to 'carp_extra_args'.  Not checking arguments.");
1809        return;
1810    }
1811
1812    my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1813    return;
1814}
1815
1816package main;
1817
1818{ # Closure
1819
1820    # This program uses the inside-out method for objects, as recommended in
1821    # "Perl Best Practices".  (This is the best solution still, since this has
1822    # to run under miniperl.)  This closure aids in generating those.  There
1823    # are two routines.  setup_package() is called once per package to set
1824    # things up, and then set_access() is called for each hash representing a
1825    # field in the object.  These routines arrange for the object to be
1826    # properly destroyed when no longer used, and for standard accessor
1827    # functions to be generated.  If you need more complex accessors, just
1828    # write your own and leave those accesses out of the call to set_access().
1829    # More details below.
1830
1831    my %constructor_fields; # fields that are to be used in constructors; see
1832                            # below
1833
1834    # The values of this hash will be the package names as keys to other
1835    # hashes containing the name of each field in the package as keys, and
1836    # references to their respective hashes as values.
1837    my %package_fields;
1838
1839    sub setup_package {
1840        # Sets up the package, creating standard DESTROY and dump methods
1841        # (unless already defined).  The dump method is used in debugging by
1842        # simple_dumper().
1843        # The optional parameters are:
1844        #   a)  a reference to a hash, that gets populated by later
1845        #       set_access() calls with one of the accesses being
1846        #       'constructor'.  The caller can then refer to this, but it is
1847        #       not otherwise used by these two routines.
1848        #   b)  a reference to a callback routine to call during destruction
1849        #       of the object, before any fields are actually destroyed
1850
1851        my %args = @_;
1852        my $constructor_ref = delete $args{'Constructor_Fields'};
1853        my $destroy_callback = delete $args{'Destroy_Callback'};
1854        Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1855
1856        my %fields;
1857        my $package = (caller)[0];
1858
1859        $package_fields{$package} = \%fields;
1860        $constructor_fields{$package} = $constructor_ref;
1861
1862        unless ($package->can('DESTROY')) {
1863            my $destroy_name = "${package}::DESTROY";
1864            no strict "refs";
1865
1866            # Use typeglob to give the anonymous subroutine the name we want
1867            *$destroy_name = sub ($self) {
1868                my $addr = pack 'J', refaddr $self;
1869
1870                $self->$destroy_callback if $destroy_callback;
1871                foreach my $field (keys %{$package_fields{$package}}) {
1872                    #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1873                    delete $package_fields{$package}{$field}{$addr};
1874                }
1875                return;
1876            }
1877        }
1878
1879        unless ($package->can('dump')) {
1880            my $dump_name = "${package}::dump";
1881            no strict "refs";
1882            *$dump_name = sub ($self, @_args) {
1883                return dump_inside_out($self, $package_fields{$package}, @_args);
1884            }
1885        }
1886        return;
1887    }
1888
1889    sub set_access($name, $field, @accessors) {
1890        # Arrange for the input field to be garbage collected when no longer
1891        # needed.  Also, creates standard accessor functions for the field
1892        # based on the optional parameters-- none if none of these parameters:
1893        #   'addable'    creates an 'add_NAME()' accessor function.
1894        #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1895        #                function.
1896        #   'settable'   creates a 'set_NAME()' accessor function.
1897        #   'constructor' doesn't create an accessor function, but adds the
1898        #                field to the hash that was previously passed to
1899        #                setup_package();
1900        # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1901        # 'add' etc. all mean 'addable'.
1902        # The read accessor function will work on both array and scalar
1903        # values.  If another accessor in the parameter list is 'a', the read
1904        # access assumes an array.  You can also force it to be array access
1905        # by specifying 'readable_array' instead of 'readable'
1906        #
1907        # A sort-of 'protected' access can be set-up by preceding the addable,
1908        # readable or settable with some initial portion of 'protected_' (but,
1909        # the underscore is required), like 'p_a', 'pro_set', etc.  The
1910        # "protection" is only by convention.  All that happens is that the
1911        # accessor functions' names begin with an underscore.  So instead of
1912        # calling set_foo, the call is _set_foo.  (Real protection could be
1913        # accomplished by having a new subroutine, end_package, called at the
1914        # end of each package, and then storing the __LINE__ ranges and
1915        # checking them on every accessor.  But that is way overkill.)
1916
1917        # We create anonymous subroutines as the accessors and then use
1918        # typeglobs to assign them to the proper package and name
1919
1920        # $name 	Name of the field
1921        # $field 	Reference to the inside-out hash containing the
1922		# 			field
1923
1924        my $package = (caller)[0];
1925
1926        if (! exists $package_fields{$package}) {
1927            croak "$0: Must call 'setup_package' before 'set_access'";
1928        }
1929
1930        # Stash the field so DESTROY can get it.
1931        $package_fields{$package}{$name} = $field;
1932
1933        # Remaining arguments are the accessors.  For each...
1934        foreach my $access (@accessors) {
1935            my $access = lc $access;
1936
1937            my $protected = "";
1938
1939            # Match the input as far as it goes.
1940            if ($access =~ /^(p[^_]*)_/) {
1941                $protected = $1;
1942                if (substr('protected_', 0, length $protected)
1943                    eq $protected)
1944                {
1945
1946                    # Add 1 for the underscore not included in $protected
1947                    $access = substr($access, length($protected) + 1);
1948                    $protected = '_';
1949                }
1950                else {
1951                    $protected = "";
1952                }
1953            }
1954
1955            if (substr('addable', 0, length $access) eq $access) {
1956                my $subname = "${package}::${protected}add_$name";
1957                no strict "refs";
1958
1959                # add_ accessor.  Don't add if already there, which we
1960                # determine using 'eq' for scalars and '==' otherwise.
1961                *$subname = sub ($self, $value) {
1962                    use strict "refs";
1963                    my $addr = pack 'J', refaddr $self;
1964                    if (ref $value) {
1965                        return if grep { $value == $_ } @{$field->{$addr}};
1966                    }
1967                    else {
1968                        return if grep { $value eq $_ } @{$field->{$addr}};
1969                    }
1970                    push @{$field->{$addr}}, $value;
1971                    return;
1972                }
1973            }
1974            elsif (substr('constructor', 0, length $access) eq $access) {
1975                if ($protected) {
1976                    Carp::my_carp_bug("Can't set-up 'protected' constructors")
1977                }
1978                else {
1979                    $constructor_fields{$package}{$name} = $field;
1980                }
1981            }
1982            elsif (substr('readable_array', 0, length $access) eq $access) {
1983
1984                # Here has read access.  If one of the other parameters for
1985                # access is array, or this one specifies array (by being more
1986                # than just 'readable_'), then create a subroutine that
1987                # assumes the data is an array.  Otherwise just a scalar
1988                my $subname = "${package}::${protected}$name";
1989                if (grep { /^a/i } (@accessors)
1990                    or length($access) > length('readable_'))
1991                {
1992                    no strict "refs";
1993                    *$subname = sub ($_addr) {
1994                        use strict "refs";
1995                        my $addr = pack 'J', refaddr $_addr;
1996                        if (ref $field->{$addr} ne 'ARRAY') {
1997                            my $type = ref $field->{$addr};
1998                            $type = 'scalar' unless $type;
1999                            Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
2000                            return;
2001                        }
2002                        return scalar @{$field->{$addr}} unless wantarray;
2003
2004                        # Make a copy; had problems with caller modifying the
2005                        # original otherwise
2006                        my @return = @{$field->{$addr}};
2007                        return @return;
2008                    }
2009                }
2010                else {
2011
2012                    # Here not an array value, a simpler function.
2013                    no strict "refs";
2014                    *$subname = sub ($addr) {
2015                        use strict "refs";
2016                        return $field->{pack 'J', refaddr $addr};
2017                    }
2018                }
2019            }
2020            elsif (substr('settable', 0, length $access) eq $access) {
2021                my $subname = "${package}::${protected}set_$name";
2022                no strict "refs";
2023                *$subname = sub ($self, $value) {
2024                    use strict "refs";
2025                    # $self is $_[0]; $value is $_[1]
2026                    $field->{pack 'J', refaddr $self} = $value;
2027                    return;
2028                }
2029            }
2030            else {
2031                Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2032            }
2033        }
2034        return;
2035    }
2036}
2037
2038package Input_file;
2039
2040# All input files use this object, which stores various attributes about them,
2041# and provides for convenient, uniform handling.  The run method wraps the
2042# processing.  It handles all the bookkeeping of opening, reading, and closing
2043# the file, returning only significant input lines.
2044#
2045# Each object gets a handler which processes the body of the file, and is
2046# called by run().  All character property files must use the generic,
2047# default handler, which has code scrubbed to handle things you might not
2048# expect, including automatic EBCDIC handling.  For files that don't deal with
2049# mapping code points to a property value, such as test files,
2050# PropertyAliases, PropValueAliases, and named sequences, you can override the
2051# handler to be a custom one.  Such a handler should basically be a
2052# while(next_line()) {...} loop.
2053#
2054# You can also set up handlers to
2055#   0) call during object construction time, after everything else is done
2056#   1) call before the first line is read, for pre processing
2057#   2) call to adjust each line of the input before the main handler gets
2058#      them.  This can be automatically generated, if appropriately simple
2059#      enough, by specifying a Properties parameter in the constructor.
2060#   3) call upon EOF before the main handler exits its loop
2061#   4) call at the end, for post processing
2062#
2063# $_ is used to store the input line, and is to be filtered by the
2064# each_line_handler()s.  So, if the format of the line is not in the desired
2065# format for the main handler, these are used to do that adjusting.  They can
2066# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2067# so the $_ output of one is used as the input to the next.  The EOF handler
2068# is also stackable, but none of the others are, but could easily be changed
2069# to be so.
2070#
2071# Some properties are used by the Perl core but aren't defined until later
2072# Unicode releases.  The perl interpreter would have problems working when
2073# compiled with an earlier Unicode version that doesn't have them, so we need
2074# to define them somehow for those releases.  The 'Early' constructor
2075# parameter can be used to automatically handle this.  It is essentially
2076# ignored if the Unicode version being compiled has a data file for this
2077# property.  Either code to execute or a file to read can be specified.
2078# Details are at the %early definition.
2079#
2080# Most of the handlers can call insert_lines() or insert_adjusted_lines()
2081# which insert the parameters as lines to be processed before the next input
2082# file line is read.  This allows the EOF handler(s) to flush buffers, for
2083# example.  The difference between the two routines is that the lines inserted
2084# by insert_lines() are subjected to the each_line_handler()s.  (So if you
2085# called it from such a handler, you would get infinite recursion without some
2086# mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
2087# directly to the main handler without any adjustments.  If the
2088# post-processing handler calls any of these, there will be no effect.  Some
2089# error checking for these conditions could be added, but it hasn't been done.
2090#
2091# carp_bad_line() should be called to warn of bad input lines, which clears $_
2092# to prevent further processing of the line.  This routine will output the
2093# message as a warning once, and then keep a count of the lines that have the
2094# same message, and output that count at the end of the file's processing.
2095# This keeps the number of messages down to a manageable amount.
2096#
2097# get_missings() should be called to retrieve any @missing input lines.
2098# Messages will be raised if this isn't done if the options aren't to ignore
2099# missings.
2100
2101sub trace { return main::trace(@_); }
2102
2103{ # Closure
2104    # Keep track of fields that are to be put into the constructor.
2105    my %constructor_fields;
2106
2107    main::setup_package(Constructor_Fields => \%constructor_fields);
2108
2109    my %file; # Input file name, required
2110    main::set_access('file', \%file, qw{ c r });
2111
2112    my %first_released; # Unicode version file was first released in, required
2113    main::set_access('first_released', \%first_released, qw{ c r });
2114
2115    my %handler;    # Subroutine to process the input file, defaults to
2116                    # 'process_generic_property_file'
2117    main::set_access('handler', \%handler, qw{ c });
2118
2119    my %property;
2120    # name of property this file is for.  defaults to none, meaning not
2121    # applicable, or is otherwise determinable, for example, from each line.
2122    main::set_access('property', \%property, qw{ c r });
2123
2124    my %optional;
2125    # This is either an unsigned number, or a list of property names.  In the
2126    # former case, if it is non-zero, it means the file is optional, so if the
2127    # file is absent, no warning about that is output.  In the latter case, it
2128    # is a list of properties that the file (exclusively) defines.  If the
2129    # file is present, tables for those properties will be produced; if
2130    # absent, none will, even if they are listed elsewhere (namely
2131    # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2132    # and no warnings will be raised about them not being available.  (And no
2133    # warning about the file itself will be raised.)
2134    main::set_access('optional', \%optional, qw{ c readable_array } );
2135
2136    my %non_skip;
2137    # This is used for debugging, to skip processing of all but a few input
2138    # files.  Add 'non_skip => 1' to the constructor for those files you want
2139    # processed when you set the $debug_skip global.
2140    main::set_access('non_skip', \%non_skip, 'c');
2141
2142    my %skip;
2143    # This is used to skip processing of this input file (semi-) permanently.
2144    # The value should be the reason the file is being skipped.  It is used
2145    # for files that we aren't planning to process anytime soon, but want to
2146    # allow to be in the directory and be checked for their names not
2147    # conflicting with any other files on a DOS 8.3 name filesystem, but to
2148    # not otherwise be processed, and to not raise a warning about not being
2149    # handled.  In the constructor call, any value that evaluates to a numeric
2150    # 0 or undef means don't skip.  Any other value is a string giving the
2151    # reason it is being skipped, and this will appear in generated pod.
2152    # However, an empty string reason will suppress the pod entry.
2153    # Internally, calls that evaluate to numeric 0 are changed into undef to
2154    # distinguish them from an empty string call.
2155    main::set_access('skip', \%skip, 'c', 'r');
2156
2157    my %each_line_handler;
2158    # list of subroutines to look at and filter each non-comment line in the
2159    # file.  defaults to none.  The subroutines are called in order, each is
2160    # to adjust $_ for the next one, and the final one adjusts it for
2161    # 'handler'
2162    main::set_access('each_line_handler', \%each_line_handler, 'c');
2163
2164    my %retain_trailing_comments;
2165    # This is used to not discard the comments that end data lines.  This
2166    # would be used only for files with non-typical syntax, and most code here
2167    # assumes that comments have been stripped, so special handlers would have
2168    # to be written.  It is assumed that the code will use these in
2169    # single-quoted contexts, and so any "'" marks in the comment will be
2170    # prefixed by a backslash.
2171    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2172
2173    my %properties; # Optional ordered list of the properties that occur in each
2174    # meaningful line of the input file.  If present, an appropriate
2175    # each_line_handler() is automatically generated and pushed onto the stack
2176    # of such handlers.  This is useful when a file contains multiple
2177    # properties per line, but no other special considerations are necessary.
2178    # The special value "<ignored>" means to discard the corresponding input
2179    # field.
2180    # Any @missing lines in the file should also match this syntax; no such
2181    # files exist as of 6.3.  But if it happens in a future release, the code
2182    # could be expanded to properly parse them.
2183    main::set_access('properties', \%properties, qw{ c r });
2184
2185    my %has_missings_defaults;
2186    # ? Are there lines in the file giving default values for code points
2187    # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2188    # the norm, but IGNORED means it has such lines, but the handler doesn't
2189    # use them.  Having these three states allows us to catch changes to the
2190    # UCD that this program should track.  XXX This could be expanded to
2191    # specify the syntax for such lines, like %properties above.
2192    main::set_access('has_missings_defaults',
2193                                        \%has_missings_defaults, qw{ c r });
2194
2195    my %construction_time_handler;
2196    # Subroutine to call at the end of the new method.  If undef, no such
2197    # handler is called.
2198    main::set_access('construction_time_handler',
2199                                        \%construction_time_handler, qw{ c });
2200
2201    my %pre_handler;
2202    # Subroutine to call before doing anything else in the file.  If undef, no
2203    # such handler is called.
2204    main::set_access('pre_handler', \%pre_handler, qw{ c });
2205
2206    my %eof_handler;
2207    # Subroutines to call upon getting an EOF on the input file, but before
2208    # that is returned to the main handler.  This is to allow buffers to be
2209    # flushed.  The handler is expected to call insert_lines() or
2210    # insert_adjusted() with the buffered material
2211    main::set_access('eof_handler', \%eof_handler, qw{ c });
2212
2213    my %post_handler;
2214    # Subroutine to call after all the lines of the file are read in and
2215    # processed.  If undef, no such handler is called.  Note that this cannot
2216    # add lines to be processed; instead use eof_handler
2217    main::set_access('post_handler', \%post_handler, qw{ c });
2218
2219    my %progress_message;
2220    # Message to print to display progress in lieu of the standard one
2221    main::set_access('progress_message', \%progress_message, qw{ c });
2222
2223    my %handle;
2224    # cache open file handle, internal.  Is undef if file hasn't been
2225    # processed at all, empty if has;
2226    main::set_access('handle', \%handle);
2227
2228    my %added_lines;
2229    # cache of lines added virtually to the file, internal
2230    main::set_access('added_lines', \%added_lines);
2231
2232    my %remapped_lines;
2233    # cache of lines added virtually to the file, internal
2234    main::set_access('remapped_lines', \%remapped_lines);
2235
2236    my %errors;
2237    # cache of errors found, internal
2238    main::set_access('errors', \%errors);
2239
2240    my %missings;
2241    # storage of '@missing' defaults lines
2242    main::set_access('missings', \%missings);
2243
2244    my %early;
2245    # Used for properties that must be defined (for Perl's purposes) on
2246    # versions of Unicode earlier than Unicode itself defines them.  The
2247    # parameter is an array (it would be better to be a hash, but not worth
2248    # bothering about due to its rare use).
2249    #
2250    # The first element is either a code reference to call when in a release
2251    # earlier than the Unicode file is available in, or it is an alternate
2252    # file to use instead of the non-existent one.  This file must have been
2253    # plunked down in the same directory as mktables.  Should you be compiling
2254    # on a release that needs such a file, mktables will abort the
2255    # compilation, and tell you where to get the necessary file(s), and what
2256    # name(s) to use to store them as.
2257    # In the case of specifying an alternate file, the array must contain two
2258    # further elements:
2259    #
2260    # [1] is the name of the property that will be generated by this file.
2261    # The class automatically takes the input file and excludes any code
2262    # points in it that were not assigned in the Unicode version being
2263    # compiled.  It then uses this result to define the property in the given
2264    # version.  Since the property doesn't actually exist in the Unicode
2265    # version being compiled, this should be a name accessible only by core
2266    # perl.  If it is the same name as the regular property, the constructor
2267    # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2268    # get output, and so will be unusable by non-core code.  Otherwise it gets
2269    # marked as $INTERNAL_ONLY.
2270    #
2271    # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2272    # the Hangul syllables in that release (which were ripped out in version
2273    # 2) for the given property .  (Hence it is ignored except when compiling
2274    # version 1.  You only get one value that applies to all of them, which
2275    # may not be the actual reality, but probably nobody cares anyway for
2276    # these obsolete characters.)
2277    #
2278    # [3] if present is the default value for the property to assign for code
2279    # points not given in the input.  If not present, the default from the
2280    # normal property is used
2281    #
2282    # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2283    # it means to not add the name in [1] as an alias to the property name
2284    # used for these.  Normally, when compiling Unicode versions that don't
2285    # invoke the early handling, the name is added as a synonym.
2286    #
2287    # Not all files can be handled in the above way, and so the code ref
2288    # alternative is available.  It can do whatever it needs to.  The other
2289    # array elements are optional in this case, and the code is free to use or
2290    # ignore them if they are present.
2291    #
2292    # Internally, the constructor unshifts a 0 or 1 onto this array to
2293    # indicate if an early alternative is actually being used or not.  This
2294    # makes for easier testing later on.
2295    main::set_access('early', \%early, 'c');
2296
2297    my %only_early;
2298    main::set_access('only_early', \%only_early, 'c');
2299
2300    my %required_even_in_debug_skip;
2301    # debug_skip is used to speed up compilation during debugging by skipping
2302    # processing files that are not needed for the task at hand.  However,
2303    # some files pretty much can never be skipped, and this is used to specify
2304    # that this is one of them.  In order to skip this file, the call to the
2305    # constructor must be edited to comment out this parameter.
2306    main::set_access('required_even_in_debug_skip',
2307                     \%required_even_in_debug_skip, 'c');
2308
2309    my %withdrawn;
2310    # Some files get removed from the Unicode DB.  This is a version object
2311    # giving the first release without this file.
2312    main::set_access('withdrawn', \%withdrawn, 'c');
2313
2314    my %ucd;
2315    # Some files are not actually part of the Unicode Character Database.
2316    # These typically have a different way of indicating their version
2317    main::set_access('ucd', \%ucd, 'c');
2318
2319    my %in_this_release;
2320    # Calculated value from %first_released and %withdrawn.  Are we compiling
2321    # a Unicode release which includes this file?
2322    main::set_access('in_this_release', \%in_this_release);
2323
2324    sub _next_line;
2325    sub _next_line_with_remapped_range;
2326
2327    sub new {
2328        my $class = shift;
2329
2330        my $self = bless \do{ my $anonymous_scalar }, $class;
2331        my $addr = pack 'J', refaddr $self;
2332
2333        # Set defaults
2334        $handler{$addr} = \&main::process_generic_property_file;
2335        $retain_trailing_comments{$addr} = 0;
2336        $non_skip{$addr} = 0;
2337        $skip{$addr} = undef;
2338        $has_missings_defaults{$addr} = $NO_DEFAULTS;
2339        $handle{$addr} = undef;
2340        $added_lines{$addr} = [ ];
2341        $remapped_lines{$addr} = [ ];
2342        $each_line_handler{$addr} = [ ];
2343        $eof_handler{$addr} = [ ];
2344        $errors{$addr} = { };
2345        $missings{$addr} = [ ];
2346        $early{$addr} = [ ];
2347        $optional{$addr} = [ ];
2348        $ucd{$addr} = 1;
2349
2350        # Two positional parameters.
2351        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2352        $file{$addr} = main::internal_file_to_platform(shift);
2353        $first_released{$addr} = shift;
2354
2355        # The rest of the arguments are key => value pairs
2356        # %constructor_fields has been set up earlier to list all possible
2357        # ones.  Either set or push, depending on how the default has been set
2358        # up just above.
2359        my %args = @_;
2360        foreach my $key (keys %args) {
2361            my $argument = $args{$key};
2362
2363            # Note that the fields are the lower case of the constructor keys
2364            my $hash = $constructor_fields{lc $key};
2365            if (! defined $hash) {
2366                Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2367                next;
2368            }
2369            if (ref $hash->{$addr} eq 'ARRAY') {
2370                if (ref $argument eq 'ARRAY') {
2371                    foreach my $argument (@{$argument}) {
2372                        next if ! defined $argument;
2373                        push @{$hash->{$addr}}, $argument;
2374                    }
2375                }
2376                else {
2377                    push @{$hash->{$addr}}, $argument if defined $argument;
2378                }
2379            }
2380            else {
2381                $hash->{$addr} = $argument;
2382            }
2383            delete $args{$key};
2384        };
2385
2386        $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2387
2388        # Convert 0 (meaning don't skip) to undef
2389        undef $skip{$addr} unless $skip{$addr};
2390
2391        # Handle the case where this file is optional
2392        my $pod_message_for_non_existent_optional = "";
2393        if ($optional{$addr}->@*) {
2394
2395            # First element is the pod message
2396            $pod_message_for_non_existent_optional
2397                                                = shift $optional{$addr}->@*;
2398            # Convert a 0 'Optional' argument to an empty list to make later
2399            # code more concise.
2400            if (   $optional{$addr}->@*
2401                && $optional{$addr}->@* == 1
2402                && $optional{$addr}[0] ne ""
2403                && $optional{$addr}[0] !~ /\D/
2404                && $optional{$addr}[0] == 0)
2405            {
2406                $optional{$addr} = [ ];
2407            }
2408            else {  # But if the only element doesn't evaluate to 0, make sure
2409                    # that this file is indeed considered optional below.
2410                unshift $optional{$addr}->@*, 1;
2411            }
2412        }
2413
2414        my $progress;
2415        my $function_instead_of_file = 0;
2416
2417        if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2418            $only_early{$addr} = 1;
2419            pop $early{$addr}->@*;
2420        }
2421
2422        # If we are compiling a Unicode release earlier than the file became
2423        # available, the constructor may have supplied a substitute
2424        if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2425
2426            # Yes, we have a substitute, that we will use; mark it so
2427            unshift $early{$addr}->@*, 1;
2428
2429            # See the definition of %early for what the array elements mean.
2430            # Note that we have just unshifted onto the array, so the numbers
2431            # below are +1 of those in the %early description.
2432            # If we have a property this defines, create a table and default
2433            # map for it now (at essentially compile time), so that it will be
2434            # available for the whole of run time.  (We will want to add this
2435            # name as an alias when we are using the official property name;
2436            # but this must be deferred until run(), because at construction
2437            # time the official names have yet to be defined.)
2438            if ($early{$addr}[2]) {
2439                my $fate = ($property{$addr}
2440                            && $property{$addr} eq $early{$addr}[2])
2441                          ? $PLACEHOLDER
2442                          : $INTERNAL_ONLY;
2443                my $prop_object = Property->new($early{$addr}[2],
2444                                                Fate => $fate,
2445                                                Perl_Extension => 1,
2446                                                );
2447
2448                # If not specified by the constructor, use the default mapping
2449                # for the regular property for this substitute one.
2450                if ($early{$addr}[4]) {
2451                    $prop_object->set_default_map($early{$addr}[4]);
2452                }
2453                elsif (    defined $property{$addr}
2454                       &&  defined $default_mapping{$property{$addr}})
2455                {
2456                    $prop_object
2457                        ->set_default_map($default_mapping{$property{$addr}});
2458                }
2459            }
2460
2461            if (ref $early{$addr}[1] eq 'CODE') {
2462                $function_instead_of_file = 1;
2463
2464                # If the first element of the array is a code ref, the others
2465                # are optional.
2466                $handler{$addr} = $early{$addr}[1];
2467                $property{$addr} = $early{$addr}[2]
2468                                                if defined $early{$addr}[2];
2469                $progress = "substitute $file{$addr}";
2470
2471                undef $file{$addr};
2472            }
2473            else {  # Specifying a substitute file
2474
2475                if (! main::file_exists($early{$addr}[1])) {
2476
2477                    # If we don't see the substitute file, generate an error
2478                    # message giving the needed things, and add it to the list
2479                    # of such to output before actual processing happens
2480                    # (hence the user finds out all of them in one run).
2481                    # Instead of creating a general method for NameAliases,
2482                    # hard-code it here, as there is unlikely to ever be a
2483                    # second one which needs special handling.
2484                    my $string_version = ($file{$addr} eq "NameAliases.txt")
2485                                    ? 'at least 6.1 (the later, the better)'
2486                                    : sprintf "%vd", $first_released{$addr};
2487                    push @missing_early_files, <<END;
2488'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2489END
2490                    ;
2491                    return;
2492                }
2493                $progress = $early{$addr}[1];
2494                $progress .= ", substituting for $file{$addr}" if $file{$addr};
2495                $file{$addr} = $early{$addr}[1];
2496                $property{$addr} = $early{$addr}[2];
2497
2498                # Ignore code points not in the version being compiled
2499                push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2500
2501                if (   $v_version lt v2.0        # Hanguls in this release ...
2502                    && defined $early{$addr}[3]) # ... need special treatment
2503                {
2504                    push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2505                }
2506            }
2507
2508            # And this substitute is valid for all releases.
2509            $first_released{$addr} = v0;
2510        }
2511        else {  # Normal behavior
2512            $progress = $file{$addr};
2513            unshift $early{$addr}->@*, 0; # No substitute
2514        }
2515
2516        my $file = $file{$addr};
2517        $progress_message{$addr} = "Processing $progress"
2518                                            unless $progress_message{$addr};
2519
2520        # A file should be there if it is within the window of versions for
2521        # which Unicode supplies it
2522        if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2523            $in_this_release{$addr} = 0;
2524            $skip{$addr} = "";
2525        }
2526        else {
2527            $in_this_release{$addr} = $first_released{$addr} le $v_version;
2528
2529            # Check that the file for this object (possibly using a substitute
2530            # for early releases) exists or we have a function alternative
2531            if (   ! $function_instead_of_file
2532                && ! main::file_exists($file))
2533            {
2534                # Here there is nothing available for this release.  This is
2535                # fine if we aren't expecting anything in this release.
2536                if (! $in_this_release{$addr}) {
2537                    $skip{$addr} = "";  # Don't remark since we expected
2538                                        # nothing and got nothing
2539                }
2540                elsif ($optional{$addr}->@*) {
2541
2542                    # Here the file is optional in this release; Use the
2543                    # passed in text to document this case in the pod.
2544                    $skip{$addr} = $pod_message_for_non_existent_optional;
2545                }
2546                elsif (   $in_this_release{$addr}
2547                       && ! defined $skip{$addr}
2548                       && defined $file)
2549                { # Doesn't exist but should.
2550                    $skip{$addr} = "'$file' not found.  Possibly Big problems";
2551                    Carp::my_carp($skip{$addr});
2552                }
2553            }
2554            elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2555            {
2556
2557                # The file exists; if not skipped for another reason, and we are
2558                # skipping most everything during debugging builds, use that as
2559                # the skip reason.
2560                $skip{$addr} = '$debug_skip is on'
2561            }
2562        }
2563
2564        if (   ! $debug_skip
2565            && $non_skip{$addr}
2566            && ! $required_even_in_debug_skip{$addr}
2567            && $verbosity)
2568        {
2569            print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2570        }
2571
2572        # Here, we have figured out if we will be skipping this file or not.
2573        # If so, we add any single property it defines to any passed in
2574        # optional property list.  These will be dealt with at run time.
2575        if (defined $skip{$addr}) {
2576            if ($property{$addr}) {
2577                push $optional{$addr}->@*, $property{$addr};
2578            }
2579        } # Otherwise, are going to process the file.
2580        elsif ($property{$addr}) {
2581
2582            # If the file has a property defined in the constructor for it, it
2583            # means that the property is not listed in the file's entries.  So
2584            # add a handler (to the list of line handlers) to insert the
2585            # property name into the lines, to provide a uniform interface to
2586            # the final processing subroutine.
2587            push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2588        }
2589        elsif ($properties{$addr}) {
2590
2591            # Similarly, there may be more than one property represented on
2592            # each line, with no clue but the constructor input what those
2593            # might be.  Add a handler for each line in the input so that it
2594            # creates a separate input line for each property in those input
2595            # lines, thus making them suitable to handle generically.
2596
2597            push @{$each_line_handler{$addr}},
2598                 sub {
2599                    my $file = shift;
2600                    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2601                    my @fields = split /\s*;\s*/, $_, -1;
2602
2603                    if (@fields - 1 > @{$properties{$addr}}) {
2604                        $file->carp_bad_line('Extra fields');
2605                        $_ = "";
2606                        return;
2607                    }
2608                    my $range = shift @fields;  # 0th element is always the
2609                                                # range
2610
2611                    # The next fields in the input line correspond
2612                    # respectively to the stored properties.
2613                    for my $i (0 ..  @{$properties{$addr}} - 1) {
2614                        my $property_name = $properties{$addr}[$i];
2615                        next if $property_name eq '<ignored>';
2616                        $file->insert_adjusted_lines(
2617                              "$range; $property_name; $fields[$i]");
2618                    }
2619                    $_ = "";
2620
2621                    return;
2622                };
2623        }
2624
2625        {   # On non-ascii platforms, we use a special pre-handler
2626            no strict;
2627            no warnings 'once';
2628            *next_line = (main::NON_ASCII_PLATFORM)
2629                         ? *_next_line_with_remapped_range
2630                         : *_next_line;
2631        }
2632
2633        &{$construction_time_handler{$addr}}($self)
2634                                        if $construction_time_handler{$addr};
2635
2636        return $self;
2637    }
2638
2639
2640    use overload
2641        fallback => 0,
2642        qw("") => "_operator_stringify",
2643        "." => \&main::_operator_dot,
2644        ".=" => \&main::_operator_dot_equal,
2645    ;
2646
2647    sub _operator_stringify($self, $other="", $reversed=0) {
2648        return __PACKAGE__ . " object for " . $self->file;
2649    }
2650
2651    sub run($self) {
2652        # Process the input object $self.  This opens and closes the file and
2653        # calls all the handlers for it.  Currently,  this can only be called
2654        # once per file, as it destroy's the EOF handlers
2655
2656        # flag to make sure extracted files are processed early
2657        state $seen_non_extracted = 0;
2658
2659        my $addr = pack 'J', refaddr $self;
2660
2661        my $file = $file{$addr};
2662
2663        if (! $file) {
2664            $handle{$addr} = 'pretend_is_open';
2665        }
2666        else {
2667            if ($seen_non_extracted) {
2668                if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2669                                            # case of the file's name
2670                {
2671                    Carp::my_carp_bug(main::join_lines(<<END
2672$file should be processed just after the 'Prop...Alias' files, and before
2673anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2674have subtle problems
2675END
2676                    ));
2677                }
2678            }
2679            elsif ($EXTRACTED_DIR
2680
2681                    # We only do this check for generic property files
2682                    && $handler{$addr} == \&main::process_generic_property_file
2683
2684                    && $file !~ /$EXTRACTED/i)
2685            {
2686                # We don't set this (by the 'if' above) if we have no
2687                # extracted directory, so if running on an early version,
2688                # this test won't work.  Not worth worrying about.
2689                $seen_non_extracted = 1;
2690            }
2691
2692            # Mark the file as having being processed, and warn if it
2693            # isn't a file we are expecting.  As we process the files,
2694            # they are deleted from the hash, so any that remain at the
2695            # end of the program are files that we didn't process.
2696            my $fkey = File::Spec->rel2abs($file);
2697            my $exists = delete $potential_files{lc($fkey)};
2698
2699            Carp::my_carp("Was not expecting '$file'.")
2700                                    if $exists && ! $in_this_release{$addr};
2701
2702            # If there is special handling for compiling Unicode releases
2703            # earlier than the first one in which Unicode defines this
2704            # property ...
2705            if ($early{$addr}->@* > 1) {
2706
2707                # Mark as processed any substitute file that would be used in
2708                # such a release
2709                $fkey = File::Spec->rel2abs($early{$addr}[1]);
2710                delete $potential_files{lc($fkey)};
2711
2712                # As commented in the constructor code, when using the
2713                # official property, we still have to allow the publicly
2714                # inaccessible early name so that the core code which uses it
2715                # will work regardless.
2716                if (   ! $only_early{$addr}
2717                    && ! $early{$addr}[0]
2718                    && $early{$addr}->@* > 2)
2719                {
2720                    my $early_property_name = $early{$addr}[2];
2721                    if ($property{$addr} ne $early_property_name) {
2722                        main::property_ref($property{$addr})
2723                                            ->add_alias($early_property_name);
2724                    }
2725                }
2726            }
2727
2728            # We may be skipping this file ...
2729            if (defined $skip{$addr}) {
2730
2731                # If the file isn't supposed to be in this release, there is
2732                # nothing to do
2733                if ($in_this_release{$addr}) {
2734
2735                    # But otherwise, we may print a message
2736                    if ($debug_skip) {
2737                        print STDERR "Skipping input file '$file'",
2738                                     " because '$skip{$addr}'\n";
2739                    }
2740
2741                    # And add it to the list of skipped files, which is later
2742                    # used to make the pod
2743                    $skipped_files{$file} = $skip{$addr};
2744
2745                    # The 'optional' list contains properties that are also to
2746                    # be skipped along with the file.  (There may also be
2747                    # digits which are just placeholders to make sure it isn't
2748                    # an empty list
2749                    foreach my $property ($optional{$addr}->@*) {
2750                        next unless $property =~ /\D/;
2751                        my $prop_object = main::property_ref($property);
2752                        next unless defined $prop_object;
2753                        $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2754                    }
2755                }
2756
2757                return;
2758            }
2759
2760            # Here, we are going to process the file.  Open it, converting the
2761            # slashes used in this program into the proper form for the OS
2762            my $file_handle;
2763            if (not open $file_handle, "<", $file) {
2764                Carp::my_carp("Can't open $file.  Skipping: $!");
2765                return;
2766            }
2767            $handle{$addr} = $file_handle; # Cache the open file handle
2768
2769            # If possible, make sure that the file is the correct version.
2770            # (This data isn't available on early Unicode releases or in
2771            # UnicodeData.txt.)  We don't do this check if we are using a
2772            # substitute file instead of the official one (though the code
2773            # could be extended to do so).
2774            if ($in_this_release{$addr}
2775                && ! $early{$addr}[0]
2776                && lc($file) ne 'unicodedata.txt')
2777            {
2778                my $this_version;
2779
2780                if ($file !~ /^Unihan/i) {
2781
2782                    # The non-Unihan files started getting version numbers in
2783                    # 3.2, but some files in 4.0 are unchanged from 3.2, and
2784                    # marked as 3.2.  4.0.1 is the first version where there
2785                    # are no files marked as being from less than 4.0, though
2786                    # some are marked as 4.0.  In versions after that, the
2787                    # numbers are correct.
2788                    if ($v_version ge v4.0.1) {
2789                        $_ = <$file_handle>;    # The version number is in the
2790                                                # very first line if it is a
2791                                                # UCD file; otherwise, it
2792                                                # might be
2793                        goto valid_version if $_ =~ / - $string_version \. /x;
2794                        chomp;
2795                        if ($ucd{$addr}) {
2796                            $_ =~ s/^#\s*//;
2797
2798                            # 4.0.1 had some valid files that weren't updated.
2799                            goto valid_version
2800                                    if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/;
2801                            $this_version = $_;
2802                            goto wrong_version;
2803                        }
2804                        else {
2805                            my $BOM = "\x{FEFF}";
2806                            utf8::encode($BOM);
2807                            my $BOM_re = qr/ ^ (?:$BOM)? /x;
2808
2809                            do {
2810                                chomp;
2811
2812                                # BOM; seems to be on many lines in some
2813                                # files!!
2814                                $_ =~ s/$BOM_re//;
2815
2816                                if (/./) {
2817
2818                                    # Only look for the version if in the
2819                                    # first comment block.
2820                                    goto no_version unless $_ =~ /^#/;
2821
2822                                    if ($_ =~ /Version:? (\S*)/) {
2823                                        $this_version = $1;
2824                                        goto valid_version
2825                                          if  $this_version eq $string_version;
2826                                        goto valid_version
2827                                            if  "$this_version.0"
2828                                                            eq $string_version;
2829                                    }
2830                                }
2831                            } while (<$file_handle>);
2832
2833                            goto no_version;
2834                        }
2835                    }
2836                }
2837                elsif ($v_version ge v6.0.0) { # Unihan
2838
2839                    # Unihan files didn't get accurate version numbers until
2840                    # 6.0.  The version is somewhere in the first comment
2841                    # block
2842                    while (<$file_handle>) {
2843                        goto no_version if $_ !~ /^#/;
2844                        chomp;
2845                        $_ =~ s/^#\s*//;
2846                        next if $_ !~ / version: /x;
2847                        goto valid_version if $_ =~ /$string_version/;
2848                        goto wrong_version;
2849                    }
2850                    goto no_version;
2851                }
2852                else {  # Old Unihan; have to assume is valid
2853                    goto valid_version;
2854                }
2855
2856              wrong_version:
2857                die Carp::my_carp("File '$file' is version "
2858                                . "'$this_version'.  It should be "
2859                                . "version $string_version");
2860              no_version:
2861                Carp::my_carp_bug("Could not find the expected "
2862                                . "version info in file '$file'");
2863            }
2864        }
2865
2866      valid_version:
2867        print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2868
2869        # Call any special handler for before the file.
2870        &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2871
2872        # Then the main handler
2873        &{$handler{$addr}}($self);
2874
2875        # Then any special post-file handler.
2876        &{$post_handler{$addr}}($self) if $post_handler{$addr};
2877
2878        # If any errors have been accumulated, output the counts (as the first
2879        # error message in each class was output when it was encountered).
2880        if ($errors{$addr}) {
2881            my $total = 0;
2882            my $types = 0;
2883            foreach my $error (keys %{$errors{$addr}}) {
2884                $total += $errors{$addr}->{$error};
2885                delete $errors{$addr}->{$error};
2886                $types++;
2887            }
2888            if ($total > 1) {
2889                my $message
2890                        = "A total of $total lines had errors in $file.  ";
2891
2892                $message .= ($types == 1)
2893                            ? '(Only the first one was displayed.)'
2894                            : '(Only the first of each type was displayed.)';
2895                Carp::my_carp($message);
2896            }
2897        }
2898
2899        if (@{$missings{$addr}}) {
2900            Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2901        }
2902
2903        # If a real file handle, close it.
2904        close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2905                                                        ref $handle{$addr};
2906        $handle{$addr} = "";   # Uses empty to indicate that has already seen
2907                               # the file, as opposed to undef
2908        return;
2909    }
2910
2911    sub _next_line($self) {
2912        # Sets $_ to be the next logical input line, if any.  Returns non-zero
2913        # if such a line exists.  'logical' means that any lines that have
2914        # been added via insert_lines() will be returned in $_ before the file
2915        # is read again.
2916
2917        my $addr = pack 'J', refaddr $self;
2918
2919        # Here the file is open (or if the handle is not a ref, is an open
2920        # 'virtual' file).  Get the next line; any inserted lines get priority
2921        # over the file itself.
2922        my $adjusted;
2923
2924        LINE:
2925        while (1) { # Loop until find non-comment, non-empty line
2926            #local $to_trace = 1 if main::DEBUG;
2927            my $inserted_ref = shift @{$added_lines{$addr}};
2928            if (defined $inserted_ref) {
2929                ($adjusted, $_) = @{$inserted_ref};
2930                trace $adjusted, $_ if main::DEBUG && $to_trace;
2931                return 1 if $adjusted;
2932            }
2933            else {
2934                last if ! ref $handle{$addr}; # Don't read unless is real file
2935                last if ! defined ($_ = readline $handle{$addr});
2936            }
2937            chomp;
2938            trace $_ if main::DEBUG && $to_trace;
2939
2940            # See if this line is the comment line that defines what property
2941            # value that code points that are not listed in the file should
2942            # have.  The format or existence of these lines is not guaranteed
2943            # by Unicode since they are comments, but the documentation says
2944            # that this was added for machine-readability, so probably won't
2945            # change.  This works starting in Unicode Version 5.0.  They look
2946            # like:
2947            #
2948            # @missing: 0000..10FFFF; Not_Reordered
2949            # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2950            # @missing: 0000..10FFFF; ; NaN
2951            #
2952            # Save the line for a later get_missings() call.
2953            if (/$missing_defaults_prefix/) {
2954                if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2955                    $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2956                }
2957                elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2958                    my $start = $1;     # The pattern saves the beginning and
2959                    my $end = $2;       # end points of the range the default
2960                                        # is for
2961                    my @defaults = split /\s* ; \s*/x, $_;
2962
2963                    # The first field is the @missing, which ends in a
2964                    # semi-colon, so can safely shift.
2965                    shift @defaults;
2966
2967                    # Some of these lines may have empty field placeholders
2968                    # which get in the way.  An example is:
2969                    # @missing: 0000..10FFFF; ; NaN
2970                    # Remove them.  Process starting from the top so the
2971                    # splice doesn't affect things still to be looked at.
2972                    for (my $i = @defaults - 1; $i >= 0; $i--) {
2973                        next if $defaults[$i] ne "";
2974                        splice @defaults, $i, 1;
2975                    }
2976
2977                    # What's left should be just the property (maybe) and the
2978                    # default.  Having only one element means it doesn't have
2979                    # the property.
2980                    my $default;
2981                    my $property;
2982                    if (@defaults >= 1) {
2983                        if (@defaults == 1) {
2984                            $default = $defaults[0];
2985                        }
2986                        else {
2987                            $property = $defaults[0];
2988                            $default = $defaults[1];
2989                        }
2990                    }
2991
2992                    if (@defaults < 1
2993                        || @defaults > 2
2994                        || ($default =~ /^</
2995                            && $default !~ /^<code *point>$/i
2996                            && $default !~ /^<none>$/i
2997                            && $default !~ /^<script>$/i))
2998                    {
2999                        $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
3000                    }
3001                    else {
3002
3003                        # If the property is missing from the line, it should
3004                        # be the one for the whole file
3005                        $property = $property{$addr} if ! defined $property;
3006
3007                        # Change <none> to the null string, which is what it
3008                        # really means.  If the default is the code point
3009                        # itself, set it to <code point>, which is what
3010                        # Unicode uses (but sometimes they've forgotten the
3011                        # space)
3012                        if ($default =~ /^<none>$/i) {
3013                            $default = "";
3014                        }
3015                        elsif ($default =~ /^<code *point>$/i) {
3016                            $default = $CODE_POINT;
3017                        }
3018                        elsif ($default =~ /^<script>$/i) {
3019
3020                            # Special case this one.  Currently is from
3021                            # ScriptExtensions.txt, and means for all unlisted
3022                            # code points, use their Script property values.
3023                            # For the code points not listed in that file, the
3024                            # default value is 'Unknown'.
3025                            $default = "Unknown";
3026                        }
3027
3028                        # Store them as a sub-hash as part of an array, with
3029                        # both components.
3030                        push @{$missings{$addr}}, { start    => hex $start,
3031                                                    end      => hex $end,
3032                                                    default  => $default,
3033                                                    property => $property
3034                                                  };
3035                    }
3036                }
3037
3038                # There is nothing for the caller to process on this comment
3039                # line.
3040                next;
3041            }
3042
3043            # Unless to keep, remove comments.  If to keep, ignore
3044            # comment-only lines
3045            if ($retain_trailing_comments{$addr}) {
3046                next if / ^ \s* \# /x;
3047
3048                # But escape any single quotes (done in both the comment and
3049                # non-comment portion; this could be a bug someday, but not
3050                # likely)
3051                s/'/\\'/g;
3052            }
3053            else {
3054                s/#.*//;
3055            }
3056
3057            # Remove trailing space, and skip this line if the result is empty
3058            s/\s+$//;
3059            next if /^$/;
3060
3061            # Call any handlers for this line, and skip further processing of
3062            # the line if the handler sets the line to null.
3063            foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3064                &{$sub_ref}($self);
3065                next LINE if /^$/;
3066            }
3067
3068            # Here the line is ok.  return success.
3069            return 1;
3070        } # End of looping through lines.
3071
3072        # If there are EOF handlers, call each (only once) and if it generates
3073        # more lines to process go back in the loop to handle them.
3074        while ($eof_handler{$addr}->@*) {
3075            &{$eof_handler{$addr}[0]}($self);
3076            shift $eof_handler{$addr}->@*;   # Currently only get one shot at it.
3077            goto LINE if $added_lines{$addr};
3078        }
3079
3080        # Return failure -- no more lines.
3081        return 0;
3082
3083    }
3084
3085    sub _next_line_with_remapped_range($self) {
3086        # like _next_line(), but for use on non-ASCII platforms.  It sets $_
3087        # to be the next logical input line, if any.  Returns non-zero if such
3088        # a line exists.  'logical' means that any lines that have been added
3089        # via insert_lines() will be returned in $_ before the file is read
3090        # again.
3091        #
3092        # The difference from _next_line() is that this remaps the Unicode
3093        # code points in the input to those of the native platform.  Each
3094        # input line contains a single code point, or a single contiguous
3095        # range of them  This routine splits each range into its individual
3096        # code points and caches them.  It returns the cached values,
3097        # translated into their native equivalents, one at a time, for each
3098        # call, before reading the next line.  Since native values can only be
3099        # a single byte wide, no translation is needed for code points above
3100        # 0xFF, and ranges that are entirely above that number are not split.
3101        # If an input line contains the range 254-1000, it would be split into
3102        # three elements: 254, 255, and 256-1000.  (The downstream table
3103        # insertion code will sort and coalesce the individual code points
3104        # into appropriate ranges.)
3105
3106        my $addr = pack 'J', refaddr $self;
3107
3108        while (1) {
3109
3110            # Look in cache before reading the next line.  Return any cached
3111            # value, translated
3112            my $inserted = shift @{$remapped_lines{$addr}};
3113            if (defined $inserted) {
3114                trace $inserted if main::DEBUG && $to_trace;
3115                $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3116                trace $_ if main::DEBUG && $to_trace;
3117                return 1;
3118            }
3119
3120            # Get the next line.
3121            return 0 unless _next_line($self);
3122
3123            # If there is a special handler for it, return the line,
3124            # untranslated.  This should happen only for files that are
3125            # special, not being code-point related, such as property names.
3126            return 1 if $handler{$addr}
3127                                    != \&main::process_generic_property_file;
3128
3129            my ($range, $property_name, $map, @remainder)
3130                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3131
3132            if (@remainder
3133                || ! defined $property_name
3134                || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3135            {
3136                Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
3137            }
3138
3139            my $low = hex $1;
3140            my $high = (defined $2) ? hex $2 : $low;
3141
3142            # If the input maps the range to another code point, remap the
3143            # target if it is between 0 and 255.
3144            my $tail;
3145            if (defined $map) {
3146                $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3147                $tail = "$property_name; $map";
3148                $_ = "$range; $tail";
3149            }
3150            else {
3151                $tail = $property_name;
3152            }
3153
3154            # If entire range is above 255, just return it, unchanged (except
3155            # any mapped-to code point, already changed above)
3156            return 1 if $low > 255;
3157
3158            # Cache an entry for every code point < 255.  For those in the
3159            # range above 255, return a dummy entry for just that portion of
3160            # the range.  Note that this will be out-of-order, but that is not
3161            # a problem.
3162            foreach my $code_point ($low .. $high) {
3163                if ($code_point > 255) {
3164                    $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3165                    return 1;
3166                }
3167                push @{$remapped_lines{$addr}}, "$code_point; $tail";
3168            }
3169        } # End of looping through lines.
3170
3171        # NOTREACHED
3172    }
3173
3174#   Not currently used, not fully tested.
3175#    sub peek {
3176#        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3177#        # record.  Not callable from an each_line_handler(), nor does it call
3178#        # an each_line_handler() on the line.
3179#
3180#        my $self = shift;
3181#        my $addr = pack 'J', refaddr $self;
3182#
3183#        foreach my $inserted_ref (@{$added_lines{$addr}}) {
3184#            my ($adjusted, $line) = @{$inserted_ref};
3185#            next if $adjusted;
3186#
3187#            # Remove comments and trailing space, and return a non-empty
3188#            # resulting line
3189#            $line =~ s/#.*//;
3190#            $line =~ s/\s+$//;
3191#            return $line if $line ne "";
3192#        }
3193#
3194#        return if ! ref $handle{$addr}; # Don't read unless is real file
3195#        while (1) { # Loop until find non-comment, non-empty line
3196#            local $to_trace = 1 if main::DEBUG;
3197#            trace $_ if main::DEBUG && $to_trace;
3198#            return if ! defined (my $line = readline $handle{$addr});
3199#            chomp $line;
3200#            push @{$added_lines{$addr}}, [ 0, $line ];
3201#
3202#            $line =~ s/#.*//;
3203#            $line =~ s/\s+$//;
3204#            return $line if $line ne "";
3205#        }
3206#
3207#        return;
3208#    }
3209
3210
3211    sub insert_lines($self, @lines) {
3212        # Lines can be inserted so that it looks like they were in the input
3213        # file at the place it was when this routine is called.  See also
3214        # insert_adjusted_lines().  Lines inserted via this routine go through
3215        # any each_line_handler()
3216
3217        # Each inserted line is an array, with the first element being 0 to
3218        # indicate that this line hasn't been adjusted, and needs to be
3219        # processed.
3220        push @{$added_lines{pack 'J', refaddr $self}}, map { [ 0, $_ ] } @lines;
3221        return;
3222    }
3223
3224    sub insert_adjusted_lines($self, @lines) {
3225        # Lines can be inserted so that it looks like they were in the input
3226        # file at the place it was when this routine is called.  See also
3227        # insert_lines().  Lines inserted via this routine are already fully
3228        # adjusted, ready to be processed; each_line_handler()s handlers will
3229        # not be called.  This means this is not a completely general
3230        # facility, as only the last each_line_handler on the stack should
3231        # call this.  It could be made more general, by passing to each of the
3232        # line_handlers their position on the stack, which they would pass on
3233        # to this routine, and that would replace the boolean first element in
3234        # the anonymous array pushed here, so that the next_line routine could
3235        # use that to call only those handlers whose index is after it on the
3236        # stack.  But this is overkill for what is needed now.
3237
3238        trace $self if main::DEBUG && $to_trace;
3239
3240        # Each inserted line is an array, with the first element being 1 to
3241        # indicate that this line has been adjusted
3242        push @{$added_lines{pack 'J', refaddr $self}}, map { [ 1, $_ ] } @lines;
3243        return;
3244    }
3245
3246    sub get_missings($self) {
3247        # Returns the stored up @missings lines' values, and clears the list.
3248        # The values are in a hash, consisting of 'default' and 'property'.
3249        # However, since these lines can be stacked up, the return is an array
3250        # of all these hashes.
3251
3252        my $addr = pack 'J', refaddr $self;
3253
3254        # If not accepting a list return, just return the first one.
3255        return shift @{$missings{$addr}} unless wantarray;
3256
3257        my @return = @{$missings{$addr}};
3258        undef @{$missings{$addr}};
3259        return @return;
3260    }
3261
3262    sub _exclude_unassigned($self) {
3263
3264        # Takes the range in $_ and excludes code points that aren't assigned
3265        # in this release
3266
3267        state $skip_inserted_count = 0;
3268
3269        # Ignore recursive calls.
3270        if ($skip_inserted_count) {
3271            $skip_inserted_count--;
3272            return;
3273        }
3274
3275        # Find what code points are assigned in this release
3276        main::calculate_Assigned() if ! defined $Assigned;
3277
3278        my ($range, @remainder)
3279            = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3280
3281        # Examine the range.
3282        if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3283        {
3284            my $low = hex $1;
3285            my $high = (defined $2) ? hex $2 : $low;
3286
3287            # Split the range into subranges of just those code points in it
3288            # that are assigned.
3289            my @ranges = (Range_List->new(Initialize
3290                              => Range->new($low, $high)) & $Assigned)->ranges;
3291
3292            # Do nothing if nothing in the original range is assigned in this
3293            # release; handle normally if everything is in this release.
3294            if (! @ranges) {
3295                $_ = "";
3296            }
3297            elsif (@ranges != 1) {
3298
3299                # Here, some code points in the original range aren't in this
3300                # release; @ranges gives the ones that are.  Create fake input
3301                # lines for each of the ranges, and set things up so that when
3302                # this routine is called on that fake input, it will do
3303                # nothing.
3304                $skip_inserted_count = @ranges;
3305                my $remainder = join ";", @remainder;
3306                for my $range (@ranges) {
3307                    $self->insert_lines(sprintf("%04X..%04X;%s",
3308                                    $range->start, $range->end, $remainder));
3309                }
3310                $_ = "";    # The original range is now defunct.
3311            }
3312        }
3313
3314        return;
3315    }
3316
3317    sub _fixup_obsolete_hanguls($self) {
3318
3319        # This is called only when compiling Unicode version 1.  All Unicode
3320        # data for subsequent releases assumes that the code points that were
3321        # Hangul syllables in this release only are something else, so if
3322        # using such data, we have to override it
3323
3324        my $addr = pack 'J', refaddr $self;
3325
3326        my $object = main::property_ref($property{$addr});
3327        $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3328                         $FINAL_REMOVED_HANGUL_SYLLABLE,
3329                         $early{$addr}[3],  # Passed-in value for these
3330                         Replace => $UNCONDITIONALLY);
3331    }
3332
3333    sub _insert_property_into_line($self) {
3334        # Add a property field to $_, if this file requires it.
3335
3336        my $property = $property{pack 'J', refaddr $self};
3337        $_ =~ s/(;|$)/; $property$1/;
3338        return;
3339    }
3340
3341    sub carp_bad_line($self, $message="") {
3342        # Output consistent error messages, using either a generic one, or the
3343        # one given by the optional parameter.  To avoid gazillions of the
3344        # same message in case the syntax of a  file is way off, this routine
3345        # only outputs the first instance of each message, incrementing a
3346        # count so the totals can be output at the end of the file.
3347
3348        my $addr = pack 'J', refaddr $self;
3349
3350        $message = 'Unexpected line' unless $message;
3351
3352        # No trailing punctuation so as to fit with our addenda.
3353        $message =~ s/[.:;,]$//;
3354
3355        # If haven't seen this exact message before, output it now.  Otherwise
3356        # increment the count of how many times it has occurred
3357        unless ($errors{$addr}->{$message}) {
3358            Carp::my_carp("$message in '$_' in "
3359                            . $file{$addr}
3360                            . " at line $..  Skipping this line;");
3361            $errors{$addr}->{$message} = 1;
3362        }
3363        else {
3364            $errors{$addr}->{$message}++;
3365        }
3366
3367        # Clear the line to prevent any further (meaningful) processing of it.
3368        $_ = "";
3369
3370        return;
3371    }
3372} # End closure
3373
3374package Multi_Default;
3375
3376sub trace { return main::trace(@_); }
3377
3378# Certain properties in early versions of Unicode had more than one possible
3379# default for code points missing from the files.  In these cases, one
3380# default applies to everything left over after all the others are applied,
3381# and for each of the others, there is a description of which class of code
3382# points applies to it.  This object helps implement this by storing the
3383# defaults, and for all but that final default, an eval string that generates
3384# the class that it applies to.  That class must be a Range_List, or contains
3385# a Range_List that the overloaded operators recognize as to be operated on.
3386# A string is used because this is called early when we know symbolically what
3387# needs to be done, but typically before any data is gathered.  Thus the
3388# evaluation gets delayed until we have at hand all the needed information.
3389
3390{   # Closure
3391
3392    main::setup_package();
3393
3394    my %class_defaults;
3395    # The defaults structure for the classes
3396    main::set_access('class_defaults', \%class_defaults, 'readable_array');
3397
3398    my %other_default;
3399    # The default that applies to everything left over.
3400    main::set_access('other_default', \%other_default, 'r');
3401
3402    my %iterator;
3403
3404    sub new {
3405        # The constructor is called with default => eval pairs, terminated by
3406        # the left-over default. e.g.
3407        # Multi_Default->new(
3408        #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3409        #               -  0x200D',
3410        #        'R' => 'some other expression that evaluates to code points',
3411        #        .
3412        #        .
3413        #        .
3414        #        'U'));
3415        # It is best to leave the final value be the one that matches the
3416        # above-Unicode code points.
3417
3418        my $class = shift;
3419
3420        my $self = bless \do{my $anonymous_scalar}, $class;
3421        my $addr = pack 'J', refaddr $self;
3422        $iterator{$addr} = 0;
3423
3424        return $self unless @_;
3425
3426        while (@_ > 1) {
3427            $self->append_default(shift, shift);
3428        }
3429
3430        $self->set_final_default(shift);
3431
3432        return $self;
3433    }
3434
3435    sub append_default($self, $new_default, $eval) {
3436        my $addr = pack 'J', refaddr $self;
3437
3438        # Pushes a default setting to the current list
3439        push $class_defaults{$addr}->@*, [ $new_default, $eval ];
3440    }
3441
3442    sub set_final_default($self, $new_default) {
3443        my $addr = pack 'J', refaddr $self;
3444        $other_default{$addr} = $new_default;
3445    }
3446
3447    sub get_next_defaults($self) {
3448        # Iterates and returns the next class of defaults.
3449
3450        my $addr = pack 'J', refaddr $self;
3451        if ($iterator{$addr}++ < $class_defaults{$addr}->@*) {
3452            return $class_defaults{$addr}->[$iterator{$addr}-1]->@*;
3453        }
3454
3455        $iterator{$addr} = 0;
3456        return undef;
3457    }
3458}
3459
3460package Alias;
3461
3462# An alias is one of the names that a table goes by.  This class defines them
3463# including some attributes.  Everything is currently setup in the
3464# constructor.
3465
3466
3467{   # Closure
3468
3469    main::setup_package();
3470
3471    my %name;
3472    main::set_access('name', \%name, 'r');
3473
3474    my %loose_match;
3475    # Should this name match loosely or not.
3476    main::set_access('loose_match', \%loose_match, 'r');
3477
3478    my %make_re_pod_entry;
3479    # Some aliases should not get their own entries in the re section of the
3480    # pod, because they are covered by a wild-card, and some we want to
3481    # discourage use of.  Binary
3482    main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3483
3484    my %ucd;
3485    # Is this documented to be accessible via Unicode::UCD
3486    main::set_access('ucd', \%ucd, 'r', 's');
3487
3488    my %status;
3489    # Aliases have a status, like deprecated, or even suppressed (which means
3490    # they don't appear in documentation).  Enum
3491    main::set_access('status', \%status, 'r');
3492
3493    my %ok_as_filename;
3494    # Similarly, some aliases should not be considered as usable ones for
3495    # external use, such as file names, or we don't want documentation to
3496    # recommend them.  Boolean
3497    main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3498
3499    sub new {
3500        my $class = shift;
3501
3502        my $self = bless \do { my $anonymous_scalar }, $class;
3503        my $addr = pack 'J', refaddr $self;
3504
3505        $name{$addr} = shift;
3506        $loose_match{$addr} = shift;
3507        $make_re_pod_entry{$addr} = shift;
3508        $ok_as_filename{$addr} = shift;
3509        $status{$addr} = shift;
3510        $ucd{$addr} = shift;
3511
3512        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3513
3514        # Null names are never ok externally
3515        $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3516
3517        return $self;
3518    }
3519}
3520
3521package Range;
3522
3523# A range is the basic unit for storing code points, and is described in the
3524# comments at the beginning of the program.  Each range has a starting code
3525# point; an ending code point (not less than the starting one); a value
3526# that applies to every code point in between the two end-points, inclusive;
3527# and an enum type that applies to the value.  The type is for the user's
3528# convenience, and has no meaning here, except that a non-zero type is
3529# considered to not obey the normal Unicode rules for having standard forms.
3530#
3531# The same structure is used for both map and match tables, even though in the
3532# latter, the value (and hence type) is irrelevant and could be used as a
3533# comment.  In map tables, the value is what all the code points in the range
3534# map to.  Type 0 values have the standardized version of the value stored as
3535# well, so as to not have to recalculate it a lot.
3536
3537sub trace { return main::trace(@_); }
3538
3539{   # Closure
3540
3541    main::setup_package();
3542
3543    my %start;
3544    main::set_access('start', \%start, 'r', 's');
3545
3546    my %end;
3547    main::set_access('end', \%end, 'r', 's');
3548
3549    my %value;
3550    main::set_access('value', \%value, 'r', 's');
3551
3552    my %type;
3553    main::set_access('type', \%type, 'r');
3554
3555    my %standard_form;
3556    # The value in internal standard form.  Defined only if the type is 0.
3557    main::set_access('standard_form', \%standard_form);
3558
3559    # Note that if these fields change, the dump() method should as well
3560
3561    sub new($class, $_addr, $_end, @_args) {
3562        my $self = bless \do { my $anonymous_scalar }, $class;
3563        my $addr = pack 'J', refaddr $self;
3564
3565        $start{$addr} = $_addr;
3566        $end{$addr}   = $_end;
3567
3568        my %args = @_args;
3569
3570        my $value = delete $args{'Value'};  # Can be 0
3571        $value = "" unless defined $value;
3572        $value{$addr} = $value;
3573
3574        $type{$addr} = delete $args{'Type'} || 0;
3575
3576        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3577
3578        return $self;
3579    }
3580
3581    use overload
3582        fallback => 0,
3583        qw("") => "_operator_stringify",
3584        "." => \&main::_operator_dot,
3585        ".=" => \&main::_operator_dot_equal,
3586    ;
3587
3588    sub _operator_stringify($self, $other="", $reversed=0) {
3589        my $addr = pack 'J', refaddr $self;
3590
3591        # Output it like '0041..0065 (value)'
3592        my $return = sprintf("%04X", $start{$addr})
3593                        .  '..'
3594                        . sprintf("%04X", $end{$addr});
3595        my $value = $value{$addr};
3596        my $type = $type{$addr};
3597        $return .= ' (';
3598        $return .= "$value";
3599        $return .= ", Type=$type" if $type != 0;
3600        $return .= ')';
3601
3602        return $return;
3603    }
3604
3605    sub standard_form($self) {
3606        # Calculate the standard form only if needed, and cache the result.
3607        # The standard form is the value itself if the type is special.
3608        # This represents a considerable CPU and memory saving - at the time
3609        # of writing there are 368676 non-special objects, but the standard
3610        # form is only requested for 22047 of them - ie about 6%.
3611
3612        my $addr = pack 'J', refaddr $self;
3613
3614        return $standard_form{$addr} if defined $standard_form{$addr};
3615
3616        my $value = $value{$addr};
3617        return $value if $type{$addr};
3618        return $standard_form{$addr} = main::standardize($value);
3619    }
3620
3621    sub dump($self, $indent) {
3622        # Human, not machine readable.  For machine readable, comment out this
3623        # entire routine and let the standard one take effect.
3624        my $addr = pack 'J', refaddr $self;
3625
3626        my $return = $indent
3627                    . sprintf("%04X", $start{$addr})
3628                    . '..'
3629                    . sprintf("%04X", $end{$addr})
3630                    . " '$value{$addr}';";
3631        if (! defined $standard_form{$addr}) {
3632            $return .= "(type=$type{$addr})";
3633        }
3634        elsif ($standard_form{$addr} ne $value{$addr}) {
3635            $return .= "(standard '$standard_form{$addr}')";
3636        }
3637        return $return;
3638    }
3639} # End closure
3640
3641package _Range_List_Base;
3642
3643# Base class for range lists.  A range list is simply an ordered list of
3644# ranges, so that the ranges with the lowest starting numbers are first in it.
3645#
3646# When a new range is added that is adjacent to an existing range that has the
3647# same value and type, it merges with it to form a larger range.
3648#
3649# Ranges generally do not overlap, except that there can be multiple entries
3650# of single code point ranges.  This is because of NameAliases.txt.
3651#
3652# In this program, there is a standard value such that if two different
3653# values, have the same standard value, they are considered equivalent.  This
3654# value was chosen so that it gives correct results on Unicode data
3655
3656# There are a number of methods to manipulate range lists, and some operators
3657# are overloaded to handle them.
3658
3659sub trace { return main::trace(@_); }
3660
3661{ # Closure
3662
3663    our $addr;
3664
3665    # Max is initialized to a negative value that isn't adjacent to 0, for
3666    # simpler tests
3667    my $max_init = -2;
3668
3669    main::setup_package();
3670
3671    my %ranges;
3672    # The list of ranges
3673    main::set_access('ranges', \%ranges, 'readable_array');
3674
3675    my %max;
3676    # The highest code point in the list.  This was originally a method, but
3677    # actual measurements said it was used a lot.
3678    main::set_access('max', \%max, 'r');
3679
3680    my %each_range_iterator;
3681    # Iterator position for each_range()
3682    main::set_access('each_range_iterator', \%each_range_iterator);
3683
3684    my %owner_name_of;
3685    # Name of parent this is attached to, if any.  Solely for better error
3686    # messages.
3687    main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3688
3689    my %_search_ranges_cache;
3690    # A cache of the previous result from _search_ranges(), for better
3691    # performance
3692    main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3693
3694    sub new {
3695        my $class = shift;
3696        my %args = @_;
3697
3698        # Optional initialization data for the range list.  NOTE: For large
3699        # ranges, it is better to use Range object rather than
3700        #   [ low .. high ]
3701        # as it iterates through each one individually in the latter case.
3702        my $initialize = delete $args{'Initialize'};
3703
3704        my $self;
3705
3706        # Use _union() to initialize.  _union() returns an object of this
3707        # class, which means that it will call this constructor recursively.
3708        # But it won't have this $initialize parameter so that it won't
3709        # infinitely loop on this.
3710        return _union($class, $initialize, %args) if defined $initialize;
3711
3712        $self = bless \do { my $anonymous_scalar }, $class;
3713        my $addr = pack 'J', refaddr $self;
3714
3715        # Optional parent object, only for debug info.
3716        $owner_name_of{$addr} = delete $args{'Owner'};
3717        $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3718
3719        # Stringify, in case it is an object.
3720        $owner_name_of{$addr} = "$owner_name_of{$addr}";
3721
3722        # This is used only for error messages, and so a colon is added
3723        $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3724
3725        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3726
3727        $max{$addr} = $max_init;
3728
3729        $_search_ranges_cache{$addr} = 0;
3730        $ranges{$addr} = [];
3731
3732        return $self;
3733    }
3734
3735    use overload
3736        fallback => 0,
3737        qw("") => "_operator_stringify",
3738        "." => \&main::_operator_dot,
3739        ".=" => \&main::_operator_dot_equal,
3740    ;
3741
3742    sub _operator_stringify($self, $other="", $reversed=0) {
3743        my $addr = pack 'J', refaddr $self;
3744
3745        return "Range_List attached to '$owner_name_of{$addr}'"
3746                                                if $owner_name_of{$addr};
3747        return "anonymous Range_List " . \$self;
3748    }
3749
3750    sub _union {
3751        # Returns the union of the input code points.  It can be called as
3752        # either a constructor or a method.  If called as a method, the result
3753        # will be a new() instance of the calling object, containing the union
3754        # of that object with the other parameter's code points;  if called as
3755        # a constructor, the first parameter gives the class that the new object
3756        # should be, and the second parameter gives the code points to go into
3757        # it.
3758        # In either case, there are two parameters looked at by this routine;
3759        # any additional parameters are passed to the new() constructor.
3760        #
3761        # The code points can come in the form of some object that contains
3762        # ranges, and has a conventionally named method to access them; or
3763        # they can be an array of individual code points (as integers); or
3764        # just a single code point.
3765        #
3766        # If they are ranges, this routine doesn't make any effort to preserve
3767        # the range values and types of one input over the other.  Therefore
3768        # this base class should not allow _union to be called from other than
3769        # initialization code, so as to prevent two tables from being added
3770        # together where the range values matter.  The general form of this
3771        # routine therefore belongs in a derived class, but it was moved here
3772        # to avoid duplication of code.  The failure to overload this in this
3773        # class keeps it safe.
3774        #
3775        # It does make the effort during initialization to accept tables with
3776        # multiple values for the same code point, and to preserve the order
3777        # of these.  If there is only one input range or range set, it doesn't
3778        # sort (as it should already be sorted to the desired order), and will
3779        # accept multiple values per code point.  Otherwise it will merge
3780        # multiple values into a single one.
3781
3782        my $self;
3783        my @args;   # Arguments to pass to the constructor
3784
3785        my $class = shift;
3786
3787        # If a method call, will start the union with the object itself, and
3788        # the class of the new object will be the same as self.
3789        if (ref $class) {
3790            $self = $class;
3791            $class = ref $self;
3792            push @args, $self;
3793        }
3794
3795        # Add the other required parameter.
3796        push @args, shift;
3797        # Rest of parameters are passed on to the constructor
3798
3799        # Accumulate all records from both lists.
3800        my @records;
3801        my $input_count = 0;
3802        for my $arg (@args) {
3803            #local $to_trace = 0 if main::DEBUG;
3804            trace "argument = $arg" if main::DEBUG && $to_trace;
3805            if (! defined $arg) {
3806                my $message = "";
3807                if (defined $self) {
3808                    $message .= $owner_name_of{pack 'J', refaddr $self};
3809                }
3810                Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3811                return;
3812            }
3813
3814            $arg = [ $arg ] if ! ref $arg;
3815            my $type = ref $arg;
3816            if ($type eq 'ARRAY') {
3817                foreach my $element (@$arg) {
3818                    push @records, Range->new($element, $element);
3819                    $input_count++;
3820                }
3821            }
3822            elsif ($arg->isa('Range')) {
3823                push @records, $arg;
3824                $input_count++;
3825            }
3826            elsif ($arg->can('ranges')) {
3827                push @records, $arg->ranges;
3828                $input_count++;
3829            }
3830            else {
3831                my $message = "";
3832                if (defined $self) {
3833                    $message .= $owner_name_of{pack 'J', refaddr $self};
3834                }
3835                Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3836                return;
3837            }
3838        }
3839
3840        # Sort with the range containing the lowest ordinal first, but if
3841        # two ranges start at the same code point, sort with the bigger range
3842        # of the two first, because it takes fewer cycles.
3843        if ($input_count > 1) {
3844            @records = sort { ($a->start <=> $b->start)
3845                                      or
3846                                    # if b is shorter than a, b->end will be
3847                                    # less than a->end, and we want to select
3848                                    # a, so want to return -1
3849                                    ($b->end <=> $a->end)
3850                                   } @records;
3851        }
3852
3853        my $new = $class->new(@_);
3854
3855        # Fold in records so long as they add new information.
3856        for my $set (@records) {
3857            my $start = $set->start;
3858            my $end   = $set->end;
3859            my $value = $set->value;
3860            my $type  = $set->type;
3861            if ($start > $new->max) {
3862                $new->_add_delete('+', $start, $end, $value, Type => $type);
3863            }
3864            elsif ($end > $new->max) {
3865                $new->_add_delete('+', $new->max +1, $end, $value,
3866                                                                Type => $type);
3867            }
3868            elsif ($input_count == 1) {
3869                # Here, overlaps existing range, but is from a single input,
3870                # so preserve the multiple values from that input.
3871                $new->_add_delete('+', $start, $end, $value, Type => $type,
3872                                                Replace => $MULTIPLE_AFTER);
3873            }
3874        }
3875
3876        return $new;
3877    }
3878
3879    sub range_count($self) {        # Return the number of ranges in the range list
3880        return scalar @{$ranges{pack 'J', refaddr $self}};
3881    }
3882
3883    sub min($self) {
3884        # Returns the minimum code point currently in the range list, or if
3885        # the range list is empty, 2 beyond the max possible.  This is a
3886        # method because used so rarely, that not worth saving between calls,
3887        # and having to worry about changing it as ranges are added and
3888        # deleted.
3889
3890        my $addr = pack 'J', refaddr $self;
3891
3892        # If the range list is empty, return a large value that isn't adjacent
3893        # to any that could be in the range list, for simpler tests
3894        return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3895        return $ranges{$addr}->[0]->start;
3896    }
3897
3898    sub contains($self, $codepoint) {
3899        # Boolean: Is argument in the range list?  If so returns $i such that:
3900        #   range[$i]->end < $codepoint <= range[$i+1]->end
3901        # which is one beyond what you want; this is so that the 0th range
3902        # doesn't return false
3903
3904        my $i = $self->_search_ranges($codepoint);
3905        return 0 unless defined $i;
3906
3907        # The search returns $i, such that
3908        #   range[$i-1]->end < $codepoint <= range[$i]->end
3909        # So is in the table if and only iff it is at least the start position
3910        # of range $i.
3911        return 0 if $ranges{pack 'J', refaddr $self}->[$i]->start > $codepoint;
3912        return $i + 1;
3913    }
3914
3915    sub containing_range($self, $codepoint) {
3916        # Returns the range object that contains the code point, undef if none
3917        my $i = $self->contains($codepoint);
3918        return unless $i;
3919
3920        # contains() returns 1 beyond where we should look
3921        return $ranges{pack 'J', refaddr $self}->[$i-1];
3922    }
3923
3924    sub value_of($self, $codepoint) {
3925        # Returns the value associated with the code point, undef if none
3926        my $range = $self->containing_range($codepoint);
3927        return unless defined $range;
3928
3929        return $range->value;
3930    }
3931
3932    sub type_of($self, $codepoint) {
3933        # Returns the type of the range containing the code point, undef if
3934        # the code point is not in the table
3935        my $range = $self->containing_range($codepoint);
3936        return unless defined $range;
3937
3938        return $range->type;
3939    }
3940
3941    sub _search_ranges($self, $code_point) {
3942        # Find the range in the list which contains a code point, or where it
3943        # should go if were to add it.  That is, it returns $i, such that:
3944        #   range[$i-1]->end < $codepoint <= range[$i]->end
3945        # Returns undef if no such $i is possible (e.g. at end of table), or
3946        # if there is an error.
3947        my $addr = pack 'J', refaddr $self;
3948
3949        return if $code_point > $max{$addr};
3950        my $r = $ranges{$addr};                # The current list of ranges
3951        my $range_list_size = scalar @$r;
3952        my $i;
3953
3954        use integer;        # want integer division
3955
3956        # Use the cached result as the starting guess for this one, because,
3957        # an experiment on 5.1 showed that 90% of the time the cache was the
3958        # same as the result on the next call (and 7% it was one less).
3959        $i = $_search_ranges_cache{$addr};
3960        $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3961                                            # from an intervening deletion
3962        #local $to_trace = 1 if main::DEBUG;
3963        trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3964        return $i if $code_point <= $r->[$i]->end
3965                     && ($i == 0 || $r->[$i-1]->end < $code_point);
3966
3967        # Here the cache doesn't yield the correct $i.  Try adding 1.
3968        if ($i < $range_list_size - 1
3969            && $r->[$i]->end < $code_point &&
3970            $code_point <= $r->[$i+1]->end)
3971        {
3972            $i++;
3973            trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3974            $_search_ranges_cache{$addr} = $i;
3975            return $i;
3976        }
3977
3978        # Here, adding 1 also didn't work.  We do a binary search to
3979        # find the correct position, starting with current $i
3980        my $lower = 0;
3981        my $upper = $range_list_size - 1;
3982        while (1) {
3983            trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3984
3985            if ($code_point <= $r->[$i]->end) {
3986
3987                # Here we have met the upper constraint.  We can quit if we
3988                # also meet the lower one.
3989                last if $i == 0 || $r->[$i-1]->end < $code_point;
3990
3991                $upper = $i;        # Still too high.
3992
3993            }
3994            else {
3995
3996                # Here, $r[$i]->end < $code_point, so look higher up.
3997                $lower = $i;
3998            }
3999
4000            # Split search domain in half to try again.
4001            my $temp = ($upper + $lower) / 2;
4002
4003            # No point in continuing unless $i changes for next time
4004            # in the loop.
4005            if ($temp == $i) {
4006
4007                # We can't reach the highest element because of the averaging.
4008                # So if one below the upper edge, force it there and try one
4009                # more time.
4010                if ($i == $range_list_size - 2) {
4011
4012                    trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4013                    $i = $range_list_size - 1;
4014
4015                    # Change $lower as well so if fails next time through,
4016                    # taking the average will yield the same $i, and we will
4017                    # quit with the error message just below.
4018                    $lower = $i;
4019                    next;
4020                }
4021                Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
4022                return;
4023            }
4024            $i = $temp;
4025        } # End of while loop
4026
4027        if (main::DEBUG && $to_trace) {
4028            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4029            trace "i=  [ $i ]", $r->[$i];
4030            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4031        }
4032
4033        # Here we have found the offset.  Cache it as a starting point for the
4034        # next call.
4035        $_search_ranges_cache{$addr} = $i;
4036        return $i;
4037    }
4038
4039    sub _add_delete {
4040        # Add, replace or delete ranges to or from a list.  The $type
4041        # parameter gives which:
4042        #   '+' => insert or replace a range, returning a list of any changed
4043        #          ranges.
4044        #   '-' => delete a range, returning a list of any deleted ranges.
4045        #
4046        # The next three parameters give respectively the start, end, and
4047        # value associated with the range.  'value' should be null unless the
4048        # operation is '+';
4049        #
4050        # The range list is kept sorted so that the range with the lowest
4051        # starting position is first in the list, and generally, adjacent
4052        # ranges with the same values are merged into a single larger one (see
4053        # exceptions below).
4054        #
4055        # There are more parameters; all are key => value pairs:
4056        #   Type    gives the type of the value.  It is only valid for '+'.
4057        #           All ranges have types; if this parameter is omitted, 0 is
4058        #           assumed.  Ranges with type 0 are assumed to obey the
4059        #           Unicode rules for casing, etc; ranges with other types are
4060        #           not.  Otherwise, the type is arbitrary, for the caller's
4061        #           convenience, and looked at only by this routine to keep
4062        #           adjacent ranges of different types from being merged into
4063        #           a single larger range, and when Replace =>
4064        #           $IF_NOT_EQUIVALENT is specified (see just below).
4065        #   Replace  determines what to do if the range list already contains
4066        #            ranges which coincide with all or portions of the input
4067        #            range.  It is only valid for '+':
4068        #       => $NO            means that the new value is not to replace
4069        #                         any existing ones, but any empty gaps of the
4070        #                         range list coinciding with the input range
4071        #                         will be filled in with the new value.
4072        #       => $UNCONDITIONALLY  means to replace the existing values with
4073        #                         this one unconditionally.  However, if the
4074        #                         new and old values are identical, the
4075        #                         replacement is skipped to save cycles
4076        #       => $IF_NOT_EQUIVALENT means to replace the existing values
4077        #          (the default)  with this one if they are not equivalent.
4078        #                         Ranges are equivalent if their types are the
4079        #                         same, and they are the same string; or if
4080        #                         both are type 0 ranges, if their Unicode
4081        #                         standard forms are identical.  In this last
4082        #                         case, the routine chooses the more "modern"
4083        #                         one to use.  This is because some of the
4084        #                         older files are formatted with values that
4085        #                         are, for example, ALL CAPs, whereas the
4086        #                         derived files have a more modern style,
4087        #                         which looks better.  By looking for this
4088        #                         style when the pre-existing and replacement
4089        #                         standard forms are the same, we can move to
4090        #                         the modern style
4091        #       => $MULTIPLE_BEFORE means that if this range duplicates an
4092        #                         existing one, but has a different value,
4093        #                         don't replace the existing one, but insert
4094        #                         this one so that the same range can occur
4095        #                         multiple times.  They are stored LIFO, so
4096        #                         that the final one inserted is the first one
4097        #                         returned in an ordered search of the table.
4098        #                         If this is an exact duplicate, including the
4099        #                         value, the original will be moved to be
4100        #                         first, before any other duplicate ranges
4101        #                         with different values.
4102        #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4103        #                         FIFO, so that this one is inserted after all
4104        #                         others that currently exist.  If this is an
4105        #                         exact duplicate, including value, of an
4106        #                         existing range, this one is discarded
4107        #                         (leaving the existing one in its original,
4108        #                         higher priority position
4109        #       => $CROAK         Die with an error if is already there
4110        #       => anything else  is the same as => $IF_NOT_EQUIVALENT
4111        #
4112        # "same value" means identical for non-type-0 ranges, and it means
4113        # having the same standard forms for type-0 ranges.
4114
4115        return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4116
4117        my $self = shift;
4118        my $operation = shift;   # '+' for add/replace; '-' for delete;
4119        my $start = shift;
4120        my $end   = shift;
4121        my $value = shift;
4122
4123        my %args = @_;
4124
4125        $value = "" if not defined $value;        # warning: $value can be "0"
4126
4127        my $replace = delete $args{'Replace'};
4128        $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4129
4130        my $type = delete $args{'Type'};
4131        $type = 0 unless defined $type;
4132
4133        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4134
4135        my $addr = pack 'J', refaddr $self;
4136
4137        if ($operation ne '+' && $operation ne '-') {
4138            Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
4139            return;
4140        }
4141        unless (defined $start && defined $end) {
4142            Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
4143            return;
4144        }
4145        unless ($end >= $start) {
4146            Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
4147            return;
4148        }
4149        #local $to_trace = 1 if main::DEBUG;
4150
4151        if ($operation eq '-') {
4152            if ($replace != $IF_NOT_EQUIVALENT) {
4153                Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
4154                $replace = $IF_NOT_EQUIVALENT;
4155            }
4156            if ($type) {
4157                Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
4158                $type = 0;
4159            }
4160            if ($value ne "") {
4161                Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
4162                $value = "";
4163            }
4164        }
4165
4166        my $r = $ranges{$addr};               # The current list of ranges
4167        my $range_list_size = scalar @$r;     # And its size
4168        my $max = $max{$addr};                # The current high code point in
4169                                              # the list of ranges
4170
4171        # Do a special case requiring fewer machine cycles when the new range
4172        # starts after the current highest point.  The Unicode input data is
4173        # structured so this is common.
4174        if ($start > $max) {
4175
4176            trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4177            return if $operation eq '-'; # Deleting a non-existing range is a
4178                                         # no-op
4179
4180            # If the new range doesn't logically extend the current final one
4181            # in the range list, create a new range at the end of the range
4182            # list.  (max cleverly is initialized to a negative number not
4183            # adjacent to 0 if the range list is empty, so even adding a range
4184            # to an empty range list starting at 0 will have this 'if'
4185            # succeed.)
4186            if ($start > $max + 1        # non-adjacent means can't extend.
4187                || @{$r}[-1]->value ne $value # values differ, can't extend.
4188                || @{$r}[-1]->type != $type # types differ, can't extend.
4189            ) {
4190                push @$r, Range->new($start, $end,
4191                                     Value => $value,
4192                                     Type => $type);
4193            }
4194            else {
4195
4196                # Here, the new range starts just after the current highest in
4197                # the range list, and they have the same type and value.
4198                # Extend the existing range to incorporate the new one.
4199                @{$r}[-1]->set_end($end);
4200            }
4201
4202            # This becomes the new maximum.
4203            $max{$addr} = $end;
4204
4205            return;
4206        }
4207        #local $to_trace = 0 if main::DEBUG;
4208
4209        trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4210
4211        # Here, the input range isn't after the whole rest of the range list.
4212        # Most likely 'splice' will be needed.  The rest of the routine finds
4213        # the needed splice parameters, and if necessary, does the splice.
4214        # First, find the offset parameter needed by the splice function for
4215        # the input range.  Note that the input range may span multiple
4216        # existing ones, but we'll worry about that later.  For now, just find
4217        # the beginning.  If the input range is to be inserted starting in a
4218        # position not currently in the range list, it must (obviously) come
4219        # just after the range below it, and just before the range above it.
4220        # Slightly less obviously, it will occupy the position currently
4221        # occupied by the range that is to come after it.  More formally, we
4222        # are looking for the position, $i, in the array of ranges, such that:
4223        #
4224        # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4225        #
4226        # (The ordered relationships within existing ranges are also shown in
4227        # the equation above).  However, if the start of the input range is
4228        # within an existing range, the splice offset should point to that
4229        # existing range's position in the list; that is $i satisfies a
4230        # somewhat different equation, namely:
4231        #
4232        #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4233        #
4234        # More briefly, $start can come before or after r[$i]->start, and at
4235        # this point, we don't know which it will be.  However, these
4236        # two equations share these constraints:
4237        #
4238        #   r[$i-1]->end < $start <= r[$i]->end
4239        #
4240        # And that is good enough to find $i.
4241
4242        my $i = $self->_search_ranges($start);
4243        if (! defined $i) {
4244            Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
4245            return;
4246        }
4247
4248        # The search function returns $i such that:
4249        #
4250        # r[$i-1]->end < $start <= r[$i]->end
4251        #
4252        # That means that $i points to the first range in the range list
4253        # that could possibly be affected by this operation.  We still don't
4254        # know if the start of the input range is within r[$i], or if it
4255        # points to empty space between r[$i-1] and r[$i].
4256        trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4257
4258        # Special case the insertion of data that is not to replace any
4259        # existing data.
4260        if ($replace == $NO) {  # If $NO, has to be operation '+'
4261            #local $to_trace = 1 if main::DEBUG;
4262            trace "Doesn't replace" if main::DEBUG && $to_trace;
4263
4264            # Here, the new range is to take effect only on those code points
4265            # that aren't already in an existing range.  This can be done by
4266            # looking through the existing range list and finding the gaps in
4267            # the ranges that this new range affects, and then calling this
4268            # function recursively on each of those gaps, leaving untouched
4269            # anything already in the list.  Gather up a list of the changed
4270            # gaps first so that changes to the internal state as new ranges
4271            # are added won't be a problem.
4272            my @gap_list;
4273
4274            # First, if the starting point of the input range is outside an
4275            # existing one, there is a gap from there to the beginning of the
4276            # existing range -- add a span to fill the part that this new
4277            # range occupies
4278            if ($start < $r->[$i]->start) {
4279                push @gap_list, Range->new($start,
4280                                           main::min($end,
4281                                                     $r->[$i]->start - 1),
4282                                           Type => $type);
4283                trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4284            }
4285
4286            # Then look through the range list for other gaps until we reach
4287            # the highest range affected by the input one.
4288            my $j;
4289            for ($j = $i+1; $j < $range_list_size; $j++) {
4290                trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4291                last if $end < $r->[$j]->start;
4292
4293                # If there is a gap between when this range starts and the
4294                # previous one ends, add a span to fill it.  Note that just
4295                # because there are two ranges doesn't mean there is a
4296                # non-zero gap between them.  It could be that they have
4297                # different values or types
4298                if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4299                    push @gap_list,
4300                        Range->new($r->[$j-1]->end + 1,
4301                                   $r->[$j]->start - 1,
4302                                   Type => $type);
4303                    trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4304                }
4305            }
4306
4307            # Here, we have either found an existing range in the range list,
4308            # beyond the area affected by the input one, or we fell off the
4309            # end of the loop because the input range affects the whole rest
4310            # of the range list.  In either case, $j is 1 higher than the
4311            # highest affected range.  If $j == $i, it means that there are no
4312            # affected ranges, that the entire insertion is in the gap between
4313            # r[$i-1], and r[$i], which we already have taken care of before
4314            # the loop.
4315            # On the other hand, if there are affected ranges, it might be
4316            # that there is a gap that needs filling after the final such
4317            # range to the end of the input range
4318            if ($r->[$j-1]->end < $end) {
4319                    push @gap_list, Range->new(main::max($start,
4320                                                         $r->[$j-1]->end + 1),
4321                                               $end,
4322                                               Type => $type);
4323                    trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4324            }
4325
4326            # Call recursively to fill in all the gaps.
4327            foreach my $gap (@gap_list) {
4328                $self->_add_delete($operation,
4329                                   $gap->start,
4330                                   $gap->end,
4331                                   $value,
4332                                   Type => $type);
4333            }
4334
4335            return;
4336        }
4337
4338        # Here, we have taken care of the case where $replace is $NO.
4339        # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4340        # If inserting a multiple record, this is where it goes, before the
4341        # first (if any) existing one if inserting LIFO.  (If this is to go
4342        # afterwards, FIFO, we below move the pointer to there.)  These imply
4343        # an insertion, and no change to any existing ranges.  Note that $i
4344        # can be -1 if this new range doesn't actually duplicate any existing,
4345        # and comes at the beginning of the list.
4346        if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4347
4348            if ($start != $end) {
4349                Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
4350                return;
4351            }
4352
4353            # If the new code point is within a current range ...
4354            if ($end >= $r->[$i]->start) {
4355
4356                # Don't add an exact duplicate, as it isn't really a multiple
4357                my $existing_value = $r->[$i]->value;
4358                my $existing_type = $r->[$i]->type;
4359                return if $value eq $existing_value && $type eq $existing_type;
4360
4361                # If the multiple value is part of an existing range, we want
4362                # to split up that range, so that only the single code point
4363                # is affected.  To do this, we first call ourselves
4364                # recursively to delete that code point from the table, having
4365                # preserved its current data above.  Then we call ourselves
4366                # recursively again to add the new multiple, which we know by
4367                # the test just above is different than the current code
4368                # point's value, so it will become a range containing a single
4369                # code point: just itself.  Finally, we add back in the
4370                # pre-existing code point, which will again be a single code
4371                # point range.  Because 'i' likely will have changed as a
4372                # result of these operations, we can't just continue on, but
4373                # do this operation recursively as well.  If we are inserting
4374                # LIFO, the pre-existing code point needs to go after the new
4375                # one, so use MULTIPLE_AFTER; and vice versa.
4376                if ($r->[$i]->start != $r->[$i]->end) {
4377                    $self->_add_delete('-', $start, $end, "");
4378                    $self->_add_delete('+', $start, $end, $value, Type => $type);
4379                    return $self->_add_delete('+',
4380                            $start, $end,
4381                            $existing_value,
4382                            Type => $existing_type,
4383                            Replace => ($replace == $MULTIPLE_BEFORE)
4384                                       ? $MULTIPLE_AFTER
4385                                       : $MULTIPLE_BEFORE);
4386                }
4387            }
4388
4389            # If to place this new record after, move to beyond all existing
4390            # ones; but don't add this one if identical to any of them, as it
4391            # isn't really a multiple.  This leaves the original order, so
4392            # that the current request is ignored.  The reasoning is that the
4393            # previous request that wanted this record to have high priority
4394            # should have precedence.
4395            if ($replace == $MULTIPLE_AFTER) {
4396                while ($i < @$r && $r->[$i]->start == $start) {
4397                    return if $value eq $r->[$i]->value
4398                              && $type eq $r->[$i]->type;
4399                    $i++;
4400                }
4401            }
4402            else {
4403                # If instead we are to place this new record before any
4404                # existing ones, remove any identical ones that come after it.
4405                # This changes the existing order so that the new one is
4406                # first, as is being requested.
4407                for (my $j = $i + 1;
4408                     $j < @$r && $r->[$j]->start == $start;
4409                     $j++)
4410                {
4411                    if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4412                        splice @$r, $j, 1;
4413                        last;   # There should only be one instance, so no
4414                                # need to keep looking
4415                    }
4416                }
4417            }
4418
4419            trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4420            my @return = splice @$r,
4421                                $i,
4422                                0,
4423                                Range->new($start,
4424                                           $end,
4425                                           Value => $value,
4426                                           Type => $type);
4427            if (main::DEBUG && $to_trace) {
4428                trace "After splice:";
4429                trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4430                trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4431                trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4432                trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4433                trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4434                trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4435            }
4436            return @return;
4437        }
4438
4439        # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4440        # leaves delete, insert, and replace either unconditionally or if not
4441        # equivalent.  $i still points to the first potential affected range.
4442        # Now find the highest range affected, which will determine the length
4443        # parameter to splice.  (The input range can span multiple existing
4444        # ones.)  If this isn't a deletion, while we are looking through the
4445        # range list, see also if this is a replacement rather than a clean
4446        # insertion; that is if it will change the values of at least one
4447        # existing range.  Start off assuming it is an insert, until find it
4448        # isn't.
4449        my $clean_insert = $operation eq '+';
4450        my $j;        # This will point to the highest affected range
4451
4452        # For non-zero types, the standard form is the value itself;
4453        my $standard_form = ($type) ? $value : main::standardize($value);
4454
4455        for ($j = $i; $j < $range_list_size; $j++) {
4456            trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4457
4458            # If find a range that it doesn't overlap into, we can stop
4459            # searching
4460            last if $end < $r->[$j]->start;
4461
4462            # Here, overlaps the range at $j.  If the values don't match,
4463            # and so far we think this is a clean insertion, it becomes a
4464            # non-clean insertion, i.e., a 'change' or 'replace' instead.
4465            if ($clean_insert) {
4466                if ($r->[$j]->standard_form ne $standard_form) {
4467                    $clean_insert = 0;
4468                    if ($replace == $CROAK) {
4469                        main::croak("The range to add "
4470                        . sprintf("%04X", $start)
4471                        . '-'
4472                        . sprintf("%04X", $end)
4473                        . " with value '$value' overlaps an existing range $r->[$j]");
4474                    }
4475                }
4476                else {
4477
4478                    # Here, the two values are essentially the same.  If the
4479                    # two are actually identical, replacing wouldn't change
4480                    # anything so skip it.
4481                    my $pre_existing = $r->[$j]->value;
4482                    if ($pre_existing ne $value) {
4483
4484                        # Here the new and old standardized values are the
4485                        # same, but the non-standardized values aren't.  If
4486                        # replacing unconditionally, then replace
4487                        if( $replace == $UNCONDITIONALLY) {
4488                            $clean_insert = 0;
4489                        }
4490                        else {
4491
4492                            # Here, are replacing conditionally.  Decide to
4493                            # replace or not based on which appears to look
4494                            # the "nicest".  If one is mixed case and the
4495                            # other isn't, choose the mixed case one.
4496                            my $new_mixed = $value =~ /[A-Z]/
4497                                            && $value =~ /[a-z]/;
4498                            my $old_mixed = $pre_existing =~ /[A-Z]/
4499                                            && $pre_existing =~ /[a-z]/;
4500
4501                            if ($old_mixed != $new_mixed) {
4502                                $clean_insert = 0 if $new_mixed;
4503                                if (main::DEBUG && $to_trace) {
4504                                    if ($clean_insert) {
4505                                        trace "Retaining $pre_existing over $value";
4506                                    }
4507                                    else {
4508                                        trace "Replacing $pre_existing with $value";
4509                                    }
4510                                }
4511                            }
4512                            else {
4513
4514                                # Here casing wasn't different between the two.
4515                                # If one has hyphens or underscores and the
4516                                # other doesn't, choose the one with the
4517                                # punctuation.
4518                                my $new_punct = $value =~ /[-_]/;
4519                                my $old_punct = $pre_existing =~ /[-_]/;
4520
4521                                if ($old_punct != $new_punct) {
4522                                    $clean_insert = 0 if $new_punct;
4523                                    if (main::DEBUG && $to_trace) {
4524                                        if ($clean_insert) {
4525                                            trace "Retaining $pre_existing over $value";
4526                                        }
4527                                        else {
4528                                            trace "Replacing $pre_existing with $value";
4529                                        }
4530                                    }
4531                                }   # else existing one is just as "good";
4532                                    # retain it to save cycles.
4533                            }
4534                        }
4535                    }
4536                }
4537            }
4538        } # End of loop looking for highest affected range.
4539
4540        # Here, $j points to one beyond the highest range that this insertion
4541        # affects (hence to beyond the range list if that range is the final
4542        # one in the range list).
4543
4544        # The splice length is all the affected ranges.  Get it before
4545        # subtracting, for efficiency, so we don't have to later add 1.
4546        my $length = $j - $i;
4547
4548        $j--;        # $j now points to the highest affected range.
4549        trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4550
4551        # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4552        # $j points to the highest affected range.  But it can be < $i or even
4553        # -1.  These happen only if the insertion is entirely in the gap
4554        # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4555        # above exited first time through with $end < $r->[$i]->start.  (And
4556        # then we subtracted one from j)  This implies also that $start <
4557        # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4558        # $start, so the entire input range is in the gap.
4559        if ($j < $i) {
4560
4561            # Here the entire input range is in the gap before $i.
4562
4563            if (main::DEBUG && $to_trace) {
4564                if ($i) {
4565                    trace "Entire range is between $r->[$i-1] and $r->[$i]";
4566                }
4567                else {
4568                    trace "Entire range is before $r->[$i]";
4569                }
4570            }
4571            return if $operation ne '+'; # Deletion of a non-existent range is
4572                                         # a no-op
4573        }
4574        else {
4575
4576            # Here part of the input range is not in the gap before $i.  Thus,
4577            # there is at least one affected one, and $j points to the highest
4578            # such one.
4579
4580            # At this point, here is the situation:
4581            # This is not an insertion of a multiple, nor of tentative ($NO)
4582            # data.
4583            #   $i  points to the first element in the current range list that
4584            #            may be affected by this operation.  In fact, we know
4585            #            that the range at $i is affected because we are in
4586            #            the else branch of this 'if'
4587            #   $j  points to the highest affected range.
4588            # In other words,
4589            #   r[$i-1]->end < $start <= r[$i]->end
4590            # And:
4591            #   r[$i-1]->end < $start <= $end < r[$j+1]->start
4592            #
4593            # Also:
4594            #   $clean_insert is a boolean which is set true if and only if
4595            #        this is a "clean insertion", i.e., not a change nor a
4596            #        deletion (multiple was handled above).
4597
4598            # We now have enough information to decide if this call is a no-op
4599            # or not.  It is a no-op if this is an insertion of already
4600            # existing data.  To be so, it must be contained entirely in one
4601            # range.
4602
4603            if (main::DEBUG && $to_trace && $clean_insert
4604                                         && $start >= $r->[$i]->start
4605                                         && $end   <= $r->[$i]->end)
4606            {
4607                    trace "no-op";
4608            }
4609            return if $clean_insert
4610                      && $start >= $r->[$i]->start
4611                      && $end   <= $r->[$i]->end;
4612        }
4613
4614        # Here, we know that some action will have to be taken.  We have
4615        # calculated the offset and length (though adjustments may be needed)
4616        # for the splice.  Now start constructing the replacement list.
4617        my @replacement;
4618        my $splice_start = $i;
4619
4620        my $extends_below;
4621        my $extends_above;
4622
4623        # See if should extend any adjacent ranges.
4624        if ($operation eq '-') { # Don't extend deletions
4625            $extends_below = $extends_above = 0;
4626        }
4627        else {  # Here, should extend any adjacent ranges.  See if there are
4628                # any.
4629            $extends_below = ($i > 0
4630                            # can't extend unless adjacent
4631                            && $r->[$i-1]->end == $start -1
4632                            # can't extend unless are same standard value
4633                            && $r->[$i-1]->standard_form eq $standard_form
4634                            # can't extend unless share type
4635                            && $r->[$i-1]->type == $type);
4636            $extends_above = ($j+1 < $range_list_size
4637                            && $r->[$j+1]->start == $end +1
4638                            && $r->[$j+1]->standard_form eq $standard_form
4639                            && $r->[$j+1]->type == $type);
4640        }
4641        if ($extends_below && $extends_above) { # Adds to both
4642            $splice_start--;     # start replace at element below
4643            $length += 2;        # will replace on both sides
4644            trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4645
4646            # The result will fill in any gap, replacing both sides, and
4647            # create one large range.
4648            @replacement = Range->new($r->[$i-1]->start,
4649                                      $r->[$j+1]->end,
4650                                      Value => $value,
4651                                      Type => $type);
4652        }
4653        else {
4654
4655            # Here we know that the result won't just be the conglomeration of
4656            # a new range with both its adjacent neighbors.  But it could
4657            # extend one of them.
4658
4659            if ($extends_below) {
4660
4661                # Here the new element adds to the one below, but not to the
4662                # one above.  If inserting, and only to that one range,  can
4663                # just change its ending to include the new one.
4664                if ($length == 0 && $clean_insert) {
4665                    $r->[$i-1]->set_end($end);
4666                    trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4667                    return;
4668                }
4669                else {
4670                    trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4671                    $splice_start--;        # start replace at element below
4672                    $length++;              # will replace the element below
4673                    $start = $r->[$i-1]->start;
4674                }
4675            }
4676            elsif ($extends_above) {
4677
4678                # Here the new element adds to the one above, but not below.
4679                # Mirror the code above
4680                if ($length == 0 && $clean_insert) {
4681                    $r->[$j+1]->set_start($start);
4682                    trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4683                    return;
4684                }
4685                else {
4686                    trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4687                    $length++;        # will replace the element above
4688                    $end = $r->[$j+1]->end;
4689                }
4690            }
4691
4692            trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4693
4694            # Finally, here we know there will have to be a splice.
4695            # If the change or delete affects only the highest portion of the
4696            # first affected range, the range will have to be split.  The
4697            # splice will remove the whole range, but will replace it by a new
4698            # range containing just the unaffected part.  So, in this case,
4699            # add to the replacement list just this unaffected portion.
4700            if (! $extends_below
4701                && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4702            {
4703                push @replacement,
4704                    Range->new($r->[$i]->start,
4705                               $start - 1,
4706                               Value => $r->[$i]->value,
4707                               Type => $r->[$i]->type);
4708            }
4709
4710            # In the case of an insert or change, but not a delete, we have to
4711            # put in the new stuff;  this comes next.
4712            if ($operation eq '+') {
4713                push @replacement, Range->new($start,
4714                                              $end,
4715                                              Value => $value,
4716                                              Type => $type);
4717            }
4718
4719            trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4720            #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4721
4722            # And finally, if we're changing or deleting only a portion of the
4723            # highest affected range, it must be split, as the lowest one was.
4724            if (! $extends_above
4725                && $j >= 0  # Remember that j can be -1 if before first
4726                            # current element
4727                && $end >= $r->[$j]->start
4728                && $end < $r->[$j]->end)
4729            {
4730                push @replacement,
4731                    Range->new($end + 1,
4732                               $r->[$j]->end,
4733                               Value => $r->[$j]->value,
4734                               Type => $r->[$j]->type);
4735            }
4736        }
4737
4738        # And do the splice, as calculated above
4739        if (main::DEBUG && $to_trace) {
4740            trace "replacing $length element(s) at $i with ";
4741            foreach my $replacement (@replacement) {
4742                trace "    $replacement";
4743            }
4744            trace "Before splice:";
4745            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4746            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4747            trace "i  =[", $i, "]", $r->[$i];
4748            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4749            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4750        }
4751
4752        my @return = splice @$r, $splice_start, $length, @replacement;
4753
4754        if (main::DEBUG && $to_trace) {
4755            trace "After splice:";
4756            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4757            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4758            trace "i  =[", $i, "]", $r->[$i];
4759            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4760            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4761            trace "removed ", @return if @return;
4762        }
4763
4764        # An actual deletion could have changed the maximum in the list.
4765        # There was no deletion if the splice didn't return something, but
4766        # otherwise recalculate it.  This is done too rarely to worry about
4767        # performance.
4768        if ($operation eq '-' && @return) {
4769            if (@$r) {
4770                $max{$addr} = $r->[-1]->end;
4771            }
4772            else {  # Now empty
4773                $max{$addr} = $max_init;
4774            }
4775        }
4776        return @return;
4777    }
4778
4779    sub reset_each_range($self) {  # reset the iterator for each_range();
4780        undef $each_range_iterator{pack 'J', refaddr $self};
4781        return;
4782    }
4783
4784    sub each_range($self) {
4785        # Iterate over each range in a range list.  Results are undefined if
4786        # the range list is changed during the iteration.
4787        my $addr = pack 'J', refaddr $self;
4788
4789        return if $self->is_empty;
4790
4791        $each_range_iterator{$addr} = -1
4792                                if ! defined $each_range_iterator{$addr};
4793        $each_range_iterator{$addr}++;
4794        return $ranges{$addr}->[$each_range_iterator{$addr}]
4795                        if $each_range_iterator{$addr} < @{$ranges{$addr}};
4796        undef $each_range_iterator{$addr};
4797        return;
4798    }
4799
4800    sub count($self) {        # Returns count of code points in range list
4801        my $addr = pack 'J', refaddr $self;
4802
4803        my $count = 0;
4804        foreach my $range (@{$ranges{$addr}}) {
4805            $count += $range->end - $range->start + 1;
4806        }
4807        return $count;
4808    }
4809
4810    sub delete_range($self, $start, $end) {    # Delete a range
4811        return $self->_add_delete('-', $start, $end, "");
4812    }
4813
4814    sub is_empty($self) { # Returns boolean as to if a range list is empty
4815        return scalar @{$ranges{pack 'J', refaddr $self}} == 0;
4816    }
4817
4818    sub hash($self) {
4819        # Quickly returns a scalar suitable for separating tables into
4820        # buckets, i.e. it is a hash function of the contents of a table, so
4821        # there are relatively few conflicts.
4822        my $addr = pack 'J', refaddr $self;
4823
4824        # These are quickly computable.  Return looks like 'min..max;count'
4825        return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4826    }
4827} # End closure for _Range_List_Base
4828
4829package Range_List;
4830use parent '-norequire', '_Range_List_Base';
4831
4832# A Range_List is a range list for match tables; i.e. the range values are
4833# not significant.  Thus a number of operations can be safely added to it,
4834# such as inversion, intersection.  Note that union is also an unsafe
4835# operation when range values are cared about, and that method is in the base
4836# class, not here.  But things are set up so that that method is callable only
4837# during initialization.  Only in this derived class, is there an operation
4838# that combines two tables.  A Range_Map can thus be used to initialize a
4839# Range_List, and its mappings will be in the list, but are not significant to
4840# this class.
4841
4842sub trace { return main::trace(@_); }
4843
4844{ # Closure
4845
4846    use overload
4847        fallback => 0,
4848        '+' => sub { my $self = shift;
4849                    my $other = shift;
4850
4851                    return $self->_union($other)
4852                },
4853        '+=' => sub { my $self = shift;
4854                    my $other = shift;
4855                    my $reversed = shift;
4856
4857                    if ($reversed) {
4858                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4859                        . ref($other)
4860                        . ' += '
4861                        . ref($self)
4862                        . "'.  undef returned.");
4863                        return;
4864                    }
4865
4866                    return $self->_union($other)
4867                },
4868        '&' => sub { my $self = shift;
4869                    my $other = shift;
4870
4871                    return $self->_intersect($other, 0);
4872                },
4873        '&=' => sub { my $self = shift;
4874                    my $other = shift;
4875                    my $reversed = shift;
4876
4877                    if ($reversed) {
4878                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4879                        . ref($other)
4880                        . ' &= '
4881                        . ref($self)
4882                        . "'.  undef returned.");
4883                        return;
4884                    }
4885
4886                    return $self->_intersect($other, 0);
4887                },
4888        '~' => "_invert",
4889        '-' => "_subtract",
4890    ;
4891
4892    sub _invert($self, @) {
4893        # Returns a new Range_List that gives all code points not in $self.
4894        my $new = Range_List->new;
4895
4896        # Go through each range in the table, finding the gaps between them
4897        my $max = -1;   # Set so no gap before range beginning at 0
4898        for my $range ($self->ranges) {
4899            my $start = $range->start;
4900            my $end   = $range->end;
4901
4902            # If there is a gap before this range, the inverse will contain
4903            # that gap.
4904            if ($start > $max + 1) {
4905                $new->add_range($max + 1, $start - 1);
4906            }
4907            $max = $end;
4908        }
4909
4910        # And finally, add the gap from the end of the table to the max
4911        # possible code point
4912        if ($max < $MAX_WORKING_CODEPOINT) {
4913            $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4914        }
4915        return $new;
4916    }
4917
4918    sub _subtract($self, $other, $reversed=0) {
4919        # Returns a new Range_List with the argument deleted from it.  The
4920        # argument can be a single code point, a range, or something that has
4921        # a range, with the _range_list() method on it returning them
4922
4923        if ($reversed) {
4924            Carp::my_carp_bug("Bad news.  Can't cope with '"
4925            . ref($other)
4926            . ' - '
4927            . ref($self)
4928            . "'.  undef returned.");
4929            return;
4930        }
4931
4932        my $new = Range_List->new(Initialize => $self);
4933
4934        if (! ref $other) { # Single code point
4935            $new->delete_range($other, $other);
4936        }
4937        elsif ($other->isa('Range')) {
4938            $new->delete_range($other->start, $other->end);
4939        }
4940        elsif ($other->can('_range_list')) {
4941            foreach my $range ($other->_range_list->ranges) {
4942                $new->delete_range($range->start, $range->end);
4943            }
4944        }
4945        else {
4946            Carp::my_carp_bug("Can't cope with a "
4947                        . ref($other)
4948                        . " argument to '-'.  Subtraction ignored."
4949                        );
4950            return $self;
4951        }
4952
4953        return $new;
4954    }
4955
4956    sub _intersect($a_object, $b_object, $check_if_overlapping=0) {
4957        # Returns either a boolean giving whether the two inputs' range lists
4958        # intersect (overlap), or a new Range_List containing the intersection
4959        # of the two lists.  The optional final parameter being true indicates
4960        # to do the check instead of the intersection.
4961
4962        if (! defined $b_object) {
4963            my $message = "";
4964            $message .= $a_object->_owner_name_of if defined $a_object;
4965            Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4966            return;
4967        }
4968
4969        # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4970        # Thus the intersection could be much more simply be written:
4971        #   return ~(~$a_object + ~$b_object);
4972        # But, this is slower, and when taking the inverse of a large
4973        # range_size_1 table, back when such tables were always stored that
4974        # way, it became prohibitively slow, hence the code was changed to the
4975        # below
4976
4977        if ($b_object->isa('Range')) {
4978            $b_object = Range_List->new(Initialize => $b_object,
4979                                        Owner => $a_object->_owner_name_of);
4980        }
4981        $b_object = $b_object->_range_list if $b_object->can('_range_list');
4982
4983        my @a_ranges = $a_object->ranges;
4984        my @b_ranges = $b_object->ranges;
4985
4986        #local $to_trace = 1 if main::DEBUG;
4987        trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4988
4989        # Start with the first range in each list
4990        my $a_i = 0;
4991        my $range_a = $a_ranges[$a_i];
4992        my $b_i = 0;
4993        my $range_b = $b_ranges[$b_i];
4994
4995        my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4996                                                if ! $check_if_overlapping;
4997
4998        # If either list is empty, there is no intersection and no overlap
4999        if (! defined $range_a || ! defined $range_b) {
5000            return $check_if_overlapping ? 0 : $new;
5001        }
5002        trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5003
5004        # Otherwise, must calculate the intersection/overlap.  Start with the
5005        # very first code point in each list
5006        my $a = $range_a->start;
5007        my $b = $range_b->start;
5008
5009        # Loop through all the ranges of each list; in each iteration, $a and
5010        # $b are the current code points in their respective lists
5011        while (1) {
5012
5013            # If $a and $b are the same code point, ...
5014            if ($a == $b) {
5015
5016                # it means the lists overlap.  If just checking for overlap
5017                # know the answer now,
5018                return 1 if $check_if_overlapping;
5019
5020                # The intersection includes this code point plus anything else
5021                # common to both current ranges.
5022                my $start = $a;
5023                my $end = main::min($range_a->end, $range_b->end);
5024                if (! $check_if_overlapping) {
5025                    trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5026                    $new->add_range($start, $end);
5027                }
5028
5029                # Skip ahead to the end of the current intersect
5030                $a = $b = $end;
5031
5032                # If the current intersect ends at the end of either range (as
5033                # it must for at least one of them), the next possible one
5034                # will be the beginning code point in it's list's next range.
5035                if ($a == $range_a->end) {
5036                    $range_a = $a_ranges[++$a_i];
5037                    last unless defined $range_a;
5038                    $a = $range_a->start;
5039                }
5040                if ($b == $range_b->end) {
5041                    $range_b = $b_ranges[++$b_i];
5042                    last unless defined $range_b;
5043                    $b = $range_b->start;
5044                }
5045
5046                trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5047            }
5048            elsif ($a < $b) {
5049
5050                # Not equal, but if the range containing $a encompasses $b,
5051                # change $a to be the middle of the range where it does equal
5052                # $b, so the next iteration will get the intersection
5053                if ($range_a->end >= $b) {
5054                    $a = $b;
5055                }
5056                else {
5057
5058                    # Here, the current range containing $a is entirely below
5059                    # $b.  Go try to find a range that could contain $b.
5060                    $a_i = $a_object->_search_ranges($b);
5061
5062                    # If no range found, quit.
5063                    last unless defined $a_i;
5064
5065                    # The search returns $a_i, such that
5066                    #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5067                    # Set $a to the beginning of this new range, and repeat.
5068                    $range_a = $a_ranges[$a_i];
5069                    $a = $range_a->start;
5070                }
5071            }
5072            else { # Here, $b < $a.
5073
5074                # Mirror image code to the leg just above
5075                if ($range_b->end >= $a) {
5076                    $b = $a;
5077                }
5078                else {
5079                    $b_i = $b_object->_search_ranges($a);
5080                    last unless defined $b_i;
5081                    $range_b = $b_ranges[$b_i];
5082                    $b = $range_b->start;
5083                }
5084            }
5085        } # End of looping through ranges.
5086
5087        # Intersection fully computed, or now know that there is no overlap
5088        return $check_if_overlapping ? 0 : $new;
5089    }
5090
5091    sub overlaps($self, $other) {
5092        # Returns boolean giving whether the two arguments overlap somewhere
5093        return $self->_intersect($other, 1);
5094    }
5095
5096    sub add_range($self, $start, $end) {
5097        # Add a range to the list.
5098        return $self->_add_delete('+', $start, $end, "");
5099    }
5100
5101    sub matches_identically_to($self, $other) {
5102        # Return a boolean as to whether or not two Range_Lists match identical
5103        # sets of code points.
5104        # These are ordered in increasing real time to figure out (at least
5105        # until a patch changes that and doesn't change this)
5106        return 0 if $self->max != $other->max;
5107        return 0 if $self->min != $other->min;
5108        return 0 if $self->range_count != $other->range_count;
5109        return 0 if $self->count != $other->count;
5110
5111        # Here they could be identical because all the tests above passed.
5112        # The loop below is somewhat simpler since we know they have the same
5113        # number of elements.  Compare range by range, until reach the end or
5114        # find something that differs.
5115        my @a_ranges = $self->ranges;
5116        my @b_ranges = $other->ranges;
5117        for my $i (0 .. @a_ranges - 1) {
5118            my $a = $a_ranges[$i];
5119            my $b = $b_ranges[$i];
5120            trace "self $a; other $b" if main::DEBUG && $to_trace;
5121            return 0 if ! defined $b
5122                        || $a->start != $b->start
5123                        || $a->end != $b->end;
5124        }
5125        return 1;
5126    }
5127
5128    sub is_code_point_usable($code, $try_hard) {
5129        # This used only for making the test script.  See if the input
5130        # proposed trial code point is one that Perl will handle.  If second
5131        # parameter is 0, it won't select some code points for various
5132        # reasons, noted below.
5133        return 0 if $code < 0;                # Never use a negative
5134
5135        # shun null.  I'm (khw) not sure why this was done, but NULL would be
5136        # the character very frequently used.
5137        return $try_hard if $code == 0x0000;
5138
5139        # shun non-character code points.
5140        return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5141        return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5142
5143        return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
5144        return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5145
5146        return 1;
5147    }
5148
5149    sub get_valid_code_point($self) {
5150        # Return a code point that's part of the range list.  Returns nothing
5151        # if the table is empty or we can't find a suitable code point.  This
5152        # used only for making the test script.
5153
5154        # On first pass, don't choose less desirable code points; if no good
5155        # one is found, repeat, allowing a less desirable one to be selected.
5156        for my $try_hard (0, 1) {
5157
5158            # Look through all the ranges for a usable code point.
5159            for my $set (reverse $self->ranges) {
5160
5161                # Try the edge cases first, starting with the end point of the
5162                # range.
5163                my $end = $set->end;
5164                return $end if is_code_point_usable($end, $try_hard);
5165                $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5166
5167                # End point didn't, work.  Start at the beginning and try
5168                # every one until find one that does work.
5169                for my $trial ($set->start .. $end - 1) {
5170                    return $trial if is_code_point_usable($trial, $try_hard);
5171                }
5172            }
5173        }
5174        return ();  # If none found, give up.
5175    }
5176
5177    sub get_invalid_code_point($self) {
5178        # Return a code point that's not part of the table.  Returns nothing
5179        # if the table covers all code points or a suitable code point can't
5180        # be found.  This used only for making the test script.
5181
5182        # Just find a valid code point of the inverse, if any.
5183        return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5184    }
5185} # end closure for Range_List
5186
5187package Range_Map;
5188use parent '-norequire', '_Range_List_Base';
5189
5190# A Range_Map is a range list in which the range values (called maps) are
5191# significant, and hence shouldn't be manipulated by our other code, which
5192# could be ambiguous or lose things.  For example, in taking the union of two
5193# lists, which share code points, but which have differing values, which one
5194# has precedence in the union?
5195# It turns out that these operations aren't really necessary for map tables,
5196# and so this class was created to make sure they aren't accidentally
5197# applied to them.
5198
5199{ # Closure
5200
5201    sub add_map($self, @add) {
5202        # Add a range containing a mapping value to the list
5203        return $self->_add_delete('+', @add);
5204    }
5205
5206    sub replace_map($self, @list) {
5207        # Replace a range
5208        return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY);
5209    }
5210
5211    sub add_duplicate {
5212        # Adds entry to a range list which can duplicate an existing entry
5213
5214        my $self = shift;
5215        my $code_point = shift;
5216        my $value = shift;
5217        my %args = @_;
5218        my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5219        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5220
5221        return $self->add_map($code_point, $code_point,
5222                                $value, Replace => $replace);
5223    }
5224} # End of closure for package Range_Map
5225
5226package _Base_Table;
5227
5228# A table is the basic data structure that gets written out into a file for
5229# use by the Perl core.  This is the abstract base class implementing the
5230# common elements from the derived ones.  A list of the methods to be
5231# furnished by an implementing class is just after the constructor.
5232
5233sub standardize { return main::standardize($_[0]); }
5234sub trace { return main::trace(@_); }
5235
5236{ # Closure
5237
5238    main::setup_package();
5239
5240    my %range_list;
5241    # Object containing the ranges of the table.
5242    main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5243
5244    my %full_name;
5245    # The full table name.
5246    main::set_access('full_name', \%full_name, 'r');
5247
5248    my %name;
5249    # The table name, almost always shorter
5250    main::set_access('name', \%name, 'r');
5251
5252    my %short_name;
5253    # The shortest of all the aliases for this table, with underscores removed
5254    main::set_access('short_name', \%short_name);
5255
5256    my %nominal_short_name_length;
5257    # The length of short_name before removing underscores
5258    main::set_access('nominal_short_name_length',
5259                    \%nominal_short_name_length);
5260
5261    my %complete_name;
5262    # The complete name, including property.
5263    main::set_access('complete_name', \%complete_name, 'r');
5264
5265    my %property;
5266    # Parent property this table is attached to.
5267    main::set_access('property', \%property, 'r');
5268
5269    my %aliases;
5270    # Ordered list of alias objects of the table's name.  The first ones in
5271    # the list are output first in comments
5272    main::set_access('aliases', \%aliases, 'readable_array');
5273
5274    my %comment;
5275    # A comment associated with the table for human readers of the files
5276    main::set_access('comment', \%comment, 's');
5277
5278    my %description;
5279    # A comment giving a short description of the table's meaning for human
5280    # readers of the files.
5281    main::set_access('description', \%description, 'readable_array');
5282
5283    my %note;
5284    # A comment giving a short note about the table for human readers of the
5285    # files.
5286    main::set_access('note', \%note, 'readable_array');
5287
5288    my %fate;
5289    # Enum; there are a number of possibilities for what happens to this
5290    # table: it could be normal, or suppressed, or not for external use.  See
5291    # values at definition for $SUPPRESSED.
5292    main::set_access('fate', \%fate, 'r');
5293
5294    my %find_table_from_alias;
5295    # The parent property passes this pointer to a hash which this class adds
5296    # all its aliases to, so that the parent can quickly take an alias and
5297    # find this table.
5298    main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5299
5300    my %locked;
5301    # After this table is made equivalent to another one; we shouldn't go
5302    # changing the contents because that could mean it's no longer equivalent
5303    main::set_access('locked', \%locked, 'r');
5304
5305    my %file_path;
5306    # This gives the final path to the file containing the table.  Each
5307    # directory in the path is an element in the array
5308    main::set_access('file_path', \%file_path, 'readable_array');
5309
5310    my %status;
5311    # What is the table's status, normal, $OBSOLETE, etc.  Enum
5312    main::set_access('status', \%status, 'r');
5313
5314    my %status_info;
5315    # A comment about its being obsolete, or whatever non normal status it has
5316    main::set_access('status_info', \%status_info, 'r');
5317
5318    my %caseless_equivalent;
5319    # The table this is equivalent to under /i matching, if any.
5320    main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5321
5322    my %range_size_1;
5323    # Is the table to be output with each range only a single code point?
5324    # This is done to avoid breaking existing code that may have come to rely
5325    # on this behavior in previous versions of this program.)
5326    main::set_access('range_size_1', \%range_size_1, 'r', 's');
5327
5328    my %perl_extension;
5329    # A boolean set iff this table is a Perl extension to the Unicode
5330    # standard.
5331    main::set_access('perl_extension', \%perl_extension, 'r');
5332
5333    my %output_range_counts;
5334    # A boolean set iff this table is to have comments written in the
5335    # output file that contain the number of code points in the range.
5336    # The constructor can override the global flag of the same name.
5337    main::set_access('output_range_counts', \%output_range_counts, 'r');
5338
5339    my %write_as_invlist;
5340    # A boolean set iff the output file for this table is to be in the form of
5341    # an inversion list/map.
5342    main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5343
5344    my %format;
5345    # The format of the entries of the table.  This is calculated from the
5346    # data in the table (or passed in the constructor).  This is an enum e.g.,
5347    # $STRING_FORMAT.  It is marked protected as it should not be generally
5348    # used to override calculations.
5349    main::set_access('format', \%format, 'r', 'p_s');
5350
5351    my %has_dependency;
5352    # A boolean that gives whether some other table in this property is
5353    # defined as the complement of this table.  This is a crude, but currently
5354    # sufficient, mechanism to make this table not get destroyed before what
5355    # is dependent on it is.  Other dependencies could be added, so the name
5356    # was chosen to reflect a more general situation than actually is
5357    # currently the case.
5358    main::set_access('has_dependency', \%has_dependency, 'r', 's');
5359
5360    sub new {
5361        # All arguments are key => value pairs, which you can see below, most
5362        # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5363        # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5364        # documented in the Alias package
5365
5366        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5367
5368        my $class = shift;
5369
5370        my $self = bless \do { my $anonymous_scalar }, $class;
5371        my $addr = pack 'J', refaddr $self;
5372
5373        my %args = @_;
5374
5375        $name{$addr} = delete $args{'Name'};
5376        $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5377        $full_name{$addr} = delete $args{'Full_Name'};
5378        my $complete_name = $complete_name{$addr}
5379                          = delete $args{'Complete_Name'};
5380        $format{$addr} = delete $args{'Format'};
5381        $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5382        $property{$addr} = delete $args{'_Property'};
5383        $range_list{$addr} = delete $args{'_Range_List'};
5384        $status{$addr} = delete $args{'Status'} || $NORMAL;
5385        $status_info{$addr} = delete $args{'_Status_Info'} || "";
5386        $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5387        $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5388        $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5389        $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5390        my $ucd = delete $args{'UCD'};
5391
5392        my $description = delete $args{'Description'};
5393        my $ok_as_filename = delete $args{'OK_as_Filename'};
5394        my $loose_match = delete $args{'Fuzzy'};
5395        my $note = delete $args{'Note'};
5396        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5397        my $perl_extension = delete $args{'Perl_Extension'};
5398        my $suppression_reason = delete $args{'Suppression_Reason'};
5399
5400        # Shouldn't have any left over
5401        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5402
5403        # Can't use || above because conceivably the name could be 0, and
5404        # can't use // operator in case this program gets used in Perl 5.8
5405        $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5406        $output_range_counts{$addr} = $output_range_counts if
5407                                        ! defined $output_range_counts{$addr};
5408
5409        $aliases{$addr} = [ ];
5410        $comment{$addr} = [ ];
5411        $description{$addr} = [ ];
5412        $note{$addr} = [ ];
5413        $file_path{$addr} = [ ];
5414        $locked{$addr} = "";
5415        $has_dependency{$addr} = 0;
5416
5417        push @{$description{$addr}}, $description if $description;
5418        push @{$note{$addr}}, $note if $note;
5419
5420        if ($fate{$addr} == $PLACEHOLDER) {
5421
5422            # A placeholder table doesn't get documented, is a perl extension,
5423            # and quite likely will be empty
5424            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5425            $perl_extension = 1 if ! defined $perl_extension;
5426            $ucd = 0 if ! defined $ucd;
5427            push @tables_that_may_be_empty, $complete_name{$addr};
5428            $self->add_comment(<<END);
5429This is a placeholder because it is not in Version $string_version of Unicode,
5430but is needed by the Perl core to work gracefully.  Because it is not in this
5431version of Unicode, it will not be listed in $pod_file.pod
5432END
5433        }
5434        elsif (exists $why_suppressed{$complete_name}
5435                # Don't suppress if overridden
5436                && ! grep { $_ eq $complete_name{$addr} }
5437                                                    @output_mapped_properties)
5438        {
5439            $fate{$addr} = $SUPPRESSED;
5440        }
5441        elsif ($fate{$addr} == $SUPPRESSED) {
5442            Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5443            # Though currently unused
5444        }
5445        elsif ($suppression_reason) {
5446            Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5447        }
5448
5449        # If hasn't set its status already, see if it is on one of the
5450        # lists of properties or tables that have particular statuses; if
5451        # not, is normal.  The lists are prioritized so the most serious
5452        # ones are checked first
5453        if (! $status{$addr}) {
5454            if (exists $why_deprecated{$complete_name}) {
5455                $status{$addr} = $DEPRECATED;
5456            }
5457            elsif (exists $why_stabilized{$complete_name}) {
5458                $status{$addr} = $STABILIZED;
5459            }
5460            elsif (exists $why_obsolete{$complete_name}) {
5461                $status{$addr} = $OBSOLETE;
5462            }
5463
5464            # Existence above doesn't necessarily mean there is a message
5465            # associated with it.  Use the most serious message.
5466            if ($status{$addr}) {
5467                if ($why_deprecated{$complete_name}) {
5468                    $status_info{$addr}
5469                                = $why_deprecated{$complete_name};
5470                }
5471                elsif ($why_stabilized{$complete_name}) {
5472                    $status_info{$addr}
5473                                = $why_stabilized{$complete_name};
5474                }
5475                elsif ($why_obsolete{$complete_name}) {
5476                    $status_info{$addr}
5477                                = $why_obsolete{$complete_name};
5478                }
5479            }
5480        }
5481
5482        $perl_extension{$addr} = $perl_extension || 0;
5483
5484        # Don't list a property by default that is internal only
5485        if ($fate{$addr} > $MAP_PROXIED) {
5486            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5487            $ucd = 0 if ! defined $ucd;
5488        }
5489        else {
5490            $ucd = 1 if ! defined $ucd;
5491        }
5492
5493        # By convention what typically gets printed only or first is what's
5494        # first in the list, so put the full name there for good output
5495        # clarity.  Other routines rely on the full name being first on the
5496        # list
5497        $self->add_alias($full_name{$addr},
5498                            OK_as_Filename => $ok_as_filename,
5499                            Fuzzy => $loose_match,
5500                            Re_Pod_Entry => $make_re_pod_entry,
5501                            Status => $status{$addr},
5502                            UCD => $ucd,
5503                            );
5504
5505        # Then comes the other name, if meaningfully different.
5506        if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5507            $self->add_alias($name{$addr},
5508                            OK_as_Filename => $ok_as_filename,
5509                            Fuzzy => $loose_match,
5510                            Re_Pod_Entry => $make_re_pod_entry,
5511                            Status => $status{$addr},
5512                            UCD => $ucd,
5513                            );
5514        }
5515
5516        return $self;
5517    }
5518
5519    # Here are the methods that are required to be defined by any derived
5520    # class
5521    for my $sub (qw(
5522                    handle_special_range
5523                    append_to_body
5524                    pre_body
5525                ))
5526                # write() knows how to write out normal ranges, but it calls
5527                # handle_special_range() when it encounters a non-normal one.
5528                # append_to_body() is called by it after it has handled all
5529                # ranges to add anything after the main portion of the table.
5530                # And finally, pre_body() is called after all this to build up
5531                # anything that should appear before the main portion of the
5532                # table.  Doing it this way allows things in the middle to
5533                # affect what should appear before the main portion of the
5534                # table.
5535    {
5536        no strict "refs";
5537        *$sub = sub {
5538            Carp::my_carp_bug( __LINE__
5539                              . ": Must create method '$sub()' for "
5540                              . ref shift);
5541            return;
5542        }
5543    }
5544
5545    use overload
5546        fallback => 0,
5547        "." => \&main::_operator_dot,
5548        ".=" => \&main::_operator_dot_equal,
5549        '!=' => \&main::_operator_not_equal,
5550        '==' => \&main::_operator_equal,
5551    ;
5552
5553    sub ranges {
5554        # Returns the array of ranges associated with this table.
5555
5556        return $range_list{pack 'J', refaddr shift}->ranges;
5557    }
5558
5559    sub add_alias {
5560        # Add a synonym for this table.
5561
5562        return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5563
5564        my $self = shift;
5565        my $name = shift;       # The name to add.
5566        my $pointer = shift;    # What the alias hash should point to.  For
5567                                # map tables, this is the parent property;
5568                                # for match tables, it is the table itself.
5569
5570        my %args = @_;
5571        my $loose_match = delete $args{'Fuzzy'};
5572
5573        my $ok_as_filename = delete $args{'OK_as_Filename'};
5574        $ok_as_filename = 1 unless defined $ok_as_filename;
5575
5576        # An internal name does not get documented, unless overridden by the
5577        # input; same for making tests for it.
5578        my $status = delete $args{'Status'} || (($name =~ /^_/)
5579                                                ? $INTERNAL_ALIAS
5580                                                : $NORMAL);
5581        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5582                                            // (($status ne $INTERNAL_ALIAS)
5583                                               ? (($name =~ /^_/) ? $NO : $YES)
5584                                               : $NO);
5585        my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5586
5587        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5588
5589        # Capitalize the first letter of the alias unless it is one of the CJK
5590        # ones which specifically begins with a lower 'k'.  Do this because
5591        # Unicode has varied whether they capitalize first letters or not, and
5592        # have later changed their minds and capitalized them, but not the
5593        # other way around.  So do it always and avoid changes from release to
5594        # release
5595        $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5596
5597        my $addr = pack 'J', refaddr $self;
5598
5599        # Figure out if should be loosely matched if not already specified.
5600        if (! defined $loose_match) {
5601
5602            # Is a loose_match if isn't null, and doesn't begin with an
5603            # underscore and isn't just a number
5604            if ($name ne ""
5605                && substr($name, 0, 1) ne '_'
5606                && $name !~ qr{^[0-9_.+-/]+$})
5607            {
5608                $loose_match = 1;
5609            }
5610            else {
5611                $loose_match = 0;
5612            }
5613        }
5614
5615        # If this alias has already been defined, do nothing.
5616        return if defined $find_table_from_alias{$addr}->{$name};
5617
5618        # That includes if it is standardly equivalent to an existing alias,
5619        # in which case, add this name to the list, so won't have to search
5620        # for it again.
5621        my $standard_name = main::standardize($name);
5622        if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5623            $find_table_from_alias{$addr}->{$name}
5624                        = $find_table_from_alias{$addr}->{$standard_name};
5625            return;
5626        }
5627
5628        # Set the index hash for this alias for future quick reference.
5629        $find_table_from_alias{$addr}->{$name} = $pointer;
5630        $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5631        local $to_trace = 0 if main::DEBUG;
5632        trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5633        trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5634
5635
5636        # Put the new alias at the end of the list of aliases unless the final
5637        # element begins with an underscore (meaning it is for internal perl
5638        # use) or is all numeric, in which case, put the new one before that
5639        # one.  This floats any all-numeric or underscore-beginning aliases to
5640        # the end.  This is done so that they are listed last in output lists,
5641        # to encourage the user to use a better name (either more descriptive
5642        # or not an internal-only one) instead.  This ordering is relied on
5643        # implicitly elsewhere in this program, like in short_name()
5644        my $list = $aliases{$addr};
5645        my $insert_position = (@$list == 0
5646                                || (substr($list->[-1]->name, 0, 1) ne '_'
5647                                    && $list->[-1]->name =~ /\D/))
5648                            ? @$list
5649                            : @$list - 1;
5650        splice @$list,
5651                $insert_position,
5652                0,
5653                Alias->new($name, $loose_match, $make_re_pod_entry,
5654                           $ok_as_filename, $status, $ucd);
5655
5656        # This name may be shorter than any existing ones, so clear the cache
5657        # of the shortest, so will have to be recalculated.
5658        undef $short_name{pack 'J', refaddr $self};
5659        return;
5660    }
5661
5662    sub short_name($self, $nominal_length_ptr=undef) {
5663        # Returns a name suitable for use as the base part of a file name.
5664        # That is, shorter wins.  It can return undef if there is no suitable
5665        # name.  The name has all non-essential underscores removed.
5666
5667        # The optional second parameter is a reference to a scalar in which
5668        # this routine will store the length the returned name had before the
5669        # underscores were removed, or undef if the return is undef.
5670
5671        # The shortest name can change if new aliases are added.  So using
5672        # this should be deferred until after all these are added.  The code
5673        # that does that should clear this one's cache.
5674        # Any name with alphabetics is preferred over an all numeric one, even
5675        # if longer.
5676
5677        my $addr = pack 'J', refaddr $self;
5678
5679        # For efficiency, don't recalculate, but this means that adding new
5680        # aliases could change what the shortest is, so the code that does
5681        # that needs to undef this.
5682        if (defined $short_name{$addr}) {
5683            if ($nominal_length_ptr) {
5684                $$nominal_length_ptr = $nominal_short_name_length{$addr};
5685            }
5686            return $short_name{$addr};
5687        }
5688
5689        # Look at each alias
5690        my $is_last_resort = 0;
5691        my $deprecated_or_discouraged
5692                                = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5693        foreach my $alias ($self->aliases()) {
5694
5695            # Don't use an alias that isn't ok to use for an external name.
5696            next if ! $alias->ok_as_filename;
5697
5698            my $name = main::Standardize($alias->name);
5699            trace $self, $name if main::DEBUG && $to_trace;
5700
5701            # Take the first one, or any non-deprecated non-discouraged one
5702            # over one that is, or a shorter one that isn't numeric.  This
5703            # relies on numeric aliases always being last in the array
5704            # returned by aliases().  Any alpha one will have precedence.
5705            if (   ! defined $short_name{$addr}
5706                || (   $is_last_resort
5707                    && $alias->status !~ $deprecated_or_discouraged)
5708                || ($name =~ /\D/
5709                    && length($name) < length($short_name{$addr})))
5710            {
5711                # Remove interior underscores.
5712                ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5713
5714                $nominal_short_name_length{$addr} = length $name;
5715                $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5716            }
5717        }
5718
5719        # If the short name isn't a nice one, perhaps an equivalent table has
5720        # a better one.
5721        if (   $self->can('children')
5722            && (   ! defined $short_name{$addr}
5723                || $short_name{$addr} eq ""
5724                || $short_name{$addr} eq "_"))
5725        {
5726            my $return;
5727            foreach my $follower ($self->children) {    # All equivalents
5728                my $follower_name = $follower->short_name;
5729                next unless defined $follower_name;
5730
5731                # Anything (except undefined) is better than underscore or
5732                # empty
5733                if (! defined $return || $return eq "_") {
5734                    $return = $follower_name;
5735                    next;
5736                }
5737
5738                # If the new follower name isn't "_" and is shorter than the
5739                # current best one, prefer the new one.
5740                next if $follower_name eq "_";
5741                next if length $follower_name > length $return;
5742                $return = $follower_name;
5743            }
5744            $short_name{$addr} = $return if defined $return;
5745        }
5746
5747        # If no suitable external name return undef
5748        if (! defined $short_name{$addr}) {
5749            $$nominal_length_ptr = undef if $nominal_length_ptr;
5750            return;
5751        }
5752
5753        # Don't allow a null short name.
5754        if ($short_name{$addr} eq "") {
5755            $short_name{$addr} = '_';
5756            $nominal_short_name_length{$addr} = 1;
5757        }
5758
5759        trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5760
5761        if ($nominal_length_ptr) {
5762            $$nominal_length_ptr = $nominal_short_name_length{$addr};
5763        }
5764        return $short_name{$addr};
5765    }
5766
5767    sub external_name($self) {
5768        # Returns the external name that this table should be known by.  This
5769        # is usually the short_name, but not if the short_name is undefined,
5770        # in which case the external_name is arbitrarily set to the
5771        # underscore.
5772
5773        my $short = $self->short_name;
5774        return $short if defined $short;
5775
5776        return '_';
5777    }
5778
5779    sub add_description($self, $description) { # Adds the parameter as a short description.
5780        push @{$description{pack 'J', refaddr $self}}, $description;
5781
5782        return;
5783    }
5784
5785    sub add_note($self, $note) { # Adds the parameter as a short note.
5786        push @{$note{pack 'J', refaddr $self}}, $note;
5787
5788        return;
5789    }
5790
5791    sub add_comment($self, $comment) { # Adds the parameter as a comment.
5792
5793        return unless $debugging_build;
5794
5795        chomp $comment;
5796
5797        push @{$comment{pack 'J', refaddr $self}}, $comment;
5798
5799        return;
5800    }
5801
5802    sub comment($self) {
5803        # Return the current comment for this table.  If called in list
5804        # context, returns the array of comments.  In scalar, returns a string
5805        # of each element joined together with a period ending each.
5806
5807        my $addr = pack 'J', refaddr $self;
5808        my @list = @{$comment{$addr}};
5809        return @list if wantarray;
5810        my $return = "";
5811        foreach my $sentence (@list) {
5812            $return .= '.  ' if $return;
5813            $return .= $sentence;
5814            $return =~ s/\.$//;
5815        }
5816        $return .= '.' if $return;
5817        return $return;
5818    }
5819
5820    sub initialize($self, $initialization) {
5821        # Initialize the table with the argument which is any valid
5822        # initialization for range lists.
5823
5824        my $addr = pack 'J', refaddr $self;
5825
5826        # Replace the current range list with a new one of the same exact
5827        # type.
5828        my $class = ref $range_list{$addr};
5829        $range_list{$addr} = $class->new(Owner => $self,
5830                                        Initialize => $initialization);
5831        return;
5832
5833    }
5834
5835    sub header($self) {
5836        # The header that is output for the table in the file it is written
5837        # in.
5838        my $return = "";
5839        $return .= $DEVELOPMENT_ONLY if $compare_versions;
5840        $return .= $HEADER;
5841        return $return;
5842    }
5843
5844    sub merge_single_annotation_line ($output, $annotation, $annotation_column) {
5845
5846        # This appends an annotation comment, $annotation, to $output,
5847        # starting in or after column $annotation_column, removing any
5848        # pre-existing comment from $output.
5849
5850        $annotation =~ s/^ \s* \# \  //x;
5851        $output =~ s/ \s* ( \# \N* )? \n //x;
5852        $output = Text::Tabs::expand($output);
5853
5854        my $spaces = $annotation_column - length $output;
5855        $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
5856
5857        $output = sprintf "%s%*s# %s",
5858                            $output,
5859                            $spaces,
5860                            " ",
5861                            $annotation;
5862        return Text::Tabs::unexpand $output;
5863    }
5864
5865    sub write($self, $use_adjustments=0, $suppress_value=0) {
5866        # Write a representation of the table to its file.  It calls several
5867        # functions furnished by sub-classes of this abstract base class to
5868        # handle non-normal ranges, to add stuff before the table, and at its
5869        # end.  If the table is to be written so that adjustments are
5870        # required, this does that conversion.
5871
5872
5873        # $use_adjustments ? output in adjusted format or not
5874        # $suppress_value Optional, if the value associated with
5875        # a range equals this one, don't write
5876        # the range
5877
5878        my $addr = pack 'J', refaddr $self;
5879        my $write_as_invlist = $write_as_invlist{$addr};
5880
5881        # Start with the header
5882        my @HEADER = $self->header;
5883
5884        # Then the comments
5885        push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5886                                                        if $comment{$addr};
5887
5888        # Things discovered processing the main body of the document may
5889        # affect what gets output before it, therefore pre_body() isn't called
5890        # until after all other processing of the table is done.
5891
5892        # The main body looks like a 'here' document.  If there are comments,
5893        # get rid of them when processing it.
5894        my @OUT;
5895        if ($annotate || $output_range_counts) {
5896            # Use the line below in Perls that don't have /r
5897            #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5898            push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5899        } else {
5900            push @OUT, "return <<'END';\n";
5901        }
5902
5903        if ($range_list{$addr}->is_empty) {
5904
5905            # This is a kludge for empty tables to silence a warning in
5906            # utf8.c, which can't really deal with empty tables, but it can
5907            # deal with a table that matches nothing, as the inverse of 'All'
5908            # does.
5909            push @OUT, "!Unicode::UCD::All\n";
5910        }
5911        elsif ($self->name eq 'N'
5912
5913               # To save disk space and table cache space, avoid putting out
5914               # binary N tables, but instead create a file which just inverts
5915               # the Y table.  Since the file will still exist and occupy a
5916               # certain number of blocks, might as well output the whole
5917               # thing if it all will fit in one block.   The number of
5918               # ranges below is an approximate number for that.
5919               && ($self->property->type == $BINARY
5920                   || $self->property->type == $FORCED_BINARY)
5921               # && $self->property->tables == 2  Can't do this because the
5922               #        non-binary properties, like NFDQC aren't specifiable
5923               #        by the notation
5924               && $range_list{$addr}->ranges > 15
5925               && ! $annotate)  # Under --annotate, want to see everything
5926        {
5927            push @OUT, "!Unicode::UCD::" . $self->property->name . "\n";
5928        }
5929        else {
5930            my $range_size_1 = $range_size_1{$addr};
5931
5932            # To make it more readable, use a minimum indentation
5933            my $comment_indent;
5934
5935            # These are used only in $annotate option
5936            my $format;         # e.g. $HEX_ADJUST_FORMAT
5937            my $include_name;   # ? Include the character's name in the
5938                                # annotation?
5939            my $include_cp;     # ? Include its code point
5940
5941            if (! $annotate) {
5942                $comment_indent = ($self->isa('Map_Table'))
5943                                  ? 24
5944                                  : ($write_as_invlist)
5945                                    ? 8
5946                                    : 16;
5947            }
5948            else {
5949                $format = $self->format;
5950
5951                # The name of the character is output only for tables that
5952                # don't already include the name in the output.
5953                my $property = $self->property;
5954                $include_name =
5955                    !  ($property == $perl_charname
5956                        || $property == main::property_ref('Unicode_1_Name')
5957                        || $property == main::property_ref('Name')
5958                        || $property == main::property_ref('Name_Alias')
5959                       );
5960
5961                # Don't include the code point in the annotation where all
5962                # lines are a single code point, so it can be easily found in
5963                # the first column
5964                $include_cp = ! $range_size_1;
5965
5966                if (! $self->isa('Map_Table')) {
5967                    $comment_indent = ($write_as_invlist) ? 8 : 16;
5968                }
5969                else {
5970                    $comment_indent = 16;
5971
5972                    # There are just a few short ranges in this table, so no
5973                    # need to include the code point in the annotation.
5974                    $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
5975
5976                    # We're trying to get this to look good, as the whole
5977                    # point is to make human-readable tables.  It is easier to
5978                    # read if almost all the annotation comments begin in the
5979                    # same column.  Map tables have varying width maps, so can
5980                    # create a jagged comment appearance.  This code does a
5981                    # preliminary pass through these tables looking for the
5982                    # maximum width map in each, and causing the comments to
5983                    # begin just to the right of that.  However, if the
5984                    # comments begin too far to the right of most lines, it's
5985                    # hard to line them up horizontally with their real data.
5986                    # Therefore we ignore the longest outliers
5987                    my $ignore_longest_X_percent = 2;  # Discard longest X%
5988
5989                    # Each key in this hash is a width of at least one of the
5990                    # maps in the table.  Its value is how many lines have
5991                    # that width.
5992                    my %widths;
5993
5994                    # We won't space things further left than one tab stop
5995                    # after the rest of the line; initializing it to that
5996                    # number saves some work.
5997                    my $max_map_width = 8;
5998
5999                    # Fill in the %widths hash
6000                    my $total = 0;
6001                    for my $set ($range_list{$addr}->ranges) {
6002                        my $value = $set->value;
6003
6004                        # These range types don't appear in the main table
6005                        next if $set->type == 0
6006                                && defined $suppress_value
6007                                && $value eq $suppress_value;
6008                        next if $set->type == $MULTI_CP
6009                                || $set->type == $NULL;
6010
6011                        # Include 2 spaces before the beginning of the
6012                        # comment
6013                        my $this_width = length($value) + 2;
6014
6015                        # Ranges of the remaining non-zero types usually
6016                        # occupy just one line (maybe occasionally two, but
6017                        # this doesn't have to be dead accurate).  This is
6018                        # because these ranges are like "unassigned code
6019                        # points"
6020                        my $count = ($set->type != 0)
6021                                    ? 1
6022                                    : $set->end - $set->start + 1;
6023                        $widths{$this_width} += $count;
6024                        $total += $count;
6025                        $max_map_width = $this_width
6026                                            if $max_map_width < $this_width;
6027                    }
6028
6029                    # If the widest map gives us less than two tab stops
6030                    # worth, just take it as-is.
6031                    if ($max_map_width > 16) {
6032
6033                        # Otherwise go through %widths until we have included
6034                        # the desired percentage of lines in the whole table.
6035                        my $running_total = 0;
6036                        foreach my $width (sort { $a <=> $b } keys %widths)
6037                        {
6038                            $running_total += $widths{$width};
6039                            use integer;
6040                            if ($running_total * 100 / $total
6041                                            >= 100 - $ignore_longest_X_percent)
6042                            {
6043                                $max_map_width = $width;
6044                                last;
6045                            }
6046                        }
6047                    }
6048                    $comment_indent += $max_map_width;
6049                }
6050            }
6051
6052            # Values for previous time through the loop.  Initialize to
6053            # something that won't be adjacent to the first iteration;
6054            # only $previous_end matters for that.
6055            my $previous_start;
6056            my $previous_end = -2;
6057            my $previous_value;
6058
6059            # Values for next time through the portion of the loop that splits
6060            # the range.  0 in $next_start means there is no remaining portion
6061            # to deal with.
6062            my $next_start = 0;
6063            my $next_end;
6064            my $next_value;
6065            my $offset = 0;
6066            my $invlist_count = 0;
6067
6068            my $output_value_in_hex = $self->isa('Map_Table')
6069                                && ($self->format eq $HEX_ADJUST_FORMAT
6070                                    || $self->to_output_map == $EXTERNAL_MAP);
6071            # Use leading zeroes just for files whose format should not be
6072            # changed from what it has been.  Otherwise, they just take up
6073            # space and time to process.
6074            my $hex_format = ($self->isa('Map_Table')
6075                              && $self->to_output_map == $EXTERNAL_MAP)
6076                             ? "%04X"
6077                             : "%X";
6078
6079            # The values for some of these tables are stored in mktables as
6080            # hex strings.  Normally, these are just output as strings without
6081            # change, but when we are doing adjustments, we have to operate on
6082            # these numerically, so we convert those to decimal to do that,
6083            # and back to hex for output
6084            my $convert_map_to_from_hex = 0;
6085            my $output_map_in_hex = 0;
6086            if ($self->isa('Map_Table')) {
6087                $convert_map_to_from_hex
6088                   = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6089                      || ($annotate && $self->format eq $HEX_FORMAT);
6090                $output_map_in_hex = $convert_map_to_from_hex
6091                                 || $self->format eq $HEX_FORMAT;
6092            }
6093
6094            # To store any annotations about the characters.
6095            my @annotation;
6096
6097            # Output each range as part of the here document.
6098            RANGE:
6099            for my $set ($range_list{$addr}->ranges) {
6100                if ($set->type != 0) {
6101                    $self->handle_special_range($set);
6102                    next RANGE;
6103                }
6104                my $start = $set->start;
6105                my $end   = $set->end;
6106                my $value  = $set->value;
6107
6108                # Don't output ranges whose value is the one to suppress
6109                next RANGE if defined $suppress_value
6110                              && $value eq $suppress_value;
6111
6112                $value = CORE::hex $value if $convert_map_to_from_hex;
6113
6114
6115                {   # This bare block encloses the scope where we may need to
6116                    # 'redo' to.  Consider a table that is to be written out
6117                    # using single item ranges.  This is given in the
6118                    # $range_size_1 boolean.  To accomplish this, we split the
6119                    # range each time through the loop into two portions, the
6120                    # first item, and the rest.  We handle that first item
6121                    # this time in the loop, and 'redo' to repeat the process
6122                    # for the rest of the range.
6123                    #
6124                    # We may also have to do it, with other special handling,
6125                    # if the table has adjustments.  Consider the table that
6126                    # contains the lowercasing maps.  mktables stores the
6127                    # ASCII range ones as 26 ranges:
6128                    #       ord('A') => ord('a'), .. ord('Z') => ord('z')
6129                    # For compactness, the table that gets written has this as
6130                    # just one range
6131                    #       ( ord('A') .. ord('Z') ) => ord('a')
6132                    # and the software that reads the tables is smart enough
6133                    # to "connect the dots".  This change is accomplished in
6134                    # this loop by looking to see if the current iteration
6135                    # fits the paradigm of the previous iteration, and if so,
6136                    # we merge them by replacing the final output item with
6137                    # the merged data.  Repeated 25 times, this gets A-Z.  But
6138                    # we also have to make sure we don't screw up cases where
6139                    # we have internally stored
6140                    #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
6141                    # This single internal range has to be output as 3 ranges,
6142                    # which is done by splitting, like we do for $range_size_1
6143                    # tables.  (There are very few of such ranges that need to
6144                    # be split, so the gain of doing the combining of other
6145                    # ranges far outweighs the splitting of these.)  The
6146                    # values to use for the redo at the end of this block are
6147                    # set up just below in the scalars whose names begin with
6148                    # '$next_'.
6149
6150                    if (($use_adjustments || $range_size_1) && $end != $start)
6151                    {
6152                        $next_start = $start + 1;
6153                        $next_end = $end;
6154                        $next_value = $value;
6155                        $end = $start;
6156                    }
6157
6158                    if ($use_adjustments && ! $range_size_1) {
6159
6160                        # If this range is adjacent to the previous one, and
6161                        # the values in each are integers that are also
6162                        # adjacent (differ by 1), then this range really
6163                        # extends the previous one that is already in element
6164                        # $OUT[-1].  So we pop that element, and pretend that
6165                        # the range starts with whatever it started with.
6166                        # $offset is incremented by 1 each time so that it
6167                        # gives the current offset from the first element in
6168                        # the accumulating range, and we keep in $value the
6169                        # value of that first element.
6170                        if ($start == $previous_end + 1
6171                            && $value =~ /^ -? \d+ $/xa
6172                            && $previous_value =~ /^ -? \d+ $/xa
6173                            && ($value == ($previous_value + ++$offset)))
6174                        {
6175                            pop @OUT;
6176                            $start = $previous_start;
6177                            $value = $previous_value;
6178                        }
6179                        else {
6180                            $offset = 0;
6181                            if (@annotation == 1) {
6182                                $OUT[-1] = merge_single_annotation_line(
6183                                    $OUT[-1], $annotation[0], $comment_indent);
6184                            }
6185                            else {
6186                                push @OUT, @annotation;
6187                            }
6188                        }
6189                        undef @annotation;
6190
6191                        # Save the current values for the next time through
6192                        # the loop.
6193                        $previous_start = $start;
6194                        $previous_end = $end;
6195                        $previous_value = $value;
6196                    }
6197
6198                    if ($write_as_invlist) {
6199                        if (   $previous_end > 0
6200                            && $output_range_counts{$addr})
6201                        {
6202                            my $complement_count = $start - $previous_end - 1;
6203                            if ($complement_count > 1) {
6204                                $OUT[-1] = merge_single_annotation_line(
6205                                    $OUT[-1],
6206                                       "#"
6207                                     . (" " x 17)
6208                                     . "["
6209                                     .  main::clarify_code_point_count(
6210                                                            $complement_count)
6211                                      . "] in complement\n",
6212                                    $comment_indent);
6213                            }
6214                        }
6215
6216                        # Inversion list format has a single number per line,
6217                        # the starting code point of a range that matches the
6218                        # property
6219                        push @OUT, $start, "\n";
6220                        $invlist_count++;
6221
6222                        # Add a comment with the size of the range, if
6223                        # requested.
6224                        if ($output_range_counts{$addr}) {
6225                            $OUT[-1] = merge_single_annotation_line(
6226                                    $OUT[-1],
6227                                    "# ["
6228                                      . main::clarify_code_point_count($end - $start + 1)
6229                                      . "]\n",
6230                                    $comment_indent);
6231                        }
6232                    }
6233                    elsif ($start != $end) { # If there is a range
6234                        if ($end == $MAX_WORKING_CODEPOINT) {
6235                            push @OUT, sprintf "$hex_format\t$hex_format",
6236                                                $start,
6237                                                $MAX_PLATFORM_CODEPOINT;
6238                        }
6239                        else {
6240                            push @OUT, sprintf "$hex_format\t$hex_format",
6241                                                $start,       $end;
6242                        }
6243                        if (length $value) {
6244                            if ($convert_map_to_from_hex) {
6245                                $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6246                            }
6247                            else {
6248                                $OUT[-1] .= "\t$value\n";
6249                            }
6250                        }
6251
6252                        # Add a comment with the size of the range, if
6253                        # requested.
6254                        if ($output_range_counts{$addr}) {
6255                            $OUT[-1] = merge_single_annotation_line(
6256                                    $OUT[-1],
6257                                    "# ["
6258                                      . main::clarify_code_point_count($end - $start + 1)
6259                                      . "]\n",
6260                                    $comment_indent);
6261                        }
6262                    }
6263                    else { # Here to output a single code point per line.
6264
6265                        # Use any passed in subroutine to output.
6266                        if (ref $range_size_1 eq 'CODE') {
6267                            for my $i ($start .. $end) {
6268                                push @OUT, &{$range_size_1}($i, $value);
6269                            }
6270                        }
6271                        else {
6272
6273                            # Here, caller is ok with default output.
6274                            for (my $i = $start; $i <= $end; $i++) {
6275                                if ($convert_map_to_from_hex) {
6276                                    push @OUT,
6277                                        sprintf "$hex_format\t\t$hex_format\n",
6278                                                 $i,            $value;
6279                                }
6280                                else {
6281                                    push @OUT, sprintf $hex_format, $i;
6282                                    $OUT[-1] .= "\t\t$value" if $value ne "";
6283                                    $OUT[-1] .= "\n";
6284                                }
6285                            }
6286                        }
6287                    }
6288
6289                    if ($annotate) {
6290                        for (my $i = $start; $i <= $end; $i++) {
6291                            my $annotation = "";
6292
6293                            # Get character information if don't have it already
6294                            main::populate_char_info($i)
6295                                                     if ! defined $viacode[$i];
6296                            my $type = $annotate_char_type[$i];
6297
6298                            # Figure out if should output the next code points
6299                            # as part of a range or not.  If this is not in an
6300                            # annotation range, then won't output as a range,
6301                            # so returns $i.  Otherwise use the end of the
6302                            # annotation range, but no further than the
6303                            # maximum possible end point of the loop.
6304                            my $range_end =
6305                                        $range_size_1
6306                                        ? $start
6307                                        : main::min(
6308                                          $annotate_ranges->value_of($i) || $i,
6309                                          $end);
6310
6311                            # Use a range if it is a range, and either is one
6312                            # of the special annotation ranges, or the range
6313                            # is at most 3 long.  This last case causes the
6314                            # algorithmically named code points to be output
6315                            # individually in spans of at most 3, as they are
6316                            # the ones whose $type is > 0.
6317                            if ($range_end != $i
6318                                && ( $type < 0 || $range_end - $i > 2))
6319                            {
6320                                # Here is to output a range.  We don't allow a
6321                                # caller-specified output format--just use the
6322                                # standard one.
6323                                my $range_name = $viacode[$i];
6324
6325                                # For the code points which end in their hex
6326                                # value, we eliminate that from the output
6327                                # annotation, and capitalize only the first
6328                                # letter of each word.
6329                                if ($type == $CP_IN_NAME) {
6330                                    my $hex = sprintf $hex_format, $i;
6331                                    $range_name =~ s/-$hex$//;
6332                                    my @words = split " ", $range_name;
6333                                    for my $word (@words) {
6334                                        $word =
6335                                          ucfirst(lc($word)) if $word ne 'CJK';
6336                                    }
6337                                    $range_name = join " ", @words;
6338                                }
6339                                elsif ($type == $HANGUL_SYLLABLE) {
6340                                    $range_name = "Hangul Syllable";
6341                                }
6342
6343                                # If the annotation would just repeat what's
6344                                # already being output as the range, skip it.
6345                                # (When an inversion list is being written, it
6346                                # isn't a repeat, as that always is in
6347                                # decimal)
6348                                if (   $write_as_invlist
6349                                    || $i != $start
6350                                    || $range_end < $end)
6351                                {
6352                                    if ($range_end < $MAX_WORKING_CODEPOINT)
6353                                    {
6354                                        $annotation = sprintf "%04X..%04X",
6355                                                              $i,   $range_end;
6356                                    }
6357                                    else {
6358                                        $annotation = sprintf "%04X..INFINITY",
6359                                                               $i;
6360                                    }
6361                                }
6362                                else { # Indent if not displaying code points
6363                                    $annotation = " " x 4;
6364                                }
6365
6366                                if ($range_name) {
6367                                    $annotation .= " $age[$i]" if $age[$i];
6368                                    $annotation .= " $range_name";
6369                                }
6370
6371                                # Include the number of code points in the
6372                                # range
6373                                my $count =
6374                                    main::clarify_code_point_count($range_end - $i + 1);
6375                                $annotation .= " [$count]\n";
6376
6377                                # Skip to the end of the range
6378                                $i = $range_end;
6379                            }
6380                            else { # Not in a range.
6381                                my $comment = "";
6382
6383                                # When outputting the names of each character,
6384                                # use the character itself if printable
6385                                $comment .= "'" . main::display_chr($i) . "' "
6386                                                            if $printable[$i];
6387
6388                                my $output_value = $value;
6389
6390                                # Determine the annotation
6391                                if ($format eq $DECOMP_STRING_FORMAT) {
6392
6393                                    # This is very specialized, with the type
6394                                    # of decomposition beginning the line
6395                                    # enclosed in <...>, and the code points
6396                                    # that the code point decomposes to
6397                                    # separated by blanks.  Create two
6398                                    # strings, one of the printable
6399                                    # characters, and one of their official
6400                                    # names.
6401                                    (my $map = $output_value)
6402                                                    =~ s/ \ * < .*? > \ +//x;
6403                                    my $tostr = "";
6404                                    my $to_name = "";
6405                                    my $to_chr = "";
6406                                    foreach my $to (split " ", $map) {
6407                                        $to = CORE::hex $to;
6408                                        $to_name .= " + " if $to_name;
6409                                        $to_chr .= main::display_chr($to);
6410                                        main::populate_char_info($to)
6411                                                    if ! defined $viacode[$to];
6412                                        $to_name .=  $viacode[$to];
6413                                    }
6414
6415                                    $comment .=
6416                                    "=> '$to_chr'; $viacode[$i] => $to_name";
6417                                }
6418                                else {
6419                                    $output_value += $i - $start
6420                                                   if $use_adjustments
6421                                                      # Don't try to adjust a
6422                                                      # non-integer
6423                                                   && $output_value !~ /[-\D]/;
6424
6425                                    if ($output_map_in_hex) {
6426                                        main::populate_char_info($output_value)
6427                                          if ! defined $viacode[$output_value];
6428                                        $comment .= " => '"
6429                                        . main::display_chr($output_value)
6430                                        . "'; " if $printable[$output_value];
6431                                    }
6432                                    if ($include_name && $viacode[$i]) {
6433                                        $comment .= " " if $comment;
6434                                        $comment .= $viacode[$i];
6435                                    }
6436                                    if ($output_map_in_hex) {
6437                                        $comment .=
6438                                                " => $viacode[$output_value]"
6439                                                    if $viacode[$output_value];
6440                                        $output_value = sprintf($hex_format,
6441                                                                $output_value);
6442                                    }
6443                                }
6444
6445                                if ($include_cp) {
6446                                    $annotation = sprintf "%04X %s", $i, $age[$i];
6447                                    if ($use_adjustments) {
6448                                        $annotation .= " => $output_value";
6449                                    }
6450                                }
6451
6452                                if ($comment ne "") {
6453                                    $annotation .= " " if $annotation ne "";
6454                                    $annotation .= $comment;
6455                                }
6456                                $annotation .= "\n" if $annotation ne "";
6457                            }
6458
6459                            if ($annotation ne "") {
6460                                push @annotation, (" " x $comment_indent)
6461                                                  .  "# $annotation";
6462                            }
6463                        }
6464
6465                        # If not adjusting, we don't have to go through the
6466                        # loop again to know that the annotation comes next
6467                        # in the output.
6468                        if (! $use_adjustments) {
6469                            if (@annotation == 1) {
6470                                $OUT[-1] = merge_single_annotation_line(
6471                                    $OUT[-1], $annotation[0], $comment_indent);
6472                            }
6473                            else {
6474                                push @OUT, map { Text::Tabs::unexpand $_ }
6475                                               @annotation;
6476                            }
6477                            undef @annotation;
6478                        }
6479                    }
6480
6481                    # Add the beginning of the range that doesn't match the
6482                    # property, except if the just added match range extends
6483                    # to infinity.  We do this after any annotations for the
6484                    # match range.
6485                    if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6486                        push @OUT, $end + 1, "\n";
6487                        $invlist_count++;
6488                    }
6489
6490                    # If we split the range, set up so the next time through
6491                    # we get the remainder, and redo.
6492                    if ($next_start) {
6493                        $start = $next_start;
6494                        $end = $next_end;
6495                        $value = $next_value;
6496                        $next_start = 0;
6497                        redo;
6498                    }
6499                } # End of redo block
6500            } # End of loop through all the table's ranges
6501
6502            push @OUT, @annotation; # Add orphaned annotation, if any
6503
6504            splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6505        }
6506
6507        # Add anything that goes after the main body, but within the here
6508        # document,
6509        my $append_to_body = $self->append_to_body;
6510        push @OUT, $append_to_body if $append_to_body;
6511
6512        # And finish the here document.
6513        push @OUT, "END\n";
6514
6515        # Done with the main portion of the body.  Can now figure out what
6516        # should appear before it in the file.
6517        my $pre_body = $self->pre_body;
6518        push @HEADER, $pre_body, "\n" if $pre_body;
6519
6520        # All these files should have a .pl suffix added to them.
6521        my @file_with_pl = @{$file_path{$addr}};
6522        $file_with_pl[-1] .= '.pl';
6523
6524        main::write(\@file_with_pl,
6525                    $annotate,      # utf8 iff annotating
6526                    \@HEADER,
6527                    \@OUT);
6528        return;
6529    }
6530
6531    sub set_status($self, $status, $info) {    # Set the table's status
6532        # status The status enum value
6533        # info Any message associated with it.
6534        my $addr = pack 'J', refaddr $self;
6535
6536        $status{$addr} = $status;
6537        $status_info{$addr} = $info;
6538        return;
6539    }
6540
6541    sub set_fate($self, $fate, $reason=undef) {  # Set the fate of a table
6542        my $addr = pack 'J', refaddr $self;
6543
6544        return if $fate{$addr} == $fate;    # If no-op
6545
6546        # Can only change the ordinary fate, except if going to $MAP_PROXIED
6547        return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6548
6549        $fate{$addr} = $fate;
6550
6551        # Don't document anything to do with a non-normal fated table
6552        if ($fate != $ORDINARY) {
6553            my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6554            foreach my $alias ($self->aliases) {
6555                $alias->set_ucd($put_in_pod);
6556
6557                # MAP_PROXIED doesn't affect the match tables
6558                next if $fate == $MAP_PROXIED;
6559                $alias->set_make_re_pod_entry($put_in_pod);
6560            }
6561        }
6562
6563        # Save the reason for suppression for output
6564        if ($fate >= $SUPPRESSED) {
6565            $reason = "" unless defined $reason;
6566            $why_suppressed{$complete_name{$addr}} = $reason;
6567        }
6568
6569        return;
6570    }
6571
6572    sub lock($self) {
6573        # Don't allow changes to the table from now on.  This stores a stack
6574        # trace of where it was called, so that later attempts to modify it
6575        # can immediately show where it got locked.
6576        my $addr = pack 'J', refaddr $self;
6577
6578        $locked{$addr} = "";
6579
6580        my $line = (caller(0))[2];
6581        my $i = 1;
6582
6583        # Accumulate the stack trace
6584        while (1) {
6585            my ($pkg, $file, $caller_line, $caller) = caller $i++;
6586
6587            last unless defined $caller;
6588
6589            $locked{$addr} .= "    called from $caller() at line $line\n";
6590            $line = $caller_line;
6591        }
6592        $locked{$addr} .= "    called from main at line $line\n";
6593
6594        return;
6595    }
6596
6597    sub carp_if_locked($self) {
6598        # Return whether a table is locked or not, and, by the way, complain
6599        # if is locked
6600        my $addr = pack 'J', refaddr $self;
6601
6602        return 0 if ! $locked{$addr};
6603        Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6604        return 1;
6605    }
6606
6607    sub set_file_path($self, @path) { # Set the final directory path for this table
6608        @{$file_path{pack 'J', refaddr $self}} = @path;
6609        return
6610    }
6611
6612    # Accessors for the range list stored in this table.  First for
6613    # unconditional
6614    for my $sub (qw(
6615                    containing_range
6616                    contains
6617                    count
6618                    each_range
6619                    hash
6620                    is_empty
6621                    matches_identically_to
6622                    max
6623                    min
6624                    range_count
6625                    reset_each_range
6626                    type_of
6627                    value_of
6628                ))
6629    {
6630        no strict "refs";
6631        *$sub = sub {
6632            use strict "refs";
6633            my $self = shift;
6634            return $self->_range_list->$sub(@_);
6635        }
6636    }
6637
6638    # Then for ones that should fail if locked
6639    for my $sub (qw(
6640                    delete_range
6641                ))
6642    {
6643        no strict "refs";
6644        *$sub = sub {
6645            use strict "refs";
6646            my $self = shift;
6647
6648            return if $self->carp_if_locked;
6649            no overloading;
6650            return $self->_range_list->$sub(@_);
6651        }
6652    }
6653
6654} # End closure
6655
6656package Map_Table;
6657use parent '-norequire', '_Base_Table';
6658
6659# A Map Table is a table that contains the mappings from code points to
6660# values.  There are two weird cases:
6661# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6662#    are written in the table's file at the end of the table nonetheless.  It
6663#    requires specially constructed code to handle these; utf8.c can not read
6664#    these in, so they should not go in $map_directory.  As of this writing,
6665#    the only case that these happen is for named sequences used in
6666#    charnames.pm.   But this code doesn't enforce any syntax on these, so
6667#    something else could come along that uses it.
6668# 2) Specials are anything that doesn't fit syntactically into the body of the
6669#    table.  The ranges for these have a map type of non-zero.  The code below
6670#    knows about and handles each possible type.   In most cases, these are
6671#    written as part of the header.
6672#
6673# A map table deliberately can't be manipulated at will unlike match tables.
6674# This is because of the ambiguities having to do with what to do with
6675# overlapping code points.  And there just isn't a need for those things;
6676# what one wants to do is just query, add, replace, or delete mappings, plus
6677# write the final result.
6678# However, there is a method to get the list of possible ranges that aren't in
6679# this table to use for defaulting missing code point mappings.  And,
6680# map_add_or_replace_non_nulls() does allow one to add another table to this
6681# one, but it is clearly very specialized, and defined that the other's
6682# non-null values replace this one's if there is any overlap.
6683
6684sub trace { return main::trace(@_); }
6685
6686{ # Closure
6687
6688    main::setup_package();
6689
6690    my %default_map;
6691    # Many input files omit some entries; this gives what the mapping for the
6692    # missing entries should be
6693    main::set_access('default_map', \%default_map, 'r');
6694
6695    my %anomalous_entries;
6696    # Things that go in the body of the table which don't fit the normal
6697    # scheme of things, like having a range.  Not much can be done with these
6698    # once there except to output them.  This was created to handle named
6699    # sequences.
6700    main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6701    main::set_access('anomalous_entries',       # Append singular, read plural
6702                    \%anomalous_entries,
6703                    'readable_array');
6704    my %to_output_map;
6705    # Enum as to whether or not to write out this map table, and how:
6706    #   0               don't output
6707    #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6708    #                   it should not be removed nor its format changed.  This
6709    #                   is done for those files that have traditionally been
6710    #                   output.
6711    #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6712    #                   with this file
6713    #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6714    #                   outputting the actual mappings as-is, we adjust things
6715    #                   to create a much more compact table. Only those few
6716    #                   tables where the mapping is convertible at least to an
6717    #                   integer and compacting makes a big difference should
6718    #                   have this.  Hence, the default is to not do this
6719    #                   unless the table's default mapping is to $CODE_POINT,
6720    #                   and the range size is not 1.
6721    main::set_access('to_output_map', \%to_output_map, 's');
6722
6723    sub new {
6724        my $class = shift;
6725        my $name = shift;
6726
6727        my %args = @_;
6728
6729        # Optional initialization data for the table.
6730        my $initialize = delete $args{'Initialize'};
6731
6732        my $default_map = delete $args{'Default_Map'};
6733        my $property = delete $args{'_Property'};
6734        my $full_name = delete $args{'Full_Name'};
6735        my $to_output_map = delete $args{'To_Output_Map'};
6736
6737        # Rest of parameters passed on
6738
6739        my $range_list = Range_Map->new(Owner => $property);
6740
6741        my $self = $class->SUPER::new(
6742                                    Name => $name,
6743                                    Complete_Name =>  $full_name,
6744                                    Full_Name => $full_name,
6745                                    _Property => $property,
6746                                    _Range_List => $range_list,
6747                                    Write_As_Invlist => 0,
6748                                    %args);
6749
6750        my $addr = pack 'J', refaddr $self;
6751
6752        $anomalous_entries{$addr} = [];
6753        $default_map{$addr} = $default_map;
6754        $to_output_map{$addr} = $to_output_map;
6755
6756        $self->initialize($initialize) if defined $initialize;
6757
6758        return $self;
6759    }
6760
6761    use overload
6762        fallback => 0,
6763        qw("") => "_operator_stringify",
6764    ;
6765
6766    sub _operator_stringify($self, $other="", $reversed=0) {
6767
6768        my $name = $self->property->full_name;
6769        $name = '""' if $name eq "";
6770        return "Map table for Property '$name'";
6771    }
6772
6773    sub add_alias {
6774        # Add a synonym for this table (which means the property itself)
6775        my $self = shift;
6776        my $name = shift;
6777        # Rest of parameters passed on.
6778
6779        $self->SUPER::add_alias($name, $self->property, @_);
6780        return;
6781    }
6782
6783    sub add_map {
6784        # Add a range of code points to the list of specially-handled code
6785        # points.  0 is assumed if the type of special is not passed
6786        # in.
6787
6788        my $self = shift;
6789        my $lower = shift;
6790        my $upper = shift;
6791        my $string = shift;
6792        my %args = @_;
6793
6794        my $type = delete $args{'Type'} || 0;
6795        # Rest of parameters passed on
6796
6797        # Can't change the table if locked.
6798        return if $self->carp_if_locked;
6799
6800        $self->_range_list->add_map($lower, $upper,
6801                                    $string,
6802                                    @_,
6803                                    Type => $type);
6804        return;
6805    }
6806
6807    sub append_to_body($self) {
6808        # Adds to the written HERE document of the table's body any anomalous
6809        # entries in the table..
6810        my $addr = pack 'J', refaddr $self;
6811
6812        return "" unless @{$anomalous_entries{$addr}};
6813        return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6814    }
6815
6816    sub map_add_or_replace_non_nulls($self, $other) {
6817        # This adds the mappings in the table $other to $self.  Non-null
6818        # mappings from $other override those in $self.  It essentially merges
6819        # the two tables, with the second having priority except for null
6820        # mappings.
6821        return if $self->carp_if_locked;
6822
6823        if (! $other->isa(__PACKAGE__)) {
6824            Carp::my_carp_bug("$other should be a "
6825                        . __PACKAGE__
6826                        . ".  Not a '"
6827                        . ref($other)
6828                        . "'.  Not added;");
6829            return;
6830        }
6831
6832        local $to_trace = 0 if main::DEBUG;
6833
6834        my $self_range_list = $self->_range_list;
6835        my $other_range_list = $other->_range_list;
6836        foreach my $range ($other_range_list->ranges) {
6837            my $value = $range->value;
6838            next if $value eq "";
6839            $self_range_list->_add_delete('+',
6840                                          $range->start,
6841                                          $range->end,
6842                                          $value,
6843                                          Type => $range->type,
6844                                          Replace => $UNCONDITIONALLY);
6845        }
6846
6847        return;
6848    }
6849
6850    sub set_default_map($self, $map, $use_full_name=0) {
6851        # Define what code points that are missing from the input files should
6852        # map to.  The optional second parameter 'full_name' indicates to
6853        # force using the full name of the map instead of its standard name.
6854        if ($use_full_name && $use_full_name ne 'full_name') {
6855            Carp::my_carp_bug("Second parameter to set_default_map() if"
6856                            . " present, must be 'full_name'");
6857        }
6858
6859        my $addr = pack 'J', refaddr $self;
6860
6861        # Convert the input to the standard equivalent, if any (won't have any
6862        # for $STRING properties)
6863        my $standard = $self->property->table($map);
6864        if (defined $standard) {
6865            $map = ($use_full_name)
6866                   ? $standard->full_name
6867                   : $standard->name;
6868        }
6869
6870        # Warn if there already is a non-equivalent default map for this
6871        # property.  Note that a default map can be a ref, which means that
6872        # what it actually means is delayed until later in the program, and it
6873        # IS permissible to override it here without a message.
6874        my $default_map = $default_map{$addr};
6875        if (defined $default_map
6876            && ! ref($default_map)
6877            && $default_map ne $map
6878            && main::Standardize($map) ne $default_map)
6879        {
6880            my $property = $self->property;
6881            my $map_table = $property->table($map);
6882            my $default_table = $property->table($default_map);
6883            if (defined $map_table
6884                && defined $default_table
6885                && $map_table != $default_table)
6886            {
6887                Carp::my_carp("Changing the default mapping for "
6888                            . $property
6889                            . " from $default_map to $map'");
6890            }
6891        }
6892
6893        $default_map{$addr} = $map;
6894
6895        # Don't also create any missing table for this map at this point,
6896        # because if we did, it could get done before the main table add is
6897        # done for PropValueAliases.txt; instead the caller will have to make
6898        # sure it exists, if desired.
6899        return;
6900    }
6901
6902    sub to_output_map($self) {
6903        # Returns boolean: should we write this map table?
6904        my $addr = pack 'J', refaddr $self;
6905
6906        # If overridden, use that
6907        return $to_output_map{$addr} if defined $to_output_map{$addr};
6908
6909        my $full_name = $self->full_name;
6910        return $global_to_output_map{$full_name}
6911                                if defined $global_to_output_map{$full_name};
6912
6913        # If table says to output, do so; if says to suppress it, do so.
6914        my $fate = $self->fate;
6915        return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6916        return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6917        return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6918
6919        my $type = $self->property->type;
6920
6921        # Don't want to output binary map tables even for debugging.
6922        return 0 if $type == $BINARY;
6923
6924        # But do want to output string ones.  All the ones that remain to
6925        # be dealt with (i.e. which haven't explicitly been set to external)
6926        # are for internal Perl use only.  The default for those that map to
6927        # $CODE_POINT and haven't been restricted to a single element range
6928        # is to use the adjusted form.
6929        if ($type == $STRING) {
6930            return $INTERNAL_MAP if $self->range_size_1
6931                                    || $default_map{$addr} ne $CODE_POINT;
6932            return $OUTPUT_ADJUSTED;
6933        }
6934
6935        # Otherwise is an $ENUM, do output it, for Perl's purposes
6936        return $INTERNAL_MAP;
6937    }
6938
6939    sub inverse_list($self) {
6940        # Returns a Range_List that is gaps of the current table.  That is,
6941        # the inversion
6942        my $current = Range_List->new(Initialize => $self->_range_list,
6943                                Owner => $self->property);
6944        return ~ $current;
6945    }
6946
6947    sub header($self) {
6948        my $return = $self->SUPER::header();
6949
6950        if ($self->to_output_map >= $INTERNAL_MAP) {
6951            $return .= $INTERNAL_ONLY_HEADER;
6952        }
6953        else {
6954            # Other properties have fixed formats.
6955            my $property_name = $self->property->full_name;
6956
6957            $return .= <<END;
6958
6959# !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6960
6961# This file is for internal use by core Perl only.  It is retained for
6962# backwards compatibility with applications that may have come to rely on it,
6963# but its format and even its name or existence are subject to change without
6964# notice in a future Perl version.  Don't use it directly.  Instead, its
6965# contents are now retrievable through a stable API in the Unicode::UCD
6966# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
6967# code points can be retrieved via Unicode::UCD::charprop());
6968END
6969        }
6970        return $return;
6971    }
6972
6973    sub set_final_comment($self) {
6974        # Just before output, create the comment that heads the file
6975        # containing this table.
6976
6977        return unless $debugging_build;
6978
6979        # No sense generating a comment if aren't going to write it out.
6980        return if ! $self->to_output_map;
6981
6982        my $addr = pack 'J', refaddr $self;
6983
6984        my $property = $self->property;
6985
6986        # Get all the possible names for this property.  Don't use any that
6987        # aren't ok for use in a file name, etc.  This is perhaps causing that
6988        # flag to do double duty, and may have to be changed in the future to
6989        # have our own flag for just this purpose; but it works now to exclude
6990        # Perl generated synonyms from the lists for properties, where the
6991        # name is always the proper Unicode one.
6992        my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6993
6994        my $count = $self->count;
6995        my $default_map = $default_map{$addr};
6996
6997        # The ranges that map to the default aren't output, so subtract that
6998        # to get those actually output.  A property with matching tables
6999        # already has the information calculated.
7000        if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7001            $count -= $property->table($default_map)->count;
7002        }
7003        elsif (defined $default_map) {
7004
7005            # But for $STRING properties, must calculate now.  Subtract the
7006            # count from each range that maps to the default.
7007            foreach my $range ($self->_range_list->ranges) {
7008                if ($range->value eq $default_map) {
7009                    $count -= $range->end +1 - $range->start;
7010                }
7011            }
7012
7013        }
7014
7015        # Get a  string version of $count with underscores in large numbers,
7016        # for clarity.
7017        my $string_count = main::clarify_code_point_count($count);
7018
7019        my $code_points = ($count == 1)
7020                        ? 'single code point'
7021                        : "$string_count code points";
7022
7023        my $mapping;
7024        my $these_mappings;
7025        my $are;
7026        if (@property_aliases <= 1) {
7027            $mapping = 'mapping';
7028            $these_mappings = 'this mapping';
7029            $are = 'is'
7030        }
7031        else {
7032            $mapping = 'synonymous mappings';
7033            $these_mappings = 'these mappings';
7034            $are = 'are'
7035        }
7036        my $cp;
7037        if ($count >= $MAX_UNICODE_CODEPOINTS) {
7038            $cp = "any code point in Unicode Version $string_version";
7039        }
7040        else {
7041            my $map_to;
7042            if ($default_map eq "") {
7043                $map_to = 'the empty string';
7044            }
7045            elsif ($default_map eq $CODE_POINT) {
7046                $map_to = "itself";
7047            }
7048            else {
7049                $map_to = "'$default_map'";
7050            }
7051            if ($count == 1) {
7052                $cp = "the single code point";
7053            }
7054            else {
7055                $cp = "one of the $code_points";
7056            }
7057            $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7058        }
7059
7060        my $comment = "";
7061
7062        my $status = $self->status;
7063        if ($status ne $NORMAL) {
7064            my $warn = uc $status_past_participles{$status};
7065            $comment .= <<END;
7066
7067!!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
7068 All property or property=value combinations contained in this file are $warn.
7069 See $unicode_reference_url for what this means.
7070
7071END
7072        }
7073        $comment .= "This file returns the $mapping:\n";
7074
7075        my $ucd_accessible_name = "";
7076        my $has_underscore_name = 0;
7077        my $full_name = $self->property->full_name;
7078        for my $i (0 .. @property_aliases - 1) {
7079            my $name = $property_aliases[$i]->name;
7080            $has_underscore_name = 1 if $name =~ /^_/;
7081            $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7082            if ($property_aliases[$i]->ucd) {
7083                if ($name eq $full_name) {
7084                    $ucd_accessible_name = $full_name;
7085                }
7086                elsif (! $ucd_accessible_name) {
7087                    $ucd_accessible_name = $name;
7088                }
7089            }
7090        }
7091        $comment .= "\nwhere 'cp' is $cp.";
7092        if ($ucd_accessible_name) {
7093            $comment .= "  Note that $these_mappings";
7094            if ($has_underscore_name) {
7095                $comment .= " (except for the one(s) that begin with an underscore)";
7096            }
7097            $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7098
7099        }
7100
7101        # And append any commentary already set from the actual property.
7102        $comment .= "\n\n" . $self->comment if $self->comment;
7103        if ($self->description) {
7104            $comment .= "\n\n" . join " ", $self->description;
7105        }
7106        if ($self->note) {
7107            $comment .= "\n\n" . join " ", $self->note;
7108        }
7109        $comment .= "\n";
7110
7111        if (! $self->perl_extension) {
7112            $comment .= <<END;
7113
7114For information about what this property really means, see:
7115$unicode_reference_url
7116END
7117        }
7118
7119        if ($count) {        # Format differs for empty table
7120                $comment.= "\nThe format of the ";
7121            if ($self->range_size_1) {
7122                $comment.= <<END;
7123main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7124is in hex; MAPPING is what CODE_POINT maps to.
7125END
7126            }
7127            else {
7128
7129                # There are tables which end up only having one element per
7130                # range, but it is not worth keeping track of for making just
7131                # this comment a little better.
7132                $comment .= <<END;
7133non-comment portions of the main body of lines of this file is:
7134START\\tSTOP\\tMAPPING where START is the starting code point of the
7135range, in hex; STOP is the ending point, or if omitted, the range has just one
7136code point; MAPPING is what each code point between START and STOP maps to.
7137END
7138                if ($self->output_range_counts) {
7139                    $comment .= <<END;
7140Numbers in comments in [brackets] indicate how many code points are in the
7141range (omitted when the range is a single code point or if the mapping is to
7142the null string).
7143END
7144                }
7145            }
7146        }
7147        $self->set_comment(main::join_lines($comment));
7148        return;
7149    }
7150
7151    my %swash_keys; # Makes sure don't duplicate swash names.
7152
7153    # The remaining variables are temporaries used while writing each table,
7154    # to output special ranges.
7155    my @multi_code_point_maps;  # Map is to more than one code point.
7156
7157    sub handle_special_range($self, $range) {
7158        # Called in the middle of write when it finds a range it doesn't know
7159        # how to handle.
7160
7161        my $addr = pack 'J', refaddr $self;
7162
7163        my $type = $range->type;
7164
7165        my $low = $range->start;
7166        my $high = $range->end;
7167        my $map = $range->value;
7168
7169        # No need to output the range if it maps to the default.
7170        return if $map eq $default_map{$addr};
7171
7172        my $property = $self->property;
7173
7174        # Switch based on the map type...
7175        if ($type == $HANGUL_SYLLABLE) {
7176
7177            # These are entirely algorithmically determinable based on
7178            # some constants furnished by Unicode; for now, just set a
7179            # flag to indicate that have them.  After everything is figured
7180            # out, we will output the code that does the algorithm.  (Don't
7181            # output them if not needed because we are suppressing this
7182            # property.)
7183            $has_hangul_syllables = 1 if $property->to_output_map;
7184        }
7185        elsif ($type == $CP_IN_NAME) {
7186
7187            # Code points whose name ends in their code point are also
7188            # algorithmically determinable, but need information about the map
7189            # to do so.  Both the map and its inverse are stored in data
7190            # structures output in the file.  They are stored in the mean time
7191            # in global lists The lists will be written out later into Name.pm,
7192            # which is created only if needed.  In order to prevent duplicates
7193            # in the list, only add to them for one property, should multiple
7194            # ones need them.
7195            if ($needing_code_points_ending_in_code_point == 0) {
7196                $needing_code_points_ending_in_code_point = $property;
7197            }
7198            if ($property == $needing_code_points_ending_in_code_point) {
7199                push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7200                push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7201
7202                my $squeezed = $map =~ s/[-\s]+//gr;
7203                push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7204                                                                          $low;
7205                push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7206                                                                         $high;
7207
7208                # Calculate the set of legal characters in names of this
7209                # series.  It includes every character in the name prefix.
7210                my %legal;
7211                $legal{$_} = 1 for split //, $map;
7212
7213                # Plus the hex code point chars, blank, and minus.  Also \n
7214                # can show up as being required due to anchoring
7215                for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") {
7216                    $legal{$i} = 1;
7217                }
7218                my $legal = join "", sort { $a cmp $b } keys %legal;
7219
7220                # The legal chars can be used in match optimizations
7221                push @code_points_ending_in_code_point, { low => $low,
7222                                                        high => $high,
7223                                                        name => $map,
7224                                                        legal => $legal,
7225                                                        };
7226            }
7227        }
7228        elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7229
7230            # Multi-code point maps and null string maps have an entry
7231            # for each code point in the range.  They use the same
7232            # output format.
7233            for my $code_point ($low .. $high) {
7234
7235                # The pack() below can't cope with surrogates.  XXX This may
7236                # no longer be true
7237                if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7238                    Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
7239                    next;
7240                }
7241
7242                # Generate the hash entries for these in the form that
7243                # utf8.c understands.
7244                my $tostr = "";
7245                my $to_name = "";
7246                my $to_chr = "";
7247                foreach my $to (split " ", $map) {
7248                    if ($to !~ /^$code_point_re$/) {
7249                        Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
7250                        next;
7251                    }
7252                    $tostr .= sprintf "\\x{%s}", $to;
7253                    $to = CORE::hex $to;
7254                    if ($annotate) {
7255                        $to_name .= " + " if $to_name;
7256                        $to_chr .= main::display_chr($to);
7257                        main::populate_char_info($to)
7258                                            if ! defined $viacode[$to];
7259                        $to_name .=  $viacode[$to];
7260                    }
7261                }
7262
7263                # The unpack yields a list of the bytes that comprise the
7264                # UTF-8 of $code_point, which are each placed in \xZZ format
7265                # and output in the %s to map to $tostr, so the result looks
7266                # like:
7267                # "\xC4\xB0" => "\x{0069}\x{0307}",
7268                my $utf8 = sprintf(qq["%s" => "$tostr",],
7269                        join("", map { sprintf "\\x%02X", $_ }
7270                            unpack("U0C*", chr $code_point)));
7271
7272                # Add a comment so that a human reader can more easily
7273                # see what's going on.
7274                push @multi_code_point_maps,
7275                        sprintf("%-45s # U+%04X", $utf8, $code_point);
7276                if (! $annotate) {
7277                    $multi_code_point_maps[-1] .= " => $map";
7278                }
7279                else {
7280                    main::populate_char_info($code_point)
7281                                    if ! defined $viacode[$code_point];
7282                    $multi_code_point_maps[-1] .= " '"
7283                        . main::display_chr($code_point)
7284                        . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7285                }
7286            }
7287        }
7288        else {
7289            Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7290        }
7291
7292        return;
7293    }
7294
7295    sub pre_body($self) {
7296        # Returns the string that should be output in the file before the main
7297        # body of this table.  It isn't called until the main body is
7298        # calculated, saving a pass.  The string includes some hash entries
7299        # identifying the format of the body, and what the single value should
7300        # be for all ranges missing from it.  It also includes any code points
7301        # which have map_types that don't go in the main table.
7302
7303        my $addr = pack 'J', refaddr $self;
7304
7305        my $name = $self->property->swash_name;
7306
7307        # Currently there is nothing in the pre_body unless a swash is being
7308        # generated.
7309        return unless defined $name;
7310
7311        if (defined $swash_keys{$name}) {
7312            Carp::my_carp(main::join_lines(<<END
7313Already created a swash name '$name' for $swash_keys{$name}.  This means that
7314the same name desired for $self shouldn't be used.  Bad News.  This must be
7315fixed before production use, but proceeding anyway
7316END
7317            ));
7318        }
7319        $swash_keys{$name} = "$self";
7320
7321        my $pre_body = "";
7322
7323        # Here we assume we were called after have gone through the whole
7324        # file.  If we actually generated anything for each map type, add its
7325        # respective header and trailer
7326        my $specials_name = "";
7327        if (@multi_code_point_maps) {
7328            $specials_name = "Unicode::UCD::ToSpec$name";
7329            $pre_body .= <<END;
7330
7331# Some code points require special handling because their mappings are each to
7332# multiple code points.  These do not appear in the main body, but are defined
7333# in the hash below.
7334
7335# Each key is the string of N bytes that together make up the UTF-8 encoding
7336# for the code point.  (i.e. the same as looking at the code point's UTF-8
7337# under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7338\%$specials_name = (
7339END
7340            $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7341        }
7342
7343        my $format = $self->format;
7344
7345        my $return = "";
7346
7347        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7348        if ($output_adjusted) {
7349            if ($specials_name) {
7350                $return .= <<END;
7351# The mappings in the non-hash portion of this file must be modified to get the
7352# correct values by adding the code point ordinal number to each one that is
7353# numeric.
7354END
7355            }
7356            else {
7357                $return .= <<END;
7358# The mappings must be modified to get the correct values by adding the code
7359# point ordinal number to each one that is numeric.
7360END
7361            }
7362        }
7363
7364        $return .= <<END;
7365
7366# The name this table is to be known by, with the format of the mappings in
7367# the main body of the table, and what all code points missing from this file
7368# map to.
7369\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7370END
7371        if ($specials_name) {
7372            $return .= <<END;
7373\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7374END
7375        }
7376        my $default_map = $default_map{$addr};
7377
7378        # For $CODE_POINT default maps and using adjustments, instead the default
7379        # becomes zero.
7380        $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '"
7381                .  (($output_adjusted && $default_map eq $CODE_POINT)
7382                   ? "0"
7383                   : $default_map)
7384                . "';";
7385
7386        if ($default_map eq $CODE_POINT) {
7387            $return .= ' # code point maps to itself';
7388        }
7389        elsif ($default_map eq "") {
7390            $return .= ' # code point maps to the empty string';
7391        }
7392        $return .= "\n";
7393
7394        $return .= $pre_body;
7395
7396        return $return;
7397    }
7398
7399    sub write($self) {
7400        # Write the table to the file.
7401
7402        my $addr = pack 'J', refaddr $self;
7403
7404        # Clear the temporaries
7405        undef @multi_code_point_maps;
7406
7407        # Calculate the format of the table if not already done.
7408        my $format = $self->format;
7409        my $type = $self->property->type;
7410        my $default_map = $self->default_map;
7411        if (! defined $format) {
7412            if ($type == $BINARY) {
7413
7414                # Don't bother checking the values, because we elsewhere
7415                # verify that a binary table has only 2 values.
7416                $format = $BINARY_FORMAT;
7417            }
7418            else {
7419                my @ranges = $self->_range_list->ranges;
7420
7421                # default an empty table based on its type and default map
7422                if (! @ranges) {
7423
7424                    # But it turns out that the only one we can say is a
7425                    # non-string (besides binary, handled above) is when the
7426                    # table is a string and the default map is to a code point
7427                    if ($type == $STRING && $default_map eq $CODE_POINT) {
7428                        $format = $HEX_FORMAT;
7429                    }
7430                    else {
7431                        $format = $STRING_FORMAT;
7432                    }
7433                }
7434                else {
7435
7436                    # Start with the most restrictive format, and as we find
7437                    # something that doesn't fit with that, change to the next
7438                    # most restrictive, and so on.
7439                    $format = $DECIMAL_FORMAT;
7440                    foreach my $range (@ranges) {
7441                        next if $range->type != 0;  # Non-normal ranges don't
7442                                                    # affect the main body
7443                        my $map = $range->value;
7444                        if ($map ne $default_map) {
7445                            last if $format eq $STRING_FORMAT;  # already at
7446                                                                # least
7447                                                                # restrictive
7448                            $format = $INTEGER_FORMAT
7449                                                if $format eq $DECIMAL_FORMAT
7450                                                    && $map !~ / ^ [0-9] $ /x;
7451                            $format = $FLOAT_FORMAT
7452                                            if $format eq $INTEGER_FORMAT
7453                                                && $map !~ / ^ -? [0-9]+ $ /x;
7454                            $format = $RATIONAL_FORMAT
7455                                if $format eq $FLOAT_FORMAT
7456                                    && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7457                            $format = $HEX_FORMAT
7458                                if ($format eq $RATIONAL_FORMAT
7459                                       && $map !~
7460                                           m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7461                                        # Assume a leading zero means hex,
7462                                        # even if all digits are 0-9
7463                                    || ($format eq $INTEGER_FORMAT
7464                                        && $map =~ /^0[0-9A-F]/);
7465                            $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7466                                                       && $map =~ /[^0-9A-F]/;
7467                        }
7468                    }
7469                }
7470            }
7471        } # end of calculating format
7472
7473        if ($default_map eq $CODE_POINT
7474            && $format ne $HEX_FORMAT
7475            && ! defined $self->format)    # manual settings are always
7476                                           # considered ok
7477        {
7478            Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7479        }
7480
7481        # If the output is to be adjusted, the format of the table that gets
7482        # output is actually 'a' or 'ax' instead of whatever it is stored
7483        # internally as.
7484        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7485        if ($output_adjusted) {
7486            if ($default_map eq $CODE_POINT) {
7487                $format = $HEX_ADJUST_FORMAT;
7488            }
7489            else {
7490                $format = $ADJUST_FORMAT;
7491            }
7492        }
7493
7494        $self->_set_format($format);
7495
7496        return $self->SUPER::write(
7497            $output_adjusted,
7498            $default_map);   # don't write defaulteds
7499    }
7500
7501    # Accessors for the underlying list that should fail if locked.
7502    for my $sub (qw(
7503                    add_duplicate
7504                    replace_map
7505                ))
7506    {
7507        no strict "refs";
7508        *$sub = sub {
7509            use strict "refs";
7510            my $self = shift;
7511
7512            return if $self->carp_if_locked;
7513            return $self->_range_list->$sub(@_);
7514        }
7515    }
7516} # End closure for Map_Table
7517
7518package Match_Table;
7519use parent '-norequire', '_Base_Table';
7520
7521# A Match table is one which is a list of all the code points that have
7522# the same property and property value, for use in \p{property=value}
7523# constructs in regular expressions.  It adds very little data to the base
7524# structure, but many methods, as these lists can be combined in many ways to
7525# form new ones.
7526# There are only a few concepts added:
7527# 1) Equivalents and Relatedness.
7528#    Two tables can match the identical code points, but have different names.
7529#    This always happens when there is a perl single form extension
7530#    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7531#    tables are set to be related, with the Perl extension being a child, and
7532#    the Unicode property being the parent.
7533#
7534#    It may be that two tables match the identical code points and we don't
7535#    know if they are related or not.  This happens most frequently when the
7536#    Block and Script properties have the exact range.  But note that a
7537#    revision to Unicode could add new code points to the script, which would
7538#    now have to be in a different block (as the block was filled, or there
7539#    would have been 'Unknown' script code points in it and they wouldn't have
7540#    been identical).  So we can't rely on any two properties from Unicode
7541#    always matching the same code points from release to release, and thus
7542#    these tables are considered coincidentally equivalent--not related.  When
7543#    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7544#    'leader', and the others are 'equivalents'.  This concept is useful
7545#    to minimize the number of tables written out.  Only one file is used for
7546#    any identical set of code points, with entries in UCD.pl mapping all
7547#    the involved tables to it.
7548#
7549#    Related tables will always be identical; we set them up to be so.  Thus
7550#    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7551#    unrelated tables.  Relatedness makes generating the documentation easier.
7552#
7553# 2) Complement.
7554#    Like equivalents, two tables may be the inverses of each other, the
7555#    intersection between them is null, and the union is every Unicode code
7556#    point.  The two tables that occupy a binary property are necessarily like
7557#    this.  By specifying one table as the complement of another, we can avoid
7558#    storing it on disk (using the other table and performing a fast
7559#    transform), and some memory and calculations.
7560#
7561# 3) Conflicting.  It may be that there will eventually be name clashes, with
7562#    the same name meaning different things.  For a while, there actually were
7563#    conflicts, but they have so far been resolved by changing Perl's or
7564#    Unicode's definitions to match the other, but when this code was written,
7565#    it wasn't clear that that was what was going to happen.  (Unicode changed
7566#    because of protests during their beta period.)  Name clashes are warned
7567#    about during compilation, and the documentation.  The generated tables
7568#    are sane, free of name clashes, because the code suppresses the Perl
7569#    version.  But manual intervention to decide what the actual behavior
7570#    should be may be required should this happen.  The introductory comments
7571#    have more to say about this.
7572#
7573# 4) Definition.  This is a string for human consumption that specifies the
7574#    code points that this table matches.  This is used only for the generated
7575#    pod file.  It may be specified explicitly, or automatically computed.
7576#    Only the first portion of complicated definitions is computed and
7577#    displayed.
7578
7579sub standardize { return main::standardize($_[0]); }
7580sub trace { return main::trace(@_); }
7581
7582
7583{ # Closure
7584
7585    main::setup_package();
7586
7587    my %leader;
7588    # The leader table of this one; initially $self.
7589    main::set_access('leader', \%leader, 'r');
7590
7591    my %equivalents;
7592    # An array of any tables that have this one as their leader
7593    main::set_access('equivalents', \%equivalents, 'readable_array');
7594
7595    my %parent;
7596    # The parent table to this one, initially $self.  This allows us to
7597    # distinguish between equivalent tables that are related (for which this
7598    # is set to), and those which may not be, but share the same output file
7599    # because they match the exact same set of code points in the current
7600    # Unicode release.
7601    main::set_access('parent', \%parent, 'r');
7602
7603    my %children;
7604    # An array of any tables that have this one as their parent
7605    main::set_access('children', \%children, 'readable_array');
7606
7607    my %conflicting;
7608    # Array of any tables that would have the same name as this one with
7609    # a different meaning.  This is used for the generated documentation.
7610    main::set_access('conflicting', \%conflicting, 'readable_array');
7611
7612    my %matches_all;
7613    # Set in the constructor for tables that are expected to match all code
7614    # points.
7615    main::set_access('matches_all', \%matches_all, 'r');
7616
7617    my %complement;
7618    # Points to the complement that this table is expressed in terms of; 0 if
7619    # none.
7620    main::set_access('complement', \%complement, 'r');
7621
7622    my %definition;
7623    # Human readable string of the first few ranges of code points matched by
7624    # this table
7625    main::set_access('definition', \%definition, 'r', 's');
7626
7627    sub new {
7628        my $class = shift;
7629
7630        my %args = @_;
7631
7632        # The property for which this table is a listing of property values.
7633        my $property = delete $args{'_Property'};
7634
7635        my $name = delete $args{'Name'};
7636        my $full_name = delete $args{'Full_Name'};
7637        $full_name = $name if ! defined $full_name;
7638
7639        # Optional
7640        my $initialize = delete $args{'Initialize'};
7641        my $matches_all = delete $args{'Matches_All'} || 0;
7642        my $format = delete $args{'Format'};
7643        my $definition = delete $args{'Definition'} // "";
7644        # Rest of parameters passed on.
7645
7646        my $range_list = Range_List->new(Initialize => $initialize,
7647                                         Owner => $property);
7648
7649        my $complete = $full_name;
7650        $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7651                                              # but this helps debug if it
7652                                              # does
7653        # The complete name for a match table includes it's property in a
7654        # compound form 'property=table', except if the property is the
7655        # pseudo-property, perl, in which case it is just the single form,
7656        # 'table' (If you change the '=' must also change the ':' in lots of
7657        # places in this program that assume an equal sign)
7658        $complete = $property->full_name . "=$complete" if $property != $perl;
7659
7660        my $self = $class->SUPER::new(%args,
7661                                      Name => $name,
7662                                      Complete_Name => $complete,
7663                                      Full_Name => $full_name,
7664                                      _Property => $property,
7665                                      _Range_List => $range_list,
7666                                      Format => $EMPTY_FORMAT,
7667                                      Write_As_Invlist => 1,
7668                                      );
7669        my $addr = pack 'J', refaddr $self;
7670
7671        $conflicting{$addr} = [ ];
7672        $equivalents{$addr} = [ ];
7673        $children{$addr} = [ ];
7674        $matches_all{$addr} = $matches_all;
7675        $leader{$addr} = $self;
7676        $parent{$addr} = $self;
7677        $complement{$addr} = 0;
7678        $definition{$addr} = $definition;
7679
7680        if (defined $format && $format ne $EMPTY_FORMAT) {
7681            Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7682        }
7683
7684        return $self;
7685    }
7686
7687    # See this program's beginning comment block about overloading these.
7688    use overload
7689        fallback => 0,
7690        qw("") => "_operator_stringify",
7691        '=' => sub {
7692                    my $self = shift;
7693
7694                    return if $self->carp_if_locked;
7695                    return $self;
7696                },
7697
7698        '+' => sub {
7699                        my $self = shift;
7700                        my $other = shift;
7701
7702                        return $self->_range_list + $other;
7703                    },
7704        '&' => sub {
7705                        my $self = shift;
7706                        my $other = shift;
7707
7708                        return $self->_range_list & $other;
7709                    },
7710        '+=' => sub {
7711                        my $self = shift;
7712                        my $other = shift;
7713                        my $reversed = shift;
7714
7715                        if ($reversed) {
7716                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7717                            . ref($other)
7718                            . ' += '
7719                            . ref($self)
7720                            . "'.  undef returned.");
7721                            return;
7722                        }
7723
7724                        return if $self->carp_if_locked;
7725
7726                        if (ref $other) {
7727
7728                            # Change the range list of this table to be the
7729                            # union of the two.
7730                            $self->_set_range_list($self->_range_list
7731                                                    + $other);
7732                        }
7733                        else {    # $other is just a simple value
7734                            $self->add_range($other, $other);
7735                        }
7736                        return $self;
7737                    },
7738        '&=' => sub {
7739                        my $self = shift;
7740                        my $other = shift;
7741                        my $reversed = shift;
7742
7743                        if ($reversed) {
7744                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7745                            . ref($other)
7746                            . ' &= '
7747                            . ref($self)
7748                            . "'.  undef returned.");
7749                            return;
7750                        }
7751
7752                        return if $self->carp_if_locked;
7753                        $self->_set_range_list($self->_range_list & $other);
7754                        return $self;
7755                    },
7756        '-' => sub { my $self = shift;
7757                    my $other = shift;
7758                    my $reversed = shift;
7759                    if ($reversed) {
7760                        Carp::my_carp_bug("Bad news.  Can't cope with '"
7761                        . ref($other)
7762                        . ' - '
7763                        . ref($self)
7764                        . "'.  undef returned.");
7765                        return;
7766                    }
7767
7768                    return $self->_range_list - $other;
7769                },
7770        '~' => sub { my $self = shift;
7771                    return ~ $self->_range_list;
7772                },
7773    ;
7774
7775    sub _operator_stringify($self, $other="", $reversed=0) {
7776
7777        my $name = $self->complete_name;
7778        return "Table '$name'";
7779    }
7780
7781    sub _range_list {
7782        # Returns the range list associated with this table, which will be the
7783        # complement's if it has one.
7784
7785        my $self = shift;
7786        my $complement = $self->complement;
7787
7788        # In order to avoid re-complementing on each access, only do the
7789        # complement the first time, and store the result in this table's
7790        # range list to use henceforth.  However, this wouldn't work if the
7791        # controlling (complement) table changed after we do this, so lock it.
7792        # Currently, the value of the complement isn't needed until after it
7793        # is fully constructed, so this works.  If this were to change, the
7794        # each_range iteration functionality would no longer work on this
7795        # complement.
7796        if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
7797            $self->_set_range_list($self->SUPER::_range_list
7798                                + ~ $complement->_range_list);
7799            $complement->lock;
7800        }
7801
7802        return $self->SUPER::_range_list;
7803    }
7804
7805    sub add_alias {
7806        # Add a synonym for this table.  See the comments in the base class
7807
7808        my $self = shift;
7809        my $name = shift;
7810        # Rest of parameters passed on.
7811
7812        $self->SUPER::add_alias($name, $self, @_);
7813        return;
7814    }
7815
7816    sub add_conflicting {
7817        # Add the name of some other object to the list of ones that name
7818        # clash with this match table.
7819
7820        my $self = shift;
7821        my $conflicting_name = shift;   # The name of the conflicting object
7822        my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7823        my $conflicting_object = shift; # Optional, the conflicting object
7824                                        # itself.  This is used to
7825                                        # disambiguate the text if the input
7826                                        # name is identical to any of the
7827                                        # aliases $self is known by.
7828                                        # Sometimes the conflicting object is
7829                                        # merely hypothetical, so this has to
7830                                        # be an optional parameter.
7831        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7832
7833        my $addr = pack 'J', refaddr $self;
7834
7835        # Check if the conflicting name is exactly the same as any existing
7836        # alias in this table (as long as there is a real object there to
7837        # disambiguate with).
7838        if (defined $conflicting_object) {
7839            foreach my $alias ($self->aliases) {
7840                if (standardize($alias->name) eq standardize($conflicting_name)) {
7841
7842                    # Here, there is an exact match.  This results in
7843                    # ambiguous comments, so disambiguate by changing the
7844                    # conflicting name to its object's complete equivalent.
7845                    $conflicting_name = $conflicting_object->complete_name;
7846                    last;
7847                }
7848            }
7849        }
7850
7851        # Convert to the \p{...} final name
7852        $conflicting_name = "\\$p" . "{$conflicting_name}";
7853
7854        # Only add once
7855        return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7856
7857        push @{$conflicting{$addr}}, $conflicting_name;
7858
7859        return;
7860    }
7861
7862    sub is_set_equivalent_to($self, $other=undef) {
7863        # Return boolean of whether or not the other object is a table of this
7864        # type and has been marked equivalent to this one.
7865
7866        return 0 if ! defined $other; # Can happen for incomplete early
7867                                      # releases
7868        unless ($other->isa(__PACKAGE__)) {
7869            my $ref_other = ref $other;
7870            my $ref_self = ref $self;
7871            Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7872            return 0;
7873        }
7874
7875        # Two tables are equivalent if they have the same leader.
7876        return $leader{pack 'J', refaddr $self} == $leader{pack 'J', refaddr $other};
7877        return;
7878    }
7879
7880    sub set_equivalent_to {
7881        # Set $self equivalent to the parameter table.
7882        # The required Related => 'x' parameter is a boolean indicating
7883        # whether these tables are related or not.  If related, $other becomes
7884        # the 'parent' of $self; if unrelated it becomes the 'leader'
7885        #
7886        # Related tables share all characteristics except names; equivalents
7887        # not quite so many.
7888        # If they are related, one must be a perl extension.  This is because
7889        # we can't guarantee that Unicode won't change one or the other in a
7890        # later release even if they are identical now.
7891
7892        my $self = shift;
7893        my $other = shift;
7894
7895        my %args = @_;
7896        my $related = delete $args{'Related'};
7897
7898        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7899
7900        return if ! defined $other;     # Keep on going; happens in some early
7901                                        # Unicode releases.
7902
7903        if (! defined $related) {
7904            Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7905            $related = 0;
7906        }
7907
7908        # If already are equivalent, no need to re-do it;  if subroutine
7909        # returns null, it found an error, also do nothing
7910        my $are_equivalent = $self->is_set_equivalent_to($other);
7911        return if ! defined $are_equivalent || $are_equivalent;
7912
7913        my $addr = pack 'J', refaddr $self;
7914        my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7915
7916        if ($related) {
7917            if ($current_leader->perl_extension) {
7918                if ($other->perl_extension) {
7919                    Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7920                    return;
7921                }
7922            } elsif ($self->property != $other->property    # Depending on
7923                                                            # situation, might
7924                                                            # be better to use
7925                                                            # add_alias()
7926                                                            # instead for same
7927                                                            # property
7928                     && ! $other->perl_extension
7929
7930                         # We allow the sc and scx properties to be marked as
7931                         # related.  They are in fact related, and this allows
7932                         # the pod to show that better.  This test isn't valid
7933                         # if this is an early Unicode release without the scx
7934                         # property (having that also implies the sc property
7935                         # exists, so don't have to test for no 'sc')
7936                     && (   ! defined $scx
7937                         && ! (   (   $self->property == $script
7938                                   || $self->property == $scx)
7939                               && (   $self->property == $script
7940                                   || $self->property == $scx))))
7941            {
7942                Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7943                $related = 0;
7944            }
7945        }
7946
7947        if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7948            Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
7949            return;
7950        }
7951
7952        my $leader = pack 'J', refaddr $current_leader;
7953        my $other_addr = pack 'J', refaddr $other;
7954
7955        # Any tables that are equivalent to or children of this table must now
7956        # instead be equivalent to or (children) to the new leader (parent),
7957        # still equivalent.  The equivalency includes their matches_all info,
7958        # and for related tables, their fate and status.
7959        # All related tables are of necessity equivalent, but the converse
7960        # isn't necessarily true
7961        my $status = $other->status;
7962        my $status_info = $other->status_info;
7963        my $fate = $other->fate;
7964        my $matches_all = $matches_all{other_addr};
7965        my $caseless_equivalent = $other->caseless_equivalent;
7966        foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7967            next if $table == $other;
7968            trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7969
7970            my $table_addr = pack 'J', refaddr $table;
7971            $leader{$table_addr} = $other;
7972            $matches_all{$table_addr} = $matches_all;
7973            $self->_set_range_list($other->_range_list);
7974            push @{$equivalents{$other_addr}}, $table;
7975            if ($related) {
7976                $parent{$table_addr} = $other;
7977                push @{$children{$other_addr}}, $table;
7978                $table->set_status($status, $status_info);
7979
7980                # This reason currently doesn't get exposed outside; otherwise
7981                # would have to look up the parent's reason and use it instead.
7982                $table->set_fate($fate, "Parent's fate");
7983
7984                $self->set_caseless_equivalent($caseless_equivalent);
7985            }
7986        }
7987
7988        # Now that we've declared these to be equivalent, any changes to one
7989        # of the tables would invalidate that equivalency.
7990        $self->lock;
7991        $other->lock;
7992        return;
7993    }
7994
7995    sub set_complement($self, $other) {
7996        # Set $self to be the complement of the parameter table.  $self is
7997        # locked, as what it contains should all come from the other table.
7998
7999        if ($other->complement != 0) {
8000            Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8001            return;
8002        }
8003        $complement{pack 'J', refaddr $self} = $other;
8004
8005        # Be sure the other property knows we are depending on them; or the
8006        # other table if it is one in the current property.
8007        if ($self->property != $other->property) {
8008            $other->property->set_has_dependency(1);
8009        }
8010        else {
8011            $other->set_has_dependency(1);
8012        }
8013        $self->lock;
8014        return;
8015    }
8016
8017    sub add_range($self, @range) { # Add a range to the list for this table.
8018        # Rest of parameters passed on
8019
8020        return if $self->carp_if_locked;
8021        return $self->_range_list->add_range(@range);
8022    }
8023
8024    sub header($self) {
8025        # All match tables are to be used only by the Perl core.
8026        return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8027    }
8028
8029    sub pre_body {  # Does nothing for match tables.
8030        return
8031    }
8032
8033    sub append_to_body {  # Does nothing for match tables.
8034        return
8035    }
8036
8037    sub set_fate($self, $fate, $reason=undef) {
8038        $self->SUPER::set_fate($fate, $reason);
8039
8040        # All children share this fate
8041        foreach my $child ($self->children) {
8042            $child->set_fate($fate, $reason);
8043        }
8044        return;
8045    }
8046
8047    sub calculate_table_definition
8048    {
8049        # Returns a human-readable string showing some or all of the code
8050        # points matched by this table.  The string will include a
8051        # bracketed-character class for all characters matched in the 00-FF
8052        # range, and the first few ranges matched beyond that.
8053        my $max_ranges = 6;
8054
8055        my $self = shift;
8056        my $definition = $self->definition || "";
8057
8058        # Skip this if already have a definition.
8059        return $definition if $definition;
8060
8061        my $lows_string = "";   # The string representation of the 0-FF
8062                                # characters
8063        my $string_range = "";  # The string rep. of the above FF ranges
8064        my $range_count = 0;    # How many ranges in $string_rage
8065
8066        my @lows_invlist;       # The inversion list of the 0-FF code points
8067        my $first_non_control = ord(" ");   # Everything below this is a
8068                                            # control, on ASCII or EBCDIC
8069        my $max_table_code_point = $self->max;
8070
8071        # On ASCII platforms, the range 80-FF contains no printables.
8072        my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8073
8074
8075        # Look through the first few ranges matched by this table.
8076        $self->reset_each_range;    # Defensive programming
8077        while (defined (my $range = $self->each_range())) {
8078            my $start = $range->start;
8079            my $end = $range->end;
8080
8081            # Accumulate an inversion list of the 00-FF code points
8082            if ($start < 256 && ($start > 0 || $end < 256)) {
8083                push @lows_invlist, $start;
8084                push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8085
8086                # Get next range if there are more ranges below 256
8087                next if $end < 256 && $end < $max_table_code_point;
8088
8089                # If the range straddles the 255/256 boundary, we split it
8090                # there.  We already added above the low portion to the
8091                # inversion list
8092                $start = 256 if $end > 256;
8093            }
8094
8095            # Here, @lows_invlist contains the code points below 256, and
8096            # there is no other range, or the current one starts at or above
8097            # 256.  Generate the [char class] for the 0-255 ones.
8098            while (@lows_invlist) {
8099
8100                # If this range (necessarily the first one, by the way) starts
8101                # at 0 ...
8102                if ($lows_invlist[0] == 0) {
8103
8104                    # If it ends within the block of controls, that means that
8105                    # some controls are in it and some aren't.  Since Unicode
8106                    # properties pretty much only know about a few of the
8107                    # controls, like \n, \t, this means that its one of them
8108                    # that isn't in the range.  Complement the inversion list
8109                    # which will likely cause these to be output using their
8110                    # mnemonics, hence being clearer.
8111                    if ($lows_invlist[1] < $first_non_control) {
8112                        $lows_string .= '^';
8113                        shift @lows_invlist;
8114                        push @lows_invlist, 256;
8115                    }
8116                    elsif ($lows_invlist[1] <= $highest_printable) {
8117
8118                        # Here, it extends into the printables block.  Split
8119                        # into two ranges so that the controls are separate.
8120                        $lows_string .= sprintf "\\x00-\\x%02x",
8121                                                    $first_non_control - 1;
8122                        $lows_invlist[0] = $first_non_control;
8123                    }
8124                }
8125
8126                # If the range completely contains the printables, don't
8127                # individually spell out the printables.
8128                if (    $lows_invlist[0] <= $first_non_control
8129                    && $lows_invlist[1] > $highest_printable)
8130                {
8131                    $lows_string .= sprintf "\\x%02x-\\x%02x",
8132                                        $lows_invlist[0], $lows_invlist[1] - 1;
8133                    shift @lows_invlist;
8134                    shift @lows_invlist;
8135                    next;
8136                }
8137
8138                # Here, the range may include some but not all printables.
8139                # Look at each one individually
8140                foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8141                    my $char = chr $ord;
8142
8143                    # If there is already something in the list, an
8144                    # alphanumeric char could be the next in sequence.  If so,
8145                    # we start or extend a range.  That is, we could have so
8146                    # far something like 'a-c', and the next char is a 'd', so
8147                    # we change it to 'a-d'.  We use native_to_unicode()
8148                    # because a-z on EBCDIC means 26 chars, and excludes the
8149                    # gap ones.
8150                    if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8151                        my $prev = substr($lows_string, -1);
8152                        if (   $prev !~ /[[:alnum:]]/
8153                            ||   utf8::native_to_unicode(ord $prev) + 1
8154                              != utf8::native_to_unicode(ord $char))
8155                        {
8156                            # Not extending the range
8157                            $lows_string .= $char;
8158                        }
8159                        elsif (   length $lows_string > 1
8160                               && substr($lows_string, -2, 1) eq '-')
8161                        {
8162                            # We had a sequence like '-c' and the current
8163                            # character is 'd'.  Extend the range.
8164                            substr($lows_string, -1, 1) = $char;
8165                        }
8166                        else {
8167                            # We had something like 'd' and this is 'e'.
8168                            # Start a range.
8169                            $lows_string .= "-$char";
8170                        }
8171                    }
8172                    elsif ($char =~ /[[:graph:]]/) {
8173
8174                        # We output a graphic char as-is, preceded by a
8175                        # backslash if it is a metacharacter
8176                        $lows_string .= '\\'
8177                                if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8178                        $lows_string .= $char;
8179                    } # Otherwise use mnemonic for any that have them
8180                    elsif ($char =~ /[\a]/) {
8181                        $lows_string .= '\a';
8182                    }
8183                    elsif ($char =~ /[\b]/) {
8184                        $lows_string .= '\b';
8185                    }
8186                    elsif ($char eq "\e") {
8187                        $lows_string .= '\e';
8188                    }
8189                    elsif ($char eq "\f") {
8190                        $lows_string .= '\f';
8191                    }
8192                    elsif ($char eq "\cK") {
8193                        $lows_string .= '\cK';
8194                    }
8195                    elsif ($char eq "\n") {
8196                        $lows_string .= '\n';
8197                    }
8198                    elsif ($char eq "\r") {
8199                        $lows_string .= '\r';
8200                    }
8201                    elsif ($char eq "\t") {
8202                        $lows_string .= '\t';
8203                    }
8204                    else {
8205
8206                        # Here is a non-graphic without a mnemonic.  We use \x
8207                        # notation.  But if the ordinal of this is one above
8208                        # the previous, create or extend the range
8209                        my $hex_representation = sprintf("%02x", ord $char);
8210                        if (   length $lows_string >= 4
8211                            && substr($lows_string, -4, 2) eq '\\x'
8212                            && hex(substr($lows_string, -2)) + 1 == ord $char)
8213                        {
8214                            if (       length $lows_string >= 5
8215                                &&     substr($lows_string, -5, 1) eq '-'
8216                                && (   length $lows_string == 5
8217                                    || substr($lows_string, -6, 1) ne '\\'))
8218                            {
8219                                substr($lows_string, -2) = $hex_representation;
8220                            }
8221                            else {
8222                                $lows_string .= '-\\x' . $hex_representation;
8223                            }
8224                        }
8225                        else {
8226                            $lows_string .= '\\x' . $hex_representation;
8227                        }
8228                    }
8229                }
8230            }
8231
8232            # Done with assembling the string of all lows.  If there are only
8233            # lows in the property, are completely done.
8234            if ($max_table_code_point < 256) {
8235                $self->reset_each_range;
8236                last;
8237            }
8238
8239            # Otherwise, quit if reached max number of non-lows ranges.  If
8240            # there are lows, count them as one unit towards the maximum.
8241            $range_count++;
8242            if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8243                $string_range .= " ...";
8244                $self->reset_each_range;
8245                last;
8246            }
8247
8248            # Otherwise add this range.
8249            $string_range .= ", " if $string_range ne "";
8250            if ($start == $end) {
8251                $string_range .= sprintf("U+%04X", $start);
8252            }
8253            elsif ($end >= $MAX_WORKING_CODEPOINT)  {
8254                $string_range .= sprintf("U+%04X..infinity", $start);
8255            }
8256            else  {
8257                $string_range .= sprintf("U+%04X..%04X",
8258                                        $start, $end);
8259            }
8260        }
8261
8262        # Done with all the ranges we're going to look at.  Assemble the
8263        # definition from the lows + non-lows.
8264
8265        if ($lows_string ne "" || $string_range ne "") {
8266            if ($lows_string ne "") {
8267                $definition .= "[$lows_string]";
8268                $definition .= ", " if $string_range;
8269            }
8270            $definition .= $string_range;
8271        }
8272
8273        return $definition;
8274    }
8275
8276    sub write($self) {
8277        return $self->SUPER::write(0); # No adjustments
8278    }
8279
8280    # $leader - Should only be called on the leader table of an equivalent group
8281    sub set_final_comment($leader) {
8282        # This creates a comment for the file that is to hold the match table
8283        # $self.  It is somewhat convoluted to make the English read nicely,
8284        # but, heh, it's just a comment.
8285        # This should be called only with the leader match table of all the
8286        # ones that share the same file.  It lists all such tables, ordered so
8287        # that related ones are together.
8288
8289        return unless $debugging_build;
8290
8291        my $addr = pack 'J', refaddr $leader;
8292
8293        if ($leader{$addr} != $leader) {
8294            Carp::my_carp_bug(<<END
8295set_final_comment() must be called on a leader table, which $leader is not.
8296It is equivalent to $leader{$addr}.  No comment created
8297END
8298            );
8299            return;
8300        }
8301
8302        # Get the number of code points matched by each of the tables in this
8303        # file, and add underscores for clarity.
8304        my $count = $leader->count;
8305        my $unicode_count;
8306        my $non_unicode_string;
8307        if ($count > $MAX_UNICODE_CODEPOINTS) {
8308            $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8309                                       - $MAX_UNICODE_CODEPOINT);
8310            $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8311        }
8312        else {
8313            $unicode_count = $count;
8314            $non_unicode_string = "";
8315        }
8316        my $string_count = main::clarify_code_point_count($unicode_count);
8317
8318        my $loose_count = 0;        # how many aliases loosely matched
8319        my $compound_name = "";     # ? Are any names compound?, and if so, an
8320                                    # example
8321        my $properties_with_compound_names = 0;    # count of these
8322
8323
8324        my %flags;              # The status flags used in the file
8325        my $total_entries = 0;  # number of entries written in the comment
8326        my $matches_comment = ""; # The portion of the comment about the
8327                                  # \p{}'s
8328        my @global_comments;    # List of all the tables' comments that are
8329                                # there before this routine was called.
8330        my $has_ucd_alias = 0;  # If there is an alias that is accessible via
8331                                # Unicode::UCD.  If not, then don't say it is
8332                                # in the comment
8333
8334        # Get list of all the parent tables that are equivalent to this one
8335        # (including itself).
8336        my @parents = grep { $parent{main::objaddr $_} == $_ }
8337                            main::uniques($leader, @{$equivalents{$addr}});
8338        my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
8339                                              # tables
8340        for my $parent (@parents) {
8341
8342            my $property = $parent->property;
8343
8344            # Special case 'N' tables in properties with two match tables when
8345            # the other is a 'Y' one.  These are likely to be binary tables,
8346            # but not necessarily.  In either case, \P{} will match the
8347            # complement of \p{}, and so if something is a synonym of \p, the
8348            # complement of that something will be the synonym of \P.  This
8349            # would be true of any property with just two match tables, not
8350            # just those whose values are Y and N; but that would require a
8351            # little extra work, and there are none such so far in Unicode.
8352            my $perl_p = 'p';        # which is it?  \p{} or \P{}
8353            my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
8354
8355            if (scalar $property->tables == 2
8356                && $parent == $property->table('N')
8357                && defined (my $yes = $property->table('Y')))
8358            {
8359                my $yes_addr = pack 'J', refaddr $yes;
8360                @yes_perl_synonyms
8361                    = grep { $_->property == $perl }
8362                                    main::uniques($yes,
8363                                                $parent{$yes_addr},
8364                                                $parent{$yes_addr}->children);
8365
8366                # But these synonyms are \P{} ,not \p{}
8367                $perl_p = 'P';
8368            }
8369
8370            my @description;        # Will hold the table description
8371            my @note;               # Will hold the table notes.
8372            my @conflicting;        # Will hold the table conflicts.
8373
8374            # Look at the parent, any yes synonyms, and all the children
8375            my $parent_addr = pack 'J', refaddr $parent;
8376            for my $table ($parent,
8377                           @yes_perl_synonyms,
8378                           @{$children{$parent_addr}})
8379            {
8380                my $table_addr = pack 'J', refaddr $table;
8381                my $table_property = $table->property;
8382
8383                # Tables are separated by a blank line to create a grouping.
8384                $matches_comment .= "\n" if $matches_comment;
8385
8386                # The table is named based on the property and value
8387                # combination it is for, like script=greek.  But there may be
8388                # a number of synonyms for each side, like 'sc' for 'script',
8389                # and 'grek' for 'greek'.  Any combination of these is a valid
8390                # name for this table.  In this case, there are three more,
8391                # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
8392                # listing all possible combinations in the comment, we make
8393                # sure that each synonym occurs at least once, and add
8394                # commentary that the other combinations are possible.
8395                # Because regular expressions don't recognize things like
8396                # \p{jsn=}, only look at non-null right-hand-sides
8397                my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8398                my @table_aliases = grep { $_->name ne "" } $table->aliases;
8399
8400                # The alias lists above are already ordered in the order we
8401                # want to output them.  To ensure that each synonym is listed,
8402                # we must use the max of the two numbers.  But if there are no
8403                # legal synonyms (nothing in @table_aliases), then we don't
8404                # list anything.
8405                my $listed_combos = (@table_aliases)
8406                                    ?  main::max(scalar @table_aliases,
8407                                                 scalar @property_aliases)
8408                                    : 0;
8409                trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8410
8411                my $property_had_compound_name = 0;
8412
8413                for my $i (0 .. $listed_combos - 1) {
8414                    $total_entries++;
8415
8416                    # The current alias for the property is the next one on
8417                    # the list, or if beyond the end, start over.  Similarly
8418                    # for the table (\p{prop=table})
8419                    my $property_alias = $property_aliases
8420                                            [$i % @property_aliases]->name;
8421                    my $table_alias_object = $table_aliases
8422                                                        [$i % @table_aliases];
8423                    my $table_alias = $table_alias_object->name;
8424                    my $loose_match = $table_alias_object->loose_match;
8425                    $has_ucd_alias |= $table_alias_object->ucd;
8426
8427                    if ($table_alias !~ /\D/) { # Clarify large numbers.
8428                        $table_alias = main::clarify_number($table_alias)
8429                    }
8430
8431                    # Add a comment for this alias combination
8432                    my $current_match_comment;
8433                    if ($table_property == $perl) {
8434                        $current_match_comment = "\\$perl_p"
8435                                                    . "{$table_alias}";
8436                    }
8437                    else {
8438                        $current_match_comment
8439                                        = "\\p{$property_alias=$table_alias}";
8440                        $property_had_compound_name = 1;
8441                    }
8442
8443                    # Flag any abnormal status for this table.
8444                    my $flag = $property->status
8445                                || $table->status
8446                                || $table_alias_object->status;
8447                    if ($flag && $flag ne $PLACEHOLDER) {
8448                        $flags{$flag} = $status_past_participles{$flag};
8449                    }
8450
8451                    $loose_count++;
8452
8453                    # Pretty up the comment.  Note the \b; it says don't make
8454                    # this line a continuation.
8455                    $matches_comment .= sprintf("\b%-1s%-s%s\n",
8456                                        $flag,
8457                                        " " x 7,
8458                                        $current_match_comment);
8459                } # End of generating the entries for this table.
8460
8461                # Save these for output after this group of related tables.
8462                push @description, $table->description;
8463                push @note, $table->note;
8464                push @conflicting, $table->conflicting;
8465
8466                # And this for output after all the tables.
8467                push @global_comments, $table->comment;
8468
8469                # Compute an alternate compound name using the final property
8470                # synonym and the first table synonym with a colon instead of
8471                # the equal sign used elsewhere.
8472                if ($property_had_compound_name) {
8473                    $properties_with_compound_names ++;
8474                    if (! $compound_name || @property_aliases > 1) {
8475                        $compound_name = $property_aliases[-1]->name
8476                                        . ': '
8477                                        . $table_aliases[0]->name;
8478                    }
8479                }
8480            } # End of looping through all children of this table
8481
8482            # Here have assembled in $matches_comment all the related tables
8483            # to the current parent (preceded by the same info for all the
8484            # previous parents).  Put out information that applies to all of
8485            # the current family.
8486            if (@conflicting) {
8487
8488                # But output the conflicting information now, as it applies to
8489                # just this table.
8490                my $conflicting = join ", ", @conflicting;
8491                if ($conflicting) {
8492                    $matches_comment .= <<END;
8493
8494    Note that contrary to what you might expect, the above is NOT the same as
8495END
8496                    $matches_comment .= "any of: " if @conflicting > 1;
8497                    $matches_comment .= "$conflicting\n";
8498                }
8499            }
8500            if (@description) {
8501                $matches_comment .= "\n    Meaning: "
8502                                    . join('; ', @description)
8503                                    . "\n";
8504            }
8505            if (@note) {
8506                $matches_comment .= "\n    Note: "
8507                                    . join("\n    ", @note)
8508                                    . "\n";
8509            }
8510        } # End of looping through all tables
8511
8512        $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8513
8514
8515        my $code_points;
8516        my $match;
8517        my $any_of_these;
8518        if ($unicode_count == 1) {
8519            $match = 'matches';
8520            $code_points = 'single code point';
8521        }
8522        else {
8523            $match = 'match';
8524            $code_points = "$string_count code points";
8525        }
8526
8527        my $synonyms;
8528        my $entries;
8529        if ($total_entries == 1) {
8530            $synonyms = "";
8531            $entries = 'entry';
8532            $any_of_these = 'this'
8533        }
8534        else {
8535            $synonyms = " any of the following regular expression constructs";
8536            $entries = 'entries';
8537            $any_of_these = 'any of these'
8538        }
8539
8540        my $comment = "";
8541        if ($has_ucd_alias) {
8542            $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8543        }
8544        if ($has_unrelated) {
8545            $comment .= <<END;
8546This file is for tables that are not necessarily related:  To conserve
8547resources, every table that matches the identical set of code points in this
8548version of Unicode uses this file.  Each one is listed in a separate group
8549below.  It could be that the tables will match the same set of code points in
8550other Unicode releases, or it could be purely coincidence that they happen to
8551be the same in Unicode $unicode_version, and hence may not in other versions.
8552
8553END
8554        }
8555
8556        if (%flags) {
8557            foreach my $flag (sort keys %flags) {
8558                $comment .= <<END;
8559'$flag' below means that this form is $flags{$flag}.
8560END
8561                if ($flag eq $INTERNAL_ALIAS) {
8562                    $comment .= "DO NOT USE!!!";
8563                }
8564                else {
8565                    $comment .= "Consult $pod_file.pod";
8566                }
8567                $comment .= "\n";
8568            }
8569            $comment .= "\n";
8570        }
8571
8572        if ($total_entries == 0) {
8573            Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8574            $comment .= <<END;
8575This file returns the $code_points in Unicode Version
8576$unicode_version for
8577$leader, but it is inaccessible through Perl regular expressions, as
8578"\\p{prop=}" is not recognized.
8579END
8580
8581        } else {
8582            $comment .= <<END;
8583This file returns the $code_points in Unicode Version
8584$unicode_version that
8585$match$synonyms:
8586
8587$matches_comment
8588$pod_file.pod should be consulted for the syntax rules for $any_of_these,
8589including if adding or subtracting white space, underscore, and hyphen
8590characters matters or doesn't matter, and other permissible syntactic
8591variants.  Upper/lower case distinctions never matter.
8592END
8593
8594        }
8595        if ($compound_name) {
8596            $comment .= <<END;
8597
8598A colon can be substituted for the equals sign, and
8599END
8600            if ($properties_with_compound_names > 1) {
8601                $comment .= <<END;
8602within each group above,
8603END
8604            }
8605            $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8606
8607            # Note the \b below, it says don't make that line a continuation.
8608            $comment .= <<END;
8609anything to the left of the equals (or colon) can be combined with anything to
8610the right.  Thus, for example,
8611$compound_name
8612\bis also valid.
8613END
8614        }
8615
8616        # And append any comment(s) from the actual tables.  They are all
8617        # gathered here, so may not read all that well.
8618        if (@global_comments) {
8619            $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8620        }
8621
8622        if ($count) {   # The format differs if no code points, and needs no
8623                        # explanation in that case
8624            if ($leader->write_as_invlist) {
8625                $comment.= <<END;
8626
8627The first data line of this file begins with the letter V to indicate it is in
8628inversion list format.  The number following the V gives the number of lines
8629remaining.  Each of those remaining lines is a single number representing the
8630starting code point of a range which goes up to but not including the number
8631on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8632the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8633the property.  The final line's range extends to the platform's infinity.
8634END
8635            }
8636            else {
8637                $comment.= <<END;
8638The format of the lines of this file is:
8639START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8640STOP is the ending point, or if omitted, the range has just one code point.
8641END
8642            }
8643            if ($leader->output_range_counts) {
8644                $comment .= <<END;
8645Numbers in comments in [brackets] indicate how many code points are in the
8646range.
8647END
8648            }
8649        }
8650
8651        $leader->set_comment(main::join_lines($comment));
8652        return;
8653    }
8654
8655    # Accessors for the underlying list
8656    for my $sub (qw(
8657                    get_valid_code_point
8658                    get_invalid_code_point
8659                ))
8660    {
8661        no strict "refs";
8662        *$sub = sub {
8663            use strict "refs";
8664            my $self = shift;
8665
8666            return $self->_range_list->$sub(@_);
8667        }
8668    }
8669} # End closure for Match_Table
8670
8671package Property;
8672
8673# The Property class represents a Unicode property, or the $perl
8674# pseudo-property.  It contains a map table initialized empty at construction
8675# time, and for properties accessible through regular expressions, various
8676# match tables, created through the add_match_table() method, and referenced
8677# by the table('NAME') or tables() methods, the latter returning a list of all
8678# of the match tables.  Otherwise table operations implicitly are for the map
8679# table.
8680#
8681# Most of the data in the property is actually about its map table, so it
8682# mostly just uses that table's accessors for most methods.  The two could
8683# have been combined into one object, but for clarity because of their
8684# differing semantics, they have been kept separate.  It could be argued that
8685# the 'file' and 'directory' fields should be kept with the map table.
8686#
8687# Each property has a type.  This can be set in the constructor, or in the
8688# set_type accessor, but mostly it is figured out by the data.  Every property
8689# starts with unknown type, overridden by a parameter to the constructor, or
8690# as match tables are added, or ranges added to the map table, the data is
8691# inspected, and the type changed.  After the table is mostly or entirely
8692# filled, compute_type() should be called to finalize the analysis.
8693#
8694# There are very few operations defined.  One can safely remove a range from
8695# the map table, and property_add_or_replace_non_nulls() adds the maps from another
8696# table to this one, replacing any in the intersection of the two.
8697
8698sub standardize { return main::standardize($_[0]); }
8699sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8700
8701{   # Closure
8702
8703    # This hash will contain as keys, all the aliases of all properties, and
8704    # as values, pointers to their respective property objects.  This allows
8705    # quick look-up of a property from any of its names.
8706    my %alias_to_property_of;
8707
8708    sub dump_alias_to_property_of {
8709        # For debugging
8710
8711        print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8712        return;
8713    }
8714
8715    sub property_ref($name) {
8716        # This is a package subroutine, not called as a method.
8717        # If the single parameter is a literal '*' it returns a list of all
8718        # defined properties.
8719        # Otherwise, the single parameter is a name, and it returns a pointer
8720        # to the corresponding property object, or undef if none.
8721        #
8722        # Properties can have several different names.  The 'standard' form of
8723        # each of them is stored in %alias_to_property_of as they are defined.
8724        # But it's possible that this subroutine will be called with some
8725        # variant, so if the initial lookup fails, it is repeated with the
8726        # standardized form of the input name.  If found, besides returning the
8727        # result, the input name is added to the list so future calls won't
8728        # have to do the conversion again.
8729
8730        if (! defined $name) {
8731            Carp::my_carp_bug("Undefined input property.  No action taken.");
8732            return;
8733        }
8734
8735        return main::uniques(values %alias_to_property_of) if $name eq '*';
8736
8737        # Return cached result if have it.
8738        my $result = $alias_to_property_of{$name};
8739        return $result if defined $result;
8740
8741        # Convert the input to standard form.
8742        my $standard_name = standardize($name);
8743
8744        $result = $alias_to_property_of{$standard_name};
8745        return unless defined $result;        # Don't cache undefs
8746
8747        # Cache the result before returning it.
8748        $alias_to_property_of{$name} = $result;
8749        return $result;
8750    }
8751
8752
8753    main::setup_package();
8754
8755    my %map;
8756    # A pointer to the map table object for this property
8757    main::set_access('map', \%map);
8758
8759    my %full_name;
8760    # The property's full name.  This is a duplicate of the copy kept in the
8761    # map table, but is needed because stringify needs it during
8762    # construction of the map table, and then would have a chicken before egg
8763    # problem.
8764    main::set_access('full_name', \%full_name, 'r');
8765
8766    my %table_ref;
8767    # This hash will contain as keys, all the aliases of any match tables
8768    # attached to this property, and as values, the pointers to their
8769    # respective tables.  This allows quick look-up of a table from any of its
8770    # names.
8771    main::set_access('table_ref', \%table_ref);
8772
8773    my %type;
8774    # The type of the property, $ENUM, $BINARY, etc
8775    main::set_access('type', \%type, 'r');
8776
8777    my %file;
8778    # The filename where the map table will go (if actually written).
8779    # Normally defaulted, but can be overridden.
8780    main::set_access('file', \%file, 'r', 's');
8781
8782    my %directory;
8783    # The directory where the map table will go (if actually written).
8784    # Normally defaulted, but can be overridden.
8785    main::set_access('directory', \%directory, 's');
8786
8787    my %pseudo_map_type;
8788    # This is used to affect the calculation of the map types for all the
8789    # ranges in the table.  It should be set to one of the values that signify
8790    # to alter the calculation.
8791    main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8792
8793    my %has_only_code_point_maps;
8794    # A boolean used to help in computing the type of data in the map table.
8795    main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8796
8797    my %unique_maps;
8798    # A list of the first few distinct mappings this property has.  This is
8799    # used to disambiguate between binary and enum property types, so don't
8800    # have to keep more than three.
8801    main::set_access('unique_maps', \%unique_maps);
8802
8803    my %pre_declared_maps;
8804    # A boolean that gives whether the input data should declare all the
8805    # tables used, or not.  If the former, unknown ones raise a warning.
8806    main::set_access('pre_declared_maps',
8807                                    \%pre_declared_maps, 'r', 's');
8808
8809    my %match_subdir;
8810    # For properties whose shortest names are too long for a DOS 8.3
8811    # filesystem to distinguish between, this is used to manually give short
8812    # names for the directory name immediately under $match_tables that the
8813    # match tables for this property should be placed in.
8814    main::set_access('match_subdir', \%match_subdir, 'r');
8815
8816    my %has_dependency;
8817    # A boolean that gives whether some table somewhere is defined as the
8818    # complement of a table in this property.  This is a crude, but currently
8819    # sufficient, mechanism to make this property not get destroyed before
8820    # what is dependent on it is.  Other dependencies could be added, so the
8821    # name was chosen to reflect a more general situation than actually is
8822    # currently the case.
8823    main::set_access('has_dependency', \%has_dependency, 'r', 's');
8824
8825    sub new {
8826        # The only required parameter is the positionally first, name.  All
8827        # other parameters are key => value pairs.  See the documentation just
8828        # above for the meanings of the ones not passed directly on to the map
8829        # table constructor.
8830
8831        my $class = shift;
8832        my $name = shift || "";
8833
8834        my $self = property_ref($name);
8835        if (defined $self) {
8836            my $options_string = join ", ", @_;
8837            $options_string = ".  Ignoring options $options_string" if $options_string;
8838            Carp::my_carp("$self is already in use.  Using existing one$options_string;");
8839            return $self;
8840        }
8841
8842        my %args = @_;
8843
8844        $self = bless \do { my $anonymous_scalar }, $class;
8845        my $addr = pack 'J', refaddr $self;
8846
8847        $directory{$addr} = delete $args{'Directory'};
8848        $file{$addr} = delete $args{'File'};
8849        $full_name{$addr} = delete $args{'Full_Name'} || $name;
8850        $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8851        $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8852        $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8853                                    # Starting in this release, property
8854                                    # values should be defined for all
8855                                    # properties, except those overriding this
8856                                    // $v_version ge v5.1.0;
8857        $match_subdir{$addr} = delete $args{'Match_SubDir'};
8858
8859        # Rest of parameters passed on.
8860
8861        $has_only_code_point_maps{$addr} = 1;
8862        $table_ref{$addr} = { };
8863        $unique_maps{$addr} = { };
8864        $has_dependency{$addr} = 0;
8865
8866        $map{$addr} = Map_Table->new($name,
8867                                    Full_Name => $full_name{$addr},
8868                                    _Alias_Hash => \%alias_to_property_of,
8869                                    _Property => $self,
8870                                    %args);
8871        return $self;
8872    }
8873
8874    # See this program's beginning comment block about overloading the copy
8875    # constructor.  Few operations are defined on properties, but a couple are
8876    # useful.  It is safe to take the inverse of a property, and to remove a
8877    # single code point from it.
8878    use overload
8879        fallback => 0,
8880        qw("") => "_operator_stringify",
8881        "." => \&main::_operator_dot,
8882        ".=" => \&main::_operator_dot_equal,
8883        '==' => \&main::_operator_equal,
8884        '!=' => \&main::_operator_not_equal,
8885        '=' => sub { return shift },
8886        '-=' => "_minus_and_equal",
8887    ;
8888
8889    sub _operator_stringify($self, $other="", $reversed=0) {
8890        return "Property '" .  $self->full_name . "'";
8891    }
8892
8893    sub _minus_and_equal($self, $other, $reversed=0) {
8894        # Remove a single code point from the map table of a property.
8895        if (ref $other) {
8896            Carp::my_carp_bug("Bad news.  Can't cope with a "
8897                        . ref($other)
8898                        . " argument to '-='.  Subtraction ignored.");
8899            return $self;
8900        }
8901        elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
8902            Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
8903            . ref $self
8904            . " from a non-object.  undef returned.");
8905            return;
8906        }
8907        else {
8908            $map{pack 'J', refaddr $self}->delete_range($other, $other);
8909        }
8910        return $self;
8911    }
8912
8913    sub add_match_table {
8914        # Add a new match table for this property, with name given by the
8915        # parameter.  It returns a pointer to the table.
8916
8917        my $self = shift;
8918        my $name = shift;
8919        my %args = @_;
8920
8921        my $addr = pack 'J', refaddr $self;
8922
8923        my $table = $table_ref{$addr}{$name};
8924        my $standard_name = main::standardize($name);
8925        if (defined $table
8926            || (defined ($table = $table_ref{$addr}{$standard_name})))
8927        {
8928            Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
8929            $table_ref{$addr}{$name} = $table;
8930            return $table;
8931        }
8932        else {
8933
8934            # See if this is a perl extension, if not passed in.
8935            my $perl_extension = delete $args{'Perl_Extension'};
8936            $perl_extension
8937                        = $self->perl_extension if ! defined $perl_extension;
8938
8939            my $fate;
8940            my $suppression_reason = "";
8941            if ($self->name =~ /^_/) {
8942                $fate = $SUPPRESSED;
8943                $suppression_reason = "Parent property is internal only";
8944            }
8945            elsif ($self->fate >= $SUPPRESSED) {
8946                $fate = $self->fate;
8947                $suppression_reason = $why_suppressed{$self->complete_name};
8948
8949            }
8950            elsif ($name =~ /^_/) {
8951                $fate = $INTERNAL_ONLY;
8952            }
8953            $table = Match_Table->new(
8954                                Name => $name,
8955                                Perl_Extension => $perl_extension,
8956                                _Alias_Hash => $table_ref{$addr},
8957                                _Property => $self,
8958                                Fate => $fate,
8959                                Suppression_Reason => $suppression_reason,
8960                                Status => $self->status,
8961                                _Status_Info => $self->status_info,
8962                                %args);
8963            return unless defined $table;
8964        }
8965
8966        # Save the names for quick look up
8967        $table_ref{$addr}{$standard_name} = $table;
8968        $table_ref{$addr}{$name} = $table;
8969
8970        # Perhaps we can figure out the type of this property based on the
8971        # fact of adding this match table.  First, string properties don't
8972        # have match tables; second, a binary property can't have 3 match
8973        # tables
8974        if ($type{$addr} == $UNKNOWN) {
8975            $type{$addr} = $NON_STRING;
8976        }
8977        elsif ($type{$addr} == $STRING) {
8978            Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
8979            $type{$addr} = $NON_STRING;
8980        }
8981        elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8982            if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8983                if ($type{$addr} == $BINARY) {
8984                    Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
8985                }
8986                $type{$addr} = $ENUM;
8987            }
8988        }
8989
8990        return $table;
8991    }
8992
8993    sub delete_match_table($self, $table_to_remove) {
8994        # Delete the table referred to by $2 from the property $1.
8995        my $addr = pack 'J', refaddr $self;
8996
8997        # Remove all names that refer to it.
8998        foreach my $key (keys %{$table_ref{$addr}}) {
8999            delete $table_ref{$addr}{$key}
9000                                if $table_ref{$addr}{$key} == $table_to_remove;
9001        }
9002
9003        $table_to_remove->DESTROY;
9004        return;
9005    }
9006
9007    sub table($self, $name) {
9008        # Return a pointer to the match table (with name given by the
9009        # parameter) associated with this property; undef if none.
9010        my $addr = pack 'J', refaddr $self;
9011
9012        return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9013
9014        # If quick look-up failed, try again using the standard form of the
9015        # input name.  If that succeeds, cache the result before returning so
9016        # won't have to standardize this input name again.
9017        my $standard_name = main::standardize($name);
9018        return unless defined $table_ref{$addr}{$standard_name};
9019
9020        $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9021        return $table_ref{$addr}{$name};
9022    }
9023
9024    sub tables {
9025        # Return a list of pointers to all the match tables attached to this
9026        # property
9027
9028        return main::uniques(values %{$table_ref{pack 'J', refaddr shift}});
9029    }
9030
9031    sub directory {
9032        # Returns the directory the map table for this property should be
9033        # output in.  If a specific directory has been specified, that has
9034        # priority;  'undef' is returned if the type isn't defined;
9035        # or $map_directory for everything else.
9036
9037        my $addr = pack 'J', refaddr shift;
9038
9039        return $directory{$addr} if defined $directory{$addr};
9040        return undef if $type{$addr} == $UNKNOWN;
9041        return $map_directory;
9042    }
9043
9044    sub swash_name($self) {
9045        # Return the name that is used to both:
9046        #   1)  Name the file that the map table is written to.
9047        #   2)  The name of swash related stuff inside that file.
9048        # The reason for this is that the Perl core historically has used
9049        # certain names that aren't the same as the Unicode property names.
9050        # To continue using these, $file is hard-coded in this file for those,
9051        # but otherwise the standard name is used.  This is different from the
9052        # external_name, so that the rest of the files, like in lib can use
9053        # the standard name always, without regard to historical precedent.
9054        my $addr = pack 'J', refaddr $self;
9055
9056        # Swash names are used only on either
9057        # 1) regular or internal-only map tables
9058        # 2) otherwise there should be no access to the
9059        #    property map table from other parts of Perl.
9060        return if $map{$addr}->fate != $ORDINARY
9061                  && ! ($map{$addr}->name =~ /^_/
9062                        && $map{$addr}->fate == $INTERNAL_ONLY);
9063
9064        return $file{$addr} if defined $file{$addr};
9065        return $map{$addr}->external_name;
9066    }
9067
9068    sub to_create_match_tables($self) {
9069        # Returns a boolean as to whether or not match tables should be
9070        # created for this property.
9071
9072        # The whole point of this pseudo property is match tables.
9073        return 1 if $self == $perl;
9074
9075        my $addr = pack 'J', refaddr $self;
9076
9077        # Don't generate tables of code points that match the property values
9078        # of a string property.  Such a list would most likely have many
9079        # property values, each with just one or very few code points mapping
9080        # to it.
9081        return 0 if $type{$addr} == $STRING;
9082
9083        # Otherwise, do.
9084        return 1;
9085    }
9086
9087    sub property_add_or_replace_non_nulls($self, $other) {
9088        # This adds the mappings in the property $other to $self.  Non-null
9089        # mappings from $other override those in $self.  It essentially merges
9090        # the two properties, with the second having priority except for null
9091        # mappings.
9092
9093        if (! $other->isa(__PACKAGE__)) {
9094            Carp::my_carp_bug("$other should be a "
9095                            . __PACKAGE__
9096                            . ".  Not a '"
9097                            . ref($other)
9098                            . "'.  Not added;");
9099            return;
9100        }
9101
9102        return $map{pack 'J', refaddr $self}->map_add_or_replace_non_nulls($map{pack 'J', refaddr $other});
9103    }
9104
9105    sub set_proxy_for {
9106        # Certain tables are not generally written out to files, but
9107        # Unicode::UCD has the intelligence to know that the file for $self
9108        # can be used to reconstruct those tables.  This routine just changes
9109        # things so that UCD pod entries for those suppressed tables are
9110        # generated, so the fact that a proxy is used is invisible to the
9111        # user.
9112
9113        my $self = shift;
9114
9115        foreach my $property_name (@_) {
9116            my $ref = property_ref($property_name);
9117            next if $ref->to_output_map;
9118            $ref->set_fate($MAP_PROXIED);
9119        }
9120    }
9121
9122    sub set_type($self, $type) {
9123        # Set the type of the property.  Mostly this is figured out by the
9124        # data in the table.  But this is used to set it explicitly.  The
9125        # reason it is not a standard accessor is that when setting a binary
9126        # property, we need to make sure that all the true/false aliases are
9127        # present, as they were omitted in early Unicode releases.
9128
9129        if ($type != $ENUM
9130            && $type != $BINARY
9131            && $type != $FORCED_BINARY
9132            && $type != $STRING)
9133        {
9134            Carp::my_carp("Unrecognized type '$type'.  Type not set");
9135            return;
9136        }
9137
9138        $type{pack 'J', refaddr $self} = $type;
9139        return if $type != $BINARY && $type != $FORCED_BINARY;
9140
9141        my $yes = $self->table('Y');
9142        $yes = $self->table('Yes') if ! defined $yes;
9143        $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9144                                                            if ! defined $yes;
9145
9146        # Add aliases in order wanted, duplicates will be ignored.  We use a
9147        # binary property present in all releases for its ordered lists of
9148        # true/false aliases.  Note, that could run into problems in
9149        # outputting things in that we don't distinguish between the name and
9150        # full name of these.  Hopefully, if the table was already created
9151        # before this code is executed, it was done with these set properly.
9152        my $bm = property_ref("Bidi_Mirrored");
9153        foreach my $alias ($bm->table("Y")->aliases) {
9154            $yes->add_alias($alias->name);
9155        }
9156        my $no = $self->table('N');
9157        $no = $self->table('No') if ! defined $no;
9158        $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9159        foreach my $alias ($bm->table("N")->aliases) {
9160            $no->add_alias($alias->name);
9161        }
9162
9163        return;
9164    }
9165
9166    sub add_map {
9167        # Add a map to the property's map table.  This also keeps
9168        # track of the maps so that the property type can be determined from
9169        # its data.
9170
9171        my $self = shift;
9172        my $start = shift;  # First code point in range
9173        my $end = shift;    # Final code point in range
9174        my $map = shift;    # What the range maps to.
9175        # Rest of parameters passed on.
9176
9177        my $addr = pack 'J', refaddr $self;
9178
9179        # If haven't the type of the property, gather information to figure it
9180        # out.
9181        if ($type{$addr} == $UNKNOWN) {
9182
9183            # If the map contains an interior blank or dash, or most other
9184            # nonword characters, it will be a string property.  This
9185            # heuristic may actually miss some string properties.  If so, they
9186            # may need to have explicit set_types called for them.  This
9187            # happens in the Unihan properties.
9188            if ($map =~ / (?<= . ) [ -] (?= . ) /x
9189                || $map =~ / [^\w.\/\ -]  /x)
9190            {
9191                $self->set_type($STRING);
9192
9193                # $unique_maps is used for disambiguating between ENUM and
9194                # BINARY later; since we know the property is not going to be
9195                # one of those, no point in keeping the data around
9196                undef $unique_maps{$addr};
9197            }
9198            else {
9199
9200                # Not necessarily a string.  The final decision has to be
9201                # deferred until all the data are in.  We keep track of if all
9202                # the values are code points for that eventual decision.
9203                $has_only_code_point_maps{$addr} &=
9204                                            $map =~ / ^ $code_point_re $/x;
9205
9206                # For the purposes of disambiguating between binary and other
9207                # enumerations at the end, we keep track of the first three
9208                # distinct property values.  Once we get to three, we know
9209                # it's not going to be binary, so no need to track more.
9210                if (scalar keys %{$unique_maps{$addr}} < 3) {
9211                    $unique_maps{$addr}{main::standardize($map)} = 1;
9212                }
9213            }
9214        }
9215
9216        # Add the mapping by calling our map table's method
9217        return $map{$addr}->add_map($start, $end, $map, @_);
9218    }
9219
9220    sub compute_type($self) {
9221        # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
9222        # should be called after the property is mostly filled with its maps.
9223        # We have been keeping track of what the property values have been,
9224        # and now have the necessary information to figure out the type.
9225
9226        my $addr = pack 'J', refaddr $self;
9227
9228        my $type = $type{$addr};
9229
9230        # If already have figured these out, no need to do so again, but we do
9231        # a double check on ENUMS to make sure that a string property hasn't
9232        # improperly been classified as an ENUM, so continue on with those.
9233        return if $type == $STRING
9234                  || $type == $BINARY
9235                  || $type == $FORCED_BINARY;
9236
9237        # If every map is to a code point, is a string property.
9238        if ($type == $UNKNOWN
9239            && ($has_only_code_point_maps{$addr}
9240                || (defined $map{$addr}->default_map
9241                    && $map{$addr}->default_map eq "")))
9242        {
9243            $self->set_type($STRING);
9244        }
9245        else {
9246
9247            # Otherwise, it is to some sort of enumeration.  (The case where
9248            # it is a Unicode miscellaneous property, and treated like a
9249            # string in this program is handled in add_map()).  Distinguish
9250            # between binary and some other enumeration type.  Of course, if
9251            # there are more than two values, it's not binary.  But more
9252            # subtle is the test that the default mapping is defined means it
9253            # isn't binary.  This in fact may change in the future if Unicode
9254            # changes the way its data is structured.  But so far, no binary
9255            # properties ever have @missing lines for them, so the default map
9256            # isn't defined for them.  The few properties that are two-valued
9257            # and aren't considered binary have the default map defined
9258            # starting in Unicode 5.0, when the @missing lines appeared; and
9259            # this program has special code to put in a default map for them
9260            # for earlier than 5.0 releases.
9261            if ($type == $ENUM
9262                || scalar keys %{$unique_maps{$addr}} > 2
9263                || defined $self->default_map)
9264            {
9265                my $tables = $self->tables;
9266                my $count = $self->count;
9267                if ($verbosity && $tables > 500 && $tables/$count > .1) {
9268                    Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9269                }
9270                $self->set_type($ENUM);
9271            }
9272            else {
9273                $self->set_type($BINARY);
9274            }
9275        }
9276        undef $unique_maps{$addr};  # Garbage collect
9277        return;
9278    }
9279
9280    # $reaons - Ignored unless suppressing
9281    sub set_fate($self, $fate, $reason=undef) {
9282        my $addr = pack 'J', refaddr $self;
9283        if ($fate >= $SUPPRESSED) {
9284            $why_suppressed{$self->complete_name} = $reason;
9285        }
9286
9287        # Each table shares the property's fate, except that MAP_PROXIED
9288        # doesn't affect match tables
9289        $map{$addr}->set_fate($fate, $reason);
9290        if ($fate != $MAP_PROXIED) {
9291            foreach my $table ($map{$addr}, $self->tables) {
9292                $table->set_fate($fate, $reason);
9293            }
9294        }
9295        return;
9296    }
9297
9298
9299    # Most of the accessors for a property actually apply to its map table.
9300    # Setup up accessor functions for those, referring to %map
9301    for my $sub (qw(
9302                    add_alias
9303                    add_anomalous_entry
9304                    add_comment
9305                    add_conflicting
9306                    add_description
9307                    add_duplicate
9308                    add_note
9309                    aliases
9310                    comment
9311                    complete_name
9312                    containing_range
9313                    count
9314                    default_map
9315                    definition
9316                    delete_range
9317                    description
9318                    each_range
9319                    external_name
9320                    fate
9321                    file_path
9322                    format
9323                    initialize
9324                    inverse_list
9325                    is_empty
9326                    name
9327                    note
9328                    perl_extension
9329                    property
9330                    range_count
9331                    ranges
9332                    range_size_1
9333                    replace_map
9334                    reset_each_range
9335                    set_comment
9336                    set_default_map
9337                    set_file_path
9338                    set_final_comment
9339                    _set_format
9340                    set_range_size_1
9341                    set_status
9342                    set_to_output_map
9343                    short_name
9344                    status
9345                    status_info
9346                    to_output_map
9347                    type_of
9348                    value_of
9349                    write
9350                ))
9351                    # 'property' above is for symmetry, so that one can take
9352                    # the property of a property and get itself, and so don't
9353                    # have to distinguish between properties and tables in
9354                    # calling code
9355    {
9356        no strict "refs";
9357        *$sub = sub {
9358            use strict "refs";
9359            my $self = shift;
9360            return $map{pack 'J', refaddr $self}->$sub(@_);
9361        }
9362    }
9363
9364
9365} # End closure
9366
9367package main;
9368
9369sub display_chr {
9370    # Converts an ordinal printable character value to a displayable string,
9371    # using a dotted circle to hold combining characters.
9372
9373    my $ord = shift;
9374    my $chr = chr $ord;
9375    return $chr if $ccc->table(0)->contains($ord);
9376    return "\x{25CC}$chr";
9377}
9378
9379sub join_lines($input) {
9380    # Returns lines of the input joined together, so that they can be folded
9381    # properly.
9382    # This causes continuation lines to be joined together into one long line
9383    # for folding.  A continuation line is any line that doesn't begin with a
9384    # space or "\b" (the latter is stripped from the output).  This is so
9385    # lines can be in a HERE document so as to fit nicely in the terminal
9386    # width, but be joined together in one long line, and then folded with
9387    # indents, '#' prefixes, etc, properly handled.
9388    # A blank separates the joined lines except if there is a break; an extra
9389    # blank is inserted after a period ending a line.
9390
9391    # Initialize the return with the first line.
9392    my ($return, @lines) = split "\n", $input;
9393
9394    # If the first line is null, it was an empty line, add the \n back in
9395    $return = "\n" if $return eq "";
9396
9397    # Now join the remainder of the physical lines.
9398    for my $line (@lines) {
9399
9400        # An empty line means wanted a blank line, so add two \n's to get that
9401        # effect, and go to the next line.
9402        if (length $line == 0) {
9403            $return .= "\n\n";
9404            next;
9405        }
9406
9407        # Look at the last character of what we have so far.
9408        my $previous_char = substr($return, -1, 1);
9409
9410        # And at the next char to be output.
9411        my $next_char = substr($line, 0, 1);
9412
9413        if ($previous_char ne "\n") {
9414
9415            # Here didn't end wth a nl.  If the next char a blank or \b, it
9416            # means that here there is a break anyway.  So add a nl to the
9417            # output.
9418            if ($next_char eq " " || $next_char eq "\b") {
9419                $previous_char = "\n";
9420                $return .= $previous_char;
9421            }
9422
9423            # Add an extra space after periods.
9424            $return .= " " if $previous_char eq '.';
9425        }
9426
9427        # Here $previous_char is still the latest character to be output.  If
9428        # it isn't a nl, it means that the next line is to be a continuation
9429        # line, with a blank inserted between them.
9430        $return .= " " if $previous_char ne "\n";
9431
9432        # Get rid of any \b
9433        substr($line, 0, 1) = "" if $next_char eq "\b";
9434
9435        # And append this next line.
9436        $return .= $line;
9437    }
9438
9439    return $return;
9440}
9441
9442sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) {
9443    # Returns a string of the input (string or an array of strings) folded
9444    # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9445    # a \n
9446    # This is tailored for the kind of text written by this program,
9447    # especially the pod file, which can have very long names with
9448    # underscores in the middle, or words like AbcDefgHij....  We allow
9449    # breaking in the middle of such constructs if the line won't fit
9450    # otherwise.  The break in such cases will come either just after an
9451    # underscore, or just before one of the Capital letters.
9452
9453    local $to_trace = 0 if main::DEBUG;
9454
9455    # $prefix Optional string to prepend to each output line
9456    # $hanging_indent Optional number of spaces to indent
9457    # continuation lines
9458    # $right_margin  Optional number of spaces to narrow the
9459    # total width by.
9460
9461    # The space available doesn't include what's automatically prepended
9462    # to each line, or what's reserved on the right.
9463    my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9464    # XXX Instead of using the 'nofold' perhaps better to look up the stack
9465
9466    if (DEBUG && $hanging_indent >= $max) {
9467        Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
9468        $hanging_indent = 0;
9469    }
9470
9471    # First, split into the current physical lines.
9472    my @line;
9473    if (ref $line) {        # Better be an array, because not bothering to
9474                            # test
9475        foreach my $line (@{$line}) {
9476            push @line, split /\n/, $line;
9477        }
9478    }
9479    else {
9480        @line = split /\n/, $line;
9481    }
9482
9483    #local $to_trace = 1 if main::DEBUG;
9484    trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9485
9486    # Look at each current physical line.
9487    for (my $i = 0; $i < @line; $i++) {
9488        Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9489        #local $to_trace = 1 if main::DEBUG;
9490        trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9491
9492        # Remove prefix, because will be added back anyway, don't want
9493        # doubled prefix
9494        $line[$i] =~ s/^$prefix//;
9495
9496        # Remove trailing space
9497        $line[$i] =~ s/\s+\Z//;
9498
9499        # If the line is too long, fold it.
9500        if (length $line[$i] > $max) {
9501            my $remainder;
9502
9503            # Here needs to fold.  Save the leading space in the line for
9504            # later.
9505            $line[$i] =~ /^ ( \s* )/x;
9506            my $leading_space = $1;
9507            trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9508
9509            # If character at final permissible position is white space,
9510            # fold there, which will delete that white space
9511            if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9512                $remainder = substr($line[$i], $max);
9513                $line[$i] = substr($line[$i], 0, $max - 1);
9514            }
9515            else {
9516
9517                # Otherwise fold at an acceptable break char closest to
9518                # the max length.  Look at just the maximal initial
9519                # segment of the line
9520                my $segment = substr($line[$i], 0, $max - 1);
9521                if ($segment =~
9522                    /^ ( .{$hanging_indent}   # Don't look before the
9523                                              #  indent.
9524                        \ *                   # Don't look in leading
9525                                              #  blanks past the indent
9526                            [^ ] .*           # Find the right-most
9527                        (?:                   #  acceptable break:
9528                            [ \s = ]          # space or equal
9529                            | - (?! [.0-9] )  # or non-unary minus.
9530                            | [^\\[(] (?= \\ )# break before single backslash
9531                                              #  not immediately after opening
9532                                              #  punctuation
9533                        )                     # $1 includes the character
9534                    )/x)
9535                {
9536                    # Split into the initial part that fits, and remaining
9537                    # part of the input
9538                    $remainder = substr($line[$i], length $1);
9539                    $line[$i] = $1;
9540                    trace $line[$i] if DEBUG && $to_trace;
9541                    trace $remainder if DEBUG && $to_trace;
9542                }
9543
9544                # If didn't find a good breaking spot, see if there is a
9545                # not-so-good breaking spot.  These are just after
9546                # underscores or where the case changes from lower to
9547                # upper.  Use \a as a soft hyphen, but give up
9548                # and don't break the line if there is actually a \a
9549                # already in the input.  We use an ascii character for the
9550                # soft-hyphen to avoid any attempt by miniperl to try to
9551                # access the files that this program is creating.
9552                elsif ($segment !~ /\a/
9553                       && ($segment =~ s/_/_\a/g
9554                       || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg))
9555                {
9556                    # Here were able to find at least one place to insert
9557                    # our substitute soft hyphen.  Find the right-most one
9558                    # and replace it by a real hyphen.
9559                    trace $segment if DEBUG && $to_trace;
9560                    substr($segment,
9561                            rindex($segment, "\a"),
9562                            1) = '-';
9563
9564                    # Then remove the soft hyphen substitutes.
9565                    $segment =~ s/\a//g;
9566                    trace $segment if DEBUG && $to_trace;
9567
9568                    # And split into the initial part that fits, and
9569                    # remainder of the line
9570                    my $pos = rindex($segment, '-');
9571                    $remainder = substr($line[$i], $pos);
9572                    trace $remainder if DEBUG && $to_trace;
9573                    $line[$i] = substr($segment, 0, $pos + 1);
9574                }
9575            }
9576
9577            # Here we know if we can fold or not.  If we can, $remainder
9578            # is what remains to be processed in the next iteration.
9579            if (defined $remainder) {
9580                trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9581
9582                # Insert the folded remainder of the line as a new element
9583                # of the array.  (It may still be too long, but we will
9584                # deal with that next time through the loop.)  Omit any
9585                # leading space in the remainder.
9586                $remainder =~ s/^\s+//;
9587                trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9588
9589                # But then indent by whichever is larger of:
9590                # 1) the leading space on the input line;
9591                # 2) the hanging indent.
9592                # This preserves indentation in the original line.
9593                my $lead = ($leading_space)
9594                            ? length $leading_space
9595                            : $hanging_indent;
9596                $lead = max($lead, $hanging_indent);
9597                splice @line, $i+1, 0, (" " x $lead) . $remainder;
9598            }
9599        }
9600
9601        # Ready to output the line. Get rid of any trailing space
9602        # And prefix by the required $prefix passed in.
9603        $line[$i] =~ s/\s+$//;
9604        $line[$i] = "$prefix$line[$i]\n";
9605    } # End of looping through all the lines.
9606
9607    return join "", @line;
9608}
9609
9610sub property_ref {  # Returns a reference to a property object.
9611    return Property::property_ref(@_);
9612}
9613
9614sub force_unlink ($filename) {
9615    return unless file_exists($filename);
9616    return if CORE::unlink($filename);
9617
9618    # We might need write permission
9619    chmod 0777, $filename;
9620    CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9621    return;
9622}
9623
9624sub write ($file, $use_utf8, @lines) {
9625    # Given a filename and references to arrays of lines, write the lines of
9626    # each array to the file
9627    # Filename can be given as an arrayref of directory names
9628
9629    # Get into a single string if an array, and get rid of, in Unix terms, any
9630    # leading '.'
9631    $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9632    $file = File::Spec->canonpath($file);
9633
9634    # If has directories, make sure that they all exist
9635    (undef, my $directories, undef) = File::Spec->splitpath($file);
9636    File::Path::mkpath($directories) if $directories && ! -d $directories;
9637
9638    push @files_actually_output, $file;
9639
9640    force_unlink ($file);
9641
9642    my $OUT;
9643    if (not open $OUT, ">", $file) {
9644        Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9645        return;
9646    }
9647
9648    binmode $OUT, ":utf8" if $use_utf8;
9649
9650    foreach my $lines_ref (@lines) {
9651        unless (@$lines_ref) {
9652            Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9653        }
9654
9655        print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9656    }
9657    close $OUT or die Carp::my_carp("close '$file' failed: $!");
9658
9659    print "$file written.\n" if $verbosity >= $VERBOSE;
9660
9661    return;
9662}
9663
9664
9665sub Standardize($name=undef) {
9666    # This converts the input name string into a standardized equivalent to
9667    # use internally.
9668
9669    unless (defined $name) {
9670      Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
9671      return;
9672    }
9673
9674    # Remove any leading or trailing white space
9675    $name =~ s/^\s+//g;
9676    $name =~ s/\s+$//g;
9677
9678    # Convert interior white space and hyphens into underscores.
9679    $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9680
9681    # Capitalize the letter following an underscore, and convert a sequence of
9682    # multiple underscores to a single one
9683    $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9684
9685    # And capitalize the first letter, but not for the special cjk ones.
9686    $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9687    return $name;
9688}
9689
9690sub standardize ($str=undef) {
9691    # Returns a lower-cased standardized name, without underscores.  This form
9692    # is chosen so that it can distinguish between any real versus superficial
9693    # Unicode name differences.  It relies on the fact that Unicode doesn't
9694    # have interior underscores, white space, nor dashes in any
9695    # stricter-matched name.  It should not be used on Unicode code point
9696    # names (the Name property), as they mostly, but not always follow these
9697    # rules.
9698
9699    my $name = Standardize($str);
9700    return if !defined $name;
9701
9702    $name =~ s/ (?<= .) _ (?= . ) //xg;
9703    return lc $name;
9704}
9705
9706sub UCD_name ($table, $alias) {
9707    # Returns the name that Unicode::UCD will use to find a table.  XXX
9708    # perhaps this function should be placed somewhere, like UCD.pm so that
9709    # Unicode::UCD can use it directly without duplicating code that can get
9710    # out-of sync.
9711
9712    my $property = $table->property;
9713    $property = ($property == $perl)
9714                ? ""                # 'perl' is never explicitly stated
9715                : standardize($property->name) . '=';
9716    if ($alias->loose_match) {
9717        return $property . standardize($alias->name);
9718    }
9719    else {
9720        return lc ($property . $alias->name);
9721    }
9722
9723    return;
9724}
9725
9726{   # Closure
9727
9728    my $indent_increment = " " x ( $debugging_build ? 2 : 0);
9729    %main::already_output = ();
9730
9731    $main::simple_dumper_nesting = 0;
9732
9733    sub simple_dumper( $item, $indent = "" ) {
9734        # Like Simple Data::Dumper. Good enough for our needs. We can't use
9735        # the real thing as we have to run under miniperl.
9736
9737        # It is designed so that on input it is at the beginning of a line,
9738        # and the final thing output in any call is a trailing ",\n".
9739
9740        $indent = "" if ! $debugging_build;
9741
9742        # nesting level is localized, so that as the call stack pops, it goes
9743        # back to the prior value.
9744        local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9745        local %main::already_output = %main::already_output;
9746        $main::simple_dumper_nesting++;
9747        #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9748
9749        # Determine the indent for recursive calls.
9750        my $next_indent = $indent . $indent_increment;
9751
9752        my $output;
9753        if (! ref $item) {
9754
9755            # Dump of scalar: just output it in quotes if not a number.  To do
9756            # so we must escape certain characters, and therefore need to
9757            # operate on a copy to avoid changing the original
9758            my $copy = $item;
9759            $copy = $UNDEF unless defined $copy;
9760
9761            # Quote non-integers (integers also have optional leading '-')
9762            if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9763
9764                # Escape apostrophe and backslash
9765                $copy =~ s/ ( ['\\] ) /\\$1/xg;
9766                $copy = "'$copy'";
9767            }
9768            $output = "$indent$copy,\n";
9769        }
9770        else {
9771
9772            # Keep track of cycles in the input, and refuse to infinitely loop
9773            my $addr = pack 'J', refaddr $item;
9774            if (defined $main::already_output{$addr}) {
9775                return "${indent}ALREADY OUTPUT: $item\n";
9776            }
9777            $main::already_output{$addr} = $item;
9778
9779            if (ref $item eq 'ARRAY') {
9780                my $using_brackets;
9781                $output = $indent;
9782                if ($main::simple_dumper_nesting > 1) {
9783                    $output .= '[';
9784                    $using_brackets = 1;
9785                }
9786                else {
9787                    $using_brackets = 0;
9788                }
9789
9790                # If the array is empty, put the closing bracket on the same
9791                # line.  Otherwise, recursively add each array element
9792                if (@$item == 0) {
9793                    $output .= " ";
9794                }
9795                else {
9796                    $output .= "\n";
9797                    for (my $i = 0; $i < @$item; $i++) {
9798
9799                        # Indent array elements one level
9800                        $output .= &simple_dumper($item->[$i], $next_indent);
9801                        next if ! $debugging_build;
9802                        $output =~ s/\n$//;      # Remove any trailing nl so
9803                        $output .= " # [$i]\n";  # as to add a comment giving
9804                                                 # the array index
9805                    }
9806                    $output .= $indent;     # Indent closing ']' to orig level
9807                }
9808                $output .= ']' if $using_brackets;
9809                $output .= ",\n";
9810            }
9811            elsif (ref $item eq 'HASH') {
9812                my $is_first_line;
9813                my $using_braces;
9814                my $body_indent;
9815
9816                # No surrounding braces at top level
9817                $output .= $indent;
9818                if ($main::simple_dumper_nesting > 1) {
9819                    $output .= "{\n";
9820                    $is_first_line = 0;
9821                    $body_indent = $next_indent;
9822                    $next_indent .= $indent_increment;
9823                    $using_braces = 1;
9824                }
9825                else {
9826                    $is_first_line = 1;
9827                    $body_indent = $indent;
9828                    $using_braces = 0;
9829                }
9830
9831                # Output hashes sorted alphabetically instead of apparently
9832                # random.  Use caseless alphabetic sort
9833                foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9834                {
9835                    if ($is_first_line) {
9836                        $is_first_line = 0;
9837                    }
9838                    else {
9839                        $output .= "$body_indent";
9840                    }
9841
9842                    # The key must be a scalar, but this recursive call quotes
9843                    # it
9844                    $output .= &simple_dumper($key);
9845
9846                    # And change the trailing comma and nl to the hash fat
9847                    # comma for clarity, and so the value can be on the same
9848                    # line
9849                    $output =~ s/,\n$/ => /;
9850
9851                    # Recursively call to get the value's dump.
9852                    my $next = &simple_dumper($item->{$key}, $next_indent);
9853
9854                    # If the value is all on one line, remove its indent, so
9855                    # will follow the => immediately.  If it takes more than
9856                    # one line, start it on a new line.
9857                    if ($next !~ /\n.*\n/) {
9858                        $next =~ s/^ *//;
9859                    }
9860                    else {
9861                        $output .= "\n";
9862                    }
9863                    $output .= $next;
9864                }
9865
9866                $output .= "$indent},\n" if $using_braces;
9867            }
9868            elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9869                $output = $indent . ref($item) . "\n";
9870                # XXX see if blessed
9871            }
9872            elsif ($item->can('dump')) {
9873
9874                # By convention in this program, objects furnish a 'dump'
9875                # method.  Since not doing any output at this level, just pass
9876                # on the input indent
9877                $output = $item->dump($indent);
9878            }
9879            else {
9880                Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
9881            }
9882        }
9883        return $output;
9884    }
9885}
9886
9887sub dump_inside_out( $object, $fields_ref, @args ) {
9888    # Dump inside-out hashes in an object's state by converting them to a
9889    # regular hash and then calling simple_dumper on that.
9890
9891    my $addr = pack 'J', refaddr $object;
9892
9893    my %hash;
9894    foreach my $key (keys %$fields_ref) {
9895        $hash{$key} = $fields_ref->{$key}{$addr};
9896    }
9897
9898    return simple_dumper(\%hash, @args);
9899}
9900
9901sub _operator_dot($self, $other="", $reversed=0) {
9902    # Overloaded '.' method that is common to all packages.  It uses the
9903    # package's stringify method.
9904
9905    foreach my $which (\$self, \$other) {
9906        next unless ref $$which;
9907        if ($$which->can('_operator_stringify')) {
9908            $$which = $$which->_operator_stringify;
9909        }
9910        else {
9911            my $ref = ref $$which;
9912            my $addr = pack 'J', refaddr $$which;
9913            $$which = "$ref ($addr)";
9914        }
9915    }
9916    return ($reversed)
9917            ? "$other$self"
9918            : "$self$other";
9919}
9920
9921sub _operator_dot_equal($self, $other="", $reversed=0) {
9922    # Overloaded '.=' method that is common to all packages.
9923
9924    if ($reversed) {
9925        return $other .= "$self";
9926    }
9927    else {
9928        return "$self" . "$other";
9929    }
9930}
9931
9932sub _operator_equal($self, $other, @) {
9933    # Generic overloaded '==' routine.  To be equal, they must be the exact
9934    # same object
9935
9936    return 0 unless defined $other;
9937    return 0 unless ref $other;
9938    no overloading;
9939    return $self == $other;
9940}
9941
9942sub _operator_not_equal($self, $other, @) {
9943    return ! _operator_equal($self, $other);
9944}
9945
9946sub substitute_PropertyAliases($file_object) {
9947    # Deal with early releases that don't have the crucial PropertyAliases.txt
9948    # file.
9949
9950    $file_object->insert_lines(get_old_property_aliases());
9951
9952    process_PropertyAliases($file_object);
9953}
9954
9955
9956sub process_PropertyAliases($file) {
9957    # This reads in the PropertyAliases.txt file, which contains almost all
9958    # the character properties in Unicode and their equivalent aliases:
9959    # scf       ; Simple_Case_Folding         ; sfc
9960    #
9961    # Field 0 is the preferred short name for the property.
9962    # Field 1 is the full name.
9963    # Any succeeding ones are other accepted names.
9964
9965    # Add any cjk properties that may have been defined.
9966    $file->insert_lines(@cjk_properties);
9967
9968    while ($file->next_line) {
9969
9970        my @data = split /\s*;\s*/;
9971
9972        my $full = $data[1];
9973
9974        # This line is defective in early Perls.  The property in Unihan.txt
9975        # is kRSUnicode.
9976        if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
9977            push @data, qw(cjkRSUnicode kRSUnicode);
9978        }
9979
9980        my $this = Property->new($data[0], Full_Name => $full);
9981
9982        $this->set_fate($SUPPRESSED, $why_suppressed{$full})
9983                                                    if $why_suppressed{$full};
9984
9985        # Start looking for more aliases after these two.
9986        for my $i (2 .. @data - 1) {
9987            $this->add_alias($data[$i]);
9988        }
9989
9990    }
9991
9992    my $scf = property_ref("Simple_Case_Folding");
9993    $scf->add_alias("scf");
9994    $scf->add_alias("sfc");
9995
9996    return;
9997}
9998
9999sub finish_property_setup($file) {
10000    # Finishes setting up after PropertyAliases.
10001
10002    # This entry was missing from this file in earlier Unicode versions
10003    if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10004        Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10005    }
10006
10007    # These are used so much, that we set globals for them.
10008    $gc = property_ref('General_Category');
10009    $block = property_ref('Block');
10010    $script = property_ref('Script');
10011    $age = property_ref('Age');
10012
10013    # Perl adds this alias.
10014    $gc->add_alias('Category');
10015
10016    # Unicode::Normalize expects this file with this name and directory.
10017    $ccc = property_ref('Canonical_Combining_Class');
10018    if (defined $ccc) {
10019        $ccc->set_file('CombiningClass');
10020        $ccc->set_directory(File::Spec->curdir());
10021    }
10022
10023    # These two properties aren't actually used in the core, but unfortunately
10024    # the names just above that are in the core interfere with these, so
10025    # choose different names.  These aren't a problem unless the map tables
10026    # for these files get written out.
10027    my $lowercase = property_ref('Lowercase');
10028    $lowercase->set_file('IsLower') if defined $lowercase;
10029    my $uppercase = property_ref('Uppercase');
10030    $uppercase->set_file('IsUpper') if defined $uppercase;
10031
10032    # Set up the hard-coded default mappings, but only on properties defined
10033    # for this release
10034    foreach my $property (keys %default_mapping) {
10035        my $property_object = property_ref($property);
10036        next if ! defined $property_object;
10037        my $default_map = $default_mapping{$property};
10038        $property_object->set_default_map($default_map);
10039
10040        # A map of <code point> implies the property is string.
10041        if ($property_object->type == $UNKNOWN
10042            && $default_map eq $CODE_POINT)
10043        {
10044            $property_object->set_type($STRING);
10045        }
10046    }
10047
10048    # For backwards compatibility with applications that may read the mapping
10049    # file directly (it was documented in 5.12 and 5.14 as being thusly
10050    # usable), keep it from being adjusted.  (range_size_1 is
10051    # used to force the traditional format.)
10052    if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10053        $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10054        $nfkc_cf->set_range_size_1(1);
10055    }
10056    if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10057        $bmg->set_to_output_map($EXTERNAL_MAP);
10058        $bmg->set_range_size_1(1);
10059    }
10060
10061    property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10062
10063    # The rest of this sub is for properties that need the Multi_Default class
10064    # to create objects for defaults.  As of v15.0, this is no longer needed.
10065
10066    return if $v_version ge v15.0.0;
10067
10068    # Bidi class has a complicated default, but the derived file takes care of
10069    # the complications, leaving just 'L'.
10070    if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10071        property_ref('Bidi_Class')->set_default_map('L');
10072    }
10073    else {
10074        my $default;
10075
10076        # The derived file was introduced in 3.1.1.  The values below are
10077        # taken from table 3-8, TUS 3.0
10078        my $default_R =
10079            'my $default = Range_List->new;
10080             $default->add_range(0x0590, 0x05FF);
10081             $default->add_range(0xFB1D, 0xFB4F);'
10082        ;
10083
10084        # The defaults apply only to unassigned characters
10085        $default_R .= '$gc->table("Unassigned") & $default;';
10086
10087        if ($v_version lt v3.0.0) {
10088            $default = Multi_Default->new(R => $default_R, 'L');
10089        }
10090        else {
10091
10092            # AL apparently not introduced until 3.0:  TUS 2.x references are
10093            # not on-line to check it out
10094            my $default_AL =
10095                'my $default = Range_List->new;
10096                 $default->add_range(0x0600, 0x07BF);
10097                 $default->add_range(0xFB50, 0xFDFF);
10098                 $default->add_range(0xFE70, 0xFEFF);'
10099            ;
10100
10101            # Non-character code points introduced in this release; aren't AL
10102            if ($v_version ge 3.1.0) {
10103                $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10104            }
10105            $default_AL .= '$gc->table("Unassigned") & $default';
10106            $default = Multi_Default->new(AL => $default_AL,
10107                                          R => $default_R,
10108                                          'L');
10109        }
10110        property_ref('Bidi_Class')->set_default_map($default);
10111    }
10112
10113    # Joining type has a complicated default, but the derived file takes care
10114    # of the complications, leaving just 'U' (or Non_Joining), except the file
10115    # is bad in 3.1.0
10116    if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10117        if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10118            property_ref('Joining_Type')->set_default_map('Non_Joining');
10119        }
10120        else {
10121
10122            # Otherwise, there are not one, but two possibilities for the
10123            # missing defaults: T and U.
10124            # The missing defaults that evaluate to T are given by:
10125            # T = Mn + Cf - ZWNJ - ZWJ
10126            # where Mn and Cf are the general category values. In other words,
10127            # any non-spacing mark or any format control character, except
10128            # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10129            # WIDTH JOINER (joining type C).
10130            my $default = Multi_Default->new(
10131               'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10132               'Non_Joining');
10133            property_ref('Joining_Type')->set_default_map($default);
10134        }
10135    }
10136
10137    # Line break has a complicated default in early releases. It is 'Unknown'
10138    # for non-assigned code points; 'AL' for assigned.
10139    if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10140        my $lb = property_ref('Line_Break');
10141        if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10142            $lb->set_default_map('Unknown');
10143        }
10144        else {
10145            my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10146                                             'Unknown',
10147                                            );
10148            $lb->set_default_map($default);
10149        }
10150    }
10151
10152    return;
10153}
10154
10155sub get_old_property_aliases() {
10156    # Returns what would be in PropertyAliases.txt if it existed in very old
10157    # versions of Unicode.  It was derived from the one in 3.2, and pared
10158    # down based on the data that was actually in the older releases.
10159    # An attempt was made to use the existence of files to mean inclusion or
10160    # not of various aliases, but if this was not sufficient, using version
10161    # numbers was resorted to.
10162
10163    my @return;
10164
10165    # These are to be used in all versions (though some are constructed by
10166    # this program if missing)
10167    push @return, split /\n/, <<'END';
10168bc        ; Bidi_Class
10169Bidi_M    ; Bidi_Mirrored
10170cf        ; Case_Folding
10171ccc       ; Canonical_Combining_Class
10172dm        ; Decomposition_Mapping
10173dt        ; Decomposition_Type
10174gc        ; General_Category
10175isc       ; ISO_Comment
10176lc        ; Lowercase_Mapping
10177na        ; Name
10178na1       ; Unicode_1_Name
10179nt        ; Numeric_Type
10180nv        ; Numeric_Value
10181scf       ; Simple_Case_Folding
10182slc       ; Simple_Lowercase_Mapping
10183stc       ; Simple_Titlecase_Mapping
10184suc       ; Simple_Uppercase_Mapping
10185tc        ; Titlecase_Mapping
10186uc        ; Uppercase_Mapping
10187END
10188
10189    if (-e 'Blocks.txt') {
10190        push @return, "blk       ; Block\n";
10191    }
10192    if (-e 'ArabicShaping.txt') {
10193        push @return, split /\n/, <<'END';
10194jg        ; Joining_Group
10195jt        ; Joining_Type
10196END
10197    }
10198    if (-e 'PropList.txt') {
10199
10200        # This first set is in the original old-style proplist.
10201        push @return, split /\n/, <<'END';
10202Bidi_C    ; Bidi_Control
10203Dash      ; Dash
10204Dia       ; Diacritic
10205Ext       ; Extender
10206Hex       ; Hex_Digit
10207Hyphen    ; Hyphen
10208IDC       ; ID_Continue
10209Ideo      ; Ideographic
10210Join_C    ; Join_Control
10211Math      ; Math
10212QMark     ; Quotation_Mark
10213Term      ; Terminal_Punctuation
10214WSpace    ; White_Space
10215END
10216        # The next sets were added later
10217        if ($v_version ge v3.0.0) {
10218            push @return, split /\n/, <<'END';
10219Upper     ; Uppercase
10220Lower     ; Lowercase
10221END
10222        }
10223        if ($v_version ge v3.0.1) {
10224            push @return, split /\n/, <<'END';
10225NChar     ; Noncharacter_Code_Point
10226END
10227        }
10228        # The next sets were added in the new-style
10229        if ($v_version ge v3.1.0) {
10230            push @return, split /\n/, <<'END';
10231OAlpha    ; Other_Alphabetic
10232OLower    ; Other_Lowercase
10233OMath     ; Other_Math
10234OUpper    ; Other_Uppercase
10235END
10236        }
10237        if ($v_version ge v3.1.1) {
10238            push @return, "AHex      ; ASCII_Hex_Digit\n";
10239        }
10240    }
10241    if (-e 'EastAsianWidth.txt') {
10242        push @return, "ea        ; East_Asian_Width\n";
10243    }
10244    if (-e 'CompositionExclusions.txt') {
10245        push @return, "CE        ; Composition_Exclusion\n";
10246    }
10247    if (-e 'LineBreak.txt') {
10248        push @return, "lb        ; Line_Break\n";
10249    }
10250    if (-e 'BidiMirroring.txt') {
10251        push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
10252    }
10253    if (-e 'Scripts.txt') {
10254        push @return, "sc        ; Script\n";
10255    }
10256    if (-e 'DNormalizationProps.txt') {
10257        push @return, split /\n/, <<'END';
10258Comp_Ex   ; Full_Composition_Exclusion
10259FC_NFKC   ; FC_NFKC_Closure
10260NFC_QC    ; NFC_Quick_Check
10261NFD_QC    ; NFD_Quick_Check
10262NFKC_QC   ; NFKC_Quick_Check
10263NFKD_QC   ; NFKD_Quick_Check
10264XO_NFC    ; Expands_On_NFC
10265XO_NFD    ; Expands_On_NFD
10266XO_NFKC   ; Expands_On_NFKC
10267XO_NFKD   ; Expands_On_NFKD
10268END
10269    }
10270    if (-e 'DCoreProperties.txt') {
10271        push @return, split /\n/, <<'END';
10272Alpha     ; Alphabetic
10273IDS       ; ID_Start
10274XIDC      ; XID_Continue
10275XIDS      ; XID_Start
10276END
10277        # These can also appear in some versions of PropList.txt
10278        push @return, "Lower     ; Lowercase\n"
10279                                    unless grep { $_ =~ /^Lower\b/} @return;
10280        push @return, "Upper     ; Uppercase\n"
10281                                    unless grep { $_ =~ /^Upper\b/} @return;
10282    }
10283
10284    # This flag requires the DAge.txt file to be copied into the directory.
10285    if (DEBUG && $compare_versions) {
10286        push @return, 'age       ; Age';
10287    }
10288
10289    return @return;
10290}
10291
10292sub substitute_PropValueAliases($file_object) {
10293    # Deal with early releases that don't have the crucial
10294    # PropValueAliases.txt file.
10295
10296    $file_object->insert_lines(get_old_property_value_aliases());
10297
10298    process_PropValueAliases($file_object);
10299}
10300
10301sub process_PropValueAliases($file) {
10302    # This file contains values that properties look like:
10303    # bc ; AL        ; Arabic_Letter
10304    # blk; n/a       ; Greek_And_Coptic                 ; Greek
10305    #
10306    # Field 0 is the property.
10307    # Field 1 is the short name of a property value or 'n/a' if no
10308    #                short name exists;
10309    # Field 2 is the full property value name;
10310    # Any other fields are more synonyms for the property value.
10311    # Purely numeric property values are omitted from the file; as are some
10312    # others, fewer and fewer in later releases
10313
10314    # Entries for the ccc property have an extra field before the
10315    # abbreviation:
10316    # ccc;   0; NR   ; Not_Reordered
10317    # It is the numeric value that the names are synonyms for.
10318
10319    # There are comment entries for values missing from this file:
10320    # # @missing: 0000..10FFFF; ISO_Comment; <none>
10321    # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10322
10323    if ($v_version lt 4.0.0) {
10324        $file->insert_lines(split /\n/, <<'END'
10325Hangul_Syllable_Type; L                                ; Leading_Jamo
10326Hangul_Syllable_Type; LV                               ; LV_Syllable
10327Hangul_Syllable_Type; LVT                              ; LVT_Syllable
10328Hangul_Syllable_Type; NA                               ; Not_Applicable
10329Hangul_Syllable_Type; T                                ; Trailing_Jamo
10330Hangul_Syllable_Type; V                                ; Vowel_Jamo
10331END
10332        );
10333    }
10334    if ($v_version lt 4.1.0) {
10335        $file->insert_lines(split /\n/, <<'END'
10336_Perl_GCB; CN                               ; Control
10337_Perl_GCB; CR                               ; CR
10338_Perl_GCB; EX                               ; Extend
10339_Perl_GCB; L                                ; L
10340_Perl_GCB; LF                               ; LF
10341_Perl_GCB; LV                               ; LV
10342_Perl_GCB; LVT                              ; LVT
10343_Perl_GCB; T                                ; T
10344_Perl_GCB; V                                ; V
10345_Perl_GCB; XX                               ; Other
10346END
10347        );
10348    }
10349
10350    # Add any explicit cjk values
10351    $file->insert_lines(@cjk_property_values);
10352
10353    # This line is used only for testing the code that checks for name
10354    # conflicts.  There is a script Inherited, and when this line is executed
10355    # it causes there to be a name conflict with the 'Inherited' that this
10356    # program generates for this block property value
10357    #$file->insert_lines('blk; n/a; Herited');
10358
10359    # Process each line of the file ...
10360    while ($file->next_line) {
10361
10362        # Fix typo in input file
10363        s/CCC133/CCC132/g if $v_version eq v6.1.0;
10364
10365        my ($property, @data) = split /\s*;\s*/;
10366
10367        # The ccc property has an extra field at the beginning, which is the
10368        # numeric value.  Move it to be after the other two, mnemonic, fields,
10369        # so that those will be used as the property value's names, and the
10370        # number will be an extra alias.  (Rightmost splice removes field 1-2,
10371        # returning them in a slice; left splice inserts that before anything,
10372        # thus shifting the former field 0 to after them.)
10373        splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10374
10375        if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10376            my $new_style = $data[1] =~ s/-/_/gr;
10377            splice @data, 1, 0, $new_style;
10378        }
10379
10380        # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
10381        # there is no short name, use the full one in element 1
10382        if ($data[0] eq "n/a") {
10383            $data[0] = $data[1];
10384        }
10385        elsif ($data[0] ne $data[1]
10386               && standardize($data[0]) eq standardize($data[1])
10387               && $data[1] !~ /[[:upper:]]/)
10388        {
10389            # Also, there is a bug in the file in which "n/a" is omitted, and
10390            # the two fields are identical except for case, and the full name
10391            # is all lower case.  Copy the "short" name unto the full one to
10392            # give it some upper case.
10393
10394            $data[1] = $data[0];
10395        }
10396
10397        # Earlier releases had the pseudo property 'qc' that should expand to
10398        # the ones that replace it below.
10399        if ($property eq 'qc') {
10400            if (lc $data[0] eq 'y') {
10401                $file->insert_lines('NFC_QC; Y      ; Yes',
10402                                    'NFD_QC; Y      ; Yes',
10403                                    'NFKC_QC; Y     ; Yes',
10404                                    'NFKD_QC; Y     ; Yes',
10405                                    );
10406            }
10407            elsif (lc $data[0] eq 'n') {
10408                $file->insert_lines('NFC_QC; N      ; No',
10409                                    'NFD_QC; N      ; No',
10410                                    'NFKC_QC; N     ; No',
10411                                    'NFKD_QC; N     ; No',
10412                                    );
10413            }
10414            elsif (lc $data[0] eq 'm') {
10415                $file->insert_lines('NFC_QC; M      ; Maybe',
10416                                    'NFKC_QC; M     ; Maybe',
10417                                    );
10418            }
10419            else {
10420                $file->carp_bad_line("qc followed by unexpected '$data[0]");
10421            }
10422            next;
10423        }
10424
10425        # The first field is the short name, 2nd is the full one.
10426        my $property_object = property_ref($property);
10427        my $table = $property_object->add_match_table($data[0],
10428                                                Full_Name => $data[1]);
10429
10430        # Start looking for more aliases after these two.
10431        for my $i (2 .. @data - 1) {
10432            $table->add_alias($data[$i]);
10433        }
10434    } # End of looping through the file
10435
10436    # As noted in the comments early in the program, it generates tables for
10437    # the default values for all releases, even those for which the concept
10438    # didn't exist at the time.  Here we add those if missing.
10439    if (defined $age && ! defined $age->table('Unassigned')) {
10440        $age->add_match_table('Unassigned');
10441    }
10442    $block->add_match_table('No_Block') if -e 'Blocks.txt'
10443                                    && ! defined $block->table('No_Block');
10444
10445
10446    # Now set the default mappings of the properties from the file.  This is
10447    # done after the loop because a number of properties have only @missings
10448    # entries in the file, and may not show up until the end.
10449    my @defaults = $file->get_missings;
10450    foreach my $default_ref (@defaults) {
10451        my $default = $default_ref->{default};
10452        my $property = property_ref($default_ref->{property});
10453        $property->set_default_map($default);
10454    }
10455
10456    return;
10457}
10458
10459sub get_old_property_value_aliases () {
10460    # Returns what would be in PropValueAliases.txt if it existed in very old
10461    # versions of Unicode.  It was derived from the one in 3.2, and pared
10462    # down.  An attempt was made to use the existence of files to mean
10463    # inclusion or not of various aliases, but if this was not sufficient,
10464    # using version numbers was resorted to.
10465
10466    my @return = split /\n/, <<'END';
10467bc ; AN        ; Arabic_Number
10468bc ; B         ; Paragraph_Separator
10469bc ; CS        ; Common_Separator
10470bc ; EN        ; European_Number
10471bc ; ES        ; European_Separator
10472bc ; ET        ; European_Terminator
10473bc ; L         ; Left_To_Right
10474bc ; ON        ; Other_Neutral
10475bc ; R         ; Right_To_Left
10476bc ; WS        ; White_Space
10477
10478Bidi_M; N; No; F; False
10479Bidi_M; Y; Yes; T; True
10480
10481# The standard combining classes are very much different in v1, so only use
10482# ones that look right (not checked thoroughly)
10483ccc;   0; NR   ; Not_Reordered
10484ccc;   1; OV   ; Overlay
10485ccc;   7; NK   ; Nukta
10486ccc;   8; KV   ; Kana_Voicing
10487ccc;   9; VR   ; Virama
10488ccc; 202; ATBL ; Attached_Below_Left
10489ccc; 216; ATAR ; Attached_Above_Right
10490ccc; 218; BL   ; Below_Left
10491ccc; 220; B    ; Below
10492ccc; 222; BR   ; Below_Right
10493ccc; 224; L    ; Left
10494ccc; 228; AL   ; Above_Left
10495ccc; 230; A    ; Above
10496ccc; 232; AR   ; Above_Right
10497ccc; 234; DA   ; Double_Above
10498
10499dt ; can       ; canonical
10500dt ; enc       ; circle
10501dt ; fin       ; final
10502dt ; font      ; font
10503dt ; fra       ; fraction
10504dt ; init      ; initial
10505dt ; iso       ; isolated
10506dt ; med       ; medial
10507dt ; n/a       ; none
10508dt ; nb        ; noBreak
10509dt ; sqr       ; square
10510dt ; sub       ; sub
10511dt ; sup       ; super
10512
10513gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10514gc ; Cc        ; Control
10515gc ; Cn        ; Unassigned
10516gc ; Co        ; Private_Use
10517gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10518gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10519gc ; Ll        ; Lowercase_Letter
10520gc ; Lm        ; Modifier_Letter
10521gc ; Lo        ; Other_Letter
10522gc ; Lu        ; Uppercase_Letter
10523gc ; M         ; Mark                             # Mc | Me | Mn
10524gc ; Mc        ; Spacing_Mark
10525gc ; Mn        ; Nonspacing_Mark
10526gc ; N         ; Number                           # Nd | Nl | No
10527gc ; Nd        ; Decimal_Number
10528gc ; No        ; Other_Number
10529gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10530gc ; Pd        ; Dash_Punctuation
10531gc ; Pe        ; Close_Punctuation
10532gc ; Po        ; Other_Punctuation
10533gc ; Ps        ; Open_Punctuation
10534gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10535gc ; Sc        ; Currency_Symbol
10536gc ; Sm        ; Math_Symbol
10537gc ; So        ; Other_Symbol
10538gc ; Z         ; Separator                        # Zl | Zp | Zs
10539gc ; Zl        ; Line_Separator
10540gc ; Zp        ; Paragraph_Separator
10541gc ; Zs        ; Space_Separator
10542
10543nt ; de        ; Decimal
10544nt ; di        ; Digit
10545nt ; n/a       ; None
10546nt ; nu        ; Numeric
10547END
10548
10549    if (-e 'ArabicShaping.txt') {
10550        push @return, split /\n/, <<'END';
10551jg ; n/a       ; AIN
10552jg ; n/a       ; ALEF
10553jg ; n/a       ; DAL
10554jg ; n/a       ; GAF
10555jg ; n/a       ; LAM
10556jg ; n/a       ; MEEM
10557jg ; n/a       ; NO_JOINING_GROUP
10558jg ; n/a       ; NOON
10559jg ; n/a       ; QAF
10560jg ; n/a       ; SAD
10561jg ; n/a       ; SEEN
10562jg ; n/a       ; TAH
10563jg ; n/a       ; WAW
10564
10565jt ; C         ; Join_Causing
10566jt ; D         ; Dual_Joining
10567jt ; L         ; Left_Joining
10568jt ; R         ; Right_Joining
10569jt ; U         ; Non_Joining
10570jt ; T         ; Transparent
10571END
10572        if ($v_version ge v3.0.0) {
10573            push @return, split /\n/, <<'END';
10574jg ; n/a       ; ALAPH
10575jg ; n/a       ; BEH
10576jg ; n/a       ; BETH
10577jg ; n/a       ; DALATH_RISH
10578jg ; n/a       ; E
10579jg ; n/a       ; FEH
10580jg ; n/a       ; FINAL_SEMKATH
10581jg ; n/a       ; GAMAL
10582jg ; n/a       ; HAH
10583jg ; n/a       ; HAMZA_ON_HEH_GOAL
10584jg ; n/a       ; HE
10585jg ; n/a       ; HEH
10586jg ; n/a       ; HEH_GOAL
10587jg ; n/a       ; HETH
10588jg ; n/a       ; KAF
10589jg ; n/a       ; KAPH
10590jg ; n/a       ; KNOTTED_HEH
10591jg ; n/a       ; LAMADH
10592jg ; n/a       ; MIM
10593jg ; n/a       ; NUN
10594jg ; n/a       ; PE
10595jg ; n/a       ; QAPH
10596jg ; n/a       ; REH
10597jg ; n/a       ; REVERSED_PE
10598jg ; n/a       ; SADHE
10599jg ; n/a       ; SEMKATH
10600jg ; n/a       ; SHIN
10601jg ; n/a       ; SWASH_KAF
10602jg ; n/a       ; TAW
10603jg ; n/a       ; TEH_MARBUTA
10604jg ; n/a       ; TETH
10605jg ; n/a       ; YEH
10606jg ; n/a       ; YEH_BARREE
10607jg ; n/a       ; YEH_WITH_TAIL
10608jg ; n/a       ; YUDH
10609jg ; n/a       ; YUDH_HE
10610jg ; n/a       ; ZAIN
10611END
10612        }
10613    }
10614
10615
10616    if (-e 'EastAsianWidth.txt') {
10617        push @return, split /\n/, <<'END';
10618ea ; A         ; Ambiguous
10619ea ; F         ; Fullwidth
10620ea ; H         ; Halfwidth
10621ea ; N         ; Neutral
10622ea ; Na        ; Narrow
10623ea ; W         ; Wide
10624END
10625    }
10626
10627    if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10628        my @lb = split /\n/, <<'END';
10629lb ; AI        ; Ambiguous
10630lb ; AL        ; Alphabetic
10631lb ; B2        ; Break_Both
10632lb ; BA        ; Break_After
10633lb ; BB        ; Break_Before
10634lb ; BK        ; Mandatory_Break
10635lb ; CB        ; Contingent_Break
10636lb ; CL        ; Close_Punctuation
10637lb ; CM        ; Combining_Mark
10638lb ; CR        ; Carriage_Return
10639lb ; EX        ; Exclamation
10640lb ; GL        ; Glue
10641lb ; HY        ; Hyphen
10642lb ; ID        ; Ideographic
10643lb ; IN        ; Inseperable
10644lb ; IS        ; Infix_Numeric
10645lb ; LF        ; Line_Feed
10646lb ; NS        ; Nonstarter
10647lb ; NU        ; Numeric
10648lb ; OP        ; Open_Punctuation
10649lb ; PO        ; Postfix_Numeric
10650lb ; PR        ; Prefix_Numeric
10651lb ; QU        ; Quotation
10652lb ; SA        ; Complex_Context
10653lb ; SG        ; Surrogate
10654lb ; SP        ; Space
10655lb ; SY        ; Break_Symbols
10656lb ; XX        ; Unknown
10657lb ; ZW        ; ZWSpace
10658END
10659        # If this Unicode version predates the lb property, we use our
10660        # substitute one
10661        if (-e 'LBsubst.txt') {
10662            $_ = s/^lb/_Perl_LB/r for @lb;
10663        }
10664        push @return, @lb;
10665    }
10666
10667    if (-e 'DNormalizationProps.txt') {
10668        push @return, split /\n/, <<'END';
10669qc ; M         ; Maybe
10670qc ; N         ; No
10671qc ; Y         ; Yes
10672END
10673    }
10674
10675    if (-e 'Scripts.txt') {
10676        push @return, split /\n/, <<'END';
10677sc ; Arab      ; Arabic
10678sc ; Armn      ; Armenian
10679sc ; Beng      ; Bengali
10680sc ; Bopo      ; Bopomofo
10681sc ; Cans      ; Canadian_Aboriginal
10682sc ; Cher      ; Cherokee
10683sc ; Cyrl      ; Cyrillic
10684sc ; Deva      ; Devanagari
10685sc ; Dsrt      ; Deseret
10686sc ; Ethi      ; Ethiopic
10687sc ; Geor      ; Georgian
10688sc ; Goth      ; Gothic
10689sc ; Grek      ; Greek
10690sc ; Gujr      ; Gujarati
10691sc ; Guru      ; Gurmukhi
10692sc ; Hang      ; Hangul
10693sc ; Hani      ; Han
10694sc ; Hebr      ; Hebrew
10695sc ; Hira      ; Hiragana
10696sc ; Ital      ; Old_Italic
10697sc ; Kana      ; Katakana
10698sc ; Khmr      ; Khmer
10699sc ; Knda      ; Kannada
10700sc ; Laoo      ; Lao
10701sc ; Latn      ; Latin
10702sc ; Mlym      ; Malayalam
10703sc ; Mong      ; Mongolian
10704sc ; Mymr      ; Myanmar
10705sc ; Ogam      ; Ogham
10706sc ; Orya      ; Oriya
10707sc ; Qaai      ; Inherited
10708sc ; Runr      ; Runic
10709sc ; Sinh      ; Sinhala
10710sc ; Syrc      ; Syriac
10711sc ; Taml      ; Tamil
10712sc ; Telu      ; Telugu
10713sc ; Thaa      ; Thaana
10714sc ; Thai      ; Thai
10715sc ; Tibt      ; Tibetan
10716sc ; Yiii      ; Yi
10717sc ; Zyyy      ; Common
10718END
10719    }
10720
10721    if ($v_version ge v2.0.0) {
10722        push @return, split /\n/, <<'END';
10723dt ; com       ; compat
10724dt ; nar       ; narrow
10725dt ; sml       ; small
10726dt ; vert      ; vertical
10727dt ; wide      ; wide
10728
10729gc ; Cf        ; Format
10730gc ; Cs        ; Surrogate
10731gc ; Lt        ; Titlecase_Letter
10732gc ; Me        ; Enclosing_Mark
10733gc ; Nl        ; Letter_Number
10734gc ; Pc        ; Connector_Punctuation
10735gc ; Sk        ; Modifier_Symbol
10736END
10737    }
10738    if ($v_version ge v2.1.2) {
10739        push @return, "bc ; S         ; Segment_Separator\n";
10740    }
10741    if ($v_version ge v2.1.5) {
10742        push @return, split /\n/, <<'END';
10743gc ; Pf        ; Final_Punctuation
10744gc ; Pi        ; Initial_Punctuation
10745END
10746    }
10747    if ($v_version ge v2.1.8) {
10748        push @return, "ccc; 240; IS   ; Iota_Subscript\n";
10749    }
10750
10751    if ($v_version ge v3.0.0) {
10752        push @return, split /\n/, <<'END';
10753bc ; AL        ; Arabic_Letter
10754bc ; BN        ; Boundary_Neutral
10755bc ; LRE       ; Left_To_Right_Embedding
10756bc ; LRO       ; Left_To_Right_Override
10757bc ; NSM       ; Nonspacing_Mark
10758bc ; PDF       ; Pop_Directional_Format
10759bc ; RLE       ; Right_To_Left_Embedding
10760bc ; RLO       ; Right_To_Left_Override
10761
10762ccc; 233; DB   ; Double_Below
10763END
10764    }
10765
10766    if ($v_version ge v3.1.0) {
10767        push @return, "ccc; 226; R    ; Right\n";
10768    }
10769
10770    return @return;
10771}
10772
10773sub process_NormalizationsTest($file) {
10774
10775    # Each line looks like:
10776    #      source code point; NFC; NFD; NFKC; NFKD
10777    # e.g.
10778    #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
10779
10780    # Process each line of the file ...
10781    while ($file->next_line) {
10782
10783        next if /^@/;
10784
10785        my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10786
10787        foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10788            $$var = pack "U0U*", map { hex } split " ", $$var;
10789            $$var =~ s/(\\)/$1$1/g;
10790        }
10791
10792        push @normalization_tests,
10793                "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n";
10794    } # End of looping through the file
10795}
10796
10797sub output_perl_charnames_line ($code_point, $name) {
10798
10799    # Output the entries in Perl_charnames specially, using 5 digits instead
10800    # of four.  This makes the entries a constant length, and simplifies
10801    # charnames.pm which this table is for.  Unicode can have 6 digit
10802    # ordinals, but they are all private use or noncharacters which do not
10803    # have names, so won't be in this table.
10804
10805    return sprintf "%05X\n%s\n\n", $code_point, $name;
10806}
10807
10808{ # Closure
10809
10810    # These are constants to the $property_info hash in this subroutine, to
10811    # avoid using a quoted-string which might have a typo.
10812    my $TYPE  = 'type';
10813    my $DEFAULT_MAP = 'default_map';
10814    my $DEFAULT_TABLE = 'default_table';
10815    my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10816    my $MISSINGS = 'missings';
10817
10818    sub process_generic_property_file($file) {
10819        # This processes a file containing property mappings and puts them
10820        # into internal map tables.  It should be used to handle any property
10821        # files that have mappings from a code point or range thereof to
10822        # something else.  This means almost all the UCD .txt files.
10823        # each_line_handlers() should be set to adjust the lines of these
10824        # files, if necessary, to what this routine understands:
10825        #
10826        # 0374          ; NFD_QC; N
10827        # 003C..003E    ; Math
10828        #
10829        # the fields are: "codepoint-range ; property; map"
10830        #
10831        # meaning the codepoints in the range all have the value 'map' under
10832        # 'property'.
10833        # Beginning and trailing white space in each field are not significant.
10834        # Note there is not a trailing semi-colon in the above.  A trailing
10835        # semi-colon means the map is a null-string.  An omitted map, as
10836        # opposed to a null-string, is assumed to be 'Y', based on Unicode
10837        # table syntax.  (This could have been hidden from this routine by
10838        # doing it in the $file object, but that would require parsing of the
10839        # line there, so would have to parse it twice, or change the interface
10840        # to pass this an array.  So not done.)
10841        #
10842        # The map field may begin with a sequence of commands that apply to
10843        # this range.  Each such command begins and ends with $CMD_DELIM.
10844        # These are used to indicate, for example, that the mapping for a
10845        # range has a non-default type.
10846        #
10847        # This loops through the file, calling its next_line() method, and
10848        # then taking the map and adding it to the property's table.
10849        # Complications arise because any number of properties can be in the
10850        # file, in any order, interspersed in any way.  The first time a
10851        # property is seen, it gets information about that property and
10852        # caches it for quick retrieval later.  It also normalizes the maps
10853        # so that only one of many synonyms is stored.  The Unicode input
10854        # files do use some multiple synonyms.
10855
10856        my %property_info;               # To keep track of what properties
10857                                         # have already had entries in the
10858                                         # current file, and info about each,
10859                                         # so don't have to recompute.
10860        my $property_name;               # property currently being worked on
10861        my $property_type;               # and its type
10862        my $previous_property_name = ""; # name from last time through loop
10863        my $property_object;             # pointer to the current property's
10864                                         # object
10865        my $property_addr;               # the address of that object
10866        my $default_map;                 # the string that code points missing
10867                                         # from the file map to
10868        my $default_table;               # For non-string properties, a
10869                                         # reference to the match table that
10870                                         # will contain the list of code
10871                                         # points that map to $default_map.
10872
10873        # Get the next real non-comment line
10874        LINE:
10875        while ($file->next_line) {
10876
10877            # Default replacement type; means that if parts of the range have
10878            # already been stored in our tables, the new map overrides them if
10879            # they differ more than cosmetically
10880            my $replace = $IF_NOT_EQUIVALENT;
10881            my $map_type;            # Default type for the map of this range
10882
10883            #local $to_trace = 1 if main::DEBUG;
10884            trace $_ if main::DEBUG && $to_trace;
10885
10886            # Split the line into components
10887            my ($range, $property_name, $map, @remainder)
10888                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10889
10890            # If more or less on the line than we are expecting, warn and skip
10891            # the line
10892            if (@remainder) {
10893                $file->carp_bad_line('Extra fields');
10894                next LINE;
10895            }
10896            elsif ( ! defined $property_name) {
10897                $file->carp_bad_line('Missing property');
10898                next LINE;
10899            }
10900
10901            # Examine the range.
10902            if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10903            {
10904                $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10905                next LINE;
10906            }
10907            my $low = hex $1;
10908            my $high = (defined $2) ? hex $2 : $low;
10909
10910            # If changing to a new property, get the things constant per
10911            # property
10912            if ($previous_property_name ne $property_name) {
10913
10914                $property_object = property_ref($property_name);
10915                if (! defined $property_object) {
10916                    $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
10917                    next LINE;
10918                }
10919                $property_addr = pack 'J', refaddr $property_object;
10920
10921                # Defer changing names until have a line that is acceptable
10922                # (the 'next' statement above means is unacceptable)
10923                $previous_property_name = $property_name;
10924
10925                # If not the first time for this property, retrieve info about
10926                # it from the cache
10927                my $this_property_info = $property_info{$property_addr};
10928                if (defined ($this_property_info->{$TYPE})) {
10929                    $property_type = $this_property_info->{$TYPE};
10930                    $default_map = $this_property_info->{$DEFAULT_MAP};
10931                    $map_type = $this_property_info->{$PSEUDO_MAP_TYPE};
10932                    $default_table = $this_property_info->{$DEFAULT_TABLE};
10933                }
10934                else {
10935
10936                    # Here, is the first time for this property.  Set up the
10937                    # cache.
10938                    $property_type = $this_property_info->{$TYPE}
10939                                   = $property_object->type;
10940                    $map_type
10941                        = $this_property_info->{$PSEUDO_MAP_TYPE}
10942                        = $property_object->pseudo_map_type;
10943
10944                    # The Unicode files are set up so that if the map is not
10945                    # defined, it is a binary property
10946                    if (! defined $map && $property_type != $BINARY) {
10947                        if ($property_type != $UNKNOWN
10948                            && $property_type != $NON_STRING)
10949                        {
10950                            $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
10951                        }
10952                        else {
10953                            $property_object->set_type($BINARY);
10954                            $property_type = $this_property_info->{$TYPE}
10955                                           = $BINARY;
10956                        }
10957                    }
10958
10959                    # Get any @missings default for this property.  This
10960                    # should precede the first entry for the property in the
10961                    # input file, and is located in a comment that has been
10962                    # stored by the Input_file class until we access it here.
10963                    # It's possible that there is more than one such line
10964                    # waiting for us; collect them all, and parse
10965                    my @missings_list;
10966                    @missings_list = $file->get_missings
10967                                            if $file->has_missings_defaults;
10968
10969                    foreach my $default_ref (@missings_list) {
10970
10971                        # For now, we are only interested in the fallback
10972                        # default for the entire property. i.e., an @missing
10973                        # line that is for the whole Unicode range.
10974                        next if $default_ref->{start} != 0
10975                             || $default_ref->{end} != $MAX_UNICODE_CODEPOINT;
10976
10977                        $default_map = $default_ref->{default};
10978
10979                        # For string properties, the default is just what the
10980                        # file says, but non-string properties should already
10981                        # have set up a table for the default property value;
10982                        # use the table for these, so can resolve synonyms
10983                        # later to a single standard one.
10984                        if ($property_type == $STRING
10985                            || $property_type == $UNKNOWN)
10986                        {
10987                            $this_property_info->{$MISSINGS} = $default_map;
10988                        }
10989                        else {
10990                            $default_map =
10991                               $property_object->table($default_map)->full_name;
10992                            $this_property_info->{$MISSINGS} = $default_map;
10993                            $this_property_info->{$DEFAULT_MAP} = $default_map;
10994                            if (! defined $property_object->default_map) {
10995                                $property_object->set_default_map($default_map);
10996                            }
10997                        }
10998                    }
10999
11000                    # For later Unicode versions, multiple @missing lines for
11001                    # a single property can appear in the files.  The first
11002                    # always applies to the entire Unicode range, and was
11003                    # handled above.  The subsequent ones are for smaller
11004                    # ranges, and can be read as "But for this range, the
11005                    # default is ...".  So each overrides all the preceding
11006                    # ones for the range it applies to.  Typically they apply
11007                    # to disjoint ranges, but don't have to.  What we do is to
11008                    # set them up to work in reverse order, so that after the
11009                    # rest of the table is filled, the highest priority
11010                    # default range fills in any code points that haven't been
11011                    # specified; then the next highest priority one is
11012                    # applied, and so forth.
11013                    if (@missings_list > 1 && $v_version ge v15.0.0) {
11014                        if ($property_type != $ENUM) {
11015                            Carp::my_carp_bug("Multiple \@missings lines only"
11016                                            . " make sense for ENUM-type"
11017                                            . " properties.  Changing type to"
11018                                            . " that");
11019                            $property_type = $this_property_info->{$TYPE}
11020                                                                        = $ENUM;
11021                            $property_object->set_type($ENUM);
11022                        }
11023
11024                        my $multi = Multi_Default->new();
11025
11026                        # The overall default should be first on this list,
11027                        # and is handled differently than the rest.
11028                        $default_map = shift @missings_list;
11029                        Carp::my_carp_bug("\@missings needs to be entire range")
11030                            if $default_map->{start} != 0
11031                            || $default_map->{end} != $MAX_UNICODE_CODEPOINT;
11032
11033                        # We already have looked at this line above.  Use that
11034                        # result
11035                        $multi->set_final_default($this_property_info->
11036                                                                  {$MISSINGS});
11037
11038                        # Now get the individual range elements, and add them
11039                        # to Multi_Default object
11040                        while (@missings_list) {
11041                            my $this_entry = pop @missings_list;
11042                            my $subrange_default = $this_entry->{default};
11043
11044                            # Use the short name as a standard
11045                            $subrange_default = $property_object->
11046                                        table($subrange_default)->short_name;
11047                            $multi->append_default($subrange_default,
11048                                "Range_List->new(Initialize => Range->new("
11049                              . "$this_entry->{start}, $this_entry->{end}))");
11050                        }
11051
11052                        # Override the property's simple default with this.
11053                        $property_object->set_default_map($multi);
11054                    }
11055
11056                    if (! $default_map || $property_type != $ENUM) {
11057
11058                        # Finished storing all the @missings defaults in the
11059                        # input file so far.  Get the one for the current
11060                        # property.
11061                        my $missings = $this_property_info->{$MISSINGS};
11062
11063                        # But we likely have separately stored what the
11064                        # default should be.  (This is to accommodate versions
11065                        # of the standard where the @missings lines are absent
11066                        # or incomplete.)  Hopefully the two will match.  But
11067                        # check it out.
11068                        $default_map = $property_object->default_map;
11069
11070                        # If the map is a ref, it means that the default won't
11071                        # be processed until later, so undef it, so next few
11072                        # lines will redefine it to something that nothing
11073                        # will match
11074                        undef $default_map if ref $default_map;
11075
11076                        # Create a $default_map if don't have one; maybe a
11077                        # dummy that won't match anything.
11078                        if (! defined $default_map) {
11079
11080                            # Use any @missings line in the file.
11081                            if (defined $missings) {
11082                                if (ref $missings) {
11083                                    $default_map = $missings->full_name;
11084                                    $default_table = $missings;
11085                                }
11086                                else {
11087                                    $default_map = $missings;
11088                                }
11089
11090                                # And store it with the property for outside
11091                                # use.
11092                                $property_object->set_default_map($default_map);
11093                            }
11094                            else {
11095
11096                                # Neither an @missings nor a default map.
11097                                # Create a dummy one, so won't have to test
11098                                # definedness in the main loop.
11099                                $default_map = '_Perl This will never be in a'
11100                                             . ' file from Unicode';
11101                            }
11102                        }
11103
11104                        # Here, we have $default_map defined, possibly in
11105                        # terms of $missings, but maybe not, and possibly is a
11106                        # dummy one.
11107                        if (defined $missings) {
11108
11109                            # Make sure there is no conflict between the two.
11110                            # $missings has priority.
11111                            if (ref $missings) {
11112                                $default_table
11113                                        = $property_object->table($default_map);
11114                                if ( ! defined $default_table
11115                                    || $default_table != $missings)
11116                                {
11117                                    if (! defined $default_table) {
11118                                        $default_table = $UNDEF;
11119                                    }
11120                                    $file->carp_bad_line(<<END
11121The \@missings line for $property_name in $file says that missings default to
11122$missings, but we expect it to be $default_table.  $missings used.
11123END
11124                                    );
11125                                    $default_table = $missings;
11126                                    $default_map = $missings->full_name;
11127                                }
11128                                $this_property_info->{$DEFAULT_TABLE}
11129                                                            = $default_table;
11130                            }
11131                            elsif ($default_map ne $missings) {
11132                                $file->carp_bad_line(<<END
11133The \@missings line for $property_name in $file says that missings default to
11134$missings, but we expect it to be $default_map.  $missings used.
11135END
11136                                );
11137                                $default_map = $missings;
11138                            }
11139                        }
11140
11141                        $this_property_info->{$DEFAULT_MAP} = $default_map;
11142
11143                        # If haven't done so already, find the table
11144                        # corresponding to this map for non-string properties.
11145                        if (! defined $default_table
11146                            && $property_type != $STRING
11147                            && $property_type != $UNKNOWN)
11148                        {
11149                            $default_table
11150                                        = $this_property_info->{$DEFAULT_TABLE}
11151                                        = $property_object->table($default_map);
11152                        }
11153                    }
11154                } # End of is first time for this property
11155            } # End of switching properties.
11156
11157            # Ready to process the line.
11158            # The Unicode files are set up so that if the map is not defined,
11159            # it is a binary property with value 'Y'
11160            if (! defined $map) {
11161                $map = 'Y';
11162            }
11163            else {
11164
11165                # If the map begins with a special command to us (enclosed in
11166                # delimiters), extract the command(s).
11167                while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11168                    my $command = $1;
11169                    if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
11170                        $replace = $1;
11171                    }
11172                    elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
11173                        $map_type = $1;
11174                    }
11175                    else {
11176                        $file->carp_bad_line("Unknown command line: '$1'");
11177                        next LINE;
11178                    }
11179                }
11180            }
11181
11182            if (   $default_map eq $CODE_POINT
11183                && $map =~ / ^ $code_point_re $/x)
11184            {
11185
11186                # Here, we have a map to a particular code point, and the
11187                # default map is to a code point itself.  If the range
11188                # includes the particular code point, change that portion of
11189                # the range to the default.  This makes sure that in the final
11190                # table only the non-defaults are listed.
11191                my $decimal_map = hex $map;
11192                if ($low <= $decimal_map && $decimal_map <= $high) {
11193
11194                    # If the range includes stuff before or after the map
11195                    # we're changing, split it and process the split-off parts
11196                    # later.
11197                    if ($low < $decimal_map) {
11198                        $file->insert_adjusted_lines(
11199                                            sprintf("%04X..%04X; %s; %s",
11200                                                    $low,
11201                                                    $decimal_map - 1,
11202                                                    $property_name,
11203                                                    $map));
11204                    }
11205                    if ($high > $decimal_map) {
11206                        $file->insert_adjusted_lines(
11207                                            sprintf("%04X..%04X; %s; %s",
11208                                                    $decimal_map + 1,
11209                                                    $high,
11210                                                    $property_name,
11211                                                    $map));
11212                    }
11213                    $low = $high = $decimal_map;
11214                    $map = $CODE_POINT;
11215                }
11216            }
11217
11218            if ($property_type != $STRING && $property_type != $UNKNOWN) {
11219                my $table = $property_object->table($map);
11220                if (defined $table) {
11221
11222                    # Unicode isn't very consistent about which synonym they
11223                    # use in their .txt files, even within the same file, or
11224                    # two files that are for the same property.  For enum
11225                    # properties, we know already what all the synonyms are
11226                    # (because we processed PropValueAliases already).
11227                    # Therefore we can take the input and map it to a uniform
11228                    # value now, saving us trouble later.
11229                    #
11230                    # Only if the map is well-behaved do we try this:
11231                    # non-empty, all non-blank.
11232                    if ($property_type == $ENUM && $map =~ / ^ \S+ $ /x) {
11233
11234                        # Use existing practice as much as easily practicable,
11235                        # so that code that has assumptions about spelling
11236                        # doesn't have to change
11237                        my $short_name = $property_object->short_name;
11238                        if ($short_name =~ / ^ (BC | EA | GC  |HST | JT |
11239                                                Lb | BT | BPT | NFCQC |
11240                                                NFKCQC) $ /ix)
11241                        {
11242                            $map = $table->short_name;
11243                        }
11244                        elsif ($short_name !~ / ^ ( Ccc | Age | InSC | JG |
11245                                                    SB) $ /ix)
11246                        {
11247                            $map = $table->full_name;
11248                        }
11249                    }
11250                    elsif ($table == $default_table) {
11251
11252                        # When it isn't an ENUM, we we can still tell if
11253                        # this is a synonym for the default map.  If so, use
11254                        # the default one instead.
11255                        $map = $default_map;
11256                    }
11257                }
11258            }
11259
11260            # And figure out the map type if not known.
11261            if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11262                if ($map eq "") {   # Nulls are always $NULL map type
11263                    $map_type = $NULL;
11264                } # Otherwise, non-strings, and those that don't allow
11265                  # $MULTI_CP, and those that aren't multiple code points are
11266                  # 0
11267                elsif
11268                   (($property_type != $STRING && $property_type != $UNKNOWN)
11269                   || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11270                   || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
11271                {
11272                    $map_type = 0;
11273                }
11274                else {
11275                    $map_type = $MULTI_CP;
11276                }
11277            }
11278
11279            $property_object->add_map($low, $high,
11280                                        $map,
11281                                        Type => $map_type,
11282                                        Replace => $replace);
11283        } # End of loop through file's lines
11284
11285        return;
11286    }
11287}
11288
11289{ # Closure for UnicodeData.txt handling
11290
11291    # This file was the first one in the UCD; its design leads to some
11292    # awkwardness in processing.  Here is a sample line:
11293    # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11294    # The fields in order are:
11295    my $i = 0;            # The code point is in field 0, and is shifted off.
11296    my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
11297    my $CATEGORY = $i++;  # category (e.g. "Lu")
11298    my $CCC = $i++;       # Canonical combining class (e.g. "230")
11299    my $BIDI = $i++;      # directional class (e.g. "L")
11300    my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
11301    my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
11302    my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11303                                         # Dual-use in this program; see below
11304    my $NUMERIC = $i++;   # numeric value
11305    my $MIRRORED = $i++;  # ? mirrored
11306    my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11307    my $COMMENT = $i++;   # iso comment
11308    my $UPPER = $i++;     # simple uppercase mapping
11309    my $LOWER = $i++;     # simple lowercase mapping
11310    my $TITLE = $i++;     # simple titlecase mapping
11311    my $input_field_count = $i;
11312
11313    # This routine in addition outputs these extra fields:
11314
11315    my $DECOMP_TYPE = $i++; # Decomposition type
11316
11317    # These fields are modifications of ones above, and are usually
11318    # suppressed; they must come last, as for speed, the loop upper bound is
11319    # normally set to ignore them
11320    my $NAME = $i++;        # This is the strict name field, not the one that
11321                            # charnames uses.
11322    my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
11323                            # by Unicode::Normalize
11324    my $last_field = $i - 1;
11325
11326    # All these are read into an array for each line, with the indices defined
11327    # above.  The empty fields in the example line above indicate that the
11328    # value is defaulted.  The handler called for each line of the input
11329    # changes these to their defaults.
11330
11331    # Here are the official names of the properties, in a parallel array:
11332    my @field_names;
11333    $field_names[$BIDI] = 'Bidi_Class';
11334    $field_names[$CATEGORY] = 'General_Category';
11335    $field_names[$CCC] = 'Canonical_Combining_Class';
11336    $field_names[$CHARNAME] = 'Perl_Charnames';
11337    $field_names[$COMMENT] = 'ISO_Comment';
11338    $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11339    $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11340    $field_names[$LOWER] = 'Lowercase_Mapping';
11341    $field_names[$MIRRORED] = 'Bidi_Mirrored';
11342    $field_names[$NAME] = 'Name';
11343    $field_names[$NUMERIC] = 'Numeric_Value';
11344    $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11345    $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11346    $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11347    $field_names[$TITLE] = 'Titlecase_Mapping';
11348    $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11349    $field_names[$UPPER] = 'Uppercase_Mapping';
11350
11351    # Some of these need a little more explanation:
11352    # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11353    #   property, but is used in calculating the Numeric_Type.  Perl however,
11354    #   creates a file from this field, so a Perl property is created from it.
11355    # Similarly, the Other_Digit field is used only for calculating the
11356    #   Numeric_Type, and so it can be safely re-used as the place to store
11357    #   the value for Numeric_Type; hence it is referred to as
11358    #   $NUMERIC_TYPE_OTHER_DIGIT.
11359    # The input field named $PERL_DECOMPOSITION is a combination of both the
11360    #   decomposition mapping and its type.  Perl creates a file containing
11361    #   exactly this field, so it is used for that.  The two properties are
11362    #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11363    #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
11364    #   output it), as Perl doesn't use it directly.
11365    # The input field named here $CHARNAME is used to construct the
11366    #   Perl_Charnames property, which is a combination of the Name property
11367    #   (which the input field contains), and the Unicode_1_Name property, and
11368    #   others from other files.  Since, the strict Name property is not used
11369    #   by Perl, this field is used for the table that Perl does use.  The
11370    #   strict Name property table is usually suppressed (unless the lists are
11371    #   changed to output it), so it is accumulated in a separate field,
11372    #   $NAME, which to save time is discarded unless the table is actually to
11373    #   be output
11374
11375    # This file is processed like most in this program.  Control is passed to
11376    # process_generic_property_file() which calls filter_UnicodeData_line()
11377    # for each input line.  This filter converts the input into line(s) that
11378    # process_generic_property_file() understands.  There is also a setup
11379    # routine called before any of the file is processed, and a handler for
11380    # EOF processing, all in this closure.
11381
11382    # A huge speed-up occurred at the cost of some added complexity when these
11383    # routines were altered to buffer the outputs into ranges.  Almost all the
11384    # lines of the input file apply to just one code point, and for most
11385    # properties, the map for the next code point up is the same as the
11386    # current one.  So instead of creating a line for each property for each
11387    # input line, filter_UnicodeData_line() remembers what the previous map
11388    # of a property was, and doesn't generate a line to pass on until it has
11389    # to, as when the map changes; and that passed-on line encompasses the
11390    # whole contiguous range of code points that have the same map for that
11391    # property.  This means a slight amount of extra setup, and having to
11392    # flush these buffers on EOF, testing if the maps have changed, plus
11393    # remembering state information in the closure.  But it means a lot less
11394    # real time in not having to change the data base for each property on
11395    # each line.
11396
11397    # Another complication is that there are already a few ranges designated
11398    # in the input.  There are two lines for each, with the same maps except
11399    # the code point and name on each line.  This was actually the hardest
11400    # thing to design around.  The code points in those ranges may actually
11401    # have real maps not given by these two lines.  These maps will either
11402    # be algorithmically determinable, or be in the extracted files furnished
11403    # with the UCD.  In the event of conflicts between these extracted files,
11404    # and this one, Unicode says that this one prevails.  But it shouldn't
11405    # prevail for conflicts that occur in these ranges.  The data from the
11406    # extracted files prevails in those cases.  So, this program is structured
11407    # so that those files are processed first, storing maps.  Then the other
11408    # files are processed, generally overwriting what the extracted files
11409    # stored.  But just the range lines in this input file are processed
11410    # without overwriting.  This is accomplished by adding a special string to
11411    # the lines output to tell process_generic_property_file() to turn off the
11412    # overwriting for just this one line.
11413    # A similar mechanism is used to tell it that the map is of a non-default
11414    # type.
11415
11416    sub setup_UnicodeData($file) { # Called before any lines of the input are read
11417
11418        # Create a new property specially located that is a combination of
11419        # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11420        # _Perl_Name_Alias properties.  (The final one duplicates elements of the
11421        # first, and starting in v6.1, is the same as the 'Name_Alias
11422        # property.)  A comment for the new property will later be constructed
11423        # based on the actual properties present and used
11424        $perl_charname = Property->new('Perl_Charnames',
11425                       Default_Map => "",
11426                       Directory => File::Spec->curdir(),
11427                       File => 'Name',
11428                       Fate => $INTERNAL_ONLY,
11429                       Perl_Extension => 1,
11430                       Range_Size_1 => \&output_perl_charnames_line,
11431                       Type => $STRING,
11432                       );
11433        $perl_charname->set_proxy_for('Name');
11434
11435        my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11436                                        Directory => File::Spec->curdir(),
11437                                        File => 'Decomposition',
11438                                        Format => $DECOMP_STRING_FORMAT,
11439                                        Fate => $INTERNAL_ONLY,
11440                                        Perl_Extension => 1,
11441                                        Default_Map => $CODE_POINT,
11442
11443                                        # normalize.pm can't cope with these
11444                                        Output_Range_Counts => 0,
11445
11446                                        # This is a specially formatted table
11447                                        # explicitly for normalize.pm, which
11448                                        # is expecting a particular format,
11449                                        # which means that mappings containing
11450                                        # multiple code points are in the main
11451                                        # body of the table
11452                                        Map_Type => $COMPUTE_NO_MULTI_CP,
11453                                        Type => $STRING,
11454                                        To_Output_Map => $INTERNAL_MAP,
11455                                        );
11456        $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11457        $Perl_decomp->add_comment(join_lines(<<END
11458This mapping is a combination of the Unicode 'Decomposition_Type' and
11459'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
11460identical to the official Unicode 'Decomposition_Mapping' property except for
11461two things:
11462 1) It omits the algorithmically determinable Hangul syllable decompositions,
11463which normalize.pm handles algorithmically.
11464 2) It contains the decomposition type as well.  Non-canonical decompositions
11465begin with a word in angle brackets, like <super>, which denotes the
11466compatible decomposition type.  If the map does not begin with the <angle
11467brackets>, the decomposition is canonical.
11468END
11469        ));
11470
11471        my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11472                                        Default_Map => "",
11473                                        Perl_Extension => 1,
11474                                        Directory => $map_directory,
11475                                        Type => $STRING,
11476                                        To_Output_Map => $OUTPUT_ADJUSTED,
11477                                        );
11478        $Decimal_Digit->add_comment(join_lines(<<END
11479This file gives the mapping of all code points which represent a single
11480decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11481points, and the mapping of each non-initial element of each range is actually
11482not to "0", but to the offset that element has from its corresponding DIGIT 0.
11483These code points are those that have Numeric_Type=Decimal; not special
11484things, like subscripts nor Roman numerals.
11485END
11486        ));
11487
11488        # These properties are not used for generating anything else, and are
11489        # usually not output.  By making them last in the list, we can just
11490        # change the high end of the loop downwards to avoid the work of
11491        # generating a table(s) that is/are just going to get thrown away.
11492        if (! property_ref('Decomposition_Mapping')->to_output_map
11493            && ! property_ref('Name')->to_output_map)
11494        {
11495            $last_field = min($NAME, $DECOMP_MAP) - 1;
11496        } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11497            $last_field = $DECOMP_MAP;
11498        } elsif (property_ref('Name')->to_output_map) {
11499            $last_field = $NAME;
11500        }
11501        return;
11502    }
11503
11504    my $first_time = 1;                 # ? Is this the first line of the file
11505    my $in_range = 0;                   # ? Are we in one of the file's ranges
11506    my $previous_cp;                    # hex code point of previous line
11507    my $decimal_previous_cp = -1;       # And its decimal equivalent
11508    my @start;                          # For each field, the current starting
11509                                        # code point in hex for the range
11510                                        # being accumulated.
11511    my @fields;                         # The input fields;
11512    my @previous_fields;                # And those from the previous call
11513
11514    sub filter_UnicodeData_line($file) {
11515        # Handle a single input line from UnicodeData.txt; see comments above
11516        # Conceptually this takes a single line from the file containing N
11517        # properties, and converts it into N lines with one property per line,
11518        # which is what the final handler expects.  But there are
11519        # complications due to the quirkiness of the input file, and to save
11520        # time, it accumulates ranges where the property values don't change
11521        # and only emits lines when necessary.  This is about an order of
11522        # magnitude fewer lines emitted.
11523
11524        # $_ contains the input line.
11525        # -1 in split means retain trailing null fields
11526        (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11527
11528        #local $to_trace = 1 if main::DEBUG;
11529        trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11530        if (@fields > $input_field_count) {
11531            $file->carp_bad_line('Extra fields');
11532            $_ = "";
11533            return;
11534        }
11535
11536        my $decimal_cp = hex $cp;
11537
11538        # We have to output all the buffered ranges when the next code point
11539        # is not exactly one after the previous one, which means there is a
11540        # gap in the ranges.
11541        my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11542
11543        # The decomposition mapping field requires special handling.  It looks
11544        # like either:
11545        #
11546        # <compat> 0032 0020
11547        # 0041 0300
11548        #
11549        # The decomposition type is enclosed in <brackets>; if missing, it
11550        # means the type is canonical.  There are two decomposition mapping
11551        # tables: the one for use by Perl's normalize.pm has a special format
11552        # which is this field intact; the other, for general use is of
11553        # standard format.  In either case we have to find the decomposition
11554        # type.  Empty fields have None as their type, and map to the code
11555        # point itself
11556        if ($fields[$PERL_DECOMPOSITION] eq "") {
11557            $fields[$DECOMP_TYPE] = 'None';
11558            $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11559        }
11560        else {
11561            ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11562                                            =~ / < ( .+? ) > \s* ( .+ ) /x;
11563            if (! defined $fields[$DECOMP_TYPE]) {
11564                $fields[$DECOMP_TYPE] = 'Canonical';
11565                $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11566            }
11567            else {
11568                $fields[$DECOMP_MAP] = $map;
11569            }
11570        }
11571
11572        # The 3 numeric fields also require special handling.  The 2 digit
11573        # fields must be either empty or match the number field.  This means
11574        # that if it is empty, they must be as well, and the numeric type is
11575        # None, and the numeric value is 'Nan'.
11576        # The decimal digit field must be empty or match the other digit
11577        # field.  If the decimal digit field is non-empty, the code point is
11578        # a decimal digit, and the other two fields will have the same value.
11579        # If it is empty, but the other digit field is non-empty, the code
11580        # point is an 'other digit', and the number field will have the same
11581        # value as the other digit field.  If the other digit field is empty,
11582        # but the number field is non-empty, the code point is a generic
11583        # numeric type.
11584        if ($fields[$NUMERIC] eq "") {
11585            if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11586                || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11587            ) {
11588                $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11589            }
11590            $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11591            $fields[$NUMERIC] = 'NaN';
11592        }
11593        else {
11594            $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11595            if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11596                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11597                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11598                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11599            }
11600            elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11601                $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11602                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11603            }
11604            else {
11605                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11606
11607                # Rationals require extra effort.
11608                if ($fields[$NUMERIC] =~ qr{/}) {
11609                    reduce_fraction(\$fields[$NUMERIC]);
11610                    register_fraction($fields[$NUMERIC])
11611                }
11612            }
11613        }
11614
11615        # For the properties that have empty fields in the file, and which
11616        # mean something different from empty, change them to that default.
11617        # Certain fields just haven't been empty so far in any Unicode
11618        # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11619        # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11620        # the defaults; which are very unlikely to ever change.
11621        $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11622        $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11623
11624        # UAX44 says that if title is empty, it is the same as whatever upper
11625        # is,
11626        $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11627
11628        # There are a few pairs of lines like:
11629        #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11630        #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11631        # that define ranges.  These should be processed after the fields are
11632        # adjusted above, as they may override some of them; but mostly what
11633        # is left is to possibly adjust the $CHARNAME field.  The names of all the
11634        # paired lines start with a '<', but this is also true of '<control>,
11635        # which isn't one of these special ones.
11636        if ($fields[$CHARNAME] eq '<control>') {
11637
11638            # Some code points in this file have the pseudo-name
11639            # '<control>', but the official name for such ones is the null
11640            # string.
11641            $fields[$NAME] = $fields[$CHARNAME] = "";
11642
11643            # We had better not be in between range lines.
11644            if ($in_range) {
11645                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11646                $in_range = 0;
11647            }
11648        }
11649        elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11650
11651            # Here is a non-range line.  We had better not be in between range
11652            # lines.
11653            if ($in_range) {
11654                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11655                $in_range = 0;
11656            }
11657            if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11658
11659                # These are code points whose names end in their code points,
11660                # which means the names are algorithmically derivable from the
11661                # code points.  To shorten the output Name file, the algorithm
11662                # for deriving these is placed in the file instead of each
11663                # code point, so they have map type $CP_IN_NAME
11664                $fields[$CHARNAME] = $CMD_DELIM
11665                                 . $MAP_TYPE_CMD
11666                                 . '='
11667                                 . $CP_IN_NAME
11668                                 . $CMD_DELIM
11669                                 . $fields[$CHARNAME];
11670            }
11671            $fields[$NAME] = $fields[$CHARNAME];
11672        }
11673        elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11674            $fields[$CHARNAME] = $fields[$NAME] = $1;
11675
11676            # Here we are at the beginning of a range pair.
11677            if ($in_range) {
11678                $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11679            }
11680            $in_range = 1;
11681
11682            # Because the properties in the range do not overwrite any already
11683            # in the db, we must flush the buffers of what's already there, so
11684            # they get handled in the normal scheme.
11685            $force_output = 1;
11686
11687        }
11688        elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11689            $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11690            $_ = "";
11691            return;
11692        }
11693        else { # Here, we are at the last line of a range pair.
11694
11695            if (! $in_range) {
11696                $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11697                $_ = "";
11698                return;
11699            }
11700            $in_range = 0;
11701
11702            $fields[$NAME] = $fields[$CHARNAME];
11703
11704            # Check that the input is valid: that the closing of the range is
11705            # the same as the beginning.
11706            foreach my $i (0 .. $last_field) {
11707                next if $fields[$i] eq $previous_fields[$i];
11708                $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11709            }
11710
11711            # The processing differs depending on the type of range,
11712            # determined by its $CHARNAME
11713            if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11714
11715                # Check that the data looks right.
11716                if ($decimal_previous_cp != $SBase) {
11717                    $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
11718                }
11719                if ($decimal_cp != $SBase + $SCount - 1) {
11720                    $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
11721                }
11722
11723                # The Hangul syllable range has a somewhat complicated name
11724                # generation algorithm.  Each code point in it has a canonical
11725                # decomposition also computable by an algorithm.  The
11726                # perl decomposition map table built from these is used only
11727                # by normalize.pm, which has the algorithm built in it, so the
11728                # decomposition maps are not needed, and are large, so are
11729                # omitted from it.  If the full decomposition map table is to
11730                # be output, the decompositions are generated for it, in the
11731                # EOF handling code for this input file.
11732
11733                $previous_fields[$DECOMP_TYPE] = 'Canonical';
11734
11735                # This range is stored in our internal structure with its
11736                # own map type, different from all others.
11737                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11738                                        = $CMD_DELIM
11739                                          . $MAP_TYPE_CMD
11740                                          . '='
11741                                          . $HANGUL_SYLLABLE
11742                                          . $CMD_DELIM
11743                                          . $fields[$CHARNAME];
11744            }
11745            elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
11746
11747                # All the CJK ranges like this have the name given as a
11748                # special case in the next code line.  And for the others, we
11749                # hope that Unicode continues to use the correct name in
11750                # future releases, so we don't have to make further special
11751                # cases.
11752                my $name = ($fields[$CHARNAME] =~ /^CJK/)
11753                           ? 'CJK UNIFIED IDEOGRAPH'
11754                           : uc $fields[$CHARNAME];
11755
11756                # The name for these contains the code point itself, and all
11757                # are defined to have the same base name, regardless of what
11758                # is in the file.  They are stored in our internal structure
11759                # with a map type of $CP_IN_NAME
11760                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11761                                        = $CMD_DELIM
11762                                           . $MAP_TYPE_CMD
11763                                           . '='
11764                                           . $CP_IN_NAME
11765                                           . $CMD_DELIM
11766                                           . $name;
11767
11768            }
11769            elsif ($fields[$CATEGORY] eq 'Co'
11770                     || $fields[$CATEGORY] eq 'Cs')
11771            {
11772                # The names of all the code points in these ranges are set to
11773                # null, as there are no names for the private use and
11774                # surrogate code points.
11775
11776                $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11777            }
11778            else {
11779                $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
11780            }
11781
11782            # The first line of the range caused everything else to be output,
11783            # and then its values were stored as the beginning values for the
11784            # next set of ranges, which this one ends.  Now, for each value,
11785            # add a command to tell the handler that these values should not
11786            # replace any existing ones in our database.
11787            foreach my $i (0 .. $last_field) {
11788                $previous_fields[$i] = $CMD_DELIM
11789                                        . $REPLACE_CMD
11790                                        . '='
11791                                        . $NO
11792                                        . $CMD_DELIM
11793                                        . $previous_fields[$i];
11794            }
11795
11796            # And change things so it looks like the entire range has been
11797            # gone through with this being the final part of it.  Adding the
11798            # command above to each field will cause this range to be flushed
11799            # during the next iteration, as it guaranteed that the stored
11800            # field won't match whatever value the next one has.
11801            $previous_cp = $cp;
11802            $decimal_previous_cp = $decimal_cp;
11803
11804            # We are now set up for the next iteration; so skip the remaining
11805            # code in this subroutine that does the same thing, but doesn't
11806            # know about these ranges.
11807            $_ = "";
11808
11809            return;
11810        }
11811
11812        # On the very first line, we fake it so the code below thinks there is
11813        # nothing to output, and initialize so that when it does get output it
11814        # uses the first line's values for the lowest part of the range.
11815        # (One could avoid this by using peek(), but then one would need to
11816        # know the adjustments done above and do the same ones in the setup
11817        # routine; not worth it)
11818        if ($first_time) {
11819            $first_time = 0;
11820            @previous_fields = @fields;
11821            @start = ($cp) x scalar @fields;
11822            $decimal_previous_cp = $decimal_cp - 1;
11823        }
11824
11825        # For each field, output the stored up ranges that this code point
11826        # doesn't fit in.  Earlier we figured out if all ranges should be
11827        # terminated because of changing the replace or map type styles, or if
11828        # there is a gap between this new code point and the previous one, and
11829        # that is stored in $force_output.  But even if those aren't true, we
11830        # need to output the range if this new code point's value for the
11831        # given property doesn't match the stored range's.
11832        #local $to_trace = 1 if main::DEBUG;
11833        foreach my $i (0 .. $last_field) {
11834            my $field = $fields[$i];
11835            if ($force_output || $field ne $previous_fields[$i]) {
11836
11837                # Flush the buffer of stored values.
11838                $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11839
11840                # Start a new range with this code point and its value
11841                $start[$i] = $cp;
11842                $previous_fields[$i] = $field;
11843            }
11844        }
11845
11846        # Set the values for the next time.
11847        $previous_cp = $cp;
11848        $decimal_previous_cp = $decimal_cp;
11849
11850        # The input line has generated whatever adjusted lines are needed, and
11851        # should not be looked at further.
11852        $_ = "";
11853        return;
11854    }
11855
11856    sub EOF_UnicodeData($file) {
11857        # Called upon EOF to flush the buffers, and create the Hangul
11858        # decomposition mappings if needed.
11859
11860        # Flush the buffers.
11861        foreach my $i (0 .. $last_field) {
11862            $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11863        }
11864
11865        if (-e 'Jamo.txt') {
11866
11867            # The algorithm is published by Unicode, based on values in
11868            # Jamo.txt, (which should have been processed before this
11869            # subroutine), and the results left in %Jamo
11870            unless (%Jamo) {
11871                Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
11872                return;
11873            }
11874
11875            # If the full decomposition map table is being output, insert
11876            # into it the Hangul syllable mappings.  This is to avoid having
11877            # to publish a subroutine in it to compute them.  (which would
11878            # essentially be this code.)  This uses the algorithm published by
11879            # Unicode.  (No hangul syllables in version 1)
11880            if ($v_version ge v2.0.0
11881                && property_ref('Decomposition_Mapping')->to_output_map) {
11882                for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11883                    use integer;
11884                    my $SIndex = $S - $SBase;
11885                    my $L = $LBase + $SIndex / $NCount;
11886                    my $V = $VBase + ($SIndex % $NCount) / $TCount;
11887                    my $T = $TBase + $SIndex % $TCount;
11888
11889                    trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11890                    my $decomposition = sprintf("%04X %04X", $L, $V);
11891                    $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11892                    $file->insert_adjusted_lines(
11893                                sprintf("%04X; Decomposition_Mapping; %s",
11894                                        $S,
11895                                        $decomposition));
11896                }
11897            }
11898        }
11899
11900        return;
11901    }
11902
11903    sub filter_v1_ucd($file) {
11904        # Fix UCD lines in version 1.  This is probably overkill, but this
11905        # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
11906        # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
11907        #       removed.  This program retains them
11908        # 2)    didn't include ranges, which it should have, and which are now
11909        #       added in @corrected_lines below.  It was hand populated by
11910        #       taking the data from Version 2, verified by analyzing
11911        #       DAge.txt.
11912        # 3)    There is a syntax error in the entry for U+09F8 which could
11913        #       cause problems for Unicode::UCD, and so is changed.  It's
11914        #       numeric value was simply a minus sign, without any number.
11915        #       (Eventually Unicode changed the code point to non-numeric.)
11916        # 4)    The decomposition types often don't match later versions
11917        #       exactly, and the whole syntax of that field is different; so
11918        #       the syntax is changed as well as the types to their later
11919        #       terminology.  Otherwise normalize.pm would be very unhappy
11920        # 5)    Many ccc classes are different.  These are left intact.
11921        # 6)    U+FF10..U+FF19 are missing their numeric values in all three
11922        #       fields.  These are unchanged because it doesn't really cause
11923        #       problems for Perl.
11924        # 7)    A number of code points, such as controls, don't have their
11925        #       Unicode Version 1 Names in this file.  These are added.
11926        # 8)    A number of Symbols were marked as Lm.  This changes those in
11927        #       the Latin1 range, so that regexes work.
11928        # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
11929        #       referred to by their lc equivalents.  Not fixed.
11930
11931        my @corrected_lines = split /\n/, <<'END';
119324E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
119339FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11934E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11935F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11936F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11937FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11938END
11939
11940        #local $to_trace = 1 if main::DEBUG;
11941        trace $_ if main::DEBUG && $to_trace;
11942
11943        # -1 => retain trailing null fields
11944        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11945
11946        # At the first place that is wrong in the input, insert all the
11947        # corrections, replacing the wrong line.
11948        if ($code_point eq '4E00') {
11949            my @copy = @corrected_lines;
11950            $_ = shift @copy;
11951            ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11952
11953            $file->insert_lines(@copy);
11954        }
11955        elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11956
11957            # There are no Lm characters in Latin1; these should be 'Sk', but
11958            # there isn't that in V1.
11959            $fields[$CATEGORY] = 'So';
11960        }
11961
11962        if ($fields[$NUMERIC] eq '-') {
11963            $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
11964        }
11965
11966        if  ($fields[$PERL_DECOMPOSITION] ne "") {
11967
11968            # Several entries have this change to superscript 2 or 3 in the
11969            # middle.  Convert these to the modern version, which is to use
11970            # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11971            # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11972            # 'HHHH HHHH 00B3 HHHH'.
11973            # It turns out that all of these that don't have another
11974            # decomposition defined at the beginning of the line have the
11975            # <square> decomposition in later releases.
11976            if ($code_point ne '00B2' && $code_point ne '00B3') {
11977                if  ($fields[$PERL_DECOMPOSITION]
11978                                    =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11979                {
11980                    if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11981                        $fields[$PERL_DECOMPOSITION] = '<square> '
11982                        . $fields[$PERL_DECOMPOSITION];
11983                    }
11984                }
11985            }
11986
11987            # If is like '<+circled> 0052 <-circled>', convert to
11988            # '<circled> 0052'
11989            $fields[$PERL_DECOMPOSITION] =~
11990                            s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11991
11992            # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11993            $fields[$PERL_DECOMPOSITION] =~
11994                            s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11995            or $fields[$PERL_DECOMPOSITION] =~
11996                            s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11997            or $fields[$PERL_DECOMPOSITION] =~
11998                            s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11999            or $fields[$PERL_DECOMPOSITION] =~
12000                        s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
12001
12002            # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
12003            $fields[$PERL_DECOMPOSITION] =~
12004                    s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
12005
12006            # Change names to modern form.
12007            $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
12008            $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
12009            $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
12010            $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
12011
12012            # One entry has weird braces
12013            $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
12014
12015            # One entry at U+2116 has an extra <sup>
12016            $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
12017        }
12018
12019        $_ = join ';', $code_point, @fields;
12020        trace $_ if main::DEBUG && $to_trace;
12021        return;
12022    }
12023
12024    sub filter_bad_Nd_ucd {
12025        # Early versions specified a value in the decimal digit field even
12026        # though the code point wasn't a decimal digit.  Clear the field in
12027        # that situation, so that the main code doesn't think it is a decimal
12028        # digit.
12029
12030        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12031        if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
12032            $fields[$PERL_DECIMAL_DIGIT] = "";
12033            $_ = join ';', $code_point, @fields;
12034        }
12035        return;
12036    }
12037
12038    my @U1_control_names = split /\n/, <<'END';
12039NULL
12040START OF HEADING
12041START OF TEXT
12042END OF TEXT
12043END OF TRANSMISSION
12044ENQUIRY
12045ACKNOWLEDGE
12046BELL
12047BACKSPACE
12048HORIZONTAL TABULATION
12049LINE FEED
12050VERTICAL TABULATION
12051FORM FEED
12052CARRIAGE RETURN
12053SHIFT OUT
12054SHIFT IN
12055DATA LINK ESCAPE
12056DEVICE CONTROL ONE
12057DEVICE CONTROL TWO
12058DEVICE CONTROL THREE
12059DEVICE CONTROL FOUR
12060NEGATIVE ACKNOWLEDGE
12061SYNCHRONOUS IDLE
12062END OF TRANSMISSION BLOCK
12063CANCEL
12064END OF MEDIUM
12065SUBSTITUTE
12066ESCAPE
12067FILE SEPARATOR
12068GROUP SEPARATOR
12069RECORD SEPARATOR
12070UNIT SEPARATOR
12071DELETE
12072BREAK PERMITTED HERE
12073NO BREAK HERE
12074INDEX
12075NEXT LINE
12076START OF SELECTED AREA
12077END OF SELECTED AREA
12078CHARACTER TABULATION SET
12079CHARACTER TABULATION WITH JUSTIFICATION
12080LINE TABULATION SET
12081PARTIAL LINE DOWN
12082PARTIAL LINE UP
12083REVERSE LINE FEED
12084SINGLE SHIFT TWO
12085SINGLE SHIFT THREE
12086DEVICE CONTROL STRING
12087PRIVATE USE ONE
12088PRIVATE USE TWO
12089SET TRANSMIT STATE
12090CANCEL CHARACTER
12091MESSAGE WAITING
12092START OF GUARDED AREA
12093END OF GUARDED AREA
12094START OF STRING
12095SINGLE CHARACTER INTRODUCER
12096CONTROL SEQUENCE INTRODUCER
12097STRING TERMINATOR
12098OPERATING SYSTEM COMMAND
12099PRIVACY MESSAGE
12100APPLICATION PROGRAM COMMAND
12101END
12102
12103    sub filter_early_U1_names {
12104        # Very early versions did not have the Unicode_1_name field specified.
12105        # They differed in which ones were present; make sure a U1 name
12106        # exists, so that Unicode::UCD::charinfo will work
12107
12108        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12109
12110
12111        # @U1_control names above are entirely positional, so we pull them out
12112        # in the exact order required, with gaps for the ones that don't have
12113        # names.
12114        if ($code_point =~ /^00[01]/
12115            || $code_point eq '007F'
12116            || $code_point =~ /^008[2-9A-F]/
12117            || $code_point =~ /^009[0-8A-F]/)
12118        {
12119            my $u1_name = shift @U1_control_names;
12120            $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12121            $_ = join ';', $code_point, @fields;
12122        }
12123        return;
12124    }
12125
12126    sub filter_v2_1_5_ucd {
12127        # A dozen entries in this 2.1.5 file had the mirrored and numeric
12128        # columns swapped;  These all had mirrored be 'N'.  So if the numeric
12129        # column appears to be N, swap it back.
12130
12131        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12132        if ($fields[$NUMERIC] eq 'N') {
12133            $fields[$NUMERIC] = $fields[$MIRRORED];
12134            $fields[$MIRRORED] = 'N';
12135            $_ = join ';', $code_point, @fields;
12136        }
12137        return;
12138    }
12139
12140    sub filter_v6_ucd {
12141
12142        # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12143        # it wasn't accepted, to allow for some deprecation cycles.  This
12144        # function is not called after 5.16
12145
12146        return if $_ !~ /^(?:0007|1F514|070F);/;
12147
12148        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12149        if ($code_point eq '0007') {
12150            $fields[$CHARNAME] = "";
12151        }
12152        elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12153                            # http://www.unicode.org/versions/corrigendum8.html
12154            $fields[$BIDI] = "AL";
12155        }
12156        elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12157            $fields[$CHARNAME] = "";
12158        }
12159
12160        $_ = join ';', $code_point, @fields;
12161
12162        return;
12163    }
12164} # End closure for UnicodeData
12165
12166sub process_GCB_test($file) {
12167
12168    while ($file->next_line) {
12169        push @backslash_X_tests, $_;
12170    }
12171
12172    return;
12173}
12174
12175sub process_LB_test($file) {
12176
12177    while ($file->next_line) {
12178        push @LB_tests, $_;
12179    }
12180
12181    return;
12182}
12183
12184sub process_SB_test($file) {
12185
12186    while ($file->next_line) {
12187        push @SB_tests, $_;
12188    }
12189
12190    return;
12191}
12192
12193sub process_WB_test($file) {
12194
12195    while ($file->next_line) {
12196        push @WB_tests, $_;
12197    }
12198
12199    return;
12200}
12201
12202sub process_NamedSequences($file) {
12203    # NamedSequences.txt entries are just added to an array.  Because these
12204    # don't look like the other tables, they have their own handler.
12205    # An example:
12206    # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12207    #
12208    # This just adds the sequence to an array for later handling
12209
12210    while ($file->next_line) {
12211        my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12212        if (@remainder) {
12213            $file->carp_bad_line(
12214                "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12215            next;
12216        }
12217
12218        # Code points need to be 5 digits long like the other entries in
12219        # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be
12220        # converted to native
12221        $sequence = join " ", map { sprintf("%05X",
12222                                    utf8::unicode_to_native(hex $_))
12223                                  } split / /, $sequence;
12224        push @named_sequences, "$sequence\n$name\n";
12225    }
12226    return;
12227}
12228
12229{ # Closure
12230
12231    my $first_range;
12232
12233    sub  filter_early_ea_lb {
12234        # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
12235        # third field be the name of the code point, which can be ignored in
12236        # most cases.  But it can be meaningful if it marks a range:
12237        # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12238        # 3400;W;<CJK Ideograph Extension A, First>
12239        #
12240        # We need to see the First in the example above to know it's a range.
12241        # They did not use the later range syntaxes.  This routine changes it
12242        # to use the modern syntax.
12243        # $1 is the Input_file object.
12244
12245        my @fields = split /\s*;\s*/;
12246        if ($fields[2] =~ /^<.*, First>/) {
12247            $first_range = $fields[0];
12248            $_ = "";
12249        }
12250        elsif ($fields[2] =~ /^<.*, Last>/) {
12251            $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12252        }
12253        else {
12254            undef $first_range;
12255            $_ = "$fields[0]; $fields[1]";
12256        }
12257
12258        return;
12259    }
12260}
12261
12262sub filter_substitute_lb {
12263    # Used on Unicodes that predate the LB property, where there is a
12264    # substitute file.  This just does the regular ea_lb handling for such
12265    # files, and then substitutes the long property value name for the short
12266    # one that comes with the file.  (The other break files have the long
12267    # names in them, so this is the odd one out.)  The reason for doing this
12268    # kludge is that regen/mk_invlists.pl is expecting the long name.  This
12269    # also fixes the typo 'Inseperable' that leads to problems.
12270
12271    filter_early_ea_lb;
12272    return unless $_;
12273
12274    my @fields = split /\s*;\s*/;
12275    $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12276    $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12277    $_ = join '; ', @fields;
12278}
12279
12280sub filter_old_style_arabic_shaping {
12281    # Early versions used a different term for the later one.
12282
12283    my @fields = split /\s*;\s*/;
12284    $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12285    $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
12286    $_ = join ';', @fields;
12287    return;
12288}
12289
12290{ # Closure
12291    my $lc; # Table for lowercase mapping
12292    my $tc;
12293    my $uc;
12294    my %special_casing_code_points;
12295
12296    sub setup_special_casing($file) {
12297        # SpecialCasing.txt contains the non-simple case change mappings.  The
12298        # simple ones are in UnicodeData.txt, which should already have been
12299        # read in to the full property data structures, so as to initialize
12300        # these with the simple ones.  Then the SpecialCasing.txt entries
12301        # add or overwrite the ones which have different full mappings.
12302
12303        # This routine sees if the simple mappings are to be output, and if
12304        # so, copies what has already been put into the full mapping tables,
12305        # while they still contain only the simple mappings.
12306
12307        # The reason it is done this way is that the simple mappings are
12308        # probably not going to be output, so it saves work to initialize the
12309        # full tables with the simple mappings, and then overwrite those
12310        # relatively few entries in them that have different full mappings,
12311        # and thus skip the simple mapping tables altogether.
12312
12313        $lc = property_ref('lc');
12314        $tc = property_ref('tc');
12315        $uc = property_ref('uc');
12316
12317        # For each of the case change mappings...
12318        foreach my $full_casing_table ($lc, $tc, $uc) {
12319            my $full_casing_name = $full_casing_table->name;
12320            my $full_casing_full_name = $full_casing_table->full_name;
12321            unless (defined $full_casing_table
12322                    && ! $full_casing_table->is_empty)
12323            {
12324                Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
12325            }
12326
12327            $full_casing_table->add_comment(join_lines( <<END
12328This file includes both the simple and full case changing maps.  The simple
12329ones are in the main body of the table below, and the full ones adding to or
12330overriding them are in the hash.
12331END
12332            ));
12333
12334            # The simple version's name in each mapping merely has an 's' in
12335            # front of the full one's
12336            my $simple_name = 's' . $full_casing_name;
12337            my $simple = property_ref($simple_name);
12338            $simple->initialize($full_casing_table) if $simple->to_output_map();
12339        }
12340
12341        return;
12342    }
12343
12344    sub filter_2_1_8_special_casing_line {
12345
12346        # This version had duplicate entries in this file.  Delete all but the
12347        # first one
12348        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12349                                              # fields
12350        if (exists $special_casing_code_points{$fields[0]}) {
12351            $_ = "";
12352            return;
12353        }
12354
12355        $special_casing_code_points{$fields[0]} = 1;
12356        filter_special_casing_line(@_);
12357    }
12358
12359    sub filter_special_casing_line($file) {
12360        # Change the format of $_ from SpecialCasing.txt into something that
12361        # the generic handler understands.  Each input line contains three
12362        # case mappings.  This will generate three lines to pass to the
12363        # generic handler for each of those.
12364
12365        # The input syntax (after stripping comments and trailing white space
12366        # is like one of the following (with the final two being entries that
12367        # we ignore):
12368        # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12369        # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12370        # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12371        # Note the trailing semi-colon, unlike many of the input files.  That
12372        # means that there will be an extra null field generated by the split
12373
12374        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12375                                              # fields
12376
12377        # field #4 is when this mapping is conditional.  If any of these get
12378        # implemented, it would be by hard-coding in the casing functions in
12379        # the Perl core, not through tables.  But if there is a new condition
12380        # we don't know about, output a warning.  We know about all the
12381        # conditions through 6.0
12382        if ($fields[4] ne "") {
12383            my @conditions = split ' ', $fields[4];
12384            if ($conditions[0] ne 'tr'  # We know that these languages have
12385                                        # conditions, and some are multiple
12386                && $conditions[0] ne 'az'
12387                && $conditions[0] ne 'lt'
12388
12389                # And, we know about a single condition Final_Sigma, but
12390                # nothing else.
12391                && ($v_version gt v5.2.0
12392                    && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12393            {
12394                $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12395            }
12396            elsif ($conditions[0] ne 'Final_Sigma') {
12397
12398                    # Don't print out a message for Final_Sigma, because we
12399                    # have hard-coded handling for it.  (But the standard
12400                    # could change what the rule should be, but it wouldn't
12401                    # show up here anyway.
12402
12403                    print "# SKIPPING Special Casing: $_\n"
12404                                                    if $verbosity >= $VERBOSE;
12405            }
12406            $_ = "";
12407            return;
12408        }
12409        elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12410            $file->carp_bad_line('Extra fields');
12411            $_ = "";
12412            return;
12413        }
12414
12415        my $decimal_code_point = hex $fields[0];
12416
12417        # Loop to handle each of the three mappings in the input line, in
12418        # order, with $i indicating the current field number.
12419        my $i = 0;
12420        for my $object ($lc, $tc, $uc) {
12421            $i++;   # First time through, $i = 0 ... 3rd time = 3
12422
12423            my $value = $object->value_of($decimal_code_point);
12424            $value = ($value eq $CODE_POINT)
12425                      ? $decimal_code_point
12426                      : hex $value;
12427
12428            # If this isn't a multi-character mapping, it should already have
12429            # been read in.
12430            if ($fields[$i] !~ / /) {
12431                if ($value != hex $fields[$i]) {
12432                    Carp::my_carp("Bad news. UnicodeData.txt thinks "
12433                                  . $object->name
12434                                  . "(0x$fields[0]) is $value"
12435                                  . " and SpecialCasing.txt thinks it is "
12436                                  . hex($fields[$i])
12437                                  . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
12438                }
12439            }
12440            else {
12441
12442                # The mapping is additional, beyond the simple mapping.
12443                $file->insert_adjusted_lines("$fields[0]; "
12444                                             . $object->name
12445                                            . "; "
12446                                            . $CMD_DELIM
12447                                            . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12448                                            . $CMD_DELIM
12449                                            . $fields[$i]);
12450            }
12451        }
12452
12453        # Everything has been handled by the insert_adjusted_lines()
12454        $_ = "";
12455
12456        return;
12457    }
12458}
12459
12460sub filter_old_style_case_folding($file) {
12461    # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12462    # and later style.  Different letters were used in the earlier.
12463
12464    my @fields = split /\s*;\s*/;
12465
12466    if ($fields[1] eq 'L') {
12467        $fields[1] = 'C';             # L => C always
12468    }
12469    elsif ($fields[1] eq 'E') {
12470        if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
12471            $fields[1] = 'F'
12472        }
12473        else {
12474            $fields[1] = 'C'
12475        }
12476    }
12477    else {
12478        $file->carp_bad_line("Expecting L or E in second field");
12479        $_ = "";
12480        return;
12481    }
12482    $_ = join("; ", @fields) . ';';
12483    return;
12484}
12485
12486{ # Closure for case folding
12487
12488    # Create the map for simple only if are going to output it, for otherwise
12489    # it takes no part in anything we do.
12490    my $to_output_simple;
12491
12492    sub setup_case_folding {
12493        # Read in the case foldings in CaseFolding.txt.  This handles both
12494        # simple and full case folding.
12495
12496        $to_output_simple
12497                        = property_ref('Simple_Case_Folding')->to_output_map;
12498
12499        if (! $to_output_simple) {
12500            property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12501        }
12502
12503        # If we ever wanted to show that these tables were combined, a new
12504        # property method could be created, like set_combined_props()
12505        property_ref('Case_Folding')->add_comment(join_lines( <<END
12506This file includes both the simple and full case folding maps.  The simple
12507ones are in the main body of the table below, and the full ones adding to or
12508overriding them are in the hash.
12509END
12510        ));
12511        return;
12512    }
12513
12514    sub filter_case_folding_line($file) {
12515        # Called for each line in CaseFolding.txt
12516        # Input lines look like:
12517        # 0041; C; 0061; # LATIN CAPITAL LETTER A
12518        # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12519        # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12520        #
12521        # 'C' means that folding is the same for both simple and full
12522        # 'F' that it is only for full folding
12523        # 'S' that it is only for simple folding
12524        # 'T' is locale-dependent, and ignored
12525        # 'I' is a type of 'F' used in some early releases.
12526        # Note the trailing semi-colon, unlike many of the input files.  That
12527        # means that there will be an extra null field generated by the split
12528        # below, which we ignore and hence is not an error.
12529
12530        my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12531        if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12532            $file->carp_bad_line('Extra fields');
12533            $_ = "";
12534            return;
12535        }
12536
12537        if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12538            $_ = "";
12539            return;
12540        }
12541
12542        # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12543        # I are all full foldings; S is single-char.  For S, there is always
12544        # an F entry, so we must allow multiple values for the same code
12545        # point.  Fortunately this table doesn't need further manipulation
12546        # which would preclude using multiple-values.  The S is now included
12547        # so that _swash_inversion_hash() is able to construct closures
12548        # without having to worry about F mappings.
12549        if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12550            $_ = "$range; Case_Folding; "
12551                 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12552        }
12553        else {
12554            $_ = "";
12555            $file->carp_bad_line('Expecting C F I S or T in second field');
12556        }
12557
12558        # C and S are simple foldings, but simple case folding is not needed
12559        # unless we explicitly want its map table output.
12560        if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12561            $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12562        }
12563
12564        return;
12565    }
12566
12567} # End case fold closure
12568
12569sub filter_jamo_line {
12570    # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12571    # from this file that is used in generating the Name property for Jamo
12572    # code points.  But, it also is used to convert early versions' syntax
12573    # into the modern form.  Here are two examples:
12574    # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12575    # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12576    #
12577    # The input is $_, the output is $_ filtered.
12578
12579    my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12580
12581    # Let the caller handle unexpected input.  In earlier versions, there was
12582    # a third field which is supposed to be a comment, but did not have a '#'
12583    # before it.
12584    return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12585
12586    $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12587                                # beginning.
12588
12589    # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12590    $fields[1] = 'R' if $fields[0] eq '1105';
12591
12592    # Add to structure so can generate Names from it.
12593    my $cp = hex $fields[0];
12594    my $short_name = $fields[1];
12595    $Jamo{$cp} = $short_name;
12596    if ($cp <= $LBase + $LCount) {
12597        $Jamo_L{$short_name} = $cp - $LBase;
12598    }
12599    elsif ($cp <= $VBase + $VCount) {
12600        $Jamo_V{$short_name} = $cp - $VBase;
12601    }
12602    elsif ($cp <= $TBase + $TCount) {
12603        $Jamo_T{$short_name} = $cp - $TBase;
12604    }
12605    else {
12606        Carp::my_carp_bug("Unexpected Jamo code point in $_");
12607    }
12608
12609
12610    # Reassemble using just the first two fields to look like a typical
12611    # property file line
12612    $_ = "$fields[0]; $fields[1]";
12613
12614    return;
12615}
12616
12617sub register_fraction($rational) {
12618    # This registers the input rational number so that it can be passed on to
12619    # Unicode::UCD, both in rational and floating forms.
12620
12621    my $floating = eval $rational;
12622
12623    my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating;
12624
12625    # See if the denominator is a power of 2.
12626    $rational =~ m!.*/(.*)!;
12627    my $denominator = $1;
12628    if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) {
12629
12630        # Here the denominator is a power of 2.  This means it has an exact
12631        # representation in binary, so rounding could go either way.  It turns
12632        # out that Windows doesn't necessarily round towards even, so output
12633        # an extra entry.  This happens when the final digit we output is even
12634        # and the next digits would be 50* to the precision of the machine.
12635        my $extra_digit_float = sprintf "%e", $floating;
12636        my $q = $E_FLOAT_PRECISION - 1;
12637        if ($extra_digit_float =~ / ( .* \. \d{$q} )
12638                                    ( [02468] ) 5 0* ( e .*)
12639                                  /ix)
12640        {
12641            push @floats, $1 . ($2 + 1) . $3;
12642        }
12643    }
12644
12645    foreach my $float (@floats) {
12646        # Strip off any leading zeros beyond 2 digits to make it C99
12647        # compliant.  (Windows has 3 digit exponents, contrary to C99)
12648        $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
12649
12650        if (   defined $nv_floating_to_rational{$float}
12651            && $nv_floating_to_rational{$float} ne $rational)
12652        {
12653            die Carp::my_carp_bug("Both '$rational' and"
12654                            . " '$nv_floating_to_rational{$float}' evaluate to"
12655                            . " the same floating point number."
12656                            . "  \$E_FLOAT_PRECISION must be increased");
12657        }
12658        $nv_floating_to_rational{$float} = $rational;
12659    }
12660    return;
12661}
12662
12663sub gcd($a, $b) {   # Greatest-common-divisor; from
12664                # http://en.wikipedia.org/wiki/Euclidean_algorithm
12665    use integer;
12666
12667    while ($b != 0) {
12668       my $temp = $b;
12669       $b = $a % $b;
12670       $a = $temp;
12671    }
12672    return $a;
12673}
12674
12675sub reduce_fraction($fraction_ref) {
12676    # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
12677    # hence this is needed.  The argument is a reference to the
12678    # string denoting the fraction, which must be of the form:
12679    if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12680        Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
12681        return;
12682    }
12683
12684    my $sign = $1;
12685    my $numerator = $2;
12686    my $denominator = $3;
12687
12688    use integer;
12689
12690    # Find greatest common divisor
12691    my $gcd = gcd($numerator, $denominator);
12692
12693    # And reduce using the gcd.
12694    if ($gcd != 1) {
12695        $numerator    /= $gcd;
12696        $denominator  /= $gcd;
12697        $$fraction_ref = "$sign$numerator/$denominator";
12698    }
12699
12700    return;
12701}
12702
12703sub filter_numeric_value_line($file) {
12704    # DNumValues contains lines of a different syntax than the typical
12705    # property file:
12706    # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
12707    #
12708    # This routine transforms $_ containing the anomalous syntax to the
12709    # typical, by filtering out the extra columns, and convert early version
12710    # decimal numbers to strings that look like rational numbers.
12711
12712    # Starting in 5.1, there is a rational field.  Just use that, omitting the
12713    # extra columns.  Otherwise convert the decimal number in the second field
12714    # to a rational, and omit extraneous columns.
12715    my @fields = split /\s*;\s*/, $_, -1;
12716    my $rational;
12717
12718    if ($v_version ge v5.1.0) {
12719        if (@fields != 4) {
12720            $file->carp_bad_line('Not 4 semi-colon separated fields');
12721            $_ = "";
12722            return;
12723        }
12724        reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
12725        $rational = $fields[3];
12726
12727        $_ = join '; ', @fields[ 0, 3 ];
12728    }
12729    else {
12730
12731        # Here, is an older Unicode file, which has decimal numbers instead of
12732        # rationals in it.  Use the fraction to calculate the denominator and
12733        # convert to rational.
12734
12735        if (@fields != 2 && @fields != 3) {
12736            $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12737            $_ = "";
12738            return;
12739        }
12740
12741        my $codepoints = $fields[0];
12742        my $decimal = $fields[1];
12743        if ($decimal =~ s/\.0+$//) {
12744
12745            # Anything ending with a decimal followed by nothing but 0's is an
12746            # integer
12747            $_ = "$codepoints; $decimal";
12748            $rational = $decimal;
12749        }
12750        else {
12751
12752            my $denominator;
12753            if ($decimal =~ /\.50*$/) {
12754                $denominator = 2;
12755            }
12756
12757            # Here have the hardcoded repeating decimals in the fraction, and
12758            # the denominator they imply.  There were only a few denominators
12759            # in the older Unicode versions of this file which this code
12760            # handles, so it is easy to convert them.
12761
12762            # The 4 is because of a round-off error in the Unicode 3.2 files
12763            elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12764                $denominator = 3;
12765            }
12766            elsif ($decimal =~ /\.[27]50*$/) {
12767                $denominator = 4;
12768            }
12769            elsif ($decimal =~ /\.[2468]0*$/) {
12770                $denominator = 5;
12771            }
12772            elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12773                $denominator = 6;
12774            }
12775            elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12776                $denominator = 8;
12777            }
12778            if ($denominator) {
12779                my $sign = ($decimal < 0) ? "-" : "";
12780                my $numerator = int((abs($decimal) * $denominator) + .5);
12781                $rational = "$sign$numerator/$denominator";
12782                $_ = "$codepoints; $rational";
12783            }
12784            else {
12785                $file->carp_bad_line("Can't cope with number '$decimal'.");
12786                $_ = "";
12787                return;
12788            }
12789        }
12790    }
12791
12792    register_fraction($rational) if $rational =~ qr{/};
12793    return;
12794}
12795
12796{ # Closure
12797    my %unihan_properties;
12798
12799    sub construct_unihan($file_object) {
12800
12801        return unless file_exists($file_object->file);
12802
12803        if ($v_version lt v4.0.0) {
12804            push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
12805            push @cjk_property_values, split "\n", <<'END';
12806# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
12807END
12808        }
12809
12810        if ($v_version ge v3.0.0) {
12811            push @cjk_properties, split "\n", <<'END';
12812cjkIRG_GSource; kIRG_GSource
12813cjkIRG_JSource; kIRG_JSource
12814cjkIRG_KSource; kIRG_KSource
12815cjkIRG_TSource; kIRG_TSource
12816cjkIRG_VSource; kIRG_VSource
12817END
12818        push @cjk_property_values, split "\n", <<'END';
12819# @missing: 0000..10FFFF; cjkIRG_GSource; <none>
12820# @missing: 0000..10FFFF; cjkIRG_JSource; <none>
12821# @missing: 0000..10FFFF; cjkIRG_KSource; <none>
12822# @missing: 0000..10FFFF; cjkIRG_TSource; <none>
12823# @missing: 0000..10FFFF; cjkIRG_VSource; <none>
12824END
12825        }
12826        if ($v_version ge v3.1.0) {
12827            push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
12828            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
12829        }
12830        if ($v_version ge v3.1.1) {
12831            push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
12832            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
12833        }
12834        if ($v_version ge v3.2.0) {
12835            push @cjk_properties, split "\n", <<'END';
12836cjkAccountingNumeric; kAccountingNumeric
12837cjkCompatibilityVariant; kCompatibilityVariant
12838cjkOtherNumeric; kOtherNumeric
12839cjkPrimaryNumeric; kPrimaryNumeric
12840END
12841            push @cjk_property_values, split "\n", <<'END';
12842# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
12843# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
12844# @missing: 0000..10FFFF; cjkOtherNumeric; NaN
12845# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
12846END
12847        }
12848        if ($v_version gt v4.0.0) {
12849            push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
12850            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
12851        }
12852
12853        if ($v_version ge v4.1.0) {
12854            push @cjk_properties, 'cjkIICore ; kIICore';
12855            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
12856        }
12857    }
12858
12859    sub setup_unihan {
12860        # Do any special setup for Unihan properties.
12861
12862        # This property gives the wrong computed type, so override.
12863        my $usource = property_ref('kIRG_USource');
12864        $usource->set_type($STRING) if defined $usource;
12865
12866        # This property is to be considered binary (it says so in
12867        # http://www.unicode.org/reports/tr38/)
12868        my $iicore = property_ref('kIICore');
12869        if (defined $iicore) {
12870            $iicore->set_type($FORCED_BINARY);
12871            $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
12872
12873            # Unicode doesn't include the maps for this property, so don't
12874            # warn that they are missing.
12875            $iicore->set_pre_declared_maps(0);
12876            $iicore->add_comment(join_lines( <<END
12877This property contains string values, but any non-empty ones are considered to
12878be 'core', so Perl creates tables for both: 1) its string values, plus 2)
12879tables so that \\p{kIICore} matches any code point which has a non-empty
12880value for this property.
12881END
12882            ));
12883        }
12884
12885        return;
12886    }
12887
12888    sub filter_unihan_line {
12889        # Change unihan db lines to look like the others in the db.  Here is
12890        # an input sample:
12891        #   U+341C        kCangjie        IEKN
12892
12893        # Tabs are used instead of semi-colons to separate fields; therefore
12894        # they may have semi-colons embedded in them.  Change these to periods
12895        # so won't screw up the rest of the code.
12896        s/;/./g;
12897
12898        # Remove lines that don't look like ones we accept.
12899        if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12900            $_ = "";
12901            return;
12902        }
12903
12904        # Extract the property, and save a reference to its object.
12905        my $property = $1;
12906        if (! exists $unihan_properties{$property}) {
12907            $unihan_properties{$property} = property_ref($property);
12908        }
12909
12910        # Don't do anything unless the property is one we're handling, which
12911        # we determine by seeing if there is an object defined for it or not
12912        if (! defined $unihan_properties{$property}) {
12913            $_ = "";
12914            return;
12915        }
12916
12917        # Convert the tab separators to our standard semi-colons, and convert
12918        # the U+HHHH notation to the rest of the standard's HHHH
12919        s/\t/;/g;
12920        s/\b U \+ (?= $code_point_re )//xg;
12921
12922        #local $to_trace = 1 if main::DEBUG;
12923        trace $_ if main::DEBUG && $to_trace;
12924
12925        return;
12926    }
12927}
12928
12929sub filter_blocks_lines($file) {
12930    # In the Blocks.txt file, the names of the blocks don't quite match the
12931    # names given in PropertyValueAliases.txt, so this changes them so they
12932    # do match:  Blanks and hyphens are changed into underscores.  Also makes
12933    # early release versions look like later ones
12934    #
12935    # $_ is transformed to the correct value.
12936
12937    if ($v_version lt v3.2.0) {
12938        if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12939            $_ = "";
12940            return;
12941        }
12942
12943        # Old versions used a different syntax to mark the range.
12944        $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12945    }
12946
12947    my @fields = split /\s*;\s*/, $_, -1;
12948    if (@fields != 2) {
12949        $file->carp_bad_line("Expecting exactly two fields");
12950        $_ = "";
12951        return;
12952    }
12953
12954    # Change hyphens and blanks in the block name field only
12955    $fields[1] =~ s/[ -]/_/g;
12956    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
12957
12958    $_ = join("; ", @fields);
12959    return;
12960}
12961
12962{ # Closure
12963    my $current_property;
12964
12965    sub filter_old_style_proplist {
12966        # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
12967        # was in a completely different syntax.  Ken Whistler of Unicode says
12968        # that it was something he used as an aid for his own purposes, but
12969        # was never an official part of the standard.  Many of the properties
12970        # in it were incorporated into the later PropList.txt, but some were
12971        # not.  This program uses this early file to generate property tables
12972        # that are otherwise not accessible in the early UCD's.  It does this
12973        # for the ones that eventually became official, and don't appear to be
12974        # too different in their contents from the later official version, and
12975        # throws away the rest.  It could be argued that the ones it generates
12976        # were probably not really official at that time, so should be
12977        # ignored.  You can easily modify things to skip all of them by
12978        # changing this function to just set $_ to "", and return; and to skip
12979        # certain of them by simply removing their declarations from
12980        # get_old_property_aliases().
12981        #
12982        # Here is a list of all the ones that are thrown away:
12983        #   Alphabetic                   The definitions for this are very
12984        #                                defective, so better to not mislead
12985        #                                people into thinking it works.
12986        #                                Instead the Perl extension of the
12987        #                                same name is constructed from first
12988        #                                principles.
12989        #   Bidi=*                       duplicates UnicodeData.txt
12990        #   Combining                    never made into official property;
12991        #                                is \P{ccc=0}
12992        #   Composite                    never made into official property.
12993        #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
12994        #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
12995        #   Delimiter                    never made into official property;
12996        #                                removed in 3.0.1
12997        #   Format Control               never made into official property;
12998        #                                similar to gc=cf
12999        #   High Surrogate               duplicates Blocks.txt
13000        #   Ignorable Control            never made into official property;
13001        #                                similar to di=y
13002        #   ISO Control                  duplicates UnicodeData.txt: gc=cc
13003        #   Left of Pair                 never made into official property;
13004        #   Line Separator               duplicates UnicodeData.txt: gc=zl
13005        #   Low Surrogate                duplicates Blocks.txt
13006        #   Non-break                    was actually listed as a property
13007        #                                in 3.2, but without any code
13008        #                                points.  Unicode denies that this
13009        #                                was ever an official property
13010        #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
13011        #   Numeric                      duplicates UnicodeData.txt: gc=cc
13012        #   Paired Punctuation           never made into official property;
13013        #                                appears to be gc=ps + gc=pe
13014        #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
13015        #   Private Use                  duplicates UnicodeData.txt: gc=co
13016        #   Private Use High Surrogate   duplicates Blocks.txt
13017        #   Punctuation                  duplicates UnicodeData.txt: gc=p
13018        #   Space                        different definition than eventual
13019        #                                one.
13020        #   Titlecase                    duplicates UnicodeData.txt: gc=lt
13021        #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
13022        #   Zero-width                   never made into official property;
13023        #                                subset of gc=cf
13024        # Most of the properties have the same names in this file as in later
13025        # versions, but a couple do not.
13026        #
13027        # This subroutine filters $_, converting it from the old style into
13028        # the new style.  Here's a sample of the old-style
13029        #
13030        #   *******************************************
13031        #
13032        #   Property dump for: 0x100000A0 (Join Control)
13033        #
13034        #   200C..200D  (2 chars)
13035        #
13036        # In the example, the property is "Join Control".  It is kept in this
13037        # closure between calls to the subroutine.  The numbers beginning with
13038        # 0x were internal to Ken's program that generated this file.
13039
13040        # If this line contains the property name, extract it.
13041        if (/^Property dump for: [^(]*\((.*)\)/) {
13042            $_ = $1;
13043
13044            # Convert white space to underscores.
13045            s/ /_/g;
13046
13047            # Convert the few properties that don't have the same name as
13048            # their modern counterparts
13049            s/Identifier_Part/ID_Continue/
13050            or s/Not_a_Character/NChar/;
13051
13052            # If the name matches an existing property, use it.
13053            if (defined property_ref($_)) {
13054                trace "new property=", $_ if main::DEBUG && $to_trace;
13055                $current_property = $_;
13056            }
13057            else {        # Otherwise discard it
13058                trace "rejected property=", $_ if main::DEBUG && $to_trace;
13059                undef $current_property;
13060            }
13061            $_ = "";    # The property is saved for the next lines of the
13062                        # file, but this defining line is of no further use,
13063                        # so clear it so that the caller won't process it
13064                        # further.
13065        }
13066        elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13067
13068            # Here, the input line isn't a header defining a property for the
13069            # following section, and either we aren't in such a section, or
13070            # the line doesn't look like one that defines the code points in
13071            # such a section.  Ignore this line.
13072            $_ = "";
13073        }
13074        else {
13075
13076            # Here, we have a line defining the code points for the current
13077            # stashed property.  Anything starting with the first blank is
13078            # extraneous.  Otherwise, it should look like a normal range to
13079            # the caller.  Append the property name so that it looks just like
13080            # a modern PropList entry.
13081
13082            $_ =~ s/\s.*//;
13083            $_ .= "; $current_property";
13084        }
13085        trace $_ if main::DEBUG && $to_trace;
13086        return;
13087    }
13088} # End closure for old style proplist
13089
13090sub filter_old_style_normalization_lines {
13091    # For early releases of Unicode, the lines were like:
13092    #        74..2A76    ; NFKD_NO
13093    # For later releases this became:
13094    #        74..2A76    ; NFKD_QC; N
13095    # Filter $_ to look like those in later releases.
13096    # Similarly for MAYBEs
13097
13098    s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13099
13100    # Also, the property FC_NFKC was abbreviated to FNC
13101    s/FNC/FC_NFKC/;
13102    return;
13103}
13104
13105sub setup_script_extensions {
13106    # The Script_Extensions property starts out with a clone of the Script
13107    # property.
13108
13109    $scx = property_ref("Script_Extensions");
13110    return unless defined $scx;
13111
13112    $scx->_set_format($STRING_WHITE_SPACE_LIST);
13113    $scx->initialize($script);
13114    $scx->set_default_map($script->default_map);
13115    $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
13116    $scx->add_comment(join_lines( <<END
13117The values for code points that appear in one script are just the same as for
13118the 'Script' property.  Likewise the values for those that appear in many
13119scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
13120values of code points that appear in a few scripts are a space separated list
13121of those scripts.
13122END
13123    ));
13124
13125    # Initialize scx's tables and the aliases for them to be the same as sc's
13126    foreach my $table ($script->tables) {
13127        my $scx_table = $scx->add_match_table($table->name,
13128                                Full_Name => $table->full_name);
13129        foreach my $alias ($table->aliases) {
13130            $scx_table->add_alias($alias->name);
13131        }
13132    }
13133}
13134
13135sub  filter_script_extensions_line {
13136    # The Scripts file comes with the full name for the scripts; the
13137    # ScriptExtensions, with the short name.  The final mapping file is a
13138    # combination of these, and without adjustment, would have inconsistent
13139    # entries.  This filters the latter file to convert to full names.
13140    # Entries look like this:
13141    # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13142
13143    my @fields = split /\s*;\s*/;
13144
13145    # This script was erroneously omitted in this Unicode version.
13146    $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13147
13148    my @full_names;
13149    foreach my $short_name (split " ", $fields[1]) {
13150        push @full_names, $script->table($short_name)->full_name;
13151    }
13152    $fields[1] = join " ", @full_names;
13153    $_ = join "; ", @fields;
13154
13155    return;
13156}
13157
13158sub setup_emojidata {
13159    my $prop_ref = Property->new('ExtPict',
13160                                 Full_Name => 'Extended_Pictographic',
13161    );
13162    $prop_ref->set_fate($PLACEHOLDER,
13163                        "Not part of the Unicode Character Database");
13164}
13165
13166sub filter_emojidata_line {
13167    # We only are interested in this single property from this non-UCD data
13168    # file, and we turn it into a Perl property, so that it isn't accessible
13169    # to the users
13170
13171    $_ = "" unless /\bExtended_Pictographic\b/;
13172
13173    return;
13174}
13175
13176sub setup_IdStatus {
13177    my $ids = Property->new('Identifier_Status',
13178                            Match_SubDir => 'IdStatus',
13179                            Default_Map => 'Restricted',
13180                           );
13181    $ids->add_match_table('Allowed');
13182}
13183
13184sub setup_IdType {
13185    $idt = Property->new('Identifier_Type',
13186                            Match_SubDir => 'IdType',
13187                            Default_Map => 'Not_Character',
13188                            Format => $STRING_WHITE_SPACE_LIST,
13189                           );
13190}
13191
13192sub  filter_IdType_line {
13193
13194    # Some code points have more than one type, separated by spaces on the
13195    # input.  For now, we just add everything as a property value.  Later when
13196    # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve
13197    # things
13198
13199    my @fields = split /\s*;\s*/;
13200    my $types = $fields[1];
13201    $idt->add_match_table($types) unless defined $idt->table($types);
13202
13203    return;
13204}
13205
13206sub generate_hst($file) {
13207
13208    # Populates the Hangul Syllable Type property from first principles
13209
13210    # These few ranges are hard-coded in.
13211    $file->insert_lines(split /\n/, <<'END'
132121100..1159    ; L
13213115F          ; L
132141160..11A2    ; V
1321511A8..11F9    ; T
13216END
13217);
13218
13219    # The Hangul syllables in version 1 are at different code points than
13220    # those that came along starting in version 2, and have different names;
13221    # they comprise about 60% of the code points of the later version.
13222    # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13223    # initial set is a subset of the later version, with different English
13224    # transliterations.  I did not see an easy mapping between them.  The
13225    # later set includes essentially all possibilities, even ones that aren't
13226    # in modern use (if they ever were), and over 96% of the new ones are type
13227    # LVT.  Mathematically, the early set must also contain a preponderance of
13228    # LVT values.  In lieu of doing nothing, we just set them all to LVT, and
13229    # expect that this will be right most of the time, which is better than
13230    # not being right at all.
13231    if ($v_version lt v2.0.0) {
13232        my $property = property_ref($file->property);
13233        $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13234                                    $FIRST_REMOVED_HANGUL_SYLLABLE,
13235                                    $FINAL_REMOVED_HANGUL_SYLLABLE));
13236        push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13237        return;
13238    }
13239
13240    # The algorithmically derived syllables are almost all LVT ones, so
13241    # initialize the whole range with that.
13242    $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13243                        $SBase, $SBase + $SCount -1);
13244
13245    # Those ones that aren't LVT are LV, and they occur at intervals of
13246    # $TCount code points, starting with the first code point, at $SBase.
13247    for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13248        $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13249    }
13250
13251    return;
13252}
13253
13254sub generate_GCB($file) {
13255
13256    # Populates the Grapheme Cluster Break property from first principles
13257
13258    # All these definitions are from
13259    # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13260    # from http://www.unicode.org/reports/tr29/tr29-4.html
13261
13262    foreach my $range ($gc->ranges) {
13263
13264        # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13265        # and gc=Cf
13266        if ($range->value =~ / ^ M [en] $ /x) {
13267            $file->insert_lines(sprintf "%04X..%04X; Extend",
13268                                $range->start,  $range->end);
13269        }
13270        elsif ($range->value =~ / ^ C [cf] $ /x) {
13271            $file->insert_lines(sprintf "%04X..%04X; Control",
13272                                $range->start,  $range->end);
13273        }
13274    }
13275    $file->insert_lines("2028; Control"); # Line Separator
13276    $file->insert_lines("2029; Control"); # Paragraph Separator
13277
13278    $file->insert_lines("000D; CR");
13279    $file->insert_lines("000A; LF");
13280
13281    # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13282    foreach my $code_point ( qw{
13283                                09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13284                                0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13285                                }
13286    ) {
13287        my $category = $gc->value_of(hex $code_point);
13288        next if ! defined $category || $category eq 'Cn'; # But not if
13289                                                          # unassigned in this
13290                                                          # release
13291        $file->insert_lines("$code_point; Extend");
13292    }
13293
13294    my $hst = property_ref('Hangul_Syllable_Type');
13295    if ($hst->count > 0) {
13296        foreach my $range ($hst->ranges) {
13297            $file->insert_lines(sprintf "%04X..%04X; %s",
13298                                    $range->start, $range->end, $range->value);
13299        }
13300    }
13301    else {
13302        generate_hst($file);
13303    }
13304
13305    main::process_generic_property_file($file);
13306}
13307
13308
13309sub fixup_early_perl_name_alias($file) {
13310
13311    # Different versions of Unicode have varying support for the name synonyms
13312    # below.  Just include everything.  As of 6.1, all these are correct in
13313    # the Unicode-supplied file.
13314
13315    # ALERT did not come along until 6.0, at which point it became preferred
13316    # over BELL.  By inserting it last in early releases, BELL is preferred
13317    # over it; and vice-vers in 6.0
13318    my $type_for_bell = ($v_version lt v6.0.0)
13319               ? 'correction'
13320               : 'alternate';
13321    $file->insert_lines(split /\n/, <<END
133220007;BELL; $type_for_bell
13323000A;LINE FEED (LF);alternate
13324000C;FORM FEED (FF);alternate
13325000D;CARRIAGE RETURN (CR);alternate
133260085;NEXT LINE (NEL);alternate
13327END
13328
13329    );
13330
13331    # One might think that the 'Unicode_1_Name' field, could work for most
13332    # of the above names, but sadly that field varies depending on the
13333    # release.  Version 1.1.5 had no names for any of the controls; Version
13334    # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13335    # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13336    #   changed to parenthesized versions like "NEXT LINE" to
13337    #       "NEXT LINE (NEL)";
13338    #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13339    #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13340    #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13341    #
13342    # All these are present in the 6.1 NameAliases.txt
13343
13344    return;
13345}
13346
13347sub filter_later_version_name_alias_line {
13348
13349    # This file has an extra entry per line for the alias type.  This is
13350    # handled by creating a compound entry: "$alias: $type";  First, split
13351    # the line into components.
13352    my ($range, $alias, $type, @remainder)
13353        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13354
13355    # This file contains multiple entries for some components, so tell the
13356    # downstream code to allow this in our internal tables; the
13357    # $MULTIPLE_AFTER preserves the input ordering.
13358    $_ = join ";", $range, $CMD_DELIM
13359                           . $REPLACE_CMD
13360                           . '='
13361                           . $MULTIPLE_AFTER
13362                           . $CMD_DELIM
13363                           . "$alias: $type",
13364                   @remainder;
13365    return;
13366}
13367
13368sub filter_early_version_name_alias_line {
13369
13370    # Early versions did not have the trailing alias type field; implicitly it
13371    # was 'correction'.
13372    $_ .= "; correction";
13373
13374    filter_later_version_name_alias_line;
13375    return;
13376}
13377
13378sub filter_all_caps_script_names {
13379
13380    # Some early Unicode releases had the script names in all CAPS.  This
13381    # converts them to just the first letter of each word being capital.
13382
13383    my ($range, $script, @remainder)
13384        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13385    my @words = split /[_-]/, $script;
13386    for my $word (@words) {
13387        $word =
13388            ucfirst(lc($word)) if $word ne 'CJK';
13389    }
13390    $script = join "_", @words;
13391    $_ = join ";", $range, $script, @remainder;
13392}
13393
13394sub finish_Unicode() {
13395    # This routine should be called after all the Unicode files have been read
13396    # in.  It:
13397    # 1) Creates properties that are missing from the version of Unicode being
13398    #    compiled, and which, for whatever reason, are needed for the Perl
13399    #    core to function properly.  These are minimally populated as
13400    #    necessary.
13401    # 2) Adds the mappings for code points missing from the files which have
13402    #    defaults specified for them.
13403    # 3) At this point all mappings are known, so it computes the type of
13404    #    each property whose type hasn't been determined yet.
13405    # 4) Calculates all the regular expression match tables based on the
13406    #    mappings.
13407    # 5) Calculates and adds the tables which are defined by Unicode, but
13408    #    which aren't derived by them, and certain derived tables that Perl
13409    #    uses.
13410
13411    # Folding information was introduced later into Unicode data.  To get
13412    # Perl's case ignore (/i) to work at all in releases that don't have
13413    # folding, use the best available alternative, which is lower casing.
13414    my $fold = property_ref('Case_Folding');
13415    if ($fold->is_empty) {
13416        $fold->initialize(property_ref('Lowercase_Mapping'));
13417        $fold->add_note(join_lines(<<END
13418WARNING: This table uses lower case as a substitute for missing fold
13419information
13420END
13421        ));
13422    }
13423
13424    # Multiple-character mapping was introduced later into Unicode data, so it
13425    # is by default the simple version.  If to output the simple versions and
13426    # not present, just use the regular (which in these Unicode versions is
13427    # the simple as well).
13428    foreach my $map (qw {   Uppercase_Mapping
13429                            Lowercase_Mapping
13430                            Titlecase_Mapping
13431                            Case_Folding
13432                        } )
13433    {
13434        my $comment = <<END;
13435
13436Note that although the Perl core uses this file, it has the standard values
13437for code points from U+0000 to U+00FF compiled in, so changing this table will
13438not change the core's behavior with respect to these code points.  Use
13439Unicode::Casing to override this table.
13440END
13441        if ($map eq 'Case_Folding') {
13442            $comment .= <<END;
13443(/i regex matching is not overridable except by using a custom regex engine)
13444END
13445        }
13446        property_ref($map)->add_comment(join_lines($comment));
13447        my $simple = property_ref("Simple_$map");
13448        next if ! $simple->is_empty;
13449        if ($simple->to_output_map) {
13450            $simple->initialize(property_ref($map));
13451        }
13452        else {
13453            property_ref($map)->set_proxy_for($simple->name);
13454        }
13455    }
13456
13457    # For each property, fill in any missing mappings, and calculate the re
13458    # match tables.  If a property has more than one missing mapping, the
13459    # default is a reference to a data structure, and may require data from
13460    # other properties to resolve.  The sort is used to cause these to be
13461    # processed last, after all the other properties have been calculated.
13462    # (Fortunately, the missing properties so far don't depend on each other.)
13463    foreach my $property
13464        (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13465        property_ref('*'))
13466    {
13467        # $perl has been defined, but isn't one of the Unicode properties that
13468        # need to be finished up.
13469        next if $property == $perl;
13470
13471        # Nor do we need to do anything with properties that aren't going to
13472        # be output.
13473        next if $property->fate == $SUPPRESSED;
13474
13475        # Handle the properties that have more than one possible default
13476        if (ref $property->default_map) {
13477            my $default_map = $property->default_map;
13478
13479            # These properties have stored in the default_map:
13480            # One or more of:
13481            #   1)  A default map which applies to all code points in a
13482            #       certain class
13483            #   2)  an expression which will evaluate to the list of code
13484            #       points in that class
13485            # And
13486            #   3) the default map which applies to every other missing code
13487            #      point.
13488            #
13489            # Go through each list.
13490            while (my ($default, $eval) = $default_map->get_next_defaults) {
13491                last unless defined $eval;
13492
13493                # Get the class list, and intersect it with all the so-far
13494                # unspecified code points yielding all the code points
13495                # in the class that haven't been specified.
13496                my $list = eval $eval;
13497                if ($@) {
13498                    Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13499                    last;
13500                }
13501
13502                # Narrow down the list to just those code points we don't have
13503                # maps for yet.
13504                $list = $list & $property->inverse_list;
13505
13506                # Add mappings to the property for each code point in the list
13507                foreach my $range ($list->ranges) {
13508                    $property->add_map($range->start, $range->end, $default,
13509                    Replace => $NO);
13510                }
13511            }
13512
13513            # All remaining code points have the other mapping.  Set that up
13514            # so the normal single-default mapping code will work on them
13515            $property->set_default_map($default_map->other_default);
13516
13517            # And fall through to do that
13518        }
13519
13520        # We should have enough data now to compute the type of the property.
13521        my $property_name = $property->name;
13522        $property->compute_type;
13523        my $property_type = $property->type;
13524
13525        next if ! $property->to_create_match_tables;
13526
13527        # Here want to create match tables for this property
13528
13529        # The Unicode db always (so far, and they claim into the future) have
13530        # the default for missing entries in binary properties be 'N' (unless
13531        # there is a '@missing' line that specifies otherwise)
13532        if (! defined $property->default_map) {
13533            if ($property_type == $BINARY) {
13534                $property->set_default_map('N');
13535            }
13536            elsif ($property_type == $ENUM) {
13537                Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13538                $property->set_default_map('XXX This makes sure there is a default map');
13539            }
13540        }
13541
13542        # Add any remaining code points to the mapping, using the default for
13543        # missing code points.
13544        my $default_table;
13545        my $default_map = $property->default_map;
13546        if ($property_type == $FORCED_BINARY) {
13547
13548            # A forced binary property creates a 'Y' table that matches all
13549            # non-default values.  The actual string values are also written out
13550            # as a map table.  (The default value will almost certainly be the
13551            # empty string, so the pod glosses over the distinction, and just
13552            # talks about empty vs non-empty.)
13553            my $yes = $property->table("Y");
13554            foreach my $range ($property->ranges) {
13555                next if $range->value eq $default_map;
13556                $yes->add_range($range->start, $range->end);
13557            }
13558            $property->table("N")->set_complement($yes);
13559        }
13560        else {
13561            if (defined $default_map) {
13562
13563                # Make sure there is a match table for the default
13564                if (! defined ($default_table = $property->table($default_map)))
13565                {
13566                    $default_table = $property->add_match_table($default_map);
13567                }
13568
13569                # And, if the property is binary, the default table will just
13570                # be the complement of the other table.
13571                if ($property_type == $BINARY) {
13572                    my $non_default_table;
13573
13574                    # Find the non-default table.
13575                    for my $table ($property->tables) {
13576                        if ($table == $default_table) {
13577                            if ($v_version le v5.0.0) {
13578                                $table->add_alias($_) for qw(N No F False);
13579                            }
13580                            next;
13581                        } elsif ($v_version le v5.0.0) {
13582                            $table->add_alias($_) for qw(Y Yes T True);
13583                        }
13584                        $non_default_table = $table;
13585                    }
13586                    $default_table->set_complement($non_default_table);
13587                }
13588                else {
13589
13590                    # This fills in any missing values with the default.  It's
13591                    # not necessary to do this with binary properties, as the
13592                    # default is defined completely in terms of the Y table.
13593                    $property->add_map(0, $MAX_WORKING_CODEPOINT,
13594                                    $default_map, Replace => $NO);
13595                }
13596            }
13597
13598            # Have all we need to populate the match tables.
13599            my $maps_should_be_defined = $property->pre_declared_maps;
13600            foreach my $range ($property->ranges) {
13601                my $map = $range->value;
13602                my $table = $property->table($map);
13603                if (! defined $table) {
13604
13605                    # Integral and rational property values are not
13606                    # necessarily defined in PropValueAliases, but whether all
13607                    # the other ones should be depends on the property.
13608                    if ($maps_should_be_defined
13609                        && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13610                    {
13611                        Carp::my_carp("Table '$property_name=$map' should "
13612                                    . "have been defined.  Defining it now.")
13613                    }
13614                    $table = $property->add_match_table($map);
13615                }
13616
13617                next if $table->complement != 0; # Don't need to populate these
13618                $table->add_range($range->start, $range->end);
13619            }
13620        }
13621
13622        # For Perl 5.6 compatibility, all properties matchable in regexes can
13623        # have an optional 'Is_' prefix.  This is now done in Unicode::UCD.
13624        # But warn if this creates a conflict with a (new) Unicode property
13625        # name, although it appears that Unicode has made a decision never to
13626        # begin a property name with 'Is_', so this shouldn't happen.
13627        foreach my $alias ($property->aliases) {
13628            my $Is_name = 'Is_' . $alias->name;
13629            if (defined (my $pre_existing = property_ref($Is_name))) {
13630                Carp::my_carp(<<END
13631There is already an alias named $Is_name (from " . $pre_existing . "), so
13632creating one for $property won't work.  This is bad news.  If it is not too
13633late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13634from the git blame log for this area of the code that suppressed individual
13635aliases that conflict with the new Unicode names.  Proceeding anyway.
13636END
13637                );
13638            }
13639        } # End of loop through aliases for this property
13640
13641
13642        # Properties that have sets of values for some characters are now
13643        # converted.  For example, the Script_Extensions property started out
13644        # as a clone of the Script property.  But processing its data file
13645        # caused some elements to be replaced with different data.  (These
13646        # elements were for the Common and Inherited properties.)  This data
13647        # is a qw() list of all the scripts that the code points in the given
13648        # range are in.  An example line is:
13649        #
13650        # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
13651        #
13652        # Code executed earlier has created a new match table named "Arab Syrc
13653        # Thaa" which contains 060C.  (The cloned table started out with this
13654        # code point mapping to "Common".)  Now we add 060C to each of the
13655        # Arab, Syrc, and Thaa match tables.  Then we delete the now spurious
13656        # "Arab Syrc Thaa" match table.  This is repeated for all these tables
13657        # and ranges.  The map data is retained in the map table for
13658        # reference, but the spurious match tables are deleted.
13659        my $format = $property->format;
13660        if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) {
13661            foreach my $table ($property->tables) {
13662
13663                # Space separates the entries which should go in multiple
13664                # tables
13665                next unless $table->name =~ /\s/;
13666
13667                # The list of the entries, hence the names of the tables that
13668                # everything in this combo table should be added to.
13669                my @list = split /\s+/, $table->name;
13670
13671                # Add the entries from the combo table to each individual
13672                # table
13673                foreach my $individual (@list) {
13674                    my $existing_table = $property->table($individual);
13675
13676                    # This should only be necessary if this particular entry
13677                    # occurs only in combo with others.
13678                    $existing_table = $property->add_match_table($individual)
13679                                                unless defined $existing_table;
13680                    $existing_table += $table;
13681                }
13682                $property->delete_match_table($table);
13683            }
13684        }
13685    } # End of loop through all Unicode properties.
13686
13687    # Fill in the mappings that Unicode doesn't completely furnish.  First the
13688    # single letter major general categories.  If Unicode were to start
13689    # delivering the values, this would be redundant, but better that than to
13690    # try to figure out if should skip and not get it right.  Ths could happen
13691    # if a new major category were to be introduced, and the hard-coded test
13692    # wouldn't know about it.
13693    # This routine depends on the standard names for the general categories
13694    # being what it thinks they are, like 'Cn'.  The major categories are the
13695    # union of all the general category tables which have the same first
13696    # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13697    foreach my $minor_table ($gc->tables) {
13698        my $minor_name = $minor_table->name;
13699        next if length $minor_name == 1;
13700        if (length $minor_name != 2) {
13701            Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
13702            next;
13703        }
13704
13705        my $major_name = uc(substr($minor_name, 0, 1));
13706        my $major_table = $gc->table($major_name);
13707        $major_table += $minor_table;
13708    }
13709
13710    # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
13711    # defines it as LC)
13712    my $LC = $gc->table('LC');
13713    $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
13714    $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
13715
13716
13717    if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13718                         # deliver the correct values in it
13719        $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13720
13721        # Lt not in release 1.
13722        if (defined $gc->table('Lt')) {
13723            $LC += $gc->table('Lt');
13724            $gc->table('Lt')->set_caseless_equivalent($LC);
13725        }
13726    }
13727    $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13728
13729    $gc->table('Ll')->set_caseless_equivalent($LC);
13730    $gc->table('Lu')->set_caseless_equivalent($LC);
13731
13732    # Make sure this assumption in perl core code is valid in this Unicode
13733    # release, with known exceptions
13734    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13735        next if $range->end - $range->start == 9;
13736        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
13737        next if $range->end == 0x19DA && $v_version eq v5.2.0;
13738        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13739        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13740                    . " decimal digits.  Code in regcomp.c assumes it does,"
13741                    . " and will have to be fixed.  Proceeding anyway.");
13742    }
13743
13744    # Mark the scx table as the parent of the corresponding sc table for those
13745    # which are identical.  This causes the pod for the script table to refer
13746    # to the corresponding scx one.  This is done after everything, so as to
13747    # wait until the tables are stabilized before checking for equivalency.
13748    if (defined $scx) {
13749        if (defined $pod_directory) {
13750            foreach my $table ($scx->tables) {
13751                my $plain_sc_equiv = $script->table($table->name);
13752                if ($table->matches_identically_to($plain_sc_equiv)) {
13753                    $plain_sc_equiv->set_equivalent_to($table, Related => 1);
13754                }
13755            }
13756        }
13757    }
13758
13759    return;
13760}
13761
13762sub pre_3_dot_1_Nl () {
13763
13764    # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13765    # is when Unicode's became fully usable.  These code points were
13766    # determined by inspection and experimentation.  gc=nl is important for
13767    # certain Perl-extension properties that should be available in all
13768    # releases.
13769
13770    my $Nl = Range_List->new();
13771    if (defined (my $official = $gc->table('Nl'))) {
13772        $Nl += $official;
13773    }
13774    else {
13775        $Nl->add_range(0x2160, 0x2182);
13776        $Nl->add_range(0x3007, 0x3007);
13777        $Nl->add_range(0x3021, 0x3029);
13778    }
13779    $Nl->add_range(0xFE20, 0xFE23);
13780    $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13781                                                            # these were added
13782    return $Nl;
13783}
13784
13785sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
13786                            # called before the Cn's are completely filled.
13787                            # Works on Unicodes earlier than ones that
13788                            # explicitly specify Cn.
13789    return if defined $Assigned;
13790
13791    if (! defined $gc || $gc->is_empty()) {
13792        Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
13793    }
13794
13795    $Assigned = $perl->add_match_table('Assigned',
13796                                Description  => "All assigned code points",
13797                                );
13798    while (defined (my $range = $gc->each_range())) {
13799        my $standard_value = standardize($range->value);
13800        next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
13801        $Assigned->add_range($range->start, $range->end);
13802    }
13803}
13804
13805sub calculate_DI() {    # Set $DI to a Range_List equivalent to the
13806                        # Default_Ignorable_Code_Point property.  Works on
13807                        # Unicodes earlier than ones that explicitly specify
13808                        # DI.
13809    return if defined $DI;
13810
13811    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
13812        $DI = $di->table('Y');
13813    }
13814    else {
13815        $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
13816                                              0x2060 .. 0x206F,
13817                                              0xFE00 .. 0xFE0F,
13818                                              0xFFF0 .. 0xFFFB,
13819                                            ]);
13820        if ($v_version ge v2.0) {
13821            $DI += $gc->table('Cf')
13822                +  $gc->table('Cs');
13823
13824            # These are above the Unicode version 1 max
13825            $DI->add_range(0xE0000, 0xE0FFF);
13826        }
13827        $DI += $gc->table('Cc')
13828             - ord("\t")
13829             - utf8::unicode_to_native(0x0A)  # LINE FEED
13830             - utf8::unicode_to_native(0x0B)  # VERTICAL TAB
13831             - ord("\f")
13832             - utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
13833             - utf8::unicode_to_native(0x85); # NEL
13834    }
13835}
13836
13837sub calculate_NChar() {  # Create a Perl extension match table which is the
13838                         # same as the Noncharacter_Code_Point property, and
13839                         # set $NChar to point to it.  Works on Unicodes
13840                         # earlier than ones that explicitly specify NChar
13841    return if defined $NChar;
13842
13843    $NChar = $perl->add_match_table('_Perl_Nchar',
13844                                    Perl_Extension => 1,
13845                                    Fate => $INTERNAL_ONLY);
13846    if (defined (my $off_nchar = property_ref('NChar'))) {
13847        $NChar->initialize($off_nchar->table('Y'));
13848    }
13849    else {
13850        $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
13851        if ($v_version ge v2.0) {   # First release with these nchars
13852            for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
13853                $NChar += [ $i .. $i+1 ];
13854            }
13855        }
13856    }
13857}
13858
13859sub handle_compare_versions () {
13860    # This fixes things up for the $compare_versions capability, where we
13861    # compare Unicode version X with version Y (with Y > X), and we are
13862    # running it on the Unicode Data for version Y.
13863    #
13864    # It works by calculating the code points whose meaning has been specified
13865    # after release X, by using the Age property.  The complement of this set
13866    # is the set of code points whose meaning is unchanged between the
13867    # releases.  This is the set the program restricts itself to.  It includes
13868    # everything whose meaning has been specified by the time version X came
13869    # along, plus those still unassigned by the time of version Y.  (We will
13870    # continue to use the word 'assigned' to mean 'meaning has been
13871    # specified', as it's shorter and is accurate in all cases except the
13872    # Noncharacter code points.)
13873    #
13874    # This function is run after all the properties specified by Unicode have
13875    # been calculated for release Y.  This makes sure we get all the nuances
13876    # of Y's rules.  (It is done before the Perl extensions are calculated, as
13877    # those are based entirely on the Unicode ones.)  But doing it after the
13878    # Unicode table calculations means we have to fix up the Unicode tables.
13879    # We do this by subtracting the code points that have been assigned since
13880    # X (which is actually done by ANDing each table of assigned code points
13881    # with the set of unchanged code points).  Most Unicode properties are of
13882    # the form such that all unassigned code points have a default, grab-bag,
13883    # property value which is changed when the code point gets assigned.  For
13884    # these, we just remove the changed code points from the table for the
13885    # latter property value, and add them back in to the grab-bag one.  A few
13886    # other properties are not entirely of this form and have values for some
13887    # or all unassigned code points that are not the grab-bag one.  These have
13888    # to be handled specially, and are hard-coded in to this routine based on
13889    # manual inspection of the Unicode character database.  A list of the
13890    # outlier code points is made for each of these properties, and those
13891    # outliers are excluded from adding and removing from tables.
13892    #
13893    # Note that there are glitches when comparing against Unicode 1.1, as some
13894    # Hangul syllables in it were later ripped out and eventually replaced
13895    # with other things.
13896
13897    print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
13898
13899    my $after_first_version = "All matching code points were added after "
13900                            . "Unicode $string_compare_versions";
13901
13902    # Calculate the delta as those code points that have been newly assigned
13903    # since the first compare version.
13904    my $delta = Range_List->new();
13905    foreach my $table ($age->tables) {
13906        use version;
13907        next if $table == $age->table('Unassigned');
13908        next if version->parse($table->name)
13909             le version->parse($string_compare_versions);
13910        $delta += $table;
13911    }
13912    if ($delta->is_empty) {
13913        die ("No changes; perhaps you need a 'DAge.txt' file?");
13914    }
13915
13916    my $unchanged = ~ $delta;
13917
13918    calculate_Assigned() if ! defined $Assigned;
13919    $Assigned &= $unchanged;
13920
13921    # $Assigned now contains the code points that were assigned as of Unicode
13922    # version X.
13923
13924    # A block is all or nothing.  If nothing is assigned in it, it all goes
13925    # back to the No_Block pool; but if even one code point is assigned, the
13926    # block is retained.
13927    my $no_block = $block->table('No_Block');
13928    foreach my $this_block ($block->tables) {
13929        next if     $this_block == $no_block
13930                ||  ! ($this_block & $Assigned)->is_empty;
13931        $this_block->set_fate($SUPPRESSED, $after_first_version);
13932        foreach my $range ($this_block->ranges) {
13933            $block->replace_map($range->start, $range->end, 'No_Block')
13934        }
13935        $no_block += $this_block;
13936    }
13937
13938    my @special_delta_properties;   # List of properties that have to be
13939                                    # handled specially.
13940    my %restricted_delta;           # Keys are the entries in
13941                                    # @special_delta_properties;  values
13942                                    # are the range list of the code points
13943                                    # that behave normally when they get
13944                                    # assigned.
13945
13946    # In the next three properties, the Default Ignorable code points are
13947    # outliers.
13948    calculate_DI();
13949    $DI &= $unchanged;
13950
13951    push @special_delta_properties, property_ref('_Perl_GCB');
13952    $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13953
13954    if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
13955    {
13956        push @special_delta_properties, $cwnfkcc;
13957        $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13958    }
13959
13960    calculate_NChar();      # Non-character code points
13961    $NChar &= $unchanged;
13962
13963    # This may have to be updated from time-to-time to get the most accurate
13964    # results.
13965    my $default_BC_non_LtoR = Range_List->new(Initialize =>
13966                        # These came from the comments in v8.0 DBidiClass.txt
13967                                                        [ # AL
13968                                                            0x0600 .. 0x07BF,
13969                                                            0x08A0 .. 0x08FF,
13970                                                            0xFB50 .. 0xFDCF,
13971                                                            0xFDF0 .. 0xFDFF,
13972                                                            0xFE70 .. 0xFEFF,
13973                                                            0x1EE00 .. 0x1EEFF,
13974                                                           # R
13975                                                            0x0590 .. 0x05FF,
13976                                                            0x07C0 .. 0x089F,
13977                                                            0xFB1D .. 0xFB4F,
13978                                                            0x10800 .. 0x10FFF,
13979                                                            0x1E800 .. 0x1EDFF,
13980                                                            0x1EF00 .. 0x1EFFF,
13981                                                           # ET
13982                                                            0x20A0 .. 0x20CF,
13983                                                         ]
13984                                          );
13985    $default_BC_non_LtoR += $DI + $NChar;
13986    push @special_delta_properties, property_ref('BidiClass');
13987    $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
13988
13989    if (defined (my $eaw = property_ref('East_Asian_Width'))) {
13990
13991        my $default_EA_width_W = Range_List->new(Initialize =>
13992                                    # From comments in v8.0 EastAsianWidth.txt
13993                                                [
13994                                                    0x3400 .. 0x4DBF,
13995                                                    0x4E00 .. 0x9FFF,
13996                                                    0xF900 .. 0xFAFF,
13997                                                    0x20000 .. 0x2A6DF,
13998                                                    0x2A700 .. 0x2B73F,
13999                                                    0x2B740 .. 0x2B81F,
14000                                                    0x2B820 .. 0x2CEAF,
14001                                                    0x2F800 .. 0x2FA1F,
14002                                                    0x20000 .. 0x2FFFD,
14003                                                    0x30000 .. 0x3FFFD,
14004                                                ]
14005                                             );
14006        push @special_delta_properties, $eaw;
14007        $restricted_delta{$special_delta_properties[-1]}
14008                                                       = ~ $default_EA_width_W;
14009
14010        # Line break came along in the same release as East_Asian_Width, and
14011        # the non-grab-bag default set is a superset of the EAW one.
14012        if (defined (my $lb = property_ref('Line_Break'))) {
14013            my $default_LB_non_XX = Range_List->new(Initialize =>
14014                                        # From comments in v8.0 LineBreak.txt
14015                                                        [ 0x20A0 .. 0x20CF ]);
14016            $default_LB_non_XX += $default_EA_width_W;
14017            push @special_delta_properties, $lb;
14018            $restricted_delta{$special_delta_properties[-1]}
14019                                                        = ~ $default_LB_non_XX;
14020        }
14021    }
14022
14023    # Go through every property, skipping those we've already worked on, those
14024    # that are immutable, and the perl ones that will be calculated after this
14025    # routine has done its fixup.
14026    foreach my $property (property_ref('*')) {
14027        next if    $property == $perl     # Done later in the program
14028                || $property == $block    # Done just above
14029                || $property == $DI       # Done just above
14030                || $property == $NChar    # Done just above
14031
14032                   # The next two are invariant across Unicode versions
14033                || $property == property_ref('Pattern_Syntax')
14034                || $property == property_ref('Pattern_White_Space');
14035
14036        #  Find the grab-bag value.
14037        my $default_map = $property->default_map;
14038
14039        if (! $property->to_create_match_tables) {
14040
14041            # Here there aren't any match tables.  So far, all such properties
14042            # have a default map, and don't require special handling.  Just
14043            # change each newly assigned code point back to the default map,
14044            # as if they were unassigned.
14045            foreach my $range ($delta->ranges) {
14046                $property->add_map($range->start,
14047                                $range->end,
14048                                $default_map,
14049                                Replace => $UNCONDITIONALLY);
14050            }
14051        }
14052        else {  # Here there are match tables.  Find the one (if any) for the
14053                # grab-bag value that unassigned code points go to.
14054            my $default_table;
14055            if (defined $default_map) {
14056                $default_table = $property->table($default_map);
14057            }
14058
14059            # If some code points don't go back to the grab-bag when they
14060            # are considered unassigned, exclude them from the list that does
14061            # that.
14062            my $this_delta = $delta;
14063            my $this_unchanged = $unchanged;
14064            if (grep { $_ == $property } @special_delta_properties) {
14065                $this_delta = $delta & $restricted_delta{$property};
14066                $this_unchanged = ~ $this_delta;
14067            }
14068
14069            # Fix up each match table for this property.
14070            foreach my $table ($property->tables) {
14071                if (defined $default_table && $table == $default_table) {
14072
14073                    # The code points assigned after release X (the ones we
14074                    # are excluding in this routine) go back on to the default
14075                    # (grab-bag) table.  However, some of these tables don't
14076                    # actually exist, but are specified solely by the other
14077                    # tables.  (In a binary property, we don't need to
14078                    # actually have an 'N' table, as it's just the complement
14079                    # of the 'Y' table.)  Such tables will be locked, so just
14080                    # skip those.
14081                    $table += $this_delta unless $table->locked;
14082                }
14083                else {
14084
14085                    # Here the table is not for the default value.  We need to
14086                    # subtract the code points we are ignoring for this
14087                    # comparison (the deltas) from it.  But if the table
14088                    # started out with nothing, no need to exclude anything,
14089                    # and want to skip it here anyway, so it gets listed
14090                    # properly in the pod.
14091                    next if $table->is_empty;
14092
14093                    # Save the deltas for later, before we do the subtraction
14094                    my $deltas = $table & $this_delta;
14095
14096                    $table &= $this_unchanged;
14097
14098                    # Suppress the table if the subtraction left it with
14099                    # nothing in it
14100                    if ($table->is_empty) {
14101                        if ($property->type == $BINARY) {
14102                            push @tables_that_may_be_empty, $table->complete_name;
14103                        }
14104                        else {
14105                            $table->set_fate($SUPPRESSED, $after_first_version);
14106                        }
14107                    }
14108
14109                    # Now we add the removed code points to the property's
14110                    # map, as they should now map to the grab-bag default
14111                    # property (which they did in the first comparison
14112                    # version).  But we don't have to do this if the map is
14113                    # only for internal use.
14114                    if (defined $default_map && $property->to_output_map) {
14115
14116                        # The gc property has pseudo property values whose names
14117                        # have length 1.  These are the union of all the
14118                        # property values whose name is longer than 1 and
14119                        # whose first letter is all the same.  The replacement
14120                        # is done once for the longer-named tables.
14121                        next if $property == $gc && length $table->name == 1;
14122
14123                        foreach my $range ($deltas->ranges) {
14124                            $property->add_map($range->start,
14125                                            $range->end,
14126                                            $default_map,
14127                                            Replace => $UNCONDITIONALLY);
14128                        }
14129                    }
14130                }
14131            }
14132        }
14133    }
14134
14135    # The above code doesn't work on 'gc=C', as it is a superset of the default
14136    # ('Cn') table.  It's easiest to just special case it here.
14137    my $C = $gc->table('C');
14138    $C += $gc->table('Cn');
14139
14140    return;
14141}
14142
14143sub compile_perl() {
14144    # Create perl-defined tables.  Almost all are part of the pseudo-property
14145    # named 'perl' internally to this program.  Many of these are recommended
14146    # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14147    # on those found there.
14148    # Almost all of these are equivalent to some Unicode property.
14149    # A number of these properties have equivalents restricted to the ASCII
14150    # range, with their names prefaced by 'Posix', to signify that these match
14151    # what the Posix standard says they should match.  A couple are
14152    # effectively this, but the name doesn't have 'Posix' in it because there
14153    # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
14154    # to the full Unicode range, by our guesses as to what is appropriate.
14155
14156    # 'All' is all code points.  As an error check, instead of just setting it
14157    # to be that, construct it to be the union of all the major categories
14158    $All = $perl->add_match_table('All',
14159      Description
14160        => "All code points, including those above Unicode.  Same as qr/./s",
14161      Matches_All => 1);
14162
14163    foreach my $major_table ($gc->tables) {
14164
14165        # Major categories are the ones with single letter names.
14166        next if length($major_table->name) != 1;
14167
14168        $All += $major_table;
14169    }
14170
14171    if ($All->max != $MAX_WORKING_CODEPOINT) {
14172        Carp::my_carp_bug("Generated highest code point ("
14173           . sprintf("%X", $All->max)
14174           . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14175    }
14176    if ($All->range_count != 1 || $All->min != 0) {
14177     Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14178    }
14179
14180    my $Any = $perl->add_match_table('Any',
14181                                    Description  => "All Unicode code points");
14182    $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14183    $Any->add_alias('Unicode');
14184
14185    calculate_Assigned();
14186
14187    my $ASCII = $perl->add_match_table('ASCII');
14188    if (defined $block) {   # This is equivalent to the block if have it.
14189        my $Unicode_ASCII = $block->table('Basic_Latin');
14190        if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14191            $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14192        }
14193    }
14194
14195    # Very early releases didn't have blocks, so initialize ASCII ourselves if
14196    # necessary
14197    if ($ASCII->is_empty) {
14198        if (! NON_ASCII_PLATFORM) {
14199            $ASCII->add_range(0, 127);
14200        }
14201        else {
14202            for my $i (0 .. 127) {
14203                $ASCII->add_range(utf8::unicode_to_native($i),
14204                                  utf8::unicode_to_native($i));
14205            }
14206        }
14207    }
14208
14209    # Get the best available case definitions.  Early Unicode versions didn't
14210    # have Uppercase and Lowercase defined, so use the general category
14211    # instead for them, modified by hard-coding in the code points each is
14212    # missing.
14213    my $Lower = $perl->add_match_table('XPosixLower');
14214    my $Unicode_Lower = property_ref('Lowercase');
14215    if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14216        $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14217
14218    }
14219    else {
14220        $Lower += $gc->table('Lowercase_Letter');
14221
14222        # There are quite a few code points in Lower, that aren't in gc=lc,
14223        # and not all are in all releases.
14224        my $temp = Range_List->new(Initialize => [
14225                                                utf8::unicode_to_native(0xAA),
14226                                                utf8::unicode_to_native(0xBA),
14227                                                0x02B0 .. 0x02B8,
14228                                                0x02C0 .. 0x02C1,
14229                                                0x02E0 .. 0x02E4,
14230                                                0x0345,
14231                                                0x037A,
14232                                                0x1D2C .. 0x1D6A,
14233                                                0x1D78,
14234                                                0x1D9B .. 0x1DBF,
14235                                                0x2071,
14236                                                0x207F,
14237                                                0x2090 .. 0x209C,
14238                                                0x2170 .. 0x217F,
14239                                                0x24D0 .. 0x24E9,
14240                                                0x2C7C .. 0x2C7D,
14241                                                0xA770,
14242                                                0xA7F8 .. 0xA7F9,
14243                                ]);
14244        $Lower += $temp & $Assigned;
14245    }
14246    my $Posix_Lower = $perl->add_match_table("PosixLower",
14247                            Initialize => $Lower & $ASCII,
14248                            );
14249
14250    my $Upper = $perl->add_match_table("XPosixUpper");
14251    my $Unicode_Upper = property_ref('Uppercase');
14252    if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14253        $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14254    }
14255    else {
14256
14257        # Unlike Lower, there are only two ranges in Upper that aren't in
14258        # gc=Lu, and all code points were assigned in all releases.
14259        $Upper += $gc->table('Uppercase_Letter');
14260        $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
14261        $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
14262    }
14263    my $Posix_Upper = $perl->add_match_table("PosixUpper",
14264                            Initialize => $Upper & $ASCII,
14265                            );
14266
14267    # Earliest releases didn't have title case.  Initialize it to empty if not
14268    # otherwise present
14269    my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14270                                       Description => '(= \p{Gc=Lt})');
14271    my $lt = $gc->table('Lt');
14272
14273    # Earlier versions of mktables had this related to $lt since they have
14274    # identical code points, but their caseless equivalents are not the same,
14275    # one being 'Cased' and the other being 'LC', and so now must be kept as
14276    # separate entities.
14277    if (defined $lt) {
14278        $Title += $lt;
14279    }
14280    else {
14281        push @tables_that_may_be_empty, $Title->complete_name;
14282    }
14283
14284    my $Unicode_Cased = property_ref('Cased');
14285    if (defined $Unicode_Cased) {
14286        my $yes = $Unicode_Cased->table('Y');
14287        my $no = $Unicode_Cased->table('N');
14288        $Title->set_caseless_equivalent($yes);
14289        if (defined $Unicode_Upper) {
14290            $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14291            $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14292        }
14293        $Upper->set_caseless_equivalent($yes);
14294        if (defined $Unicode_Lower) {
14295            $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14296            $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14297        }
14298        $Lower->set_caseless_equivalent($yes);
14299    }
14300    else {
14301        # If this Unicode version doesn't have Cased, set up the Perl
14302        # extension from first principles.  From Unicode 5.1: Definition D120:
14303        # A character C is defined to be cased if and only if C has the
14304        # Lowercase or Uppercase property or has a General_Category value of
14305        # Titlecase_Letter.
14306        my $cased = $perl->add_match_table('Cased',
14307                        Initialize => $Lower + $Upper + $Title,
14308                        Description => 'Uppercase or Lowercase or Titlecase',
14309                        );
14310        # $notcased is purely for the caseless equivalents below
14311        my $notcased = $perl->add_match_table('_Not_Cased',
14312                                Initialize => ~ $cased,
14313                                Fate => $INTERNAL_ONLY,
14314                                Description => 'All not-cased code points');
14315        $Title->set_caseless_equivalent($cased);
14316        if (defined $Unicode_Upper) {
14317            $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14318            $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14319        }
14320        $Upper->set_caseless_equivalent($cased);
14321        if (defined $Unicode_Lower) {
14322            $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14323            $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14324        }
14325        $Lower->set_caseless_equivalent($cased);
14326    }
14327
14328    # The remaining perl defined tables are mostly based on Unicode TR 18,
14329    # "Annex C: Compatibility Properties".  All of these have two versions,
14330    # one whose name generally begins with Posix that is posix-compliant, and
14331    # one that matches Unicode characters beyond the Posix, ASCII range
14332
14333    my $Alpha = $perl->add_match_table('XPosixAlpha');
14334
14335    # Alphabetic was not present in early releases
14336    my $Alphabetic = property_ref('Alphabetic');
14337    if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14338        $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14339    }
14340    else {
14341
14342        # The Alphabetic property doesn't exist for early releases, so
14343        # generate it.  The actual definition, in 5.2 terms is:
14344        #
14345        # gc=L + gc=Nl + Other_Alphabetic
14346        #
14347        # Other_Alphabetic is also not defined in these early releases, but it
14348        # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14349        # those last two as well, then subtract the relatively few of them that
14350        # shouldn't have been added.  (The gc=So range is the circled capital
14351        # Latin characters.  Early releases mistakenly didn't also include the
14352        # lower-case versions of these characters, and so we don't either, to
14353        # maintain consistency with those releases that first had this
14354        # property.
14355        $Alpha->initialize($gc->table('Letter')
14356                           + pre_3_dot_1_Nl()
14357                           + $gc->table('Mn')
14358                           + $gc->table('Mc')
14359                        );
14360        $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
14361        foreach my $range (     [ 0x0300, 0x0344 ],
14362                                [ 0x0346, 0x034E ],
14363                                [ 0x0360, 0x0362 ],
14364                                [ 0x0483, 0x0486 ],
14365                                [ 0x0591, 0x05AF ],
14366                                [ 0x06DF, 0x06E0 ],
14367                                [ 0x06EA, 0x06EC ],
14368                                [ 0x0740, 0x074A ],
14369                                0x093C,
14370                                0x094D,
14371                                [ 0x0951, 0x0954 ],
14372                                0x09BC,
14373                                0x09CD,
14374                                0x0A3C,
14375                                0x0A4D,
14376                                0x0ABC,
14377                                0x0ACD,
14378                                0x0B3C,
14379                                0x0B4D,
14380                                0x0BCD,
14381                                0x0C4D,
14382                                0x0CCD,
14383                                0x0D4D,
14384                                0x0DCA,
14385                                [ 0x0E47, 0x0E4C ],
14386                                0x0E4E,
14387                                [ 0x0EC8, 0x0ECC ],
14388                                [ 0x0F18, 0x0F19 ],
14389                                0x0F35,
14390                                0x0F37,
14391                                0x0F39,
14392                                [ 0x0F3E, 0x0F3F ],
14393                                [ 0x0F82, 0x0F84 ],
14394                                [ 0x0F86, 0x0F87 ],
14395                                0x0FC6,
14396                                0x1037,
14397                                0x1039,
14398                                [ 0x17C9, 0x17D3 ],
14399                                [ 0x20D0, 0x20DC ],
14400                                0x20E1,
14401                                [ 0x302A, 0x302F ],
14402                                [ 0x3099, 0x309A ],
14403                                [ 0xFE20, 0xFE23 ],
14404                                [ 0x1D165, 0x1D169 ],
14405                                [ 0x1D16D, 0x1D172 ],
14406                                [ 0x1D17B, 0x1D182 ],
14407                                [ 0x1D185, 0x1D18B ],
14408                                [ 0x1D1AA, 0x1D1AD ],
14409        ) {
14410            if (ref $range) {
14411                $Alpha->delete_range($range->[0], $range->[1]);
14412            }
14413            else {
14414                $Alpha->delete_range($range, $range);
14415            }
14416        }
14417        $Alpha->add_description('Alphabetic');
14418        $Alpha->add_alias('Alphabetic');
14419    }
14420    my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14421                            Initialize => $Alpha & $ASCII,
14422                            );
14423    $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14424    $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14425
14426    my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14427                        Description => 'Alphabetic and (decimal) Numeric',
14428                        Initialize => $Alpha + $gc->table('Decimal_Number'),
14429                        );
14430    $perl->add_match_table("PosixAlnum",
14431                            Initialize => $Alnum & $ASCII,
14432                            );
14433
14434    my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14435                                Description => '\w, including beyond ASCII;'
14436                                            . ' = \p{Alnum} + \pM + \p{Pc}'
14437                                            . ' + \p{Join_Control}',
14438                                Initialize => $Alnum + $gc->table('Mark'),
14439                                );
14440    my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14441    if (defined $Pc) {
14442        $Word += $Pc;
14443    }
14444    else {
14445        $Word += ord('_');  # Make sure this is a $Word
14446    }
14447    my $JC = property_ref('Join_Control');  # Wasn't in release 1
14448    if (defined $JC) {
14449        $Word += $JC->table('Y');
14450    }
14451    else {
14452        $Word += 0x200C + 0x200D;
14453    }
14454
14455    # This is a Perl extension, so the name doesn't begin with Posix.
14456    my $PerlWord = $perl->add_match_table('PosixWord',
14457                    Description => '\w, restricted to ASCII',
14458                    Initialize => $Word & $ASCII,
14459                    );
14460    $PerlWord->add_alias('PerlWord');
14461
14462    my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14463                                Description => '\h, Horizontal white space',
14464
14465                                # 200B is Zero Width Space which is for line
14466                                # break control, and was listed as
14467                                # Space_Separator in early releases
14468                                Initialize => $gc->table('Space_Separator')
14469                                            +   ord("\t")
14470                                            -   0x200B, # ZWSP
14471                                );
14472    $Blank->add_alias('HorizSpace');        # Another name for it.
14473    $perl->add_match_table("PosixBlank",
14474                            Initialize => $Blank & $ASCII,
14475                            );
14476
14477    my $VertSpace = $perl->add_match_table('VertSpace',
14478                            Description => '\v',
14479                            Initialize =>
14480                               $gc->table('Line_Separator')
14481                             + $gc->table('Paragraph_Separator')
14482                             + utf8::unicode_to_native(0x0A)  # LINE FEED
14483                             + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14484                             + ord("\f")
14485                             + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14486                             + utf8::unicode_to_native(0x85)  # NEL
14487                    );
14488    # No Posix equivalent for vertical space
14489
14490    my $Space = $perl->add_match_table('XPosixSpace',
14491                Description => '\s including beyond ASCII and vertical tab',
14492                Initialize => $Blank + $VertSpace,
14493    );
14494    $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
14495    $Space->add_alias('SpacePerl');
14496    $Space->add_alias('Space') if $v_version lt v4.1.0;
14497
14498    my $Posix_space = $perl->add_match_table("PosixSpace",
14499                            Initialize => $Space & $ASCII,
14500                            );
14501    $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14502
14503    my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14504                                        Description => 'Control characters');
14505    $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14506    $perl->add_match_table("PosixCntrl",
14507                            Description => "ASCII control characters",
14508                            Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14509                                         . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14510                                         . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14511                                         . " HT, LF, NAK, NUL, RS, SI, SO,"
14512                                         . " SOH, STX, SUB, SYN, US, VT",
14513                            Initialize => $Cntrl & $ASCII,
14514                            );
14515
14516    my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14517    my $Cs = $gc->table('Cs');
14518    if (defined $Cs && ! $Cs->is_empty) {
14519        $perl_surrogate += $Cs;
14520    }
14521    else {
14522        push @tables_that_may_be_empty, '_Perl_Surrogate';
14523    }
14524
14525    # $controls is a temporary used to construct Graph.
14526    my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14527                                                + $gc->table('Control')
14528                                                + $perl_surrogate);
14529
14530    # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
14531    my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14532                        Description => 'Characters that are graphical',
14533                        Initialize => ~ ($Space + $controls),
14534                        );
14535    $perl->add_match_table("PosixGraph",
14536                            Initialize => $Graph & $ASCII,
14537                            );
14538
14539    $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14540                        Description => 'Characters that are graphical plus space characters (but no controls)',
14541                        Initialize => $Blank + $Graph - $gc->table('Control'),
14542                        );
14543    $perl->add_match_table("PosixPrint",
14544                            Initialize => $print & $ASCII,
14545                            );
14546
14547    my $Punct = $perl->add_match_table('Punct');
14548    $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14549
14550    # \p{punct} doesn't include the symbols, which posix does
14551    my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14552                    Description => '\p{Punct} + ASCII-range \p{Symbol}',
14553                    Initialize => $gc->table('Punctuation')
14554                                + ($ASCII & $gc->table('Symbol')),
14555                                Perl_Extension => 1
14556        );
14557    $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14558        Initialize => $ASCII & $XPosixPunct,
14559        );
14560
14561    my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14562                            Description => '[0-9] + all other decimal digits');
14563    $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14564    my $PosixDigit = $perl->add_match_table("PosixDigit",
14565                                            Initialize => $Digit & $ASCII,
14566                                            );
14567
14568    # Hex_Digit was not present in first release
14569    my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14570    my $Hex = property_ref('Hex_Digit');
14571    if (defined $Hex && ! $Hex->is_empty) {
14572        $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14573    }
14574    else {
14575        $Xdigit->initialize([ ord('0') .. ord('9'),
14576                              ord('A') .. ord('F'),
14577                              ord('a') .. ord('f'),
14578                              0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14579    }
14580
14581    # AHex was not present in early releases
14582    my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14583    my $AHex = property_ref('ASCII_Hex_Digit');
14584    if (defined $AHex && ! $AHex->is_empty) {
14585        $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14586    }
14587    else {
14588        $PosixXDigit->initialize($Xdigit & $ASCII);
14589        $PosixXDigit->add_alias('AHex');
14590        $PosixXDigit->add_alias('Ascii_Hex_Digit');
14591    }
14592
14593    my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14594                    Description => "Code points that particpate in some fold",
14595                    );
14596    my $loc_problem_folds = $perl->add_match_table(
14597               "_Perl_Problematic_Locale_Folds",
14598               Description =>
14599                   "Code points that are in some way problematic under locale",
14600    );
14601
14602    # This allows regexec.c to skip some work when appropriate.  Some of the
14603    # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14604    my $loc_problem_folds_start = $perl->add_match_table(
14605               "_Perl_Problematic_Locale_Foldeds_Start",
14606               Description =>
14607                   "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14608    );
14609
14610    my $cf = property_ref('Case_Folding');
14611
14612    # Every character 0-255 is problematic because what each folds to depends
14613    # on the current locale
14614    $loc_problem_folds->add_range(0, 255);
14615    $loc_problem_folds->add_range(0x130, 0x131);    # These are problematic in
14616                                                    # Turkic locales
14617    $loc_problem_folds_start += $loc_problem_folds;
14618
14619    # Also problematic are anything these fold to outside the range.  Likely
14620    # forever the only thing folded to by these outside the 0-255 range is the
14621    # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14622    # completely general, which should catch any unexpected changes or errors.
14623    # We look at each code point 0-255, and add its fold (including each part
14624    # of a multi-char fold) to the list.  See commit message
14625    # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14626    # of the MU issue.
14627    foreach my $range ($loc_problem_folds->ranges) {
14628        foreach my $code_point ($range->start .. $range->end) {
14629            my $fold_range = $cf->containing_range($code_point);
14630            next unless defined $fold_range;
14631
14632            # Skip if folds to itself
14633            next if $fold_range->value eq $CODE_POINT;
14634
14635            my @hex_folds = split " ", $fold_range->value;
14636            my $start_cp = $hex_folds[0];
14637            next if $start_cp eq $CODE_POINT;
14638            $start_cp = hex $start_cp;
14639            foreach my $i (0 .. @hex_folds - 1) {
14640                my $cp = $hex_folds[$i];
14641                next if $cp eq $CODE_POINT;
14642                $cp = hex $cp;
14643                next unless $cp > 255;    # Already have the < 256 ones
14644
14645                $loc_problem_folds->add_range($cp, $cp);
14646                $loc_problem_folds_start->add_range($start_cp, $start_cp);
14647            }
14648        }
14649    }
14650
14651    my $folds_to_multi_char = $perl->add_match_table(
14652         "_Perl_Folds_To_Multi_Char",
14653         Description =>
14654              "Code points whose fold is a string of more than one character",
14655    );
14656    my $in_multi_fold = $perl->add_match_table(
14657               "_Perl_Is_In_Multi_Char_Fold",
14658               Description =>
14659                   "Code points that are in some multiple character fold",
14660    );
14661    if ($v_version lt v3.0.1) {
14662        push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char',
14663                                        '_Perl_Is_In_Multi_Char_Fold',
14664                                        '_Perl_Non_Final_Folds';
14665    }
14666
14667    # Look through all the known folds to populate these tables.
14668    foreach my $range ($cf->ranges) {
14669        next if $range->value eq $CODE_POINT;
14670        my $start = $range->start;
14671        my $end = $range->end;
14672        $any_folds->add_range($start, $end);
14673
14674        my @hex_folds = split " ", $range->value;
14675        if (@hex_folds > 1) {   # Is multi-char fold
14676            $folds_to_multi_char->add_range($start, $end);
14677        }
14678
14679        my $found_locale_problematic = 0;
14680
14681        my $folded_count = @hex_folds;
14682        if ($folded_count > 3) {
14683            die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's  $folded_count for U+" . sprintf "%04X", $range->start);
14684        }
14685
14686        # Look at each of the folded-to characters...
14687        foreach my $i (1 .. $folded_count) {
14688            my $cp = hex $hex_folds[$i-1];
14689            $any_folds->add_range($cp, $cp);
14690
14691            # The fold is problematic if any of the folded-to characters is
14692            # already considered problematic.
14693            if ($loc_problem_folds->contains($cp)) {
14694                $loc_problem_folds->add_range($start, $end);
14695                $found_locale_problematic = 1;
14696            }
14697
14698            if ($folded_count > 1) {
14699                $in_multi_fold->add_range($cp, $cp);
14700            }
14701        }
14702
14703        # If this is a problematic fold, add to the start chars the
14704        # folding-from characters and first folded-to character.
14705        if ($found_locale_problematic) {
14706            $loc_problem_folds_start->add_range($start, $end);
14707            my $cp = hex $hex_folds[0];
14708            $loc_problem_folds_start->add_range($cp, $cp);
14709        }
14710    }
14711
14712    my $dt = property_ref('Decomposition_Type');
14713    $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14714        Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14715        Perl_Extension => 1,
14716        Note => 'Union of all non-canonical decompositions',
14717        );
14718
14719    # For backward compatibility, Perl has its own definition for IDStart.
14720    # It is regular XID_Start plus the underscore, but all characters must be
14721    # Word characters as well
14722    my $XID_Start = property_ref('XID_Start');
14723    my $perl_xids = $perl->add_match_table('_Perl_IDStart',
14724                                            Perl_Extension => 1,
14725                                            Fate => $INTERNAL_ONLY,
14726                                            Initialize => ord('_')
14727                                            );
14728    if (defined $XID_Start
14729        || defined ($XID_Start = property_ref('ID_Start')))
14730    {
14731        $perl_xids += $XID_Start->table('Y');
14732    }
14733    else {
14734        # For Unicode versions that don't have the property, construct our own
14735        # from first principles.  The actual definition is:
14736        #     Letters
14737        #   + letter numbers (Nl)
14738        #   - Pattern_Syntax
14739        #   - Pattern_White_Space
14740        #   + stability extensions
14741        #   - NKFC modifications
14742        #
14743        # What we do in the code below is to include the identical code points
14744        # that are in the first release that had Unicode's version of this
14745        # property, essentially extrapolating backwards.  There were no
14746        # stability extensions until v4.1, so none are included; likewise in
14747        # no Unicode version so far do subtracting PatSyn and PatWS make any
14748        # difference, so those also are ignored.
14749        $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14750
14751        # We do subtract the NFKC modifications that are in the first version
14752        # that had this property.  We don't bother to test if they are in the
14753        # version in question, because if they aren't, the operation is a
14754        # no-op.  The NKFC modifications are discussed in
14755        # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14756        foreach my $range ( 0x037A,
14757                            0x0E33,
14758                            0x0EB3,
14759                            [ 0xFC5E, 0xFC63 ],
14760                            [ 0xFDFA, 0xFE70 ],
14761                            [ 0xFE72, 0xFE76 ],
14762                            0xFE78,
14763                            0xFE7A,
14764                            0xFE7C,
14765                            0xFE7E,
14766                            [ 0xFF9E, 0xFF9F ],
14767        ) {
14768            if (ref $range) {
14769                $perl_xids->delete_range($range->[0], $range->[1]);
14770            }
14771            else {
14772                $perl_xids->delete_range($range, $range);
14773            }
14774        }
14775    }
14776
14777    $perl_xids &= $Word;
14778
14779    my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14780                                        Perl_Extension => 1,
14781                                        Fate => $INTERNAL_ONLY);
14782    my $XIDC = property_ref('XID_Continue');
14783    if (defined $XIDC
14784        || defined ($XIDC = property_ref('ID_Continue')))
14785    {
14786        $perl_xidc += $XIDC->table('Y');
14787    }
14788    else {
14789        # Similarly, we construct our own XIDC if necessary for early Unicode
14790        # versions.  The definition is:
14791        #     everything in XIDS
14792        #   + Gc=Mn
14793        #   + Gc=Mc
14794        #   + Gc=Nd
14795        #   + Gc=Pc
14796        #   - Pattern_Syntax
14797        #   - Pattern_White_Space
14798        #   + stability extensions
14799        #   - NFKC modifications
14800        #
14801        # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14802        # and stability extensions.  There is a somewhat different set of NFKC
14803        # mods to remove (and add in this case).  The ones below make this
14804        # have identical code points as in the first release that defined it.
14805        $perl_xidc += $perl_xids
14806                    + $gc->table('L')
14807                    + $gc->table('Mn')
14808                    + $gc->table('Mc')
14809                    + $gc->table('Nd')
14810                    + utf8::unicode_to_native(0xB7)
14811                    ;
14812        if (defined (my $pc = $gc->table('Pc'))) {
14813            $perl_xidc += $pc;
14814        }
14815        else {  # 1.1.5 didn't have Pc, but these should have been in it
14816            $perl_xidc += 0xFF3F;
14817            $perl_xidc->add_range(0x203F, 0x2040);
14818            $perl_xidc->add_range(0xFE33, 0xFE34);
14819            $perl_xidc->add_range(0xFE4D, 0xFE4F);
14820        }
14821
14822        # Subtract the NFKC mods
14823        foreach my $range ( 0x037A,
14824                            [ 0xFC5E, 0xFC63 ],
14825                            [ 0xFDFA, 0xFE1F ],
14826                            0xFE70,
14827                            [ 0xFE72, 0xFE76 ],
14828                            0xFE78,
14829                            0xFE7A,
14830                            0xFE7C,
14831                            0xFE7E,
14832        ) {
14833            if (ref $range) {
14834                $perl_xidc->delete_range($range->[0], $range->[1]);
14835            }
14836            else {
14837                $perl_xidc->delete_range($range, $range);
14838            }
14839        }
14840    }
14841
14842    $perl_xidc &= $Word;
14843
14844    my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14845                    Perl_Extension => 1,
14846                    Fate => $INTERNAL_ONLY,
14847                    Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14848                    );
14849
14850    my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14851                        Perl_Extension => 1,
14852                        Fate => $INTERNAL_ONLY,
14853                        Initialize => $perl_xidc
14854                                    + ord(" ")
14855                                    + ord("(")
14856                                    + ord(")")
14857                                    + ord("-")
14858                        );
14859
14860    my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
14861
14862    if (@named_sequences) {
14863        push @composition, 'Named_Sequence';
14864        foreach my $sequence (@named_sequences) {
14865            $perl_charname->add_anomalous_entry($sequence);
14866        }
14867    }
14868
14869    my $alias_sentence = "";
14870    my %abbreviations;
14871    my $alias = property_ref('_Perl_Name_Alias');
14872    $perl_charname->set_proxy_for('_Perl_Name_Alias');
14873
14874    # Add each entry in _Perl_Name_Alias to Perl_Charnames.  Where these go
14875    # with respect to any existing entry depends on the entry type.
14876    # Corrections go before said entry, as they should be returned in
14877    # preference over the existing entry.  (A correction to a correction
14878    # should be later in the _Perl_Name_Alias table, so it will correctly
14879    # precede the erroneous correction in Perl_Charnames.)
14880    #
14881    # Abbreviations go after everything else, so they are saved temporarily in
14882    # a hash for later.
14883    #
14884    # Everything else is added afterwards, which preserves the input
14885    # ordering
14886
14887    foreach my $range ($alias->ranges) {
14888        next if $range->value eq "";
14889        my $code_point = $range->start;
14890        if ($code_point != $range->end) {
14891            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14892        }
14893        my ($value, $type) = split ': ', $range->value;
14894        my $replace_type;
14895        if ($type eq 'correction') {
14896            $replace_type = $MULTIPLE_BEFORE;
14897        }
14898        elsif ($type eq 'abbreviation') {
14899
14900            # Save for later
14901            $abbreviations{$value} = $code_point;
14902            next;
14903        }
14904        else {
14905            $replace_type = $MULTIPLE_AFTER;
14906        }
14907
14908        # Actually add; before or after current entry(ies) as determined
14909        # above.
14910
14911        $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14912    }
14913    $alias_sentence = <<END;
14914The _Perl_Name_Alias property adds duplicate code point entries that are
14915alternatives to the original name.  If an addition is a corrected
14916name, it will be physically first in the table.  The original (less correct,
14917but still valid) name will be next; then any alternatives, in no particular
14918order; and finally any abbreviations, again in no particular order.
14919END
14920
14921    # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
14922    # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
14923    # so should be first in the file; the other names have precedence starting
14924    # in 6.1,
14925    my $before_or_after = ($v_version lt v6.1.0)
14926                          ? $MULTIPLE_BEFORE
14927                          : $MULTIPLE_AFTER;
14928
14929    foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14930        my $code_point = $range->start;
14931        my $unicode_1_value = $range->value;
14932        next if $unicode_1_value eq "";     # Skip if name doesn't exist.
14933
14934        if ($code_point != $range->end) {
14935            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14936        }
14937
14938        # To handle EBCDIC, we don't hard code in the code points of the
14939        # controls; instead realizing that all of them are below 256.
14940        last if $code_point > 255;
14941
14942        # We only add in the controls.
14943        next if $gc->value_of($code_point) ne 'Cc';
14944
14945        # We reject this Unicode1 name for later Perls, as it is used for
14946        # another code point
14947        next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14948
14949        # This won't add an exact duplicate.
14950        $perl_charname->add_duplicate($code_point, $unicode_1_value,
14951                                        Replace => $before_or_after);
14952    }
14953
14954    # Now that have everything added, add in abbreviations after
14955    # everything else.  Sort so results don't change between runs of this
14956    # program
14957    foreach my $value (sort keys %abbreviations) {
14958        $perl_charname->add_duplicate($abbreviations{$value}, $value,
14959                                        Replace => $MULTIPLE_AFTER);
14960    }
14961
14962    my $comment;
14963    if (@composition <= 2) { # Always at least 2
14964        $comment = join " and ", @composition;
14965    }
14966    else {
14967        $comment = join ", ", @composition[0 .. scalar @composition - 2];
14968        $comment .= ", and $composition[-1]";
14969    }
14970
14971    $perl_charname->add_comment(join_lines( <<END
14972This file is for charnames.pm.  It is the union of the $comment properties.
14973Unicode_1_Name entries are used only for nameless code points in the Name
14974property.
14975$alias_sentence
14976This file doesn't include the algorithmically determinable names.  For those,
14977use 'unicore/Name.pm'
14978END
14979    ));
14980    property_ref('Name')->add_comment(join_lines( <<END
14981This file doesn't include the algorithmically determinable names.  For those,
14982use 'unicore/Name.pm'
14983END
14984    ));
14985
14986    # Construct the Present_In property from the Age property.
14987    if (-e 'DAge.txt' && defined $age) {
14988        my $default_map = $age->default_map;
14989        my $in = Property->new('In',
14990                                Default_Map => $default_map,
14991                                Full_Name => "Present_In",
14992                                Perl_Extension => 1,
14993                                Type => $ENUM,
14994                                Initialize => $age,
14995                                );
14996        $in->add_comment(join_lines(<<END
14997THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
14998same as for $age, and not for what $in really means.  This is because anything
14999defined in a given release should have multiple values: that release and all
15000higher ones.  But only one value per code point can be represented in a table
15001like this.
15002END
15003        ));
15004
15005        # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
15006        # lowest numbered (earliest) come first, with the non-numeric one
15007        # last.
15008        my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15009                                            ? 1
15010                                            : ($b->name !~ /^[\d.]*$/)
15011                                                ? -1
15012                                                : $a->name <=> $b->name
15013                                            } $age->tables;
15014
15015        # The Present_In property is the cumulative age properties.  The first
15016        # one hence is identical to the first age one.
15017        my $first_in = $in->add_match_table($first_age->name);
15018        $first_in->set_equivalent_to($first_age, Related => 1);
15019
15020        my $description_start = "Code point's usage introduced in version ";
15021        $first_age->add_description($description_start . $first_age->name);
15022        foreach my $alias ($first_age->aliases) {   # Include its aliases
15023            $first_in->add_alias($alias->name);
15024        }
15025
15026        # To construct the accumulated values, for each of the age tables
15027        # starting with the 2nd earliest, merge the earliest with it, to get
15028        # all those code points existing in the 2nd earliest.  Repeat merging
15029        # the new 2nd earliest with the 3rd earliest to get all those existing
15030        # in the 3rd earliest, and so on.
15031        my $previous_in = $first_in;
15032        foreach my $current_age (@rest_ages) {
15033            next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
15034
15035            my $current_in = $in->add_match_table(
15036                                    $current_age->name,
15037                                    Initialize => $current_age + $previous_in,
15038                                    Description => $description_start
15039                                                    . $current_age->name
15040                                                    . ' or earlier',
15041                                    );
15042            foreach my $alias ($current_age->aliases) {
15043                $current_in->add_alias($alias->name);
15044            }
15045            $previous_in = $current_in;
15046
15047            # Add clarifying material for the corresponding age file.  This is
15048            # in part because of the confusing and contradictory information
15049            # given in the Standard's documentation itself, as of 5.2.
15050            $current_age->add_description(
15051                            "Code point's usage was introduced in version "
15052                            . $current_age->name);
15053            $current_age->add_note("See also $in");
15054
15055        }
15056
15057        # And finally the code points whose usages have yet to be decided are
15058        # the same in both properties.  Note that permanently unassigned code
15059        # points actually have their usage assigned (as being permanently
15060        # unassigned), so that these tables are not the same as gc=cn.
15061        my $unassigned = $in->add_match_table($default_map);
15062        my $age_default = $age->table($default_map);
15063        $age_default->add_description(<<END
15064Code point's usage has not been assigned in any Unicode release thus far.
15065END
15066        );
15067        $unassigned->set_equivalent_to($age_default, Related => 1);
15068        foreach my $alias ($age_default->aliases) {
15069            $unassigned->add_alias($alias->name);
15070        }
15071    }
15072
15073    my $patws = $perl->add_match_table('_Perl_PatWS',
15074                                       Perl_Extension => 1,
15075                                       Fate => $INTERNAL_ONLY);
15076    if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15077        $patws->initialize($off_patws->table('Y'));
15078    }
15079    else {
15080        $patws->initialize([ ord("\t"),
15081                             ord("\n"),
15082                             utf8::unicode_to_native(0x0B), # VT
15083                             ord("\f"),
15084                             ord("\r"),
15085                             ord(" "),
15086                             utf8::unicode_to_native(0x85), # NEL
15087                             0x200E..0x200F,             # Left, Right marks
15088                             0x2028..0x2029              # Line, Paragraph seps
15089                           ] );
15090    }
15091
15092    # See L<perlfunc/quotemeta>
15093    my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15094                                           Perl_Extension => 1,
15095                                           Fate => $INTERNAL_ONLY,
15096
15097                                           # Initialize to what's common in
15098                                           # all Unicode releases.
15099                                           Initialize =>
15100                                                  $gc->table('Control')
15101                                                + $Space
15102                                                + $patws
15103                                                + ((~ $Word) & $ASCII)
15104                           );
15105
15106    if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15107        $quotemeta += $patsyn->table('Y');
15108    }
15109    else {
15110        $quotemeta += ((~ $Word) & Range->new(0, 255))
15111                    - utf8::unicode_to_native(0xA8)
15112                    - utf8::unicode_to_native(0xAF)
15113                    - utf8::unicode_to_native(0xB2)
15114                    - utf8::unicode_to_native(0xB3)
15115                    - utf8::unicode_to_native(0xB4)
15116                    - utf8::unicode_to_native(0xB7)
15117                    - utf8::unicode_to_native(0xB8)
15118                    - utf8::unicode_to_native(0xB9)
15119                    - utf8::unicode_to_native(0xBC)
15120                    - utf8::unicode_to_native(0xBD)
15121                    - utf8::unicode_to_native(0xBE);
15122        $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15123                        # same in all releases
15124                        0x2010 .. 0x2027,
15125                        0x2030 .. 0x203E,
15126                        0x2041 .. 0x2053,
15127                        0x2055 .. 0x205E,
15128                        0x2190 .. 0x245F,
15129                        0x2500 .. 0x2775,
15130                        0x2794 .. 0x2BFF,
15131                        0x2E00 .. 0x2E7F,
15132                        0x3001 .. 0x3003,
15133                        0x3008 .. 0x3020,
15134                        0x3030 .. 0x3030,
15135                        0xFD3E .. 0xFD3F,
15136                        0xFE45 .. 0xFE46
15137                      ];
15138    }
15139
15140    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15141        $quotemeta += $di->table('Y')
15142    }
15143    else {
15144        if ($v_version ge v2.0) {
15145            $quotemeta += $gc->table('Cf')
15146                       +  $gc->table('Cs');
15147
15148            # These are above the Unicode version 1 max
15149            $quotemeta->add_range(0xE0000, 0xE0FFF);
15150        }
15151        $quotemeta += $gc->table('Cc')
15152                    - $Space;
15153        my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15154                                                   0x2060 .. 0x206F,
15155                                                   0xFE00 .. 0xFE0F,
15156                                                   0xFFF0 .. 0xFFFB,
15157                                                  ]);
15158        $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15159        $quotemeta += $temp;
15160    }
15161    calculate_DI();
15162    $quotemeta += $DI;
15163
15164    calculate_NChar();
15165
15166    # Finished creating all the perl properties.  All non-internal non-string
15167    # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
15168    # an underscore.)  These do not get a separate entry in the pod file
15169    foreach my $table ($perl->tables) {
15170        foreach my $alias ($table->aliases) {
15171            next if $alias->name =~ /^_/;
15172            $table->add_alias('Is_' . $alias->name,
15173                               Re_Pod_Entry => 0,
15174                               UCD => 0,
15175                               Status => $alias->status,
15176                               OK_as_Filename => 0);
15177        }
15178    }
15179
15180    # Perl tailors the WordBreak property so that \b{wb} doesn't split
15181    # adjacent spaces into separate words.  Unicode 11.0 moved in that
15182    # direction, but left TAB,  FIGURE SPACE (U+2007), and (ironically) NO
15183    # BREAK SPACE as breaking, so we retained the original Perl customization.
15184    # To do this, in the Perl copy of WB, simply replace the mappings of
15185    # horizontal space characters that otherwise would map to the default or
15186    # the 11.0 'WSegSpace' to instead map to our tailoring.
15187    my $perl_wb = property_ref('_Perl_WB');
15188    my $default = $perl_wb->default_map;
15189    for my $range ($Blank->ranges) {
15190        for my $i ($range->start .. $range->end) {
15191            my $value = $perl_wb->value_of($i);
15192
15193            next unless $value eq $default || $value eq 'WSegSpace';
15194            $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15195                              Replace => $UNCONDITIONALLY);
15196        }
15197    }
15198
15199    # Also starting in Unicode 11.0, rules for some of the boundary types are
15200    # based on a non-UCD property (which we have read in if it exists).
15201    # Recall that these boundary properties partition the code points into
15202    # equivalence classes (represented as enums).
15203    #
15204    # The loop below goes through each code point that matches the non-UCD
15205    # property, and for each current equivalence class containing such a code
15206    # point, splits it so that those that are in both are now in a newly
15207    # created equivalence class whose name is a combination of the property
15208    # and the old class name, leaving unchanged everything that doesn't match
15209    # the non-UCD property.
15210    my $ep = property_ref('ExtPict');
15211    $ep = $ep->table('Y') if defined $ep;
15212    if (defined $ep) {
15213        foreach my $base_property (property_ref('GCB'),
15214                                   property_ref('WB'))
15215        {
15216            my $property = property_ref('_Perl_' . $base_property->name);
15217            foreach my $range ($ep->ranges) {
15218                foreach my $i ($range->start .. $range->end) {
15219                    my $current = $property->value_of($i);
15220                    $current = $property->table($current)->short_name;
15221                    $property->add_map($i, $i, 'ExtPict_' . $current,
15222                                       Replace => $UNCONDITIONALLY);
15223                }
15224            }
15225        }
15226    }
15227
15228    # Create a version of the LineBreak property with the mappings that are
15229    # omitted in the default algorithm remapped to what
15230    # http://www.unicode.org/reports/tr14 says they should be.
15231    #
15232    # First, create a plain copy, but with all property values written out in
15233    # their long form, as regen/mk_invlist.pl expects that, and also fix
15234    # occurrences of the typo in early Unicode versions: 'inseperable'.
15235    my $perl_lb = property_ref('_Perl_LB');
15236    if (! defined $perl_lb) {
15237        $perl_lb = Property->new('_Perl_LB',
15238                                 Fate => $INTERNAL_ONLY,
15239                                 Perl_Extension => 1,
15240                                 Directory => $map_directory,
15241                                 Type => $STRING);
15242        my $lb = property_ref('Line_Break');
15243
15244        # Populate from $lb, but use full name and fix typo.
15245        foreach my $range ($lb->ranges) {
15246            my $full_name = $lb->table($range->value)->full_name;
15247            $full_name = 'Inseparable'
15248                                if standardize($full_name) eq 'inseperable';
15249            $perl_lb->add_map($range->start, $range->end, $full_name);
15250        }
15251    }
15252
15253    # What tr14 says is this:
15254
15255    # Original 	   Resolved  General_Category
15256    # AI, SG, XX      AL      Any
15257    # SA              CM      Only Mn or Mc
15258    # SA              AL      Any except Mn and Mc
15259    # CJ              NS      Any
15260
15261    $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
15262
15263    my $ea = property_ref('East_Asian_Width');
15264    my $Cn_EP;
15265    $Cn_EP = $ep & $gc->table('Unassigned') if defined $ep;
15266
15267    for my $range ($perl_lb->ranges) {
15268        my $value = standardize($range->value);
15269        if (   $value eq standardize('Unknown')
15270            || $value eq standardize('Ambiguous')
15271            || $value eq standardize('Surrogate'))
15272        {
15273            $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15274                              Replace => $UNCONDITIONALLY);
15275        }
15276        elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15277            $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15278                              Replace => $UNCONDITIONALLY);
15279        }
15280        elsif ($value eq standardize('Complex_Context')) {
15281            for my $i ($range->start .. $range->end) {
15282                my $gc_val = $gc->value_of($i);
15283                if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15284                    $perl_lb->add_map($i, $i, 'Combining_Mark',
15285                                      Replace => $UNCONDITIONALLY);
15286                }
15287                else {
15288                    $perl_lb->add_map($i, $i, 'Alphabetic',
15289                                      Replace => $UNCONDITIONALLY);
15290                }
15291            }
15292        }
15293        elsif (defined $ep && $value eq standardize('Ideographic')) {
15294
15295            # Unicode 14 adds a rule to not break lines before any potential
15296            # EBase,  They say that any unassigned code point that is ExtPict,
15297            # is potentially an EBase.  In 14.0, all such ones are in the
15298            # ExtPict=ID category.  We must split that category for the
15299            # pairwise rule table to work.
15300            for my $i ($range->start .. $range->end) {
15301                if ($Cn_EP->contains($i)) {
15302                    $perl_lb->add_map($i, $i,
15303                                'Unassigned_Extended_Pictographic_Ideographic',
15304                                Replace => $UNCONDITIONALLY);
15305                }
15306            }
15307        }
15308        elsif (    defined $ea
15309               && (   $value eq standardize('Close_Parenthesis')
15310                   || $value eq standardize('Open_Punctuation')))
15311        {
15312            # Unicode 13 splits the OP and CP properties each into East Asian,
15313            # and non-.  We retain the (now somewhat misleading) names OP and
15314            # CP for the non-East Asian variety, as there are very few East
15315            # Asian ones.
15316            my $replace = ($value eq standardize('Open_Punctuation'))
15317                          ? 'East_Asian_OP'
15318                          : 'East_Asian_CP';
15319            for my $i ($range->start .. $range->end) {
15320                my $ea_val = $ea->value_of($i);
15321                if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') {
15322                    $perl_lb->add_map($i, $i, $replace,
15323                                                Replace => $UNCONDITIONALLY);
15324                }
15325            }
15326        }
15327    }
15328
15329    # This property is a modification of the scx property
15330    my $perl_scx = Property->new('_Perl_SCX',
15331                                 Fate => $INTERNAL_ONLY,
15332                                 Perl_Extension => 1,
15333                                 Directory => $map_directory,
15334                                 Type => $ENUM);
15335    my $source;
15336
15337    # Use scx if available; otherwise sc;  if neither is there (a very old
15338    # Unicode version, just say that everything is 'Common'
15339    if (defined $scx) {
15340        $source = $scx;
15341        $perl_scx->set_default_map('Unknown');
15342    }
15343    elsif (defined $script) {
15344        $source = $script;
15345
15346        # Early versions of 'sc', had everything be 'Common'
15347        if (defined $script->table('Unknown')) {
15348            $perl_scx->set_default_map('Unknown');
15349        }
15350        else {
15351            $perl_scx->set_default_map('Common');
15352        }
15353    } else {
15354        $perl_scx->add_match_table('Common');
15355        $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15356
15357        $perl_scx->add_match_table('Unknown');
15358        $perl_scx->set_default_map('Unknown');
15359    }
15360
15361    $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15362    $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15363
15364    if (defined $source) {
15365        $perl_scx->initialize($source);
15366
15367        # UTS 39 says that the scx property should be modified for these
15368        # countries where certain mixed scripts are commonly used.
15369        for my $range ($perl_scx->ranges) {
15370            my $value = $range->value;
15371            my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15372             $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15373             $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15374             $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15375                                     {$1 Katakana Hiragana Jpan}xi;
15376             $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15377             $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15378
15379            if ($changed) {
15380                $value = join " ", uniques split " ", $value;
15381                $range->set_value($value)
15382            }
15383        }
15384
15385        foreach my $table ($source->tables) {
15386            my $scx_table = $perl_scx->add_match_table($table->name,
15387                                    Full_Name => $table->full_name);
15388            foreach my $alias ($table->aliases) {
15389                $scx_table->add_alias($alias->name);
15390            }
15391        }
15392    }
15393
15394    # Here done with all the basic stuff.  Ready to populate the information
15395    # about each character if annotating them.
15396    if ($annotate) {
15397
15398        # See comments at its declaration
15399        $annotate_ranges = Range_Map->new;
15400
15401        # This separates out the non-characters from the other unassigneds, so
15402        # can give different annotations for each.
15403        $unassigned_sans_noncharacters = Range_List->new(
15404                                    Initialize => $gc->table('Unassigned'));
15405        $unassigned_sans_noncharacters &= (~ $NChar);
15406
15407        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15408            $i = populate_char_info($i);    # Note sets $i so may cause skips
15409
15410        }
15411    }
15412
15413    return;
15414}
15415
15416sub add_perl_synonyms() {
15417    # A number of Unicode tables have Perl synonyms that are expressed in
15418    # the single-form, \p{name}.  These are:
15419    #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15420    #       \p{Is_Name} as synonyms
15421    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15422    #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15423    #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15424    #       conflict, \p{Value} and \p{Is_Value} as well
15425    #
15426    # This routine generates these synonyms, warning of any unexpected
15427    # conflicts.
15428
15429    # Construct the list of tables to get synonyms for.  Start with all the
15430    # binary and the General_Category ones.
15431    my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15432                                                            property_ref('*');
15433    push @tables, $gc->tables;
15434
15435    # If the version of Unicode includes the Script Extensions (preferably),
15436    # or Script property, add its tables
15437    if (defined $scx) {
15438        push @tables, $scx->tables;
15439    }
15440    else {
15441        push @tables, $script->tables if defined $script;
15442    }
15443
15444    # The Block tables are kept separate because they are treated differently.
15445    # And the earliest versions of Unicode didn't include them, so add only if
15446    # there are some.
15447    my @blocks;
15448    push @blocks, $block->tables if defined $block;
15449
15450    # Here, have the lists of tables constructed.  Process blocks last so that
15451    # if there are name collisions with them, blocks have lowest priority.
15452    # Should there ever be other collisions, manual intervention would be
15453    # required.  See the comments at the beginning of the program for a
15454    # possible way to handle those semi-automatically.
15455    foreach my $table (@tables,  @blocks) {
15456
15457        # For non-binary properties, the synonym is just the name of the
15458        # table, like Greek, but for binary properties the synonym is the name
15459        # of the property, and means the code points in its 'Y' table.
15460        my $nominal = $table;
15461        my $nominal_property = $nominal->property;
15462        my $actual;
15463        if (! $nominal->isa('Property')) {
15464            $actual = $table;
15465        }
15466        else {
15467
15468            # Here is a binary property.  Use the 'Y' table.  Verify that is
15469            # there
15470            my $yes = $nominal->table('Y');
15471            unless (defined $yes) {  # Must be defined, but is permissible to
15472                                     # be empty.
15473                Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
15474                next;
15475            }
15476            $actual = $yes;
15477        }
15478
15479        foreach my $alias ($nominal->aliases) {
15480
15481            # Attempt to create a table in the perl directory for the
15482            # candidate table, using whatever aliases in it that don't
15483            # conflict.  Also add non-conflicting aliases for all these
15484            # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15485            PREFIX:
15486            foreach my $prefix ("", 'Is_', 'In_') {
15487
15488                # Only Block properties can have added 'In_' aliases.
15489                next if $prefix eq 'In_' and $nominal_property != $block;
15490
15491                my $proposed_name = $prefix . $alias->name;
15492
15493                # No Is_Is, In_In, nor combinations thereof
15494                trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15495                next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15496
15497                trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15498
15499                # Get a reference to any existing table in the perl
15500                # directory with the desired name.
15501                my $pre_existing = $perl->table($proposed_name);
15502
15503                if (! defined $pre_existing) {
15504
15505                    # No name collision, so OK to add the perl synonym.
15506
15507                    my $make_re_pod_entry;
15508                    my $ok_as_filename;
15509                    my $status = $alias->status;
15510                    if ($nominal_property == $block) {
15511
15512                        # For block properties, only the compound form is
15513                        # preferred for external use; the others are
15514                        # discouraged.  The pod file contains wild cards for
15515                        # the 'In' and 'Is' forms so no entries for those; and
15516                        # we don't want people using the name without any
15517                        # prefix, so discourage that.
15518                        if ($prefix eq "") {
15519                            $make_re_pod_entry = 1;
15520                            $status = $status || $DISCOURAGED;
15521                            $ok_as_filename = 0;
15522                        }
15523                        elsif ($prefix eq 'In_') {
15524                            $make_re_pod_entry = 0;
15525                            $status = $status || $DISCOURAGED;
15526                            $ok_as_filename = 1;
15527                        }
15528                        else {
15529                            $make_re_pod_entry = 0;
15530                            $status = $status || $DISCOURAGED;
15531                            $ok_as_filename = 0;
15532                        }
15533                    }
15534                    elsif ($prefix ne "") {
15535
15536                        # The 'Is' prefix is handled in the pod by a wild
15537                        # card, and we won't use it for an external name
15538                        $make_re_pod_entry = 0;
15539                        $status = $status || $NORMAL;
15540                        $ok_as_filename = 0;
15541                    }
15542                    else {
15543
15544                        # Here, is an empty prefix, non block.  This gets its
15545                        # own pod entry and can be used for an external name.
15546                        $make_re_pod_entry = 1;
15547                        $status = $status || $NORMAL;
15548                        $ok_as_filename = 1;
15549                    }
15550
15551                    # Here, there isn't a perl pre-existing table with the
15552                    # name.  Look through the list of equivalents of this
15553                    # table to see if one is a perl table.
15554                    foreach my $equivalent ($actual->leader->equivalents) {
15555                        next if $equivalent->property != $perl;
15556
15557                        # Here, have found a table for $perl.  Add this alias
15558                        # to it, and are done with this prefix.
15559                        $equivalent->add_alias($proposed_name,
15560                                        Re_Pod_Entry => $make_re_pod_entry,
15561
15562                                        # Currently don't output these in the
15563                                        # ucd pod, as are strongly discouraged
15564                                        # from being used
15565                                        UCD => 0,
15566
15567                                        Status => $status,
15568                                        OK_as_Filename => $ok_as_filename);
15569                        trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15570                        next PREFIX;
15571                    }
15572
15573                    # Here, $perl doesn't already have a table that is a
15574                    # synonym for this property, add one.
15575                    my $added_table = $perl->add_match_table($proposed_name,
15576                                            Re_Pod_Entry => $make_re_pod_entry,
15577
15578                                            # See UCD comment just above
15579                                            UCD => 0,
15580
15581                                            Status => $status,
15582                                            OK_as_Filename => $ok_as_filename);
15583                    # And it will be related to the actual table, since it is
15584                    # based on it.
15585                    $added_table->set_equivalent_to($actual, Related => 1);
15586                    trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15587                    next;
15588                } # End of no pre-existing.
15589
15590                # Here, there is a pre-existing table that has the proposed
15591                # name.  We could be in trouble, but not if this is just a
15592                # synonym for another table that we have already made a child
15593                # of the pre-existing one.
15594                if ($pre_existing->is_set_equivalent_to($actual)) {
15595                    trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15596                    $pre_existing->add_alias($proposed_name);
15597                    next;
15598                }
15599
15600                # Here, there is a name collision, but it still could be OK if
15601                # the tables match the identical set of code points, in which
15602                # case, we can combine the names.  Compare each table's code
15603                # point list to see if they are identical.
15604                trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15605                if ($pre_existing->matches_identically_to($actual)) {
15606
15607                    # Here, they do match identically.  Not a real conflict.
15608                    # Make the perl version a child of the Unicode one, except
15609                    # in the non-obvious case of where the perl name is
15610                    # already a synonym of another Unicode property.  (This is
15611                    # excluded by the test for it being its own parent.)  The
15612                    # reason for this exclusion is that then the two Unicode
15613                    # properties become related; and we don't really know if
15614                    # they are or not.  We generate documentation based on
15615                    # relatedness, and this would be misleading.  Code
15616                    # later executed in the process will cause the tables to
15617                    # be represented by a single file anyway, without making
15618                    # it look in the pod like they are necessarily related.
15619                    if ($pre_existing->parent == $pre_existing
15620                        && ($pre_existing->property == $perl
15621                            || $actual->property == $perl))
15622                    {
15623                        trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15624                        $pre_existing->set_equivalent_to($actual, Related => 1);
15625                    }
15626                    elsif (main::DEBUG && $to_trace) {
15627                        trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15628                        trace $pre_existing->parent;
15629                    }
15630                    next PREFIX;
15631                }
15632
15633                # Here they didn't match identically, there is a real conflict
15634                # between our new name and a pre-existing property.
15635                $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15636                $pre_existing->add_conflicting($nominal->full_name,
15637                                               'p',
15638                                               $actual);
15639
15640                # Don't output a warning for aliases for the block
15641                # properties (unless they start with 'In_') as it is
15642                # expected that there will be conflicts and the block
15643                # form loses.
15644                if ($verbosity >= $NORMAL_VERBOSITY
15645                    && ($actual->property != $block || $prefix eq 'In_'))
15646                {
15647                    print simple_fold(join_lines(<<END
15648There is already an alias named $proposed_name (from $pre_existing),
15649so not creating this alias for $actual
15650END
15651                    ), "", 4);
15652                }
15653
15654                # Keep track for documentation purposes.
15655                $has_In_conflicts++ if $prefix eq 'In_';
15656                $has_Is_conflicts++ if $prefix eq 'Is_';
15657            }
15658        }
15659    }
15660
15661    # There are some properties which have No and Yes (and N and Y) as
15662    # property values, but aren't binary, and could possibly be confused with
15663    # binary ones.  So create caveats for them.  There are tables that are
15664    # named 'No', and tables that are named 'N', but confusion is not likely
15665    # unless they are the same table.  For example, N meaning Number or
15666    # Neutral is not likely to cause confusion, so don't add caveats to things
15667    # like them.
15668    foreach my $property (grep { $_->type != $BINARY
15669                                 && $_->type != $FORCED_BINARY }
15670                                                            property_ref('*'))
15671    {
15672        my $yes = $property->table('Yes');
15673        if (defined $yes) {
15674            my $y = $property->table('Y');
15675            if (defined $y && $yes == $y) {
15676                foreach my $alias ($property->aliases) {
15677                    $yes->add_conflicting($alias->name);
15678                }
15679            }
15680        }
15681        my $no = $property->table('No');
15682        if (defined $no) {
15683            my $n = $property->table('N');
15684            if (defined $n && $no == $n) {
15685                foreach my $alias ($property->aliases) {
15686                    $no->add_conflicting($alias->name, 'P');
15687                }
15688            }
15689        }
15690    }
15691
15692    return;
15693}
15694
15695sub register_file_for_name($table, $directory_ref, $file) {
15696    # Given info about a table and a datafile that it should be associated
15697    # with, register that association
15698
15699    # $directory_ref    # Array of the directory path for the file
15700    # $file             # The file name in the final directory.
15701
15702    trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15703
15704    if ($table->isa('Property')) {
15705        $table->set_file_path(@$directory_ref, $file);
15706        push @map_properties, $table;
15707
15708        # No swash means don't do the rest of this.
15709        return if $table->fate != $ORDINARY
15710                  && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15711
15712        # Get the path to the file
15713        my @path = $table->file_path;
15714
15715        # Use just the file name if no subdirectory.
15716        shift @path if $path[0] eq File::Spec->curdir();
15717
15718        my $file = join '/', @path;
15719
15720        # Create a hash entry for Unicode::UCD to get the file that stores this
15721        # property's map table
15722        foreach my $alias ($table->aliases) {
15723            my $name = $alias->name;
15724            if ($name =~ /^_/) {
15725                $strict_property_to_file_of{lc $name} = $file;
15726            }
15727            else {
15728                $loose_property_to_file_of{standardize($name)} = $file;
15729            }
15730        }
15731
15732        # And a way for Unicode::UCD to find the proper key in the SwashInfo
15733        # hash for this property.
15734        $file_to_swash_name{$file} = "To" . $table->swash_name;
15735        return;
15736    }
15737
15738    # Do all of the work for all equivalent tables when called with the leader
15739    # table, so skip if isn't the leader.
15740    return if $table->leader != $table;
15741
15742    # If this is a complement of another file, use that other file instead,
15743    # with a ! prepended to it.
15744    my $complement;
15745    if (($complement = $table->complement) != 0) {
15746        my @directories = $complement->file_path;
15747
15748        # This assumes that the 0th element is something like 'lib',
15749        # the 1th element the property name (in its own directory), like
15750        # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15751        # appended to it later.
15752        $directories[1] =~ s/^/!/;
15753        $file = pop @directories;
15754        $directory_ref =\@directories;
15755    }
15756
15757    # Join all the file path components together, using slashes.
15758    my $full_filename = join('/', @$directory_ref, $file);
15759
15760    # All go in the same subdirectory of unicore, or the special
15761    # pseudo-directory '#'
15762    if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15763        Carp::my_carp("Unexpected directory in "
15764                .  join('/', @{$directory_ref}, $file));
15765    }
15766
15767    # For this table and all its equivalents ...
15768    foreach my $table ($table, $table->equivalents) {
15769
15770        # Associate it with its file internally.  Don't include the
15771        # $matches_directory first component
15772        $table->set_file_path(@$directory_ref, $file);
15773
15774        # No swash means don't do the rest of this.
15775        next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15776
15777        my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15778
15779        my $property = $table->property;
15780        my $property_name = ($property == $perl)
15781                             ? ""  # 'perl' is never explicitly stated
15782                             : standardize($property->name) . '=';
15783
15784        my $is_default = 0; # Is this table the default one for the property?
15785
15786        # To calculate $is_default, we find if this table is the same as the
15787        # default one for the property.  But this is complicated by the
15788        # possibility that there is a master table for this one, and the
15789        # information is stored there instead of here.
15790        my $parent = $table->parent;
15791        my $leader_prop = $parent->property;
15792        my $default_map = $leader_prop->default_map;
15793        if (defined $default_map) {
15794            my $default_table = $leader_prop->table($default_map);
15795            $is_default = 1 if defined $default_table && $parent == $default_table;
15796        }
15797
15798        # Calculate the loose name for this table.  Mostly it's just its name,
15799        # standardized.  But in the case of Perl tables that are single-form
15800        # equivalents to Unicode properties, it is the latter's name.
15801        my $loose_table_name =
15802                        ($property != $perl || $leader_prop == $perl)
15803                        ? standardize($table->name)
15804                        : standardize($parent->name);
15805
15806        my $deprecated = ($table->status eq $DEPRECATED)
15807                         ? $table->status_info
15808                         : "";
15809        my $caseless_equivalent = $table->caseless_equivalent;
15810
15811        # And for each of the table's aliases...  This inner loop eventually
15812        # goes through all aliases in the UCD that we generate regex match
15813        # files for
15814        foreach my $alias ($table->aliases) {
15815            my $standard = UCD_name($table, $alias);
15816
15817            # Generate an entry in either the loose or strict hashes, which
15818            # will translate the property and alias names combination into the
15819            # file where the table for them is stored.
15820            if ($alias->loose_match) {
15821                if (exists $loose_to_file_of{$standard}) {
15822                    Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
15823                }
15824                else {
15825                    $loose_to_file_of{$standard} = $sub_filename;
15826                }
15827            }
15828            else {
15829                if (exists $stricter_to_file_of{$standard}) {
15830                    Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
15831                }
15832                else {
15833                    $stricter_to_file_of{$standard} = $sub_filename;
15834
15835                    # Tightly coupled with how Unicode::UCD works, for a
15836                    # floating point number that is a whole number, get rid of
15837                    # the trailing decimal point and 0's, so that Unicode::UCD
15838                    # will work.  Also note that this assumes that such a
15839                    # number is matched strictly; so if that were to change,
15840                    # this would be wrong.
15841                    if ((my $integer_name = $alias->name)
15842                            =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
15843                    {
15844                        $stricter_to_file_of{$property_name . $integer_name}
15845                                                            = $sub_filename;
15846                    }
15847                }
15848            }
15849
15850            # For Unicode::UCD, create a mapping of the prop=value to the
15851            # canonical =value for that property.
15852            if ($standard =~ /=/) {
15853
15854                # This could happen if a strict name mapped into an existing
15855                # loose name.  In that event, the strict names would have to
15856                # be moved to a new hash.
15857                if (exists($loose_to_standard_value{$standard})) {
15858                    Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
15859                }
15860                $loose_to_standard_value{$standard} = $loose_table_name;
15861            }
15862
15863            # Keep a list of the deprecated properties and their filenames
15864            if ($deprecated && $complement == 0) {
15865                $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated;
15866            }
15867
15868            # And a substitute table, if any, for case-insensitive matching
15869            if ($caseless_equivalent != 0) {
15870                $caseless_equivalent_to{$standard} = $caseless_equivalent;
15871            }
15872
15873            # Add to defaults list if the table this alias belongs to is the
15874            # default one
15875            $loose_defaults{$standard} = 1 if $is_default;
15876        }
15877    }
15878
15879    return;
15880}
15881
15882{   # Closure
15883    my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
15884                     # conflicts
15885    my %full_dir_name_of;   # Full length names of directories used.
15886
15887    sub construct_filename($name, $mutable, $directories_ref) {
15888        # Return a file name for a table, based on the table name, but perhaps
15889        # changed to get rid of non-portable characters in it, and to make
15890        # sure that it is unique on a file system that allows the names before
15891        # any period to be at most 8 characters (DOS).  While we're at it
15892        # check and complain if there are any directory conflicts.
15893
15894        # $name                 # The name to start with
15895        # $mutable              # Boolean: can it be changed?  If no, but
15896                                # yet it must be to work properly, a warning
15897                                # is given
15898        # $directories_ref      # A reference to an array containing the
15899                                # path to the file, with each element one path
15900                                # component.  This is used because the same
15901                                # name can be used in different directories.
15902
15903        my $warn = ! defined wantarray;  # If true, then if the name is
15904                                # changed, a warning is issued as well.
15905
15906        if (! defined $name) {
15907            Carp::my_carp("Undefined name in directory "
15908                          . File::Spec->join(@$directories_ref)
15909                          . ". '_' used");
15910            return '_';
15911        }
15912
15913        # Make sure that no directory names conflict with each other.  Look at
15914        # each directory in the input file's path.  If it is already in use,
15915        # assume it is correct, and is merely being re-used, but if we
15916        # truncate it to 8 characters, and find that there are two directories
15917        # that are the same for the first 8 characters, but differ after that,
15918        # then that is a problem.
15919        foreach my $directory (@$directories_ref) {
15920            my $short_dir = substr($directory, 0, 8);
15921            if (defined $full_dir_name_of{$short_dir}) {
15922                next if $full_dir_name_of{$short_dir} eq $directory;
15923                Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
15924            }
15925            else {
15926                $full_dir_name_of{$short_dir} = $directory;
15927            }
15928        }
15929
15930        my $path = join '/', @$directories_ref;
15931        $path .= '/' if $path;
15932
15933        # Remove interior underscores.
15934        (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
15935
15936        # Convert the dot in floating point numbers to an underscore
15937        $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
15938
15939        my $suffix = "";
15940
15941        # Extract any suffix, delete any non-word character, and truncate to 3
15942        # after the dot
15943        if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
15944            $filename = $1;
15945            $suffix = $2;
15946            $suffix =~ s/\W+//g;
15947            substr($suffix, 4) = "" if length($suffix) > 4;
15948        }
15949
15950        # Change any non-word character outside the suffix into an underscore,
15951        # and truncate to 8.
15952        $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
15953        substr($filename, 8) = "" if length($filename) > 8;
15954
15955        # Make sure the basename doesn't conflict with something we
15956        # might have already written. If we have, say,
15957        #     InGreekExtended1
15958        #     InGreekExtended2
15959        # they become
15960        #     InGreekE
15961        #     InGreek2
15962        my $warned = 0;
15963        while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
15964            $num++; # so basenames with numbers start with '2', which
15965                    # just looks more natural.
15966
15967            # Want to append $num, but if it'll make the basename longer
15968            # than 8 characters, pre-truncate $filename so that the result
15969            # is acceptable.
15970            my $delta = length($filename) + length($num) - 8;
15971            if ($delta > 0) {
15972                substr($filename, -$delta) = $num;
15973            }
15974            else {
15975                $filename .= $num;
15976            }
15977            if ($warn && ! $warned) {
15978                $warned = 1;
15979                Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
15980            }
15981        }
15982
15983        return $filename if $mutable;
15984
15985        # If not changeable, must return the input name, but warn if needed to
15986        # change it beyond shortening it.
15987        if ($name ne $filename
15988            && substr($name, 0, length($filename)) ne $filename) {
15989            Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
15990        }
15991        return $name;
15992    }
15993}
15994
15995# The pod file contains a very large table.  Many of the lines in that table
15996# would exceed a typical output window's size, and so need to be wrapped with
15997# a hanging indent to make them look good.  The pod language is really
15998# insufficient here.  There is no general construct to do that in pod, so it
15999# is done here by beginning each such line with a space to cause the result to
16000# be output without formatting, and doing all the formatting here.  This leads
16001# to the result that if the eventual display window is too narrow it won't
16002# look good, and if the window is too wide, no advantage is taken of that
16003# extra width.  A further complication is that the output may be indented by
16004# the formatter so that there is less space than expected.  What I (khw) have
16005# done is to assume that that indent is a particular number of spaces based on
16006# what it is in my Linux system;  people can always resize their windows if
16007# necessary, but this is obviously less than desirable, but the best that can
16008# be expected.
16009my $automatic_pod_indent = 8;
16010
16011# Try to format so that uses fewest lines, but few long left column entries
16012# slide into the right column.  An experiment on 5.1 data yielded the
16013# following percentages that didn't cut into the other side along with the
16014# associated first-column widths
16015# 69% = 24
16016# 80% not too bad except for a few blocks
16017# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
16018# 95% = 37;
16019my $indent_info_column = 27;    # 75% of lines didn't have overlap
16020
16021my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
16022                    # The 3 is because of:
16023                    #   1   for the leading space to tell the pod formatter to
16024                    #       output as-is
16025                    #   1   for the flag
16026                    #   1   for the space between the flag and the main data
16027
16028sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) {
16029    # Take a pod line and return it, formatted properly
16030
16031    # $entry Contents of left column
16032    # $info Contents of right column
16033
16034    my $flags = "";
16035    $flags .= $STRICTER if ! $loose_match;
16036
16037    $flags .= $status if $status;
16038
16039    # There is a blank in the left column to cause the pod formatter to
16040    # output the line as-is.
16041    return sprintf " %-*s%-*s %s\n",
16042                    # The first * in the format is replaced by this, the -1 is
16043                    # to account for the leading blank.  There isn't a
16044                    # hard-coded blank after this to separate the flags from
16045                    # the rest of the line, so that in the unlikely event that
16046                    # multiple flags are shown on the same line, they both
16047                    # will get displayed at the expense of that separation,
16048                    # but since they are left justified, a blank will be
16049                    # inserted in the normal case.
16050                    $FILLER - 1,
16051                    $flags,
16052
16053                    # The other * in the format is replaced by this number to
16054                    # cause the first main column to right fill with blanks.
16055                    # The -1 is for the guaranteed blank following it.
16056                    $first_column_width - $FILLER - 1,
16057                    $entry,
16058                    $info;
16059}
16060
16061my @zero_match_tables;  # List of tables that have no matches in this release
16062
16063sub make_re_pod_entries($input_table) {
16064    # This generates the entries for the pod file for a given table.
16065    # Also done at this time are any children tables.  The output looks like:
16066    # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
16067
16068    # Generate parent and all its children at the same time.
16069    return if $input_table->parent != $input_table;
16070
16071    my $property = $input_table->property;
16072    my $type = $property->type;
16073    my $full_name = $property->full_name;
16074
16075    my $count = $input_table->count;
16076    my $unicode_count;
16077    my $non_unicode_string;
16078    if ($count > $MAX_UNICODE_CODEPOINTS) {
16079        $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16080                                    - $MAX_UNICODE_CODEPOINT);
16081        $non_unicode_string = " plus all above-Unicode code points";
16082    }
16083    else {
16084        $unicode_count = $count;
16085        $non_unicode_string = "";
16086    }
16087
16088    my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16089
16090    my $definition = $input_table->calculate_table_definition;
16091    if ($definition) {
16092
16093        # Save the definition for later use.
16094        $input_table->set_definition($definition);
16095
16096        $definition = ": $definition";
16097    }
16098
16099    my $status = $input_table->status;
16100    my $status_info = $input_table->status_info;
16101    my $caseless_equivalent = $input_table->caseless_equivalent;
16102
16103    # Don't mention a placeholder equivalent as it isn't to be listed in the
16104    # pod
16105    $caseless_equivalent = 0 if $caseless_equivalent != 0
16106                                && $caseless_equivalent->fate > $ORDINARY;
16107
16108    my $entry_for_first_table; # The entry for the first table output.
16109                           # Almost certainly, it is the parent.
16110
16111    # For each related table (including itself), we will generate a pod entry
16112    # for each name each table goes by
16113    foreach my $table ($input_table, $input_table->children) {
16114
16115        # Unicode::UCD cannot deal with null string property values, so skip
16116        # any tables that have no non-null names.
16117        next if ! grep { $_->name ne "" } $table->aliases;
16118
16119        # First, gather all the info that applies to this table as a whole.
16120
16121        push @zero_match_tables, $table if $count == 0
16122                                            # Don't mention special tables
16123                                            # as being zero length
16124                                           && $table->fate == $ORDINARY;
16125
16126        my $table_property = $table->property;
16127
16128        # The short name has all the underscores removed, while the full name
16129        # retains them.  Later, we decide whether to output a short synonym
16130        # for the full one, we need to compare apples to apples, so we use the
16131        # short name's length including underscores.
16132        my $table_property_short_name_length;
16133        my $table_property_short_name
16134            = $table_property->short_name(\$table_property_short_name_length);
16135        my $table_property_full_name = $table_property->full_name;
16136
16137        # Get how much savings there is in the short name over the full one
16138        # (delta will always be <= 0)
16139        my $table_property_short_delta = $table_property_short_name_length
16140                                         - length($table_property_full_name);
16141        my @table_description = $table->description;
16142        my @table_note = $table->note;
16143
16144        # Generate an entry for each alias in this table.
16145        my $entry_for_first_alias;  # saves the first one encountered.
16146        foreach my $alias ($table->aliases) {
16147
16148            # Skip if not to go in pod.
16149            next unless $alias->make_re_pod_entry;
16150
16151            # Start gathering all the components for the entry
16152            my $name = $alias->name;
16153
16154            # Skip if name is empty, as can't be accessed by regexes.
16155            next if $name eq "";
16156
16157            my $entry;      # Holds the left column, may include extras
16158            my $entry_ref;  # To refer to the left column's contents from
16159                            # another entry; has no extras
16160
16161            # First the left column of the pod entry.  Tables for the $perl
16162            # property always use the single form.
16163            if ($table_property == $perl) {
16164                $entry = "\\p{$name}";
16165                $entry .= " \\p$name" if length $name == 1; # Show non-braced
16166                                                            # form too
16167                $entry_ref = "\\p{$name}";
16168            }
16169            else {    # Compound form.
16170
16171                # Only generate one entry for all the aliases that mean true
16172                # or false in binary properties.  Append a '*' to indicate
16173                # some are missing.  (The heading comment notes this.)
16174                my $rhs;
16175                if ($type == $BINARY) {
16176                    next if $name ne 'N' && $name ne 'Y';
16177                    $rhs = "$name*";
16178                }
16179                elsif ($type != $FORCED_BINARY) {
16180                    $rhs = $name;
16181                }
16182                else {
16183
16184                    # Forced binary properties require special handling.  It
16185                    # has two sets of tables, one set is true/false; and the
16186                    # other set is everything else.  Entries are generated for
16187                    # each set.  Use the Bidi_Mirrored property (which appears
16188                    # in all Unicode versions) to get a list of the aliases
16189                    # for the true/false tables.  Of these, only output the N
16190                    # and Y ones, the same as, a regular binary property.  And
16191                    # output all the rest, same as a non-binary property.
16192                    my $bm = property_ref("Bidi_Mirrored");
16193                    if ($name eq 'N' || $name eq 'Y') {
16194                        $rhs = "$name*";
16195                    } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16196                                                        $bm->table("N")->aliases)
16197                    {
16198                        next;
16199                    }
16200                    else {
16201                        $rhs = $name;
16202                    }
16203                }
16204
16205                # Colon-space is used to give a little more space to be easier
16206                # to read;
16207                $entry = "\\p{"
16208                        . $table_property_full_name
16209                        . ": $rhs}";
16210
16211                # But for the reference to this entry, which will go in the
16212                # right column, where space is at a premium, use equals
16213                # without a space
16214                $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16215            }
16216
16217            # Then the right (info) column.  This is stored as components of
16218            # an array for the moment, then joined into a string later.  For
16219            # non-internal only properties, begin the info with the entry for
16220            # the first table we encountered (if any), as things are ordered
16221            # so that that one is the most descriptive.  This leads to the
16222            # info column of an entry being a more descriptive version of the
16223            # name column
16224            my @info;
16225            if ($name =~ /^_/) {
16226                push @info,
16227                        '(For internal use by Perl, not necessarily stable)';
16228            }
16229            elsif ($entry_for_first_alias) {
16230                push @info, $entry_for_first_alias;
16231            }
16232
16233            # If this entry is equivalent to another, add that to the info,
16234            # using the first such table we encountered
16235            if ($entry_for_first_table) {
16236                if (@info) {
16237                    push @info, "(= $entry_for_first_table)";
16238                }
16239                else {
16240                    push @info, $entry_for_first_table;
16241                }
16242            }
16243
16244            # If the name is a large integer, add an equivalent with an
16245            # exponent for better readability
16246            if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16247                push @info, sprintf "(= %.1e)", $name
16248            }
16249
16250            my $parenthesized = "";
16251            if (! $entry_for_first_alias) {
16252
16253                # This is the first alias for the current table.  The alias
16254                # array is ordered so that this is the fullest, most
16255                # descriptive alias, so it gets the fullest info.  The other
16256                # aliases are mostly merely pointers to this one, using the
16257                # information already added above.
16258
16259                # Display any status message, but only on the parent table
16260                if ($status && ! $entry_for_first_table) {
16261                    push @info, $status_info;
16262                }
16263
16264                # Put out any descriptive info
16265                if (@table_description || @table_note) {
16266                    push @info, join "; ", @table_description, @table_note;
16267                }
16268
16269                # Look to see if there is a shorter name we can point people
16270                # at
16271                my $standard_name = standardize($name);
16272                my $short_name;
16273                my $proposed_short = $table->short_name;
16274                if (defined $proposed_short) {
16275                    my $standard_short = standardize($proposed_short);
16276
16277                    # If the short name is shorter than the standard one, or
16278                    # even if it's not, but the combination of it and its
16279                    # short property name (as in \p{prop=short} ($perl doesn't
16280                    # have this form)) saves at least two characters, then,
16281                    # cause it to be listed as a shorter synonym.
16282                    if (length $standard_short < length $standard_name
16283                        || ($table_property != $perl
16284                            && (length($standard_short)
16285                                - length($standard_name)
16286                                + $table_property_short_delta)  # (<= 0)
16287                                < -2))
16288                    {
16289                        $short_name = $proposed_short;
16290                        if ($table_property != $perl) {
16291                            $short_name = $table_property_short_name
16292                                          . "=$short_name";
16293                        }
16294                        $short_name = "\\p{$short_name}";
16295                    }
16296                }
16297
16298                # And if this is a compound form name, see if there is a
16299                # single form equivalent
16300                my $single_form;
16301                if ($table_property != $perl && $table_property != $block) {
16302
16303                    # Special case the binary N tables, so that will print
16304                    # \P{single}, but use the Y table values to populate
16305                    # 'single', as we haven't likewise populated the N table.
16306                    # For forced binary tables, we can't just look at the N
16307                    # table, but must see if this table is equivalent to the N
16308                    # one, as there are two equivalent beasts in these
16309                    # properties.
16310                    my $test_table;
16311                    my $p;
16312                    if (   ($type == $BINARY
16313                            && $input_table == $property->table('No'))
16314                        || ($type == $FORCED_BINARY
16315                            && $property->table('No')->
16316                                        is_set_equivalent_to($input_table)))
16317                    {
16318                        $test_table = $property->table('Yes');
16319                        $p = 'P';
16320                    }
16321                    else {
16322                        $test_table = $input_table;
16323                        $p = 'p';
16324                    }
16325
16326                    # Look for a single form amongst all the children.
16327                    foreach my $table ($test_table->children) {
16328                        next if $table->property != $perl;
16329                        my $proposed_name = $table->short_name;
16330                        next if ! defined $proposed_name;
16331
16332                        # Don't mention internal-only properties as a possible
16333                        # single form synonym
16334                        next if substr($proposed_name, 0, 1) eq '_';
16335
16336                        $proposed_name = "\\$p\{$proposed_name}";
16337                        if (! defined $single_form
16338                            || length($proposed_name) < length $single_form)
16339                        {
16340                            $single_form = $proposed_name;
16341
16342                            # The goal here is to find a single form; not the
16343                            # shortest possible one.  We've already found a
16344                            # short name.  So, stop at the first single form
16345                            # found, which is likely to be closer to the
16346                            # original.
16347                            last;
16348                        }
16349                    }
16350                }
16351
16352                # Output both short and single in the same parenthesized
16353                # expression, but with only one of 'Single', 'Short' if there
16354                # are both items.
16355                if ($short_name || $single_form || $table->conflicting) {
16356                    $parenthesized .= "Short: $short_name" if $short_name;
16357                    if ($short_name && $single_form) {
16358                        $parenthesized .= ', ';
16359                    }
16360                    elsif ($single_form) {
16361                        $parenthesized .= 'Single: ';
16362                    }
16363                    $parenthesized .= $single_form if $single_form;
16364                }
16365            }
16366
16367            if ($caseless_equivalent != 0) {
16368                $parenthesized .=  '; ' if $parenthesized ne "";
16369                $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16370            }
16371
16372
16373            # Warn if this property isn't the same as one that a
16374            # semi-casual user might expect.  The other components of this
16375            # parenthesized structure are calculated only for the first entry
16376            # for this table, but the conflicting is deemed important enough
16377            # to go on every entry.
16378            my $conflicting = join " NOR ", $table->conflicting;
16379            if ($conflicting) {
16380                $parenthesized .=  '; ' if $parenthesized ne "";
16381                $parenthesized .= "NOT $conflicting";
16382            }
16383
16384            push @info, "($parenthesized)" if $parenthesized;
16385
16386            if ($name =~ /_$/ && $alias->loose_match) {
16387                push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16388            }
16389
16390            if ($table_property != $perl && $table->perl_extension) {
16391                push @info, '(Perl extension)';
16392            }
16393            my $definition = $table->definition // "";
16394            $definition = "" if $entry_for_first_alias;
16395            $definition = ": $definition" if $definition;
16396            push @info, "($string_count$definition)";
16397
16398            # Now, we have both the entry and info so add them to the
16399            # list of all the properties.
16400            push @match_properties,
16401                format_pod_line($indent_info_column,
16402                                $entry,
16403                                join( " ", @info),
16404                                $alias->status,
16405                                $alias->loose_match);
16406
16407            $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16408        } # End of looping through the aliases for this table.
16409
16410        if (! $entry_for_first_table) {
16411            $entry_for_first_table = $entry_for_first_alias;
16412        }
16413    } # End of looping through all the related tables
16414    return;
16415}
16416
16417sub make_ucd_table_pod_entries($table) {
16418    # Generate the entries for the UCD section of the pod for $table.  This
16419    # also calculates if names are ambiguous, so has to be called even if the
16420    # pod is not being output
16421
16422    my $short_name = $table->name;
16423    my $standard_short_name = standardize($short_name);
16424    my $full_name = $table->full_name;
16425    my $standard_full_name = standardize($full_name);
16426
16427    my $full_info = "";     # Text of info column for full-name entries
16428    my $other_info = "";    # Text of info column for short-name entries
16429    my $short_info = "";    # Text of info column for other entries
16430    my $meaning = "";       # Synonym of this table
16431
16432    my $property = ($table->isa('Property'))
16433                   ? $table
16434                   : $table->parent->property;
16435
16436    my $perl_extension = $table->perl_extension;
16437    my $is_perl_extension_match_table_but_not_dollar_perl
16438                                                        = $property != $perl
16439                                                       && $perl_extension
16440                                                       && $property != $table;
16441
16442    # Get the more official name for perl extensions that aren't
16443    # stand-alone properties
16444    if ($is_perl_extension_match_table_but_not_dollar_perl) {
16445        if ($property->type == $BINARY) {
16446            $meaning = $property->full_name;
16447        }
16448        else {
16449            $meaning = $table->parent->complete_name;
16450        }
16451    }
16452
16453    # There are three types of info column.  One for the short name, one for
16454    # the full name, and one for everything else.  They mostly are the same,
16455    # so initialize in the same loop.
16456
16457    foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16458        if ($info_ref != \$full_info) {
16459
16460            # The non-full name columns include the full name
16461            $$info_ref .= $full_name;
16462        }
16463
16464
16465        if ($is_perl_extension_match_table_but_not_dollar_perl) {
16466
16467            # Add the synonymous name for the non-full name entries; and to
16468            # the full-name entry if it adds extra information
16469            if (   standardize($meaning) ne $standard_full_name
16470                || $info_ref == \$other_info
16471                || $info_ref == \$short_info)
16472            {
16473                my $parenthesized =  $info_ref != \$full_info;
16474                $$info_ref .= " " if $$info_ref && $parenthesized;
16475                $$info_ref .= "(=" if $parenthesized;
16476                $$info_ref .= "$meaning";
16477                $$info_ref .= ")" if $parenthesized;
16478                $$info_ref .= ".";
16479            }
16480        }
16481
16482        # And the full-name entry includes the short name, if shorter
16483        if ($info_ref == \$full_info
16484            && length $standard_short_name < length $standard_full_name)
16485        {
16486            $full_info =~ s/\.\Z//;
16487            $full_info .= "  " if $full_info;
16488            $full_info .= "(Short: $short_name)";
16489        }
16490
16491        if ($table->perl_extension) {
16492            $$info_ref =~ s/\.\Z//;
16493            $$info_ref .= ".  " if $$info_ref;
16494            $$info_ref .= "(Perl extension)";
16495        }
16496    }
16497
16498    my $definition;
16499    my $definition_table;
16500    my $type = $table->property->type;
16501    if ($type == $BINARY || $type == $FORCED_BINARY) {
16502        $definition_table = $table->property->table('Y');
16503    }
16504    elsif ($table->isa('Match_Table')) {
16505        $definition_table = $table;
16506    }
16507
16508    $definition = $definition_table->calculate_table_definition
16509                                            if defined $definition_table
16510                                                    && $definition_table != 0;
16511
16512    # Add any extra annotations to the full name entry
16513    foreach my $more_info ($table->description,
16514                            $definition,
16515                            $table->note,
16516                            $table->status_info)
16517    {
16518        next unless $more_info;
16519        $full_info =~ s/\.\Z//;
16520        $full_info .= ".  " if $full_info;
16521        $full_info .= $more_info;
16522    }
16523    if ($table->property->type == $FORCED_BINARY) {
16524        if ($full_info) {
16525            $full_info =~ s/\.\Z//;
16526            $full_info .= ".  ";
16527        }
16528        $full_info .= "This is a combination property which has both:"
16529                    . " 1) a map to various string values; and"
16530                    . " 2) a map to boolean Y/N, where 'Y' means the"
16531                    . " string value is non-empty.  Add the prefix 'is'"
16532                    . " to the prop_invmap() call to get the latter";
16533    }
16534
16535    # These keep track if have created full and short name pod entries for the
16536    # property
16537    my $done_full = 0;
16538    my $done_short = 0;
16539
16540    # Every possible name is kept track of, even those that aren't going to be
16541    # output.  This way we can be sure to find the ambiguities.
16542    foreach my $alias ($table->aliases) {
16543        my $name = $alias->name;
16544        my $standard = standardize($name);
16545        my $info;
16546        my $output_this = $alias->ucd;
16547
16548        # If the full and short names are the same, we want to output the full
16549        # one's entry, so it has priority.
16550        if ($standard eq $standard_full_name) {
16551            next if $done_full;
16552            $done_full = 1;
16553            $info = $full_info;
16554        }
16555        elsif ($standard eq $standard_short_name) {
16556            next if $done_short;
16557            $done_short = 1;
16558            next if $standard_short_name eq $standard_full_name;
16559            $info = $short_info;
16560        }
16561        else {
16562            $info = $other_info;
16563        }
16564
16565        $combination_property{$standard} = 1
16566                                  if $table->property->type == $FORCED_BINARY;
16567
16568        # Here, we have set up the two columns for this entry.  But if an
16569        # entry already exists for this name, we have to decide which one
16570        # we're going to later output.
16571        if (exists $ucd_pod{$standard}) {
16572
16573            # If the two entries refer to the same property, it's not going to
16574            # be ambiguous.  (Likely it's because the names when standardized
16575            # are the same.)  But that means if they are different properties,
16576            # there is ambiguity.
16577            if ($ucd_pod{$standard}->{'property'} != $property) {
16578
16579                # Here, we have an ambiguity.  This code assumes that one is
16580                # scheduled to be output and one not and that one is a perl
16581                # extension (which is not to be output) and the other isn't.
16582                # If those assumptions are wrong, things have to be rethought.
16583                if ($ucd_pod{$standard}{'output_this'} == $output_this
16584                    || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16585                    || $output_this == $perl_extension)
16586                {
16587                    Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
16588                }
16589
16590                # We modify the info column of the one being output to
16591                # indicate the ambiguity.  Set $which to point to that one's
16592                # info.
16593                my $which;
16594                if ($ucd_pod{$standard}{'output_this'}) {
16595                    $which = \$ucd_pod{$standard}->{'info'};
16596                }
16597                else {
16598                    $which = \$info;
16599                    $meaning = $ucd_pod{$standard}{'meaning'};
16600                }
16601
16602                chomp $$which;
16603                $$which =~ s/\.\Z//;
16604                $$which .= "; NOT '$standard' meaning '$meaning'";
16605
16606                $ambiguous_names{$standard} = 1;
16607            }
16608
16609            # Use the non-perl-extension variant
16610            next unless $ucd_pod{$standard}{'perl_extension'};
16611        }
16612
16613        # Store enough information about this entry that we can later look for
16614        # ambiguities, and output it properly.
16615        $ucd_pod{$standard} = { 'name' => $name,
16616                                'info' => $info,
16617                                'meaning' => $meaning,
16618                                'output_this' => $output_this,
16619                                'perl_extension' => $perl_extension,
16620                                'property' => $property,
16621                                'status' => $alias->status,
16622        };
16623    } # End of looping through all this table's aliases
16624
16625    return;
16626}
16627
16628sub pod_alphanumeric_sort {
16629    # Sort pod entries alphanumerically.
16630
16631    # The first few character columns are filler, plus the '\p{'; and get rid
16632    # of all the trailing stuff, starting with the trailing '}', so as to sort
16633    # on just 'Name=Value'
16634    (my $a = lc $a) =~ s/^ .*? \{ //x;
16635    $a =~ s/}.*//;
16636    (my $b = lc $b) =~ s/^ .*? \{ //x;
16637    $b =~ s/}.*//;
16638
16639    # Determine if the two operands are both internal only or both not.
16640    # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16641    # should be the underscore that begins internal only
16642    my $a_is_internal = (substr($a, 0, 1) eq '_');
16643    my $b_is_internal = (substr($b, 0, 1) eq '_');
16644
16645    # Sort so the internals come last in the table instead of first (which the
16646    # leading underscore would otherwise indicate).
16647    if ($a_is_internal != $b_is_internal) {
16648        return 1 if $a_is_internal;
16649        return -1
16650    }
16651
16652    # Determine if the two operands are compound or not, and if so if are
16653    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
16654    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16655    # all of which this considers numeric, and for sorting, looks just at the
16656    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
16657    my $split_re = qr/
16658        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16659                     # property name
16660        [:=] \s*     # The syntax for the compound form
16661        (?:          # followed by ...
16662            (        # $2 gets defined if what follows is a "numeric"
16663                     # expression, which is ...
16664              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
16665                                        # number, optionally signed
16666               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
16667                                         # of these go into $3
16668             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
16669                                         # number, into $4
16670            )
16671            | .* $    # If not "numeric", accept anything so that $1 gets
16672                      # defined if it is any compound form
16673        ) /ix;
16674    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16675    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16676
16677    # Sort alphabeticlly on the whole property name if either operand isn't
16678    # compound, or they differ.
16679    return $a cmp $b if   ! defined $a_initial
16680                       || ! defined $b_initial
16681                       || $a_initial ne $b_initial;
16682
16683    if (! defined $a_numeric) {
16684
16685        # If neither is numeric, use alpha sort
16686        return $a cmp $b if ! defined $b_numeric;
16687        return 1;  # Sort numeric ahead of alpha
16688    }
16689
16690    # Here $a is numeric
16691    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
16692
16693    # Here they are both numeric in the same property.
16694    # Convert version numbers into regular numbers
16695    if (defined $a_version) {
16696        ($a_number = $a_version) =~ s/^V//i;
16697        $a_number =~ s/_/./;
16698    }
16699    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
16700        $a_number =~ s/ ^ [[:alpha:]]+ //x;
16701    }
16702    if (defined $b_version) {
16703        ($b_number = $b_version) =~ s/^V//i;
16704        $b_number =~ s/_/./;
16705    }
16706    else {
16707        $b_number =~ s/ ^ [[:alpha:]]+ //x;
16708    }
16709
16710    # Convert rationals to floating for the comparison.
16711    $a_number = eval $a_number if $a_number =~ qr{/};
16712    $b_number = eval $b_number if $b_number =~ qr{/};
16713
16714    return $a_number <=> $b_number || $a cmp $b;
16715}
16716
16717sub make_pod () {
16718    # Create the .pod file.  This generates the various subsections and then
16719    # combines them in one big HERE document.
16720
16721    my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16722
16723    return unless defined $pod_directory;
16724    print "Making pod file\n" if $verbosity >= $PROGRESS;
16725
16726    my $exception_message =
16727    '(Any exceptions are individually noted beginning with the word NOT.)';
16728    my @block_warning;
16729    if (-e 'Blocks.txt') {
16730
16731        # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
16732        # if the global $has_In_conflicts indicates we have them.
16733        push @match_properties, format_pod_line($indent_info_column,
16734                                                '\p{In_*}',
16735                                                '\p{Block: *}'
16736                                                    . (($has_In_conflicts)
16737                                                      ? " $exception_message"
16738                                                      : ""),
16739                                                 $DISCOURAGED);
16740        @block_warning = << "END";
16741
16742In particular, matches in the Block property have single forms
16743defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
16744all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
16745C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
16746C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
16747come along that would force Perl to change the meaning of one or more of
16748these, and your program would no longer be correct.  Currently there are no
16749such conflicts with the form that begins C<"In_">, but there are many with the
16750other two shortcuts, and Unicode continues to define new properties that begin
16751with C<"In">, so it's quite possible that a conflict will occur in the future.
16752The compound form is guaranteed to not become obsolete, and its meaning is
16753clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
16754
16755User-defined properties must begin with "In" or "Is".  These override any
16756Unicode property of the same name.
16757END
16758    }
16759    my $text = $Is_flags_text;
16760    $text = "$exception_message $text" if $has_Is_conflicts;
16761
16762    # And the 'Is_ line';
16763    push @match_properties, format_pod_line($indent_info_column,
16764                                            '\p{Is_*}',
16765                                            "\\p{*} $text");
16766    push @match_properties, format_pod_line($indent_info_column,
16767            '\p{Name=*}',
16768            "Combination of Name and Name_Alias properties; has special"
16769          . " loose matching rules, for which see Unicode UAX #44");
16770    push @match_properties, format_pod_line($indent_info_column,
16771                                            '\p{Na=*}',
16772                                            '\p{Name=*}');
16773
16774    # Sort the properties array for output.  It is sorted alphabetically
16775    # except numerically for numeric properties, and only output unique lines.
16776    @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16777
16778    my $formatted_properties = simple_fold(\@match_properties,
16779                                        "",
16780                                        # indent succeeding lines by two extra
16781                                        # which looks better
16782                                        $indent_info_column + 2,
16783
16784                                        # shorten the line length by how much
16785                                        # the formatter indents, so the folded
16786                                        # line will fit in the space
16787                                        # presumably available
16788                                        $automatic_pod_indent);
16789    # Add column headings, indented to be a little more centered, but not
16790    # exactly
16791    $formatted_properties =  format_pod_line($indent_info_column,
16792                                                    '    NAME',
16793                                                    '           INFO')
16794                                    . "\n"
16795                                    . $formatted_properties;
16796
16797    # Generate pod documentation lines for the tables that match nothing
16798    my $zero_matches = "";
16799    if (@zero_match_tables) {
16800        @zero_match_tables = uniques(@zero_match_tables);
16801        $zero_matches = join "\n\n",
16802                        map { $_ = '=item \p{' . $_->complete_name . "}" }
16803                            sort { $a->complete_name cmp $b->complete_name }
16804                            @zero_match_tables;
16805
16806        $zero_matches = <<END;
16807
16808=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
16809
16810Unicode has some property-value pairs that currently don't match anything.
16811This happens generally either because they are obsolete, or they exist for
16812symmetry with other forms, but no language has yet been encoded that uses
16813them.  In this version of Unicode, the following match zero code points:
16814
16815=over 4
16816
16817$zero_matches
16818
16819=back
16820
16821END
16822    }
16823
16824    # Generate list of properties that we don't accept, grouped by the reasons
16825    # why.  This is so only put out the 'why' once, and then list all the
16826    # properties that have that reason under it.
16827
16828    my %why_list;   # The keys are the reasons; the values are lists of
16829                    # properties that have the key as their reason
16830
16831    # For each property, add it to the list that are suppressed for its reason
16832    # The sort will cause the alphabetically first properties to be added to
16833    # each list first, so each list will be sorted.
16834    foreach my $property (sort keys %why_suppressed) {
16835        next unless $why_suppressed{$property};
16836        push @{$why_list{$why_suppressed{$property}}}, $property;
16837    }
16838
16839    # For each reason (sorted by the first property that has that reason)...
16840    my @bad_re_properties;
16841    foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
16842                     keys %why_list)
16843    {
16844        # Add to the output, all the properties that have that reason.
16845        my $has_item = 0;   # Flag if actually output anything.
16846        foreach my $name (@{$why_list{$why}}) {
16847
16848            # Split compound names into $property and $table components
16849            my $property = $name;
16850            my $table;
16851            if ($property =~ / (.*) = (.*) /x) {
16852                $property = $1;
16853                $table = $2;
16854            }
16855
16856            # This release of Unicode may not have a property that is
16857            # suppressed, so don't reference a non-existent one.
16858            $property = property_ref($property);
16859            next if ! defined $property;
16860
16861            # And since this list is only for match tables, don't list the
16862            # ones that don't have match tables.
16863            next if ! $property->to_create_match_tables;
16864
16865            # Find any abbreviation, and turn it into a compound name if this
16866            # is a property=value pair.
16867            my $short_name = $property->name;
16868            $short_name .= '=' . $property->table($table)->name if $table;
16869
16870            # Start with an empty line.
16871            push @bad_re_properties, "\n\n" unless $has_item;
16872
16873            # And add the property as an item for the reason.
16874            push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
16875            $has_item = 1;
16876        }
16877
16878        # And add the reason under the list of properties, if such a list
16879        # actually got generated.  Note that the header got added
16880        # unconditionally before.  But pod ignores extra blank lines, so no
16881        # harm.
16882        push @bad_re_properties, "\n$why\n" if $has_item;
16883
16884    } # End of looping through each reason.
16885
16886    if (! @bad_re_properties) {
16887        push @bad_re_properties,
16888                "*** This installation accepts ALL non-Unihan properties ***";
16889    }
16890    else {
16891        # Add =over only if non-empty to avoid an empty =over/=back section,
16892        # which is considered bad form.
16893        unshift @bad_re_properties, "\n=over 4\n";
16894        push @bad_re_properties, "\n=back\n";
16895    }
16896
16897    # Similarly, generate a list of files that we don't use, grouped by the
16898    # reasons why (Don't output if the reason is empty).  First, create a hash
16899    # whose keys are the reasons, and whose values are anonymous arrays of all
16900    # the files that share that reason.
16901    my %grouped_by_reason;
16902    foreach my $file (keys %skipped_files) {
16903        next unless $skipped_files{$file};
16904        push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
16905    }
16906
16907    # Then, sort each group.
16908    foreach my $group (keys %grouped_by_reason) {
16909        @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
16910                                        @{$grouped_by_reason{$group}} ;
16911    }
16912
16913    # Finally, create the output text.  For each reason (sorted by the
16914    # alphabetically first file that has that reason)...
16915    my @unused_files;
16916    foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
16917                               cmp lc $grouped_by_reason{$b}->[0]
16918                              }
16919                         keys %grouped_by_reason)
16920    {
16921        # Add all the files that have that reason to the output.  Start
16922        # with an empty line.
16923        push @unused_files, "\n\n";
16924        push @unused_files, map { "\n=item F<$_> \n" }
16925                            @{$grouped_by_reason{$reason}};
16926        # And add the reason under the list of files
16927        push @unused_files, "\n$reason\n";
16928    }
16929
16930    # Similarly, create the output text for the UCD section of the pod
16931    my @ucd_pod;
16932    foreach my $key (keys %ucd_pod) {
16933        next unless $ucd_pod{$key}->{'output_this'};
16934        push @ucd_pod, format_pod_line($indent_info_column,
16935                                       $ucd_pod{$key}->{'name'},
16936                                       $ucd_pod{$key}->{'info'},
16937                                       $ucd_pod{$key}->{'status'},
16938                                      );
16939    }
16940
16941    # Sort alphabetically, and fold for output
16942    @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
16943    my $ucd_pod = simple_fold(\@ucd_pod,
16944                           ' ',
16945                           $indent_info_column,
16946                           $automatic_pod_indent);
16947    $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
16948                . "\n"
16949                . $ucd_pod;
16950    my $space_hex = sprintf("%02x", ord " ");
16951    local $" = "";
16952
16953    # Everything is ready to assemble.
16954    my @OUT = << "END";
16955=begin comment
16956
16957$HEADER
16958
16959To change this file, edit $0 instead.
16960
16961=end comment
16962
16963=head1 NAME
16964
16965$pod_file - Index of Unicode Version $unicode_version character properties in Perl
16966
16967=head1 DESCRIPTION
16968
16969This document provides information about the portion of the Unicode database
16970that deals with character properties, that is the portion that is defined on
16971single code points.  (L</Other information in the Unicode data base>
16972below briefly mentions other data that Unicode provides.)
16973
16974Perl can provide access to all non-provisional Unicode character properties,
16975though not all are enabled by default.  The omitted ones are the Unihan
16976properties and certain
16977deprecated or Unicode-internal properties.  (An installation may choose to
16978recompile Perl's tables to change this.  See L</Unicode character
16979properties that are NOT accepted by Perl>.)
16980
16981For most purposes, access to Unicode properties from the Perl core is through
16982regular expression matches, as described in the next section.
16983For some special purposes, and to access the properties that are not suitable
16984for regular expression matching, all the Unicode character properties that
16985Perl handles are accessible via the standard L<Unicode::UCD> module, as
16986described in the section L</Properties accessible through Unicode::UCD>.
16987
16988Perl also provides some additional extensions and short-cut synonyms
16989for Unicode properties.
16990
16991This document merely lists all available properties and does not attempt to
16992explain what each property really means.  There is a brief description of each
16993Perl extension; see L<perlunicode/Other Properties> for more information on
16994these.  There is some detail about Blocks, Scripts, General_Category,
16995and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
16996official Unicode properties, refer to the Unicode standard.  A good starting
16997place is L<$unicode_reference_url>.
16998
16999Note that you can define your own properties; see
17000L<perlunicode/"User-Defined Character Properties">.
17001
17002=head1 Properties accessible through C<\\p{}> and C<\\P{}>
17003
17004The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
17005most of the Unicode character properties.  The table below shows all these
17006constructs, both single and compound forms.
17007
17008B<Compound forms> consist of two components, separated by an equals sign or a
17009colon.  The first component is the property name, and the second component is
17010the particular value of the property to match against, for example,
17011C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
17012to match characters whose Script_Extensions property value is Greek.
17013(C<Script_Extensions> is an improved version of the C<Script> property.)
17014
17015B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
17016their equivalent compound forms.  The table shows these equivalences.  (In our
17017example, C<\\p{Greek}> is a just a shortcut for
17018C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
17019forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
17020These are also listed in the table.
17021
17022In parsing these constructs, Perl always ignores Upper/lower case differences
17023everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
17024C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
17025the left brace completely changes the meaning of the construct, from "match"
17026(for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
17027for improved legibility.
17028
17029Also, white space, hyphens, and underscores are normally ignored
17030everywhere between the {braces}, and hence can be freely added or removed
17031even if the C</x> modifier hasn't been specified on the regular expression.
17032But in the table below $a_bold_stricter at the beginning of an entry
17033means that tighter (stricter) rules are used for that entry:
17034
17035=over 4
17036
17037=over 4
17038
17039=item Single form (C<\\p{name}>) tighter rules:
17040
17041White space, hyphens, and underscores ARE significant
17042except for:
17043
17044=over 4
17045
17046=item * white space adjacent to a non-word character
17047
17048=item * underscores separating digits in numbers
17049
17050=back
17051
17052That means, for example, that you can freely add or remove white space
17053adjacent to (but within) the braces without affecting the meaning.
17054
17055=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17056
17057The tighter rules given above for the single form apply to everything to the
17058right of the colon or equals; the looser rules still apply to everything to
17059the left.
17060
17061That means, for example, that you can freely add or remove white space
17062adjacent to (but within) the braces and the colon or equal sign.
17063
17064=back
17065
17066=back
17067
17068Some properties are considered obsolete by Unicode, but still available.
17069There are several varieties of obsolescence:
17070
17071=over 4
17072
17073=over 4
17074
17075=item Stabilized
17076
17077A property may be stabilized.  Such a determination does not indicate
17078that the property should or should not be used; instead it is a declaration
17079that the property will not be maintained nor extended for newly encoded
17080characters.  Such properties are marked with $a_bold_stabilized in the
17081table.
17082
17083=item Deprecated
17084
17085A property may be deprecated, perhaps because its original intent
17086has been replaced by another property, or because its specification was
17087somehow defective.  This means that its use is strongly
17088discouraged, so much so that a warning will be issued if used, unless the
17089regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17090statement.  $A_bold_deprecated flags each such entry in the table, and
17091the entry there for the longest, most descriptive version of the property will
17092give the reason it is deprecated, and perhaps advice.  Perl may issue such a
17093warning, even for properties that aren't officially deprecated by Unicode,
17094when there used to be characters or code points that were matched by them, but
17095no longer.  This is to warn you that your program may not work like it did on
17096earlier Unicode releases.
17097
17098A deprecated property may be made unavailable in a future Perl version, so it
17099is best to move away from them.
17100
17101A deprecated property may also be stabilized, but this fact is not shown.
17102
17103=item Obsolete
17104
17105Properties marked with $a_bold_obsolete in the table are considered (plain)
17106obsolete.  Generally this designation is given to properties that Unicode once
17107used for internal purposes (but not any longer).
17108
17109=item Discouraged
17110
17111This is not actually a Unicode-specified obsolescence, but applies to certain
17112Perl extensions that are present for backwards compatibility, but are
17113discouraged from being used.  These are not obsolete, but their meanings are
17114not stable.  Future Unicode versions could force any of these extensions to be
17115removed without warning, replaced by another property with the same name that
17116means something different.  $A_bold_discouraged flags each such entry in the
17117table.  Use the equivalent shown instead.
17118
17119@block_warning
17120
17121=back
17122
17123=back
17124
17125The table below has two columns.  The left column contains the C<\\p{}>
17126constructs to look up, possibly preceded by the flags mentioned above; and
17127the right column contains information about them, like a description, or
17128synonyms.  The table shows both the single and compound forms for each
17129property that has them.  If the left column is a short name for a property,
17130the right column will give its longer, more descriptive name; and if the left
17131column is the longest name, the right column will show any equivalent shortest
17132name, in both single and compound forms if applicable.
17133
17134If braces are not needed to specify a property (e.g., C<\\pL>), the left
17135column contains both forms, with and without braces.
17136
17137The right column will also caution you if a property means something different
17138than what might normally be expected.
17139
17140All single forms are Perl extensions; a few compound forms are as well, and
17141are noted as such.
17142
17143Numbers in (parentheses) indicate the total number of Unicode code points
17144matched by the property.  For the entries that give the longest, most
17145descriptive version of the property, the count is followed by a list of some
17146of the code points matched by it.  The list includes all the matched
17147characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17148a regular expression bracketed character class.  Following that, the next few
17149higher matching ranges are also given.  To avoid visual ambiguity, the SPACE
17150character is represented as C<\\x$space_hex>.
17151
17152For emphasis, those properties that match no code points at all are listed as
17153well in a separate section following the table.
17154
17155Most properties match the same code points regardless of whether C<"/i">
17156case-insensitive matching is specified or not.  But a few properties are
17157affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
17158in the second column.  Under case-insensitive matching they match the
17159same code pode points as the property I<other_property>.
17160
17161There is no description given for most non-Perl defined properties (See
17162L<$unicode_reference_url> for that).
17163
17164For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17165combinations.  For example, entries like:
17166
17167 \\p{Gc: *}                                  \\p{General_Category: *}
17168
17169mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17170for the latter is also valid for the former.  Similarly,
17171
17172 \\p{Is_*}                                   \\p{*}
17173
17174means that if and only if, for example, C<\\p{Foo}> exists, then
17175C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17176And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17177C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
17178underscore.
17179
17180Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17181And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
17182'N*' to indicate this, and doesn't have separate entries for the other
17183possibilities.  Note that not all properties which have values 'Yes' and 'No'
17184are binary, and they have all their values spelled out without using this wild
17185card, and a C<NOT> clause in their description that highlights their not being
17186binary.  These also require the compound form to match them, whereas true
17187binary properties have both single and compound forms available.
17188
17189Note that all non-essential underscores are removed in the display of the
17190short names below.
17191
17192B<Legend summary:>
17193
17194=over 4
17195
17196=item Z<>B<*> is a wild-card
17197
17198=item B<(\\d+)> in the info column gives the number of Unicode code points matched
17199by this property.
17200
17201=item B<$DEPRECATED> means this is deprecated.
17202
17203=item B<$OBSOLETE> means this is obsolete.
17204
17205=item B<$STABILIZED> means this is stabilized.
17206
17207=item B<$STRICTER> means tighter (stricter) name matching applies.
17208
17209=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17210stable.
17211
17212=back
17213
17214$formatted_properties
17215
17216$zero_matches
17217
17218=head1 Properties accessible through Unicode::UCD
17219
17220The value of any Unicode (not including Perl extensions) character
17221property mentioned above for any single code point is available through
17222L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
17223values of all the Unicode properties for a given code point.
17224
17225Besides these, all the Unicode character properties mentioned above
17226(except for those marked as for internal use by Perl) are also
17227accessible by L<Unicode::UCD/prop_invlist()>.
17228
17229Due to their nature, not all Unicode character properties are suitable for
17230regular expression matches, nor C<prop_invlist()>.  The remaining
17231non-provisional, non-internal ones are accessible via
17232L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17233hasn't included; see L<below for which those are|/Unicode character properties
17234that are NOT accepted by Perl>).
17235
17236For compatibility with other parts of Perl, all the single forms given in the
17237table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17238are recognized.  BUT, there are some ambiguities between some Perl extensions
17239and the Unicode properties, all of which are silently resolved in favor of the
17240official Unicode property.  To avoid surprises, you should only use
17241C<prop_invmap()> for forms listed in the table below, which omits the
17242non-recommended ones.  The affected forms are the Perl single form equivalents
17243of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17244C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17245whose short name is C<sc>.  The table indicates the current ambiguities in the
17246INFO column, beginning with the word C<"NOT">.
17247
17248The standard Unicode properties listed below are documented in
17249L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17250L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
17251L<perlunicode/Other Properties>;
17252
17253The first column in the table is a name for the property; the second column is
17254an alternative name, if any, plus possibly some annotations.  The alternative
17255name is the property's full name, unless that would simply repeat the first
17256column, in which case the second column indicates the property's short name
17257(if different).  The annotations are given only in the entry for the full
17258name.  The annotations for binary properties include a list of the first few
17259ranges that the property matches.  To avoid any ambiguity, the SPACE character
17260is represented as C<\\x$space_hex>.
17261
17262If a property is obsolete, etc, the entry will be flagged with the same
17263characters used in the table in the L<section above|/Properties accessible
17264through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17265
17266$ucd_pod
17267
17268=head1 Properties accessible through other means
17269
17270Certain properties are accessible also via core function calls.  These are:
17271
17272 Lowercase_Mapping          lc() and lcfirst()
17273 Titlecase_Mapping          ucfirst()
17274 Uppercase_Mapping          uc()
17275
17276Also, Case_Folding is accessible through the C</i> modifier in regular
17277expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17278operator.
17279
17280Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases
17281properties are accessible through the C<\\N{}> interpolation in double-quoted
17282strings and regular expressions; and functions C<charnames::viacode()>,
17283C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a
17284C<use charnames ();> to be specified.
17285
17286Finally, most properties related to decomposition are accessible via
17287L<Unicode::Normalize>.
17288
17289=head1 Unicode character properties that are NOT accepted by Perl
17290
17291Perl will generate an error for a few character properties in Unicode when
17292used in a regular expression.  The non-Unihan ones are listed below, with the
17293reasons they are not accepted, perhaps with work-arounds.  The short names for
17294the properties are listed enclosed in (parentheses).
17295As described after the list, an installation can change the defaults and choose
17296to accept any of these.  The list is machine generated based on the
17297choices made for the installation that generated this document.
17298
17299@bad_re_properties
17300
17301An installation can choose to allow any of these to be matched by downloading
17302the Unicode database from L<http://www.unicode.org/Public/> to
17303C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17304controlling lists contained in the program
17305C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17306(C<\%Config> is available from the Config module).
17307
17308Also, perl can be recompiled to operate on an earlier version of the Unicode
17309standard.  Further information is at
17310C<\$Config{privlib}>/F<unicore/README.perl>.
17311
17312=head1 Other information in the Unicode data base
17313
17314The Unicode data base is delivered in two different formats.  The XML version
17315is valid for more modern Unicode releases.  The other version is a collection
17316of files.  The two are intended to give equivalent information.  Perl uses the
17317older form; this allows you to recompile Perl to use early Unicode releases.
17318
17319The only non-character property that Perl currently supports is Named
17320Sequences, in which a sequence of code points
17321is given a name and generally treated as a single entity.  (Perl supports
17322these via the C<\\N{...}> double-quotish construct,
17323L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17324
17325Below is a list of the files in the Unicode data base that Perl doesn't
17326currently use, along with very brief descriptions of their purposes.
17327Some of the names of the files have been shortened from those that Unicode
17328uses, in order to allow them to be distinguishable from similarly named files
17329on file systems for which only the first 8 characters of a name are
17330significant.
17331
17332=over 4
17333
17334@unused_files
17335
17336=back
17337
17338=head1 SEE ALSO
17339
17340L<$unicode_reference_url>
17341
17342L<perlrecharclass>
17343
17344L<perlunicode>
17345
17346END
17347
17348    # And write it.  The 0 means no utf8.
17349    main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17350    return;
17351}
17352
17353sub make_Name_pm () {
17354    # Create and write Name.pm, which contains subroutines and data to use in
17355    # conjunction with Name.pl
17356
17357    # Maybe there's nothing to do.
17358    return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17359
17360    my @name = <<END;
17361$HEADER
17362$INTERNAL_ONLY_HEADER
17363
17364END
17365
17366    # Convert these structures to output format.
17367    my $code_points_ending_in_code_point =
17368        main::simple_dumper(\@code_points_ending_in_code_point,
17369                            ' ' x 8);
17370    my $names = main::simple_dumper(\%names_ending_in_code_point,
17371                                    ' ' x 8);
17372    my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17373                                    ' ' x 8);
17374
17375    # Do the same with the Hangul names,
17376    my $jamo;
17377    my $jamo_l;
17378    my $jamo_v;
17379    my $jamo_t;
17380    my $jamo_re;
17381    if ($has_hangul_syllables) {
17382
17383        # Construct a regular expression of all the possible
17384        # combinations of the Hangul syllables.
17385        my @L_re;   # Leading consonants
17386        for my $i ($LBase .. $LBase + $LCount - 1) {
17387            push @L_re, $Jamo{$i}
17388        }
17389        my @V_re;   # Middle vowels
17390        for my $i ($VBase .. $VBase + $VCount - 1) {
17391            push @V_re, $Jamo{$i}
17392        }
17393        my @T_re;   # Trailing consonants
17394        for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17395            push @T_re, $Jamo{$i}
17396        }
17397
17398        # The whole re is made up of the L V T combination.
17399        $jamo_re = '('
17400                    . join ('|', sort @L_re)
17401                    . ')('
17402                    . join ('|', sort @V_re)
17403                    . ')('
17404                    . join ('|', sort @T_re)
17405                    . ')?';
17406
17407        # These hashes needed by the algorithm were generated
17408        # during reading of the Jamo.txt file
17409        $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17410        $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17411        $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17412        $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17413    }
17414
17415    push @name, <<END;
17416
17417package charnames;
17418
17419# This module contains machine-generated tables and code for the
17420# algorithmically-determinable Unicode character names.  The following
17421# routines can be used to translate between name and code point and vice versa
17422
17423{ # Closure
17424
17425    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
17426    # two must be 10; if there are 5, the first must not be a 0.  Written this
17427    # way to decrease backtracking.  The first regex allows the code point to
17428    # be at the end of a word, but to work properly, the word shouldn't end
17429    # with a valid hex character.  The second one won't match a code point at
17430    # the end of a word, and doesn't have the run-on issue
17431    my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17432    my \$code_point_re = qr/$code_point_re/;
17433
17434    # In the following hash, the keys are the bases of names which include
17435    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
17436    # of each key is another hash which is used to get the low and high ends
17437    # for each range of code points that apply to the name.
17438    my %names_ending_in_code_point = (
17439$names
17440    );
17441
17442    # The following hash is a copy of the previous one, except is for loose
17443    # matching, so each name has blanks and dashes squeezed out
17444    my %loose_names_ending_in_code_point = (
17445$loose_names
17446    );
17447
17448    # And the following array gives the inverse mapping from code points to
17449    # names.  Lowest code points are first
17450    \@code_points_ending_in_code_point = (
17451$code_points_ending_in_code_point
17452    );
17453
17454    # Is exportable, make read-only
17455    Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
17456END
17457    # Earlier releases didn't have Jamos.  No sense outputting
17458    # them unless will be used.
17459    if ($has_hangul_syllables) {
17460        push @name, <<END;
17461
17462    # Convert from code point to Jamo short name for use in composing Hangul
17463    # syllable names
17464    my %Jamo = (
17465$jamo
17466    );
17467
17468    # Leading consonant (can be null)
17469    my %Jamo_L = (
17470$jamo_l
17471    );
17472
17473    # Vowel
17474    my %Jamo_V = (
17475$jamo_v
17476    );
17477
17478    # Optional trailing consonant
17479    my %Jamo_T = (
17480$jamo_t
17481    );
17482
17483    # Computed re that splits up a Hangul name into LVT or LV syllables
17484    my \$syllable_re = qr/$jamo_re/;
17485
17486    my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17487    my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17488
17489    # These constants names and values were taken from the Unicode standard,
17490    # version 5.1, section 3.12.  They are used in conjunction with Hangul
17491    # syllables
17492    my \$SBase = $SBase_string;
17493    my \$LBase = $LBase_string;
17494    my \$VBase = $VBase_string;
17495    my \$TBase = $TBase_string;
17496    my \$SCount = $SCount;
17497    my \$LCount = $LCount;
17498    my \$VCount = $VCount;
17499    my \$TCount = $TCount;
17500    my \$NCount = \$VCount * \$TCount;
17501END
17502    } # End of has Jamos
17503
17504    push @name, << 'END';
17505
17506    sub name_to_code_point_special {
17507        my ($name, $loose) = @_;
17508
17509        # Returns undef if not one of the specially handled names; otherwise
17510        # returns the code point equivalent to the input name
17511        # $loose is non-zero if to use loose matching, 'name' in that case
17512        # must be input as upper case with all blanks and dashes squeezed out.
17513END
17514    if ($has_hangul_syllables) {
17515        push @name, << 'END';
17516
17517        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17518            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17519        {
17520            return if $name !~ qr/^$syllable_re$/;
17521            my $L = $Jamo_L{$1};
17522            my $V = $Jamo_V{$2};
17523            my $T = (defined $3) ? $Jamo_T{$3} : 0;
17524            return ($L * $VCount + $V) * $TCount + $T + $SBase;
17525        }
17526END
17527    }
17528    push @name, << 'END';
17529
17530        # Name must end in 'code_point' for this to handle.
17531        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17532                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17533
17534        my $base = $1;
17535        my $code_point = CORE::hex $2;
17536        my $names_ref;
17537
17538        if ($loose) {
17539            $names_ref = \%loose_names_ending_in_code_point;
17540        }
17541        else {
17542            return if $base !~ s/-$//;
17543            $names_ref = \%names_ending_in_code_point;
17544        }
17545
17546        # Name must be one of the ones which has the code point in it.
17547        return if ! $names_ref->{$base};
17548
17549        # Look through the list of ranges that apply to this name to see if
17550        # the code point is in one of them.
17551        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17552            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17553            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17554
17555            # Here, the code point is in the range.
17556            return $code_point;
17557        }
17558
17559        # Here, looked like the name had a code point number in it, but
17560        # did not match one of the valid ones.
17561        return;
17562    }
17563
17564    sub code_point_to_name_special {
17565        my $code_point = shift;
17566
17567        # Returns the name of a code point if algorithmically determinable;
17568        # undef if not
17569END
17570    if ($has_hangul_syllables) {
17571        push @name, << 'END';
17572
17573        # If in the Hangul range, calculate the name based on Unicode's
17574        # algorithm
17575        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17576            use integer;
17577            my $SIndex = $code_point - $SBase;
17578            my $L = $LBase + $SIndex / $NCount;
17579            my $V = $VBase + ($SIndex % $NCount) / $TCount;
17580            my $T = $TBase + $SIndex % $TCount;
17581            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17582            $name .= $Jamo{$T} if $T != $TBase;
17583            return $name;
17584        }
17585END
17586    }
17587    push @name, << 'END';
17588
17589        # Look through list of these code points for one in range.
17590        foreach my $hash (@code_points_ending_in_code_point) {
17591            return if $code_point < $hash->{'low'};
17592            if ($code_point <= $hash->{'high'}) {
17593                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17594            }
17595        }
17596        return;            # None found
17597    }
17598} # End closure
17599
176001;
17601END
17602
17603    main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
17604    return;
17605}
17606
17607sub make_UCD () {
17608    # Create and write UCD.pl, which passes info about the tables to
17609    # Unicode::UCD
17610
17611    # Stringify structures for output
17612    my $loose_property_name_of
17613                           = simple_dumper(\%loose_property_name_of, ' ' x 4);
17614    chomp $loose_property_name_of;
17615
17616    my $strict_property_name_of
17617                           = simple_dumper(\%strict_property_name_of, ' ' x 4);
17618    chomp $strict_property_name_of;
17619
17620    my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17621    chomp $stricter_to_file_of;
17622
17623    my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17624    chomp $inline_definitions;
17625
17626    my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17627    chomp $loose_to_file_of;
17628
17629    my $nv_floating_to_rational
17630                           = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17631    chomp $nv_floating_to_rational;
17632
17633    my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4);
17634    chomp $why_deprecated;
17635
17636    # We set the key to the file when we associated files with tables, but we
17637    # couldn't do the same for the value then, as we might not have the file
17638    # for the alternate table figured out at that time.
17639    foreach my $cased (keys %caseless_equivalent_to) {
17640        my @path = $caseless_equivalent_to{$cased}->file_path;
17641        my $path;
17642        if ($path[0] eq "#") {  # Pseudo-directory '#'
17643            $path = join '/', @path;
17644        }
17645        else {  # Gets rid of lib/
17646            $path = join '/', @path[1, -1];
17647        }
17648        $caseless_equivalent_to{$cased} = $path;
17649    }
17650    my $caseless_equivalent_to
17651                           = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17652    chomp $caseless_equivalent_to;
17653
17654    my $loose_property_to_file_of
17655                        = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17656    chomp $loose_property_to_file_of;
17657
17658    my $strict_property_to_file_of
17659                        = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17660    chomp $strict_property_to_file_of;
17661
17662    my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17663    chomp $file_to_swash_name;
17664
17665    # Create a mapping from each alias of Perl single-form extensions to all
17666    # its equivalent aliases, for quick look-up.
17667    my %perlprop_to_aliases;
17668    foreach my $table ($perl->tables) {
17669
17670        # First create the list of the aliases of each extension
17671        my @aliases_list;    # List of legal aliases for this extension
17672
17673        my $table_name = $table->name;
17674        my $standard_table_name = standardize($table_name);
17675        my $table_full_name = $table->full_name;
17676        my $standard_table_full_name = standardize($table_full_name);
17677
17678        # Make sure that the list has both the short and full names
17679        push @aliases_list, $table_name, $table_full_name;
17680
17681        my $found_ucd = 0;  # ? Did we actually get an alias that should be
17682                            # output for this table
17683
17684        # Go through all the aliases (including the two just added), and add
17685        # any new unique ones to the list
17686        foreach my $alias ($table->aliases) {
17687
17688            # Skip non-legal names
17689            next unless $alias->ok_as_filename;
17690            next unless $alias->ucd;
17691
17692            $found_ucd = 1;     # have at least one legal name
17693
17694            my $name = $alias->name;
17695            my $standard = standardize($name);
17696
17697            # Don't repeat a name that is equivalent to one already on the
17698            # list
17699            next if $standard eq $standard_table_name;
17700            next if $standard eq $standard_table_full_name;
17701
17702            push @aliases_list, $name;
17703        }
17704
17705        # If there were no legal names, don't output anything.
17706        next unless $found_ucd;
17707
17708        # To conserve memory in the program reading these in, omit full names
17709        # that are identical to the short name, when those are the only two
17710        # aliases for the property.
17711        if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
17712            pop @aliases_list;
17713        }
17714
17715        # Here, @aliases_list is the list of all the aliases that this
17716        # extension legally has.  Now can create a map to it from each legal
17717        # standardized alias
17718        foreach my $alias ($table->aliases) {
17719            next unless $alias->ucd;
17720            next unless $alias->ok_as_filename;
17721            push @{$perlprop_to_aliases{standardize($alias->name)}},
17722                 uniques @aliases_list;
17723        }
17724    }
17725
17726    # Make a list of all combinations of properties/values that are suppressed.
17727    my @suppressed;
17728    if (! $debug_skip) {    # This tends to fail in this debug mode
17729        foreach my $property_name (keys %why_suppressed) {
17730
17731            # Just the value
17732            my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
17733
17734            # The hash may contain properties not in this release of Unicode
17735            next unless defined (my $property = property_ref($property_name));
17736
17737            # Find all combinations
17738            foreach my $prop_alias ($property->aliases) {
17739                my $prop_alias_name = standardize($prop_alias->name);
17740
17741                # If no =value, there's just one combination possible for this
17742                if (! $value_name) {
17743
17744                    # The property may be suppressed, but there may be a proxy
17745                    # for it, so it shouldn't be listed as suppressed
17746                    next if $prop_alias->ucd;
17747                    push @suppressed, $prop_alias_name;
17748                }
17749                else {  # Otherwise
17750                    foreach my $value_alias
17751                                    ($property->table($value_name)->aliases)
17752                    {
17753                        next if $value_alias->ucd;
17754
17755                        push @suppressed, "$prop_alias_name="
17756                                        .  standardize($value_alias->name);
17757                    }
17758                }
17759            }
17760        }
17761    }
17762    @suppressed = sort @suppressed; # So doesn't change between runs of this
17763                                    # program
17764
17765    # Convert the structure below (designed for Name.pm) to a form that UCD
17766    # wants, so it doesn't have to modify it at all; i.e. so that it includes
17767    # an element for the Hangul syllables in the appropriate place, and
17768    # otherwise changes the name to include the "-<code point>" suffix.
17769    my @algorithm_names;
17770    my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
17771                                             # along in this version
17772    # Copy it linearly.
17773    for my $i (0 .. @code_points_ending_in_code_point - 1) {
17774
17775        # Insert the hanguls in the correct place.
17776        if (! $done_hangul
17777            && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
17778        {
17779            $done_hangul = 1;
17780            push @algorithm_names, { low => $SBase,
17781                                     high => $SBase + $SCount - 1,
17782                                     name => '<hangul syllable>',
17783                                    };
17784        }
17785
17786        # Copy the current entry, modified.
17787        push @algorithm_names, {
17788            low => $code_points_ending_in_code_point[$i]->{'low'},
17789            high => $code_points_ending_in_code_point[$i]->{'high'},
17790            name =>
17791               "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
17792        };
17793    }
17794
17795    # Serialize these structures for output.
17796    my $loose_to_standard_value
17797                          = simple_dumper(\%loose_to_standard_value, ' ' x 4);
17798    chomp $loose_to_standard_value;
17799
17800    my $string_property_loose_to_name
17801                    = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
17802    chomp $string_property_loose_to_name;
17803
17804    my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
17805    chomp $perlprop_to_aliases;
17806
17807    my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
17808    chomp $prop_aliases;
17809
17810    my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
17811    chomp $prop_value_aliases;
17812
17813    my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
17814    chomp $suppressed;
17815
17816    my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
17817    chomp $algorithm_names;
17818
17819    my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
17820    chomp $ambiguous_names;
17821
17822    my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
17823    chomp $combination_property;
17824
17825    my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
17826    chomp $loose_defaults;
17827
17828    my @ucd = <<END;
17829$HEADER
17830$INTERNAL_ONLY_HEADER
17831
17832# This file is for the use of Unicode::UCD
17833
17834# Highest legal Unicode code point
17835\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
17836
17837# Hangul syllables
17838\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
17839\$Unicode::UCD::HANGUL_COUNT = $SCount;
17840
17841# Maps Unicode (not Perl single-form extensions) property names in loose
17842# standard form to their corresponding standard names
17843\%Unicode::UCD::loose_property_name_of = (
17844$loose_property_name_of
17845);
17846
17847# Same, but strict names
17848\%Unicode::UCD::strict_property_name_of = (
17849$strict_property_name_of
17850);
17851
17852# Gives the definitions (in the form of inversion lists) for those properties
17853# whose definitions aren't kept in files
17854\@Unicode::UCD::inline_definitions = (
17855$inline_definitions
17856);
17857
17858# Maps property, table to file for those using stricter matching.  For paths
17859# whose directory is '#', the file is in the form of a numeric index into
17860# \@inline_definitions
17861\%Unicode::UCD::stricter_to_file_of = (
17862$stricter_to_file_of
17863);
17864
17865# Maps property, table to file for those using loose matching.  For paths
17866# whose directory is '#', the file is in the form of a numeric index into
17867# \@inline_definitions
17868\%Unicode::UCD::loose_to_file_of = (
17869$loose_to_file_of
17870);
17871
17872# Maps floating point to fractional form
17873\%Unicode::UCD::nv_floating_to_rational = (
17874$nv_floating_to_rational
17875);
17876
17877# If a %e floating point number doesn't have this number of digits in it after
17878# the decimal point to get this close to a fraction, it isn't considered to be
17879# that fraction even if all the digits it does have match.
17880\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION;
17881
17882# Deprecated tables to generate a warning for.  The key is the file containing
17883# the table, so as to avoid duplication, as many property names can map to the
17884# file, but we only need one entry for all of them.
17885\%Unicode::UCD::why_deprecated = (
17886$why_deprecated
17887);
17888
17889# A few properties have different behavior under /i matching.  This maps
17890# those to substitute files to use under /i.
17891\%Unicode::UCD::caseless_equivalent = (
17892$caseless_equivalent_to
17893);
17894
17895# Property names to mapping files
17896\%Unicode::UCD::loose_property_to_file_of = (
17897$loose_property_to_file_of
17898);
17899
17900# Property names to mapping files
17901\%Unicode::UCD::strict_property_to_file_of = (
17902$strict_property_to_file_of
17903);
17904
17905# Files to the swash names within them.
17906\%Unicode::UCD::file_to_swash_name = (
17907$file_to_swash_name
17908);
17909
17910# Keys are all the possible "prop=value" combinations, in loose form; values
17911# are the standard loose name for the 'value' part of the key
17912\%Unicode::UCD::loose_to_standard_value = (
17913$loose_to_standard_value
17914);
17915
17916# String property loose names to standard loose name
17917\%Unicode::UCD::string_property_loose_to_name = (
17918$string_property_loose_to_name
17919);
17920
17921# Keys are Perl extensions in loose form; values are each one's list of
17922# aliases
17923\%Unicode::UCD::loose_perlprop_to_name = (
17924$perlprop_to_aliases
17925);
17926
17927# Keys are standard property name; values are each one's aliases
17928\%Unicode::UCD::prop_aliases = (
17929$prop_aliases
17930);
17931
17932# Keys of top level are standard property name; values are keys to another
17933# hash,  Each one is one of the property's values, in standard form.  The
17934# values are that prop-val's aliases.  If only one specified, the short and
17935# long alias are identical.
17936\%Unicode::UCD::prop_value_aliases = (
17937$prop_value_aliases
17938);
17939
17940# Ordered (by code point ordinal) list of the ranges of code points whose
17941# names are algorithmically determined.  Each range entry is an anonymous hash
17942# of the start and end points and a template for the names within it.
17943\@Unicode::UCD::algorithmic_named_code_points = (
17944$algorithm_names
17945);
17946
17947# The properties that as-is have two meanings, and which must be disambiguated
17948\%Unicode::UCD::ambiguous_names = (
17949$ambiguous_names
17950);
17951
17952# Keys are the prop-val combinations which are the default values for the
17953# given property, expressed in standard loose form
17954\%Unicode::UCD::loose_defaults = (
17955$loose_defaults
17956);
17957
17958# The properties that are combinations, in that they have both a map table and
17959# a match table.  This is actually for UCD.t, so it knows how to test for
17960# these.
17961\%Unicode::UCD::combination_property = (
17962$combination_property
17963);
17964
17965# All combinations of names that are suppressed.
17966# This is actually for UCD.t, so it knows which properties shouldn't have
17967# entries.  If it got any bigger, would probably want to put it in its own
17968# file to use memory only when it was needed, in testing.
17969\@Unicode::UCD::suppressed_properties = (
17970$suppressed
17971);
17972
179731;
17974END
17975
17976    main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
17977    return;
17978}
17979
17980sub write_all_tables() {
17981    # Write out all the tables generated by this program to files, as well as
17982    # the supporting data structures, pod file, and .t file.
17983
17984    my @writables;              # List of tables that actually get written
17985    my %match_tables_to_write;  # Used to collapse identical match tables
17986                                # into one file.  Each key is a hash function
17987                                # result to partition tables into buckets.
17988                                # Each value is an array of the tables that
17989                                # fit in the bucket.
17990
17991    # For each property ...
17992    # (sort so that if there is an immutable file name, it has precedence, so
17993    # some other property can't come in and take over its file name.  (We
17994    # don't care if both defined, as they had better be different anyway.)
17995    # The property named 'Perl' needs to be first (it doesn't have any
17996    # immutable file name) because empty properties are defined in terms of
17997    # its table named 'All' under the -annotate option.)   We also sort by
17998    # the property's name.  This is just for repeatability of the outputs
17999    # between runs of this program, but does not affect correctness.
18000    PROPERTY:
18001    foreach my $property ($perl,
18002                          sort { return -1 if defined $a->file;
18003                                 return 1 if defined $b->file;
18004                                 return $a->name cmp $b->name;
18005                                } grep { $_ != $perl } property_ref('*'))
18006    {
18007        my $type = $property->type;
18008
18009        # And for each table for that property, starting with the mapping
18010        # table for it ...
18011        TABLE:
18012        foreach my $table($property,
18013
18014                        # and all the match tables for it (if any), sorted so
18015                        # the ones with the shortest associated file name come
18016                        # first.  The length sorting prevents problems of a
18017                        # longer file taking a name that might have to be used
18018                        # by a shorter one.  The alphabetic sorting prevents
18019                        # differences between releases
18020                        sort {  my $ext_a = $a->external_name;
18021                                return 1 if ! defined $ext_a;
18022                                my $ext_b = $b->external_name;
18023                                return -1 if ! defined $ext_b;
18024
18025                                # But return the non-complement table before
18026                                # the complement one, as the latter is defined
18027                                # in terms of the former, and needs to have
18028                                # the information for the former available.
18029                                return 1 if $a->complement != 0;
18030                                return -1 if $b->complement != 0;
18031
18032                                # Similarly, return a subservient table after
18033                                # a leader
18034                                return 1 if $a->leader != $a;
18035                                return -1 if $b->leader != $b;
18036
18037                                my $cmp = length $ext_a <=> length $ext_b;
18038
18039                                # Return result if lengths not equal
18040                                return $cmp if $cmp;
18041
18042                                # Alphabetic if lengths equal
18043                                return $ext_a cmp $ext_b
18044                        } $property->tables
18045                    )
18046        {
18047
18048            # Here we have a table associated with a property.  It could be
18049            # the map table (done first for each property), or one of the
18050            # other tables.  Determine which type.
18051            my $is_property = $table->isa('Property');
18052
18053            my $name = $table->name;
18054            my $complete_name = $table->complete_name;
18055
18056            # See if should suppress the table if is empty, but warn if it
18057            # contains something.
18058            my $suppress_if_empty_warn_if_not
18059                    = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18060
18061            # Calculate if this table should have any code points associated
18062            # with it or not.
18063            my $expected_empty =
18064
18065                # $perl should be empty
18066                ($is_property && ($table == $perl))
18067
18068                # Match tables in properties we skipped populating should be
18069                # empty
18070                || (! $is_property && ! $property->to_create_match_tables)
18071
18072                # Tables and properties that are expected to have no code
18073                # points should be empty
18074                || $suppress_if_empty_warn_if_not
18075            ;
18076
18077            # Set a boolean if this table is the complement of an empty binary
18078            # table
18079            my $is_complement_of_empty_binary =
18080                $type == $BINARY &&
18081                (($table == $property->table('Y')
18082                    && $property->table('N')->is_empty)
18083                || ($table == $property->table('N')
18084                    && $property->table('Y')->is_empty));
18085
18086            if ($table->is_empty) {
18087
18088                if ($suppress_if_empty_warn_if_not) {
18089                    $table->set_fate($SUPPRESSED,
18090                                     $suppress_if_empty_warn_if_not);
18091                }
18092
18093                # Suppress (by skipping them) expected empty tables.
18094                next TABLE if $expected_empty;
18095
18096                # And setup to later output a warning for those that aren't
18097                # known to be allowed to be empty.  Don't do the warning if
18098                # this table is a child of another one to avoid duplicating
18099                # the warning that should come from the parent one.
18100                if (($table == $property || $table->parent == $table)
18101                    && $table->fate != $SUPPRESSED
18102                    && $table->fate != $MAP_PROXIED
18103                    && ! grep { $complete_name =~ /^$_$/ }
18104                                                    @tables_that_may_be_empty)
18105                {
18106                    push @unhandled_properties, "$table";
18107                }
18108
18109                # The old way of expressing an empty match list was to
18110                # complement the list that matches everything.  The new way is
18111                # to create an empty inversion list, but this doesn't work for
18112                # annotating, so use the old way then.
18113                $table->set_complement($All) if $annotate
18114                                                && $table != $property;
18115            }
18116            elsif ($expected_empty) {
18117                my $because = "";
18118                if ($suppress_if_empty_warn_if_not) {
18119                    $because = " because $suppress_if_empty_warn_if_not";
18120                }
18121
18122                Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
18123            }
18124
18125            # Some tables should match everything
18126            my $expected_full =
18127                ($table->fate == $SUPPRESSED)
18128                ? 0
18129                : ($is_property)
18130                  ? # All these types of map tables will be full because
18131                    # they will have been populated with defaults
18132                    ($type == $ENUM)
18133
18134                  : # A match table should match everything if its method
18135                    # shows it should
18136                    ($table->matches_all
18137
18138                    # The complement of an empty binary table will match
18139                    # everything
18140                    || $is_complement_of_empty_binary
18141                    )
18142            ;
18143
18144            my $count = $table->count;
18145            if ($expected_full) {
18146                if ($count != $MAX_WORKING_CODEPOINTS) {
18147                    Carp::my_carp("$table matches only "
18148                    . clarify_number($count)
18149                    . " Unicode code points but should match "
18150                    . clarify_number($MAX_WORKING_CODEPOINTS)
18151                    . " (off by "
18152                    .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18153                    . ").  Proceeding anyway.");
18154                }
18155
18156                # Here is expected to be full.  If it is because it is the
18157                # complement of an (empty) binary table that is to be
18158                # suppressed, then suppress this one as well.
18159                if ($is_complement_of_empty_binary) {
18160                    my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18161                    my $opposing = $property->table($opposing_name);
18162                    my $opposing_status = $opposing->status;
18163                    if ($opposing_status) {
18164                        $table->set_status($opposing_status,
18165                                           $opposing->status_info);
18166                    }
18167                }
18168            }
18169            elsif ($count == $MAX_UNICODE_CODEPOINTS
18170                   && $name ne "Any"
18171                   && ($table == $property || $table->leader == $table)
18172                   && $table->property->status ne $NORMAL)
18173            {
18174                    Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
18175            }
18176
18177            if ($table->fate >= $SUPPRESSED) {
18178                if (! $is_property) {
18179                    my @children = $table->children;
18180                    foreach my $child (@children) {
18181                        if ($child->fate < $SUPPRESSED) {
18182                            Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18183                        }
18184                    }
18185                }
18186                next TABLE;
18187
18188            }
18189
18190            if (! $is_property) {
18191
18192                make_ucd_table_pod_entries($table) if $table->property == $perl;
18193
18194                # Several things need to be done just once for each related
18195                # group of match tables.  Do them on the parent.
18196                if ($table->parent == $table) {
18197
18198                    # Add an entry in the pod file for the table; it also does
18199                    # the children.
18200                    make_re_pod_entries($table) if defined $pod_directory;
18201
18202                    # See if the table matches identical code points with
18203                    # something that has already been processed and is ready
18204                    # for output.  In that case, no need to have two files
18205                    # with the same code points in them.  We use the table's
18206                    # hash() method to store these in buckets, so that it is
18207                    # quite likely that if two tables are in the same bucket
18208                    # they will be identical, so don't have to compare tables
18209                    # frequently.  The tables have to have the same status to
18210                    # share a file, so add this to the bucket hash.  (The
18211                    # reason for this latter is that UCD.pm associates a
18212                    # status with a file.) We don't check tables that are
18213                    # inverses of others, as it would lead to some coding
18214                    # complications, and checking all the regular ones should
18215                    # find everything.
18216                    if ($table->complement == 0) {
18217                        my $hash = $table->hash . ';' . $table->status;
18218
18219                        # Look at each table that is in the same bucket as
18220                        # this one would be.
18221                        foreach my $comparison
18222                                            (@{$match_tables_to_write{$hash}})
18223                        {
18224                            # If the table doesn't point back to this one, we
18225                            # see if it matches identically
18226                            if (   $comparison->leader != $table
18227                                && $table->matches_identically_to($comparison))
18228                            {
18229                                $table->set_equivalent_to($comparison,
18230                                                                Related => 0);
18231                                next TABLE;
18232                            }
18233                        }
18234
18235                        # Here, not equivalent, add this table to the bucket.
18236                        push @{$match_tables_to_write{$hash}}, $table;
18237                    }
18238                }
18239            }
18240            else {
18241
18242                # Here is the property itself.
18243                # Don't write out or make references to the $perl property
18244                next if $table == $perl;
18245
18246                make_ucd_table_pod_entries($table);
18247
18248                # There is a mapping stored of the various synonyms to the
18249                # standardized name of the property for Unicode::UCD.
18250                # Also, the pod file contains entries of the form:
18251                # \p{alias: *}         \p{full: *}
18252                # rather than show every possible combination of things.
18253
18254                my @property_aliases = $property->aliases;
18255
18256                my $full_property_name = $property->full_name;
18257                my $property_name = $property->name;
18258                my $standard_property_name = standardize($property_name);
18259                my $standard_property_full_name
18260                                        = standardize($full_property_name);
18261
18262                # We also create for Unicode::UCD a list of aliases for
18263                # the property.  The list starts with the property name;
18264                # then its full name.
18265                my @property_list;
18266                my @standard_list;
18267                if ( $property->fate <= $MAP_PROXIED) {
18268                    @property_list = ($property_name, $full_property_name);
18269                    @standard_list = ($standard_property_name,
18270                                        $standard_property_full_name);
18271                }
18272
18273                # For each synonym ...
18274                for my $i (0 .. @property_aliases - 1)  {
18275                    my $alias = $property_aliases[$i];
18276                    my $alias_name = $alias->name;
18277                    my $alias_standard = standardize($alias_name);
18278
18279
18280                    # Add other aliases to the list of property aliases
18281                    if ($property->fate <= $MAP_PROXIED
18282                        && ! grep { $alias_standard eq $_ } @standard_list)
18283                    {
18284                        push @property_list, $alias_name;
18285                        push @standard_list, $alias_standard;
18286                    }
18287
18288                    # For Unicode::UCD, set the mapping of the alias to the
18289                    # property
18290                    if ($type == $STRING) {
18291                        if ($property->fate <= $MAP_PROXIED) {
18292                            $string_property_loose_to_name{$alias_standard}
18293                                            = $standard_property_name;
18294                        }
18295                    }
18296                    else {
18297                        my $hash_ref = ($alias_standard =~ /^_/)
18298                                       ? \%strict_property_name_of
18299                                       : \%loose_property_name_of;
18300                        if (exists $hash_ref->{$alias_standard}) {
18301                            Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
18302                        }
18303                        else {
18304                            $hash_ref->{$alias_standard}
18305                                                = $standard_property_name;
18306                        }
18307
18308                        # Now for the re pod entry for this alias.  Skip if not
18309                        # outputting a pod; skip the first one, which is the
18310                        # full name so won't have an entry like: '\p{full: *}
18311                        # \p{full: *}', and skip if don't want an entry for
18312                        # this one.
18313                        next if $i == 0
18314                                || ! defined $pod_directory
18315                                || ! $alias->make_re_pod_entry;
18316
18317                        my $rhs = "\\p{$full_property_name: *}";
18318                        if ($property != $perl && $table->perl_extension) {
18319                            $rhs .= ' (Perl extension)';
18320                        }
18321                        push @match_properties,
18322                            format_pod_line($indent_info_column,
18323                                        '\p{' . $alias->name . ': *}',
18324                                        $rhs,
18325                                        $alias->status);
18326                    }
18327                }
18328
18329                # The list of all possible names is attached to each alias, so
18330                # lookup is easy
18331                if (@property_list) {
18332                    push @{$prop_aliases{$standard_list[0]}}, @property_list;
18333                }
18334
18335                if ($property->fate <= $MAP_PROXIED) {
18336
18337                    # Similarly, we create for Unicode::UCD a list of
18338                    # property-value aliases.
18339
18340                    # Look at each table in the property...
18341                    foreach my $table ($property->tables) {
18342                        my @values_list;
18343                        my $table_full_name = $table->full_name;
18344                        my $standard_table_full_name
18345                                              = standardize($table_full_name);
18346                        my $table_name = $table->name;
18347                        my $standard_table_name = standardize($table_name);
18348
18349                        # The list starts with the table name and its full
18350                        # name.
18351                        push @values_list, $table_name, $table_full_name;
18352
18353                        # We add to the table each unique alias that isn't
18354                        # discouraged from use.
18355                        foreach my $alias ($table->aliases) {
18356                            next if $alias->status
18357                                 && $alias->status eq $DISCOURAGED;
18358                            my $name = $alias->name;
18359                            my $standard = standardize($name);
18360                            next if $standard eq $standard_table_name;
18361                            next if $standard eq $standard_table_full_name;
18362                            push @values_list, $name;
18363                        }
18364
18365                        # Here @values_list is a list of all the aliases for
18366                        # the table.  That is, all the property-values given
18367                        # by this table.  By agreement with Unicode::UCD,
18368                        # if the name and full name are identical, and there
18369                        # are no other names, drop the duplicate entry to save
18370                        # memory.
18371                        if (@values_list == 2
18372                            && $values_list[0] eq $values_list[1])
18373                        {
18374                            pop @values_list
18375                        }
18376
18377                        # To save memory, unlike the similar list for property
18378                        # aliases above, only the standard forms have the list.
18379                        # This forces an extra step of converting from input
18380                        # name to standard name, but the savings are
18381                        # considerable.  (There is only marginal savings if we
18382                        # did this with the property aliases.)
18383                        push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18384                    }
18385                }
18386
18387                # Don't write out a mapping file if not desired.
18388                next if ! $property->to_output_map;
18389            }
18390
18391            # Here, we know we want to write out the table, but don't do it
18392            # yet because there may be other tables that come along and will
18393            # want to share the file, and the file's comments will change to
18394            # mention them.  So save for later.
18395            push @writables, $table;
18396
18397        } # End of looping through the property and all its tables.
18398    } # End of looping through all properties.
18399
18400    # Now have all the tables that will have files written for them.  Do it.
18401    foreach my $table (@writables) {
18402        my @directory;
18403        my $filename;
18404        my $property = $table->property;
18405        my $is_property = ($table == $property);
18406
18407        # For very short tables, instead of writing them out to actual files,
18408        # we in-line their inversion list definitions into UCD.pm.  The
18409        # definition replaces the file name, and the special pseudo-directory
18410        # '#' is used to signal this.  This significantly cuts down the number
18411        # of files written at little extra cost to the hashes in UCD.pm.
18412        # And it means, no run-time files to read to get the definitions.
18413        if (! $is_property
18414            && ! $annotate  # For annotation, we want to explicitly show
18415                            # everything, so keep in files
18416            && $table->ranges <= 3)
18417        {
18418            my @ranges = $table->ranges;
18419            my $count = @ranges;
18420            if ($count == 0) {  # 0th index reserved for 0-length lists
18421                $filename = 0;
18422            }
18423            elsif ($table->leader != $table) {
18424
18425                # Here, is a table that is equivalent to another; code
18426                # in register_file_for_name() causes its leader's definition
18427                # to be used
18428
18429                next;
18430            }
18431            else {  # No equivalent table so far.
18432
18433                # Build up its definition range-by-range.
18434                my $definition = "";
18435                while (defined (my $range = shift @ranges)) {
18436                    my $end = $range->end;
18437                    if ($end < $MAX_WORKING_CODEPOINT) {
18438                        $count++;
18439                        $end = "\n" . ($end + 1);
18440                    }
18441                    else {  # Extends to infinity, hence no 'end'
18442                        $end = "";
18443                    }
18444                    $definition .= "\n" . $range->start . $end;
18445                }
18446                $definition = "V$count" . $definition;
18447                $filename = @inline_definitions;
18448                push @inline_definitions, $definition;
18449            }
18450            @directory = "#";
18451            register_file_for_name($table, \@directory, $filename);
18452            next;
18453        }
18454
18455        if (! $is_property) {
18456            # Match tables for the property go in lib/$subdirectory, which is
18457            # the property's name.  Don't use the standard file name for this,
18458            # as may get an unfamiliar alias
18459            @directory = ($matches_directory, ($property->match_subdir)
18460                                              ? $property->match_subdir
18461                                              : $property->external_name);
18462        }
18463        else {
18464
18465            @directory = $table->directory;
18466            $filename = $table->file;
18467        }
18468
18469        # Use specified filename if available, or default to property's
18470        # shortest name.  We need an 8.3 safe filename (which means "an 8
18471        # safe" filename, since after the dot is only 'pl', which is < 3)
18472        # The 2nd parameter is if the filename shouldn't be changed, and
18473        # it shouldn't iff there is a hard-coded name for this table.
18474        $filename = construct_filename(
18475                                $filename || $table->external_name,
18476                                ! $filename,    # mutable if no filename
18477                                \@directory);
18478
18479        register_file_for_name($table, \@directory, $filename);
18480
18481        # Only need to write one file when shared by more than one
18482        # property
18483        next if ! $is_property
18484                && ($table->leader != $table || $table->complement != 0);
18485
18486        # Construct a nice comment to add to the file
18487        $table->set_final_comment;
18488
18489        $table->write;
18490    }
18491
18492
18493    # Write out the pod file
18494    make_pod;
18495
18496    # And Name.pm, UCD.pl
18497    make_Name_pm;
18498    make_UCD;
18499
18500    make_property_test_script() if $make_test_script;
18501    make_normalization_test_script() if $make_norm_test_script;
18502    return;
18503}
18504
18505my @white_space_separators = ( # This used only for making the test script.
18506                            "",
18507                            ' ',
18508                            "\t",
18509                            '   '
18510                        );
18511
18512sub generate_separator($lhs) {
18513    # This used only for making the test script.  It generates the colon or
18514    # equal separator between the property and property value, with random
18515    # white space surrounding the separator
18516
18517    return "" if $lhs eq "";  # No separator if there's only one (the r) side
18518
18519    # Choose space before and after randomly
18520    my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18521    my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18522
18523    # And return the whole complex, half the time using a colon, half the
18524    # equals
18525    return $spaces_before
18526            . (rand() < 0.5) ? '=' : ':'
18527            . $spaces_after;
18528}
18529
18530sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18531    # This used only for making the test script.  It generates test cases that
18532    # are expected to compile successfully in perl.  Note that the LHS and
18533    # RHS are assumed to already be as randomized as the caller wants.
18534
18535    # $lhs          # The property: what's to the left of the colon
18536                    #  or equals separator
18537    # $rhs          # The property value; what's to the right
18538    # $valid_code   # A code point that's known to be in the
18539                        # table given by LHS=RHS; undef if table is
18540                        # empty
18541    # $invalid_code # A code point known to not be in the table;
18542                    # undef if the table is all code points
18543    # $warning
18544
18545    # Get the colon or equal
18546    my $separator = generate_separator($lhs);
18547
18548    # The whole 'property=value'
18549    my $name = "$lhs$separator$rhs";
18550
18551    my @output;
18552    # Create a complete set of tests, with complements.
18553    if (defined $valid_code) {
18554        push @output, <<"EOC"
18555Expect(1, $valid_code, '\\p{$name}', $warning);
18556Expect(0, $valid_code, '\\p{^$name}', $warning);
18557Expect(0, $valid_code, '\\P{$name}', $warning);
18558Expect(1, $valid_code, '\\P{^$name}', $warning);
18559EOC
18560    }
18561    if (defined $invalid_code) {
18562        push @output, <<"EOC"
18563Expect(0, $invalid_code, '\\p{$name}', $warning);
18564Expect(1, $invalid_code, '\\p{^$name}', $warning);
18565Expect(1, $invalid_code, '\\P{$name}', $warning);
18566Expect(0, $invalid_code, '\\P{^$name}', $warning);
18567EOC
18568    }
18569    return @output;
18570}
18571
18572sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18573    # This used only for making the test script.  It generates wildcardl
18574    # matching test cases that are expected to compile successfully in perl.
18575
18576    # $lhs           # The property: what's to the left of the
18577                     # or equals separator
18578    # $rhs           # The property value; what's to the right
18579    # $valid_code    # A code point that's known to be in the
18580                     # table given by LHS=RHS; undef if table is
18581                     # empty
18582    # $invalid_code  # A code point known to not be in the table;
18583                     # undef if the table is all code points
18584    # $warning
18585
18586    return if $lhs eq "";
18587    return if $lhs =~ / ^ Is_ /x;   # These are not currently supported
18588
18589    # Generate a standardized pattern, with colon being the delimitter
18590    my $wildcard = "$lhs=:\\A$rhs\\z:";
18591
18592    my @output;
18593    push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
18594                                                        if defined $valid_code;
18595    push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
18596                                                      if defined $invalid_code;
18597    return @output;
18598}
18599
18600sub generate_error($lhs, $rhs, $already_in_error=0) {
18601    # This used only for making the test script.  It generates test cases that
18602    # are expected to not only not match, but to be syntax or similar errors
18603
18604    # $lhs                # The property: what's to the left of the
18605                          # colon or equals separator
18606    # $rhs                # The property value; what's to the right
18607    # $already_in_error   # Boolean; if true it's known that the
18608                          # unmodified LHS and RHS will cause an error.
18609                          # This routine should not force another one
18610    # Get the colon or equal
18611    my $separator = generate_separator($lhs);
18612
18613    # Since this is an error only, don't bother to randomly decide whether to
18614    # put the error on the left or right side; and assume that the RHS is
18615    # loosely matched, again for convenience rather than rigor.
18616    $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18617
18618    my $property = $lhs . $separator . $rhs;
18619
18620    return <<"EOC";
18621Error('\\p{$property}');
18622Error('\\P{$property}');
18623EOC
18624}
18625
18626# These are used only for making the test script
18627# XXX Maybe should also have a bad strict seps, which includes underscore.
18628
18629my @good_loose_seps = (
18630            " ",
18631            "-",
18632            "\t",
18633            "",
18634            "_",
18635           );
18636my @bad_loose_seps = (
18637           "/a/",
18638           ':=',
18639          );
18640
18641sub randomize_stricter_name($name) {
18642    # This used only for making the test script.  Take the input name and
18643    # return a randomized, but valid version of it under the stricter matching
18644    # rules.
18645
18646    # If the name looks like a number (integer, floating, or rational), do
18647    # some extra work
18648    if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18649        my $sign = $1;
18650        my $number = $2;
18651        my $separator = $3;
18652
18653        # If there isn't a sign, part of the time add a plus
18654        # Note: Not testing having any denominator having a minus sign
18655        if (! $sign) {
18656            $sign = '+' if rand() <= .3;
18657        }
18658
18659        # And add 0 or more leading zeros.
18660        $name = $sign . ('0' x int rand(10)) . $number;
18661
18662        if (defined $separator) {
18663            my $extra_zeros = '0' x int rand(10);
18664
18665            if ($separator eq '.') {
18666
18667                # Similarly, add 0 or more trailing zeros after a decimal
18668                # point
18669                $name .= $extra_zeros;
18670            }
18671            else {
18672
18673                # Or, leading zeros before the denominator
18674                $name =~ s,/,/$extra_zeros,;
18675            }
18676        }
18677    }
18678
18679    # For legibility of the test, only change the case of whole sections at a
18680    # time.  To do this, first split into sections.  The split returns the
18681    # delimiters
18682    my @sections;
18683    for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18684        trace $section if main::DEBUG && $to_trace;
18685
18686        if (length $section > 1 && $section !~ /\D/) {
18687
18688            # If the section is a sequence of digits, about half the time
18689            # randomly add underscores between some of them.
18690            if (rand() > .5) {
18691
18692                # Figure out how many underscores to add.  max is 1 less than
18693                # the number of digits.  (But add 1 at the end to make sure
18694                # result isn't 0, and compensate earlier by subtracting 2
18695                # instead of 1)
18696                my $num_underscores = int rand(length($section) - 2) + 1;
18697
18698                # And add them evenly throughout, for convenience, not rigor
18699                use integer;
18700                my $spacing = (length($section) - 1)/ $num_underscores;
18701                my $temp = $section;
18702                $section = "";
18703                for my $i (1 .. $num_underscores) {
18704                    $section .= substr($temp, 0, $spacing, "") . '_';
18705                }
18706                $section .= $temp;
18707            }
18708            push @sections, $section;
18709        }
18710        else {
18711
18712            # Here not a sequence of digits.  Change the case of the section
18713            # randomly
18714            my $switch = int rand(4);
18715            if ($switch == 0) {
18716                push @sections, uc $section;
18717            }
18718            elsif ($switch == 1) {
18719                push @sections, lc $section;
18720            }
18721            elsif ($switch == 2) {
18722                push @sections, ucfirst $section;
18723            }
18724            else {
18725                push @sections, $section;
18726            }
18727        }
18728    }
18729    trace "returning", join "", @sections if main::DEBUG && $to_trace;
18730    return join "", @sections;
18731}
18732
18733sub randomize_loose_name($name, $want_error=0) {
18734    # This used only for making the test script
18735
18736    $name = randomize_stricter_name($name);
18737
18738    my @parts;
18739    push @parts, $good_loose_seps[rand(@good_loose_seps)];
18740
18741    # Preserve trailing ones for the sake of not stripping the underscore from
18742    # 'L_'
18743    for my $part (split /[-\s_]+ (?= . )/, $name) {
18744        if (@parts) {
18745            if ($want_error and rand() < 0.3) {
18746                push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18747                $want_error = 0;
18748            }
18749            else {
18750                push @parts, $good_loose_seps[rand(@good_loose_seps)];
18751            }
18752        }
18753        push @parts, $part;
18754    }
18755    my $new = join("", @parts);
18756    trace "$name => $new" if main::DEBUG && $to_trace;
18757
18758    if ($want_error) {
18759        if (rand() >= 0.5) {
18760            $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18761        }
18762        else {
18763            $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18764        }
18765    }
18766    return $new;
18767}
18768
18769# Used to make sure don't generate duplicate test cases.
18770my %test_generated;
18771
18772sub make_property_test_script() {
18773    # This used only for making the test script
18774    # this written directly -- it's huge.
18775
18776    print "Making test script\n" if $verbosity >= $PROGRESS;
18777
18778    # This uses randomness to test different possibilities without testing all
18779    # possibilities.  To ensure repeatability, set the seed to 0.  But if
18780    # tests are added, it will perturb all later ones in the .t file
18781    srand 0;
18782
18783    $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18784
18785    # Create a list of what the %f representation is for each rational number.
18786    # This will be used below.
18787    my @valid_base_floats = '0.0';
18788    foreach my $e_representation (keys %nv_floating_to_rational) {
18789        push @valid_base_floats,
18790                            eval $nv_floating_to_rational{$e_representation};
18791    }
18792
18793    # It doesn't matter whether the elements of this array contain single lines
18794    # or multiple lines. main::write doesn't count the lines.
18795    my @output;
18796
18797    push @output, <<'EOF_CODE';
18798Error('\p{Script=InGreek}');    # Bug #69018
18799Test_GCB("1100 $nobreak 1161");  # Bug #70940
18800Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18801Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18802Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
18803Error('\p{InKana}');    # 'Kana' is not a block so InKana shouldn't compile
18804Expect(1, 0xB6, '\p{In=V1_1}', ""); # Didn't use to work
18805Expect(1, 0x3A2,'\p{In=NA}', "");   # Didn't use to work
18806
18807# Make sure this gets tested; it was not part of the official test suite at
18808# the time this was added.  Note that this is as it would appear in the
18809# official suite, and gets modified to check for the perl tailoring by
18810# Test_WB()
18811Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
18812Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
18813Expect(1, ord(" "), '\p{gc=:(?aa)s:}', "");     # /aa is valid
18814Expect(1, ord(" "), '\p{gc=:(?-s)s:}', "");     # /-s is valid
18815EOF_CODE
18816
18817    # Sort these so get results in same order on different runs of this
18818    # program
18819    foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
18820                                    or
18821                                 lc $a->name cmp lc $b->name
18822                               } property_ref('*'))
18823    {
18824        # Non-binary properties should not match \p{};  Test all for that.
18825        if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
18826            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
18827                                                            $property->aliases;
18828            foreach my $property_alias ($property->aliases) {
18829                my $name = standardize($property_alias->name);
18830
18831                # But some names are ambiguous, meaning a binary property with
18832                # the same name when used in \p{}, and a different
18833                # (non-binary) property in other contexts.
18834                next if grep { $name eq $_ } keys %ambiguous_names;
18835
18836                push @output, <<"EOF_CODE";
18837Error('\\p{$name}');
18838Error('\\P{$name}');
18839EOF_CODE
18840            }
18841        }
18842        foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
18843                                    or
18844                                  lc $a->name cmp lc $b->name
18845                                } $property->tables)
18846        {
18847
18848            # Find code points that match, and don't match this table.
18849            my $valid = $table->get_valid_code_point;
18850            my $invalid = $table->get_invalid_code_point;
18851            my $warning = ($table->status eq $DEPRECATED)
18852                            ? "'deprecated'"
18853                            : '""';
18854
18855            # Test each possible combination of the property's aliases with
18856            # the table's.  If this gets to be too many, could do what is done
18857            # in the set_final_comment() for Tables
18858            my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
18859            next unless @table_aliases;
18860            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
18861            next unless @property_aliases;
18862
18863            # Every property can be optionally be prefixed by 'Is_', so test
18864            # that those work, by creating such a new alias for each
18865            # pre-existing one.
18866            push @property_aliases, map { Alias->new("Is_" . $_->name,
18867                                                    $_->loose_match,
18868                                                    $_->make_re_pod_entry,
18869                                                    $_->ok_as_filename,
18870                                                    $_->status,
18871                                                    $_->ucd,
18872                                                    )
18873                                         } @property_aliases;
18874            my $max = max(scalar @table_aliases, scalar @property_aliases);
18875            for my $j (0 .. $max - 1) {
18876
18877                # The current alias for property is the next one on the list,
18878                # or if beyond the end, start over.  Similarly for table
18879                my $property_name
18880                            = $property_aliases[$j % @property_aliases]->name;
18881
18882                $property_name = "" if $table->property == $perl;
18883                my $table_alias = $table_aliases[$j % @table_aliases];
18884                my $table_name = $table_alias->name;
18885                my $loose_match = $table_alias->loose_match;
18886
18887                # If the table doesn't have a file, any test for it is
18888                # already guaranteed to be in error
18889                my $already_error = ! $table->file_path;
18890
18891                # A table that begins with these could actually be a
18892                # user-defined property, so won't be compile time errors, as
18893                # the definitions of those can be deferred until runtime
18894                next if $already_error && $table_name =~ / ^ I[ns] /x;
18895
18896                # Generate error cases for this alias.
18897                push @output, generate_error($property_name,
18898                                             $table_name,
18899                                             $already_error);
18900
18901                # If the table is guaranteed to always generate an error,
18902                # quit now without generating success cases.
18903                next if $already_error;
18904
18905                # Now for the success cases.  First, wildcard matching, as it
18906                # shouldn't have any randomization.
18907                if ($table_alias->status eq $NORMAL) {
18908                    push @output, generate_wildcard_tests($property_name,
18909                                                          $table_name,
18910                                                          $valid,
18911                                                          $invalid,
18912                                                          $warning,
18913                                                         );
18914                }
18915                my $random;
18916                if ($loose_match) {
18917
18918                    # For loose matching, create an extra test case for the
18919                    # standard name.
18920                    my $standard = standardize($table_name);
18921
18922                    # $test_name should be a unique combination for each test
18923                    # case; used just to avoid duplicate tests
18924                    my $test_name = "$property_name=$standard";
18925
18926                    # Don't output duplicate test cases.
18927                    if (! exists $test_generated{$test_name}) {
18928                        $test_generated{$test_name} = 1;
18929                        push @output, generate_tests($property_name,
18930                                                     $standard,
18931                                                     $valid,
18932                                                     $invalid,
18933                                                     $warning,
18934                                                 );
18935                        if ($table_alias->status eq $NORMAL) {
18936                            push @output, generate_wildcard_tests(
18937                                                     $property_name,
18938                                                     $standard,
18939                                                     $valid,
18940                                                     $invalid,
18941                                                     $warning,
18942                                                 );
18943                        }
18944                    }
18945                    $random = randomize_loose_name($table_name)
18946                }
18947                else { # Stricter match
18948                    $random = randomize_stricter_name($table_name);
18949                }
18950
18951                # Now for the main test case for this alias.
18952                my $test_name = "$property_name=$random";
18953                if (! exists $test_generated{$test_name}) {
18954                    $test_generated{$test_name} = 1;
18955                    push @output, generate_tests($property_name,
18956                                                 $random,
18957                                                 $valid,
18958                                                 $invalid,
18959                                                 $warning,
18960                                             );
18961
18962                    if ($property->name eq 'nv') {
18963                        if ($table_name !~ qr{/}) {
18964                            push @output, generate_tests($property_name,
18965                                                sprintf("%.15e", $table_name),
18966                                                $valid,
18967                                                $invalid,
18968                                                $warning,
18969                                            );
18970                    }
18971                    else {
18972                        # If the name is a rational number, add tests for a
18973                        # non-reduced form, and for a floating point equivalent.
18974
18975                        # 60 is a number divisible by a bunch of things
18976                        my ($numerator, $denominator) = $table_name
18977                                                        =~ m! (.+) / (.+) !x;
18978                        $numerator *= 60;
18979                        $denominator *= 60;
18980                        push @output, generate_tests($property_name,
18981                                                    "$numerator/$denominator",
18982                                                    $valid,
18983                                                    $invalid,
18984                                                    $warning,
18985                                    );
18986
18987                        # Calculate the float, and the %e representation
18988                        my $float = eval $table_name;
18989                        my $e_representation = sprintf("%.*e",
18990                                                $E_FLOAT_PRECISION, $float);
18991                        # Parse that
18992                        my ($non_zeros, $zeros, $exponent_sign, $exponent)
18993                           = $e_representation
18994                               =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
18995                        my $min_e_precision;
18996                        my $min_f_precision;
18997
18998                        if ($exponent_sign eq '+' && $exponent != 0) {
18999                            Carp::my_carp_bug("Not yet equipped to handle"
19000                                            . " positive exponents");
19001                            return;
19002                        }
19003                        else {
19004                            # We're trying to find the minimum precision that
19005                            # is needed to indicate this particular rational
19006                            # for the given $E_FLOAT_PRECISION.  For %e, any
19007                            # trailing zeros, like 1.500e-02 aren't needed, so
19008                            # the correct value is how many non-trailing zeros
19009                            # there are after the decimal point.
19010                            $min_e_precision = length $non_zeros;
19011
19012                            # For %f, like .01500, we want at least
19013                            # $E_FLOAT_PRECISION digits, but any trailing
19014                            # zeros aren't needed, so we can subtract the
19015                            # length of those.  But we also need to include
19016                            # the zeros after the decimal point, but before
19017                            # the first significant digit.
19018                            $min_f_precision = $E_FLOAT_PRECISION
19019                                             + $exponent
19020                                             - length $zeros;
19021                        }
19022
19023                        # Make tests for each possible precision from 1 to
19024                        # just past the worst case.
19025                        my $upper_limit = ($min_e_precision > $min_f_precision)
19026                                           ? $min_e_precision
19027                                           : $min_f_precision;
19028
19029                        for my $i (1 .. $upper_limit + 1) {
19030                            for my $format ("e", "f") {
19031                                my $this_table
19032                                          = sprintf("%.*$format", $i, $float);
19033
19034                                # If we don't have enough precision digits,
19035                                # make a fail test; otherwise a pass test.
19036                                my $pass = ($format eq "e")
19037                                            ? $i >= $min_e_precision
19038                                            : $i >= $min_f_precision;
19039                                if ($pass) {
19040                                    push @output, generate_tests($property_name,
19041                                                                $this_table,
19042                                                                $valid,
19043                                                                $invalid,
19044                                                                $warning,
19045                                                );
19046                                }
19047                                elsif (   $format eq "e"
19048
19049                                          # Here we would fail, but in the %f
19050                                          # case, the representation at this
19051                                          # precision could actually be a
19052                                          # valid one for some other rational
19053                                       || ! grep { $this_table
19054                                                            =~ / ^ $_ 0* $ /x }
19055                                                            @valid_base_floats)
19056                                {
19057                                    push @output,
19058                                        generate_error($property_name,
19059                                                       $this_table,
19060                                                       1   # 1 => already an
19061                                                           # error
19062                                                );
19063                                }
19064                            }
19065                        }
19066                    }
19067                    }
19068                }
19069            }
19070            $table->DESTROY();
19071        }
19072        $property->DESTROY();
19073    }
19074
19075    # Make any test of the boundary (break) properties TODO if the code
19076    # doesn't match the version being compiled
19077    my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19078                             ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19079                             : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19080
19081    @output= map {
19082        map s/^/    /mgr,
19083        map "$_;\n",
19084        split /;\n/, $_
19085    } @output;
19086
19087    # Cause there to be 'if' statements to only execute a portion of this
19088    # long-running test each time, so that we can have a bunch of .t's running
19089    # in parallel
19090    my $chunks = 10     # Number of test files
19091               - 1      # For GCB & SB
19092               - 1      # For WB
19093               - 4;     # LB split into this many files
19094    my @output_chunked;
19095    my $chunk_count=0;
19096    my $chunk_size= int(@output / $chunks) + 1;
19097    while (@output) {
19098        $chunk_count++;
19099        my @chunk= splice @output, 0, $chunk_size;
19100        push @output_chunked,
19101            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19102                @chunk,
19103            "}\n";
19104    }
19105
19106    $chunk_count++;
19107    push @output_chunked,
19108        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19109            (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
19110            (map {"    Test_SB('$_');\n"} @SB_tests),
19111        "}\n";
19112
19113
19114    $chunk_size= int(@LB_tests / 4) + 1;
19115    @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
19116    while (@LB_tests) {
19117        $chunk_count++;
19118        my @chunk= splice @LB_tests, 0, $chunk_size;
19119        push @output_chunked,
19120            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19121                @chunk,
19122            "}\n";
19123    }
19124
19125    $chunk_count++;
19126    push @output_chunked,
19127        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19128            (map {"    Test_WB('$_');\n"} @WB_tests),
19129        "}\n";
19130
19131    &write($t_path,
19132           0,           # Not utf8;
19133           [$HEADER,
19134            $TODO_FAILING_BREAKS,
19135            <DATA>,
19136            @output_chunked,
19137            "Finished();\n",
19138           ]);
19139
19140    return;
19141}
19142
19143sub make_normalization_test_script() {
19144    print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19145
19146    my $n_path = 'TestNorm.pl';
19147
19148    unshift @normalization_tests, <<'END';
19149use utf8;
19150use Test::More;
19151
19152sub ord_string {    # Convert packed ords to printable string
19153    use charnames ();
19154    return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19155                                                unpack "U*", shift) .  "'";
19156    #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
19157}
19158
19159sub Test_N {
19160    my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19161    my $display_source = ord_string($source);
19162    my $display_nfc = ord_string($nfc);
19163    my $display_nfd = ord_string($nfd);
19164    my $display_nfkc = ord_string($nfkc);
19165    my $display_nfkd = ord_string($nfkd);
19166
19167    use Unicode::Normalize;
19168    #    NFC
19169    #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
19170    #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
19171    #
19172    #    NFD
19173    #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
19174    #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
19175    #
19176    #    NFKC
19177    #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19178    #      toNFKC(nfkc) == toNFKC(nfkd)
19179    #
19180    #    NFKD
19181    #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19182    #      toNFKD(nfkc) == toNFKD(nfkd)
19183
19184    is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19185    is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19186    is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19187    is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19188    is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19189
19190    is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19191    is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19192    is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19193    is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19194    is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19195
19196    is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19197    is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19198    is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19199    is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19200    is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19201
19202    is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19203    is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19204    is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19205    is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19206    is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19207}
19208END
19209
19210    &write($n_path,
19211           1,           # Is utf8;
19212           [
19213            @normalization_tests,
19214            'done_testing();'
19215            ]);
19216    return;
19217}
19218
19219# Skip reasons, so will be exact same text and hence the files with each
19220# reason will get grouped together in perluniprops.
19221my $Documentation = "Documentation";
19222my $Indic_Skip
19223            = "Provisional; for the analysis and processing of Indic scripts";
19224my $Validation = "Validation Tests";
19225my $Validation_Documentation = "Documentation of validation Tests";
19226my $Unused_Skip = "Currently unused by Perl";
19227
19228# This is a list of the input files and how to handle them.  The files are
19229# processed in their order in this list.  Some reordering is possible if
19230# desired, but the PropertyAliases and PropValueAliases files should be first,
19231# and the extracted before the others (as data in an extracted file can be
19232# over-ridden by the non-extracted.  Some other files depend on data derived
19233# from an earlier file, like UnicodeData requires data from Jamo, and the case
19234# changing and folding requires data from Unicode.  Mostly, it is safest to
19235# order by first version releases in (except the Jamo).
19236#
19237# The version strings allow the program to know whether to expect a file or
19238# not, but if a file exists in the directory, it will be processed, even if it
19239# is in a version earlier than expected, so you can copy files from a later
19240# release into an earlier release's directory.
19241my @input_file_objects = (
19242    Input_file->new('PropertyAliases.txt', v3.2,
19243                    Handler => \&process_PropertyAliases,
19244                    Early => [ \&substitute_PropertyAliases ],
19245                    Required_Even_in_Debug_Skip => 1,
19246                   ),
19247    Input_file->new(undef, v0,  # No file associated with this
19248                    Progress_Message => 'Finishing property setup',
19249                    Handler => \&finish_property_setup,
19250                   ),
19251    Input_file->new('PropValueAliases.txt', v3.2,
19252                     Handler => \&process_PropValueAliases,
19253                     Early => [ \&substitute_PropValueAliases ],
19254                     Has_Missings_Defaults => $NOT_IGNORED,
19255                     Required_Even_in_Debug_Skip => 1,
19256                    ),
19257    Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19258                    Property => 'General_Category',
19259                   ),
19260    Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19261                    Property => 'Canonical_Combining_Class',
19262                    Has_Missings_Defaults => $NOT_IGNORED,
19263                   ),
19264    Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19265                    Property => 'Numeric_Type',
19266                    Has_Missings_Defaults => $NOT_IGNORED,
19267                   ),
19268    Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19269                    Property => 'East_Asian_Width',
19270                    Has_Missings_Defaults => $NOT_IGNORED,
19271                   ),
19272    Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19273                    Property => 'Line_Break',
19274                    Has_Missings_Defaults => $NOT_IGNORED,
19275                   ),
19276    Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19277                    Property => 'Bidi_Class',
19278                    Has_Missings_Defaults => $NOT_IGNORED,
19279                   ),
19280    Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19281                    Property => 'Decomposition_Type',
19282                    Has_Missings_Defaults => $NOT_IGNORED,
19283                   ),
19284    Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19285    Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19286                    Property => 'Numeric_Value',
19287                    Each_Line_Handler => \&filter_numeric_value_line,
19288                    Has_Missings_Defaults => $NOT_IGNORED,
19289                   ),
19290    Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19291                    Property => 'Joining_Group',
19292                    Has_Missings_Defaults => $NOT_IGNORED,
19293                   ),
19294
19295    Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19296                    Property => 'Joining_Type',
19297                    Has_Missings_Defaults => $NOT_IGNORED,
19298                   ),
19299    Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19300                    Skip => 'This file adds no new information not already'
19301                          . ' present in other files',
19302                    # And it's unnecessary programmer work to handle this new
19303                    # format.  Previous Derived files actually had bug fixes
19304                    # in them that were useful, but that should not be the
19305                    # case here.
19306                   ),
19307    Input_file->new('Jamo.txt', v2.0.0,
19308                    Property => 'Jamo_Short_Name',
19309                    Each_Line_Handler => \&filter_jamo_line,
19310                   ),
19311    Input_file->new('UnicodeData.txt', v1.1.5,
19312                    Pre_Handler => \&setup_UnicodeData,
19313
19314                    # We clean up this file for some early versions.
19315                    Each_Line_Handler => [ (($v_version lt v2.0.0 )
19316                                            ? \&filter_v1_ucd
19317                                            : ($v_version eq v2.1.5)
19318                                                ? \&filter_v2_1_5_ucd
19319
19320                                                # And for 5.14 Perls with 6.0,
19321                                                # have to also make changes
19322                                                : ($v_version ge v6.0.0
19323                                                   && $^V lt v5.17.0)
19324                                                    ? \&filter_v6_ucd
19325                                                    : undef),
19326
19327                                            # Early versions did not have the
19328                                            # proper Unicode_1 names for the
19329                                            # controls
19330                                            (($v_version lt v3.0.0)
19331                                            ? \&filter_early_U1_names
19332                                            : undef),
19333
19334                                            # Early versions did not correctly
19335                                            # use the later method for giving
19336                                            # decimal digit values
19337                                            (($v_version le v3.2.0)
19338                                            ? \&filter_bad_Nd_ucd
19339                                            : undef),
19340
19341                                            # And the main filter
19342                                            \&filter_UnicodeData_line,
19343                                         ],
19344                    EOF_Handler => \&EOF_UnicodeData,
19345                   ),
19346    Input_file->new('CJKXREF.TXT', v1.1.5,
19347                    Withdrawn => v2.0.0,
19348                    Skip => 'Gives the mapping of CJK code points '
19349                          . 'between Unicode and various other standards',
19350                   ),
19351    Input_file->new('ArabicShaping.txt', v2.0.0,
19352                    Each_Line_Handler =>
19353                        ($v_version lt 4.1.0)
19354                                    ? \&filter_old_style_arabic_shaping
19355                                    : undef,
19356                    # The first field after the range is a "schematic name"
19357                    # not used by Perl
19358                    Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19359                    Has_Missings_Defaults => $NOT_IGNORED,
19360                   ),
19361    Input_file->new('Blocks.txt', v2.0.0,
19362                    Property => 'Block',
19363                    Has_Missings_Defaults => $NOT_IGNORED,
19364                    Each_Line_Handler => \&filter_blocks_lines
19365                   ),
19366    Input_file->new('Index.txt', v2.0.0,
19367                    Skip => 'Alphabetical index of Unicode characters',
19368                   ),
19369    Input_file->new('NamesList.txt', v2.0.0,
19370                    Skip => 'Annotated list of characters',
19371                   ),
19372    Input_file->new('PropList.txt', v2.0.0,
19373                    Each_Line_Handler => (($v_version lt v3.1.0)
19374                                            ? \&filter_old_style_proplist
19375                                            : undef),
19376                   ),
19377    Input_file->new('Props.txt', v2.0.0,
19378                    Withdrawn => v3.0.0,
19379                    Skip => 'A subset of F<PropList.txt> (which is used instead)',
19380                   ),
19381    Input_file->new('ReadMe.txt', v2.0.0,
19382                    Skip => $Documentation,
19383                   ),
19384    Input_file->new('Unihan.txt', v2.0.0,
19385                    Withdrawn => v5.2.0,
19386                    Construction_Time_Handler => \&construct_unihan,
19387                    Pre_Handler => \&setup_unihan,
19388                    Optional => [ "",
19389                                  'Unicode_Radical_Stroke'
19390                                ],
19391                    Each_Line_Handler => \&filter_unihan_line,
19392                   ),
19393    Input_file->new('SpecialCasing.txt', v2.1.8,
19394                    Each_Line_Handler => ($v_version eq 2.1.8)
19395                                         ? \&filter_2_1_8_special_casing_line
19396                                         : \&filter_special_casing_line,
19397                    Pre_Handler => \&setup_special_casing,
19398                    Has_Missings_Defaults => $IGNORED,
19399                   ),
19400    Input_file->new(
19401                    'LineBreak.txt', v3.0.0,
19402                    Has_Missings_Defaults => $NOT_IGNORED,
19403                    Property => 'Line_Break',
19404                    # Early versions had problematic syntax
19405                    Each_Line_Handler => ($v_version ge v3.1.0)
19406                                          ? undef
19407                                          : ($v_version lt v3.0.0)
19408                                            ? \&filter_substitute_lb
19409                                            : \&filter_early_ea_lb,
19410                    # Must use long names for property values see comments at
19411                    # sub filter_substitute_lb
19412                    Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19413                               'Alphabetic', # default to this because XX ->
19414                                             # AL
19415
19416                               # Don't use _Perl_LB as a synonym for
19417                               # Line_Break in later perls, as it is tailored
19418                               # and isn't the same as Line_Break
19419                               'ONLY_EARLY' ],
19420                   ),
19421    Input_file->new('EastAsianWidth.txt', v3.0.0,
19422                    Property => 'East_Asian_Width',
19423                    Has_Missings_Defaults => $NOT_IGNORED,
19424                    # Early versions had problematic syntax
19425                    Each_Line_Handler => (($v_version lt v3.1.0)
19426                                        ? \&filter_early_ea_lb
19427                                        : undef),
19428                   ),
19429    Input_file->new('CompositionExclusions.txt', v3.0.0,
19430                    Property => 'Composition_Exclusion',
19431                   ),
19432    Input_file->new('UnicodeData.html', v3.0.0,
19433                    Withdrawn => v4.0.1,
19434                    Skip => $Documentation,
19435                   ),
19436    Input_file->new('BidiMirroring.txt', v3.0.1,
19437                    Property => 'Bidi_Mirroring_Glyph',
19438                    Has_Missings_Defaults => ($v_version lt v6.2.0)
19439                                              ? $NO_DEFAULTS
19440                                              # Is <none> which doesn't mean
19441                                              # anything to us, we will use the
19442                                              # null string
19443                                              : $IGNORED,
19444                   ),
19445    Input_file->new('NamesList.html', v3.0.0,
19446                    Skip => 'Describes the format and contents of '
19447                          . 'F<NamesList.txt>',
19448                   ),
19449    Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19450                    Withdrawn => v5.1,
19451                    Skip => $Documentation,
19452                   ),
19453    Input_file->new('CaseFolding.txt', v3.0.1,
19454                    Pre_Handler => \&setup_case_folding,
19455                    Each_Line_Handler =>
19456                        [ ($v_version lt v3.1.0)
19457                                 ? \&filter_old_style_case_folding
19458                                 : undef,
19459                           \&filter_case_folding_line
19460                        ],
19461                    Has_Missings_Defaults => $IGNORED,
19462                   ),
19463    Input_file->new("NormTest.txt", v3.0.1,
19464                     Handler => \&process_NormalizationsTest,
19465                     Skip => ($make_norm_test_script) ? 0 : $Validation,
19466                   ),
19467    Input_file->new('DCoreProperties.txt', v3.1.0,
19468                    # 5.2 changed this file
19469                    Has_Missings_Defaults => (($v_version ge v5.2.0)
19470                                            ? $NOT_IGNORED
19471                                            : $NO_DEFAULTS),
19472                   ),
19473    Input_file->new('DProperties.html', v3.1.0,
19474                    Withdrawn => v3.2.0,
19475                    Skip => $Documentation,
19476                   ),
19477    Input_file->new('PropList.html', v3.1.0,
19478                    Withdrawn => v5.1,
19479                    Skip => $Documentation,
19480                   ),
19481    Input_file->new('Scripts.txt', v3.1.0,
19482                    Property => 'Script',
19483                    Each_Line_Handler => (($v_version le v4.0.0)
19484                                          ? \&filter_all_caps_script_names
19485                                          : undef),
19486                    Has_Missings_Defaults => $NOT_IGNORED,
19487                   ),
19488    Input_file->new('DNormalizationProps.txt', v3.1.0,
19489                    Has_Missings_Defaults => $NOT_IGNORED,
19490                    Each_Line_Handler => (($v_version lt v4.0.1)
19491                                      ? \&filter_old_style_normalization_lines
19492                                      : undef),
19493                   ),
19494    Input_file->new('DerivedProperties.html', v3.1.1,
19495                    Withdrawn => v5.1,
19496                    Skip => $Documentation,
19497                   ),
19498    Input_file->new('DAge.txt', v3.2.0,
19499                    Has_Missings_Defaults => $NOT_IGNORED,
19500                    Property => 'Age'
19501                   ),
19502    Input_file->new('HangulSyllableType.txt', v4.0,
19503                    Has_Missings_Defaults => $NOT_IGNORED,
19504                    Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19505                    Property => 'Hangul_Syllable_Type'
19506                   ),
19507    Input_file->new('NormalizationCorrections.txt', v3.2.0,
19508                     # This documents the cumulative fixes to erroneous
19509                     # normalizations in earlier Unicode versions.  Its main
19510                     # purpose is so that someone running on an earlier
19511                     # version can use this file to override what got
19512                     # published in that earlier release.  It would be easy
19513                     # for mktables to handle this file.  But all the
19514                     # corrections in it should already be in the other files
19515                     # for the release it is.  To get it to actually mean
19516                     # something useful, someone would have to be using an
19517                     # earlier Unicode release, and copy it into the directory
19518                     # for that release and recompile.  So far there has been
19519                     # no demand to do that, so this hasn't been implemented.
19520                    Skip => 'Documentation of corrections already '
19521                          . 'incorporated into the Unicode data base',
19522                   ),
19523    Input_file->new('StandardizedVariants.html', v3.2.0,
19524                    Skip => 'Obsoleted as of Unicode 9.0, but previously '
19525                          . 'provided a visual display of the standard '
19526                          . 'variant sequences derived from '
19527                          . 'F<StandardizedVariants.txt>.',
19528                        # I don't know why the html came earlier than the
19529                        # .txt, but both are skipped anyway, so it doesn't
19530                        # matter.
19531                   ),
19532    Input_file->new('StandardizedVariants.txt', v4.0.0,
19533                    Skip => 'Certain glyph variations for character display '
19534                          . 'are standardized.  This lists the non-Unihan '
19535                          . 'ones; the Unihan ones are also not used by '
19536                          . 'Perl, and are in a separate Unicode data base '
19537                          . 'L<http://www.unicode.org/ivd>',
19538                   ),
19539    Input_file->new('UCD.html', v4.0.0,
19540                    Withdrawn => v5.2,
19541                    Skip => $Documentation,
19542                   ),
19543    Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19544                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
19545                    Property => 'Word_Break',
19546                    Has_Missings_Defaults => $NOT_IGNORED,
19547                   ),
19548    Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19549                    Early => [ \&generate_GCB, '_Perl_GCB' ],
19550                    Property => 'Grapheme_Cluster_Break',
19551                    Has_Missings_Defaults => $NOT_IGNORED,
19552                   ),
19553    Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19554                    Handler => \&process_GCB_test,
19555                    retain_trailing_comments => 1,
19556                   ),
19557    Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19558                    Skip => $Validation_Documentation,
19559                   ),
19560    Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19561                    Handler => \&process_SB_test,
19562                    retain_trailing_comments => 1,
19563                   ),
19564    Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19565                    Skip => $Validation_Documentation,
19566                   ),
19567    Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19568                    Handler => \&process_WB_test,
19569                    retain_trailing_comments => 1,
19570                   ),
19571    Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19572                    Skip => $Validation_Documentation,
19573                   ),
19574    Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19575                    Property => 'Sentence_Break',
19576                    Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19577                    Has_Missings_Defaults => $NOT_IGNORED,
19578                   ),
19579    Input_file->new('NamedSequences.txt', v4.1.0,
19580                    Handler => \&process_NamedSequences
19581                   ),
19582    Input_file->new('Unihan.html', v4.1.0,
19583                    Withdrawn => v5.2,
19584                    Skip => $Documentation,
19585                   ),
19586    Input_file->new('NameAliases.txt', v5.0,
19587                    Property => 'Name_Alias',
19588                    Each_Line_Handler => ($v_version le v6.0.0)
19589                                   ? \&filter_early_version_name_alias_line
19590                                   : \&filter_later_version_name_alias_line,
19591                   ),
19592        # NameAliases.txt came along in v5.0.  The above constructor handles
19593        # this.  But until 6.1, it was lacking some information needed by core
19594        # perl.  The constructor below handles that.  It is either a kludge or
19595        # clever, depending on your point of view.  The 'Withdrawn' parameter
19596        # indicates not to use it at all starting in 6.1 (so the above
19597        # constructor applies), and the 'v6.1' parameter indicates to use the
19598        # Early parameter before 6.1.  Therefore 'Early" is always used,
19599        # yielding the internal-only property '_Perl_Name_Alias', which it
19600        # gets from a NameAliases.txt from 6.1 or later stored in
19601        # N_Asubst.txt.  In combination with the above constructor,
19602        # 'Name_Alias' is publicly accessible starting with v5.0, and the
19603        # better 6.1 version is accessible to perl core in all releases.
19604    Input_file->new("NameAliases.txt", v6.1,
19605                    Withdrawn => v6.1,
19606                    Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19607                    Property => 'Name_Alias',
19608                    EOF_Handler => \&fixup_early_perl_name_alias,
19609                    Each_Line_Handler =>
19610                                       \&filter_later_version_name_alias_line,
19611                   ),
19612    Input_file->new('NamedSqProv.txt', v5.0.0,
19613                    Skip => 'Named sequences proposed for inclusion in a '
19614                          . 'later version of the Unicode Standard; if you '
19615                          . 'need them now, you can append this file to '
19616                          . 'F<NamedSequences.txt> and recompile perl',
19617                   ),
19618    Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19619                    Handler => \&process_LB_test,
19620                    retain_trailing_comments => 1,
19621                   ),
19622    Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19623                    Skip => $Validation_Documentation,
19624                   ),
19625    Input_file->new("BidiTest.txt", v5.2.0,
19626                    Skip => $Validation,
19627                   ),
19628    Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19629                    Optional => "",
19630                    Each_Line_Handler => \&filter_unihan_line,
19631                   ),
19632    Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19633                    Optional => "",
19634                    Each_Line_Handler => \&filter_unihan_line,
19635                   ),
19636    Input_file->new('UnihanIRGSources.txt', v5.2.0,
19637                    Optional => [ "",
19638                                  'kCompatibilityVariant',
19639                                  'kIICore',
19640                                  'kIRG_GSource',
19641                                  'kIRG_HSource',
19642                                  'kIRG_JSource',
19643                                  'kIRG_KPSource',
19644                                  'kIRG_MSource',
19645                                  'kIRG_KSource',
19646                                  'kIRG_SSource',
19647                                  'kIRG_TSource',
19648                                  'kIRG_USource',
19649                                  'kIRG_UKSource',
19650                                  'kIRG_VSource',
19651                               ],
19652                    Pre_Handler => \&setup_unihan,
19653                    Each_Line_Handler => \&filter_unihan_line,
19654                   ),
19655    Input_file->new('UnihanNumericValues.txt', v5.2.0,
19656                    Optional => [ "",
19657                                  'kAccountingNumeric',
19658                                  'kOtherNumeric',
19659                                  'kPrimaryNumeric',
19660                                ],
19661                    Each_Line_Handler => \&filter_unihan_line,
19662                   ),
19663    Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19664                    Optional => "",
19665                    Each_Line_Handler => \&filter_unihan_line,
19666                   ),
19667    Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19668                    Optional => [ "",
19669                                  'Unicode_Radical_Stroke'
19670                                ],
19671                    Each_Line_Handler => \&filter_unihan_line,
19672                   ),
19673    Input_file->new('UnihanReadings.txt', v5.2.0,
19674                    Optional => "",
19675                    Each_Line_Handler => \&filter_unihan_line,
19676                   ),
19677    Input_file->new('UnihanVariants.txt', v5.2.0,
19678                    Optional => "",
19679                    Each_Line_Handler => \&filter_unihan_line,
19680                   ),
19681    Input_file->new('CJKRadicals.txt', v5.2.0,
19682                    Skip => 'Maps the kRSUnicode property values to '
19683                          . 'corresponding code points',
19684                   ),
19685    Input_file->new('EmojiSources.txt', v6.0.0,
19686                    Skip => 'Maps certain Unicode code points to their '
19687                          . 'legacy Japanese cell-phone values',
19688                   ),
19689    # This file is actually not usable as-is until 6.1.0, because the property
19690    # is provisional, so its name is missing from PropertyAliases.txt until
19691    # that release, so that further work would have to be done to get it to
19692    # work properly
19693    Input_file->new('ScriptExtensions.txt', v6.0.0,
19694                    Property => 'Script_Extensions',
19695                    Early => [ sub {} ], # Doesn't do anything but ensures
19696                                         # that this isn't skipped for early
19697                                         # versions
19698                    Pre_Handler => \&setup_script_extensions,
19699                    Each_Line_Handler => \&filter_script_extensions_line,
19700                    Has_Missings_Defaults => (($v_version le v6.0.0)
19701                                            ? $NO_DEFAULTS
19702                                            : $IGNORED),
19703                   ),
19704    # These two Indic files are actually not usable as-is until 6.1.0,
19705    # because they are provisional, so their property values are missing from
19706    # PropValueAliases.txt until that release, so that further work would have
19707    # to be done to get them to work properly.
19708    Input_file->new('IndicMatraCategory.txt', v6.0.0,
19709                    Withdrawn => v8.0.0,
19710                    Property => 'Indic_Matra_Category',
19711                    Has_Missings_Defaults => $NOT_IGNORED,
19712                    Skip => $Indic_Skip,
19713                   ),
19714    Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19715                    Property => 'Indic_Syllabic_Category',
19716                    Has_Missings_Defaults => $NOT_IGNORED,
19717                    Skip => (($v_version lt v8.0.0)
19718                              ? $Indic_Skip
19719                              : 0),
19720                   ),
19721    Input_file->new('USourceData.txt', v6.2.0,
19722                    Skip => 'Documentation of status and cross reference of '
19723                          . 'proposals for encoding by Unicode of Unihan '
19724                          . 'characters',
19725                   ),
19726    Input_file->new('USourceGlyphs.pdf', v6.2.0,
19727                    Skip => 'Pictures of the characters in F<USourceData.txt>',
19728                   ),
19729    Input_file->new('BidiBrackets.txt', v6.3.0,
19730                    Properties => [ 'Bidi_Paired_Bracket',
19731                                    'Bidi_Paired_Bracket_Type'
19732                                  ],
19733                    Has_Missings_Defaults => $NO_DEFAULTS,
19734                   ),
19735    Input_file->new("BidiCharacterTest.txt", v6.3.0,
19736                    Skip => $Validation,
19737                   ),
19738    Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19739                    Property => 'Indic_Positional_Category',
19740                    Has_Missings_Defaults => $NOT_IGNORED,
19741                   ),
19742    Input_file->new('TangutSources.txt', v9.0.0,
19743                    Skip => 'Specifies source mappings for Tangut ideographs'
19744                          . ' and components. This data file also includes'
19745                          . ' informative radical-stroke values that are used'
19746                          . ' internally by Unicode',
19747                   ),
19748    Input_file->new('VerticalOrientation.txt', v10.0.0,
19749                    Property => 'Vertical_Orientation',
19750                    Has_Missings_Defaults => $NOT_IGNORED,
19751                   ),
19752    Input_file->new('NushuSources.txt', v10.0.0,
19753                    Skip => 'Specifies source material for Nushu characters',
19754                   ),
19755    Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
19756                    Property => 'Equivalent_Unified_Ideograph',
19757                    Has_Missings_Defaults => $NOT_IGNORED,
19758                   ),
19759    Input_file->new('EmojiData.txt', v11.0.0,
19760                    # Is in UAX #51 and not the UCD, so must be updated
19761                    # separately, and the first line edited to indicate the
19762                    # UCD release we're pretending it to be in.  The UTC says
19763                    # this is a transitional state, and in fact was moved to
19764                    # the UCD in 13.0
19765                    Withdrawn => v13.0.0,
19766                    Pre_Handler => \&setup_emojidata,
19767                    Has_Missings_Defaults => $NOT_IGNORED,
19768                    Each_Line_Handler => \&filter_emojidata_line,
19769                    UCD => 0,
19770                   ),
19771    Input_file->new("$EMOJI/emoji.txt", v13.0.0,
19772                    Has_Missings_Defaults => $NOT_IGNORED,
19773                    UCD => 0,
19774                   ),
19775    Input_file->new("$EMOJI/ReadMe.txt", v13.0.0,
19776                    Skip => $Documentation,
19777                    UCD => 0,
19778                   ),
19779    Input_file->new('IdStatus.txt', v13.0.0,
19780                    Pre_Handler => \&setup_IdStatus,
19781                    Property => 'Identifier_Status',
19782                    UCD => 0,
19783                   ),
19784    Input_file->new('IdType.txt', v13.0.0,
19785                    Pre_Handler => \&setup_IdType,
19786                    Each_Line_Handler => \&filter_IdType_line,
19787                    Property => 'Identifier_Type',
19788                    UCD => 0,
19789                   ),
19790    Input_file->new('confusables.txt', v15.0.0,
19791                    Skip => $Unused_Skip,
19792                    UCD => 0,
19793                   ),
19794    Input_file->new('confusablesSummary.txt', v15.0.0,
19795                    Skip => $Unused_Skip,
19796                    UCD => 0,
19797                   ),
19798    Input_file->new('intentional.txt', v15.0.0,
19799                    Skip => $Unused_Skip,
19800                    UCD => 0,
19801                   ),
19802);
19803
19804# End of all the preliminaries.
19805# Do it...
19806
19807if (@missing_early_files) {
19808    print simple_fold(join_lines(<<END
19809
19810The compilation cannot be completed because one or more required input files,
19811listed below, are missing.  This is because you are compiling Unicode version
19812$unicode_version, which predates the existence of these file(s).  To fully
19813function, perl needs the data that these files would have contained if they
19814had been in this release.  To work around this, create copies of later
19815versions of the missing files in the directory containing '$0'.  (Perl will
19816make the necessary adjustments to the data to compensate for it not being the
19817same version as is being compiled.)  The files are available from unicode.org,
19818via either ftp or http.  If using http, they will be under
19819www.unicode.org/versions/.  Below are listed the source file name of each
19820missing file, the Unicode version to copy it from, and the name to store it
19821as.  (Note that the listed source file name may not be exactly the one that
19822Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
19823to get the correct name.)
19824END
19825    ));
19826    print simple_fold(join_lines("\n$_")) for @missing_early_files;
19827    exit 2;
19828}
19829
19830if ($compare_versions) {
19831    Carp::my_carp(<<END
19832Warning.  \$compare_versions is set.  Output is not suitable for production
19833END
19834    );
19835}
19836
19837# Put into %potential_files a list of all the files in the directory structure
19838# that could be inputs to this program
19839File::Find::find({
19840    wanted=>sub {
19841        return unless / \. ( txt | htm l? ) $ /xi;  # Some platforms change the
19842                                                    # name's case
19843        my $full = lc(File::Spec->rel2abs($_));
19844        $potential_files{$full} = 1;
19845        return;
19846    }
19847}, File::Spec->curdir());
19848
19849my @mktables_list_output_files;
19850my $old_start_time = 0;
19851my $old_options = "";
19852
19853if (! -e $file_list) {
19854    print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19855    $write_unchanged_files = 1;
19856} elsif ($write_unchanged_files) {
19857    print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19858}
19859else {
19860    print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19861    my $file_handle;
19862    if (! open $file_handle, "<", $file_list) {
19863        Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
19864        $glob_list = 1;
19865    }
19866    else {
19867        my @input;
19868
19869        # Read and parse mktables.lst, placing the results from the first part
19870        # into @input, and the second part into @mktables_list_output_files
19871        for my $list ( \@input, \@mktables_list_output_files ) {
19872            while (<$file_handle>) {
19873                s/^ \s+ | \s+ $//xg;
19874                if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
19875                    $old_start_time = $1;
19876                    next;
19877                }
19878                if (/^ \s* \# \s* From\ options\ (.+) /x) {
19879                    $old_options = $1;
19880                    next;
19881                }
19882                next if /^ \s* (?: \# .* )? $/x;
19883                last if /^ =+ $/x;
19884                my ( $file ) = split /\t/;
19885                push @$list, $file;
19886            }
19887            @$list = uniques(@$list);
19888            next;
19889        }
19890
19891        # Look through all the input files
19892        foreach my $input (@input) {
19893            next if $input eq 'version'; # Already have checked this.
19894
19895            # Ignore if doesn't exist.  The checking about whether we care or
19896            # not is done via the Input_file object.
19897            next if ! file_exists($input);
19898
19899            # The paths are stored with relative names, and with '/' as the
19900            # delimiter; convert to absolute on this machine
19901            my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
19902            $potential_files{lc $full} = 1;
19903        }
19904    }
19905
19906    close $file_handle;
19907}
19908
19909if ($glob_list) {
19910
19911    # Here wants to process all .txt files in the directory structure.
19912    # Convert them to full path names.  They are stored in the platform's
19913    # relative style
19914    my @known_files;
19915    foreach my $object (@input_file_objects) {
19916        my $file = $object->file;
19917        next unless defined $file;
19918        push @known_files, File::Spec->rel2abs($file);
19919    }
19920
19921    my @unknown_input_files;
19922    foreach my $file (keys %potential_files) {  # The keys are stored in lc
19923        next if grep { $file eq lc($_) } @known_files;
19924
19925        # Here, the file is unknown to us.  Get relative path name
19926        $file = File::Spec->abs2rel($file);
19927        push @unknown_input_files, $file;
19928
19929        # What will happen is we create a data structure for it, and add it to
19930        # the list of input files to process.  First get the subdirectories
19931        # into an array
19932        my (undef, $directories, undef) = File::Spec->splitpath($file);
19933        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
19934        my @directories = File::Spec->splitdir($directories);
19935
19936        # If the file isn't extracted (meaning none of the directories is the
19937        # extracted one), just add it to the end of the list of inputs.
19938        if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
19939            push @input_file_objects, Input_file->new($file, v0);
19940        }
19941        else {
19942
19943            # Here, the file is extracted.  It needs to go ahead of most other
19944            # processing.  Search for the first input file that isn't a
19945            # special required property (that is, find one whose first_release
19946            # is non-0), and isn't extracted.  Also, the Age property file is
19947            # processed before the extracted ones, just in case
19948            # $compare_versions is set.
19949            for (my $i = 0; $i < @input_file_objects; $i++) {
19950                if ($input_file_objects[$i]->first_released ne v0
19951                    && lc($input_file_objects[$i]->file) ne 'dage.txt'
19952                    && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
19953                {
19954                    splice @input_file_objects, $i, 0,
19955                                                Input_file->new($file, v0);
19956                    last;
19957                }
19958            }
19959
19960        }
19961    }
19962    if (@unknown_input_files) {
19963        print STDERR simple_fold(join_lines(<<END
19964
19965The following files are unknown as to how to handle.  Assuming they are
19966typical property files.  You'll know by later error messages if it worked or
19967not:
19968END
19969        ) . " " . join(", ", @unknown_input_files) . "\n\n");
19970    }
19971} # End of looking through directory structure for more .txt files.
19972
19973# Create the list of input files from the objects we have defined, plus
19974# version
19975my @input_files = qw(version Makefile);
19976foreach my $object (@input_file_objects) {
19977    my $file = $object->file;
19978    next if ! defined $file;    # Not all objects have files
19979    next if defined $object->skip;;
19980    push @input_files,  $file;
19981}
19982
19983if ( $verbosity >= $VERBOSE ) {
19984    print "Expecting ".scalar( @input_files )." input files. ",
19985         "Checking ".scalar( @mktables_list_output_files )." output files.\n";
19986}
19987
19988# We set $most_recent to be the most recently changed input file, including
19989# this program itself (done much earlier in this file)
19990foreach my $in (@input_files) {
19991    next unless -e $in;        # Keep going even if missing a file
19992    my $mod_time = (stat $in)[9];
19993    $most_recent = $mod_time if $mod_time > $most_recent;
19994
19995    # See that the input files have distinct names, to warn someone if they
19996    # are adding a new one
19997    if ($make_list) {
19998        my ($volume, $directories, $file ) = File::Spec->splitpath($in);
19999        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
20000        my @directories = File::Spec->splitdir($directories);
20001        construct_filename($file, 'mutable', \@directories);
20002    }
20003}
20004
20005# We use 'Makefile' just to see if it has changed since the last time we
20006# rebuilt.  Now discard it.
20007@input_files = grep { $_ ne 'Makefile' } @input_files;
20008
20009my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
20010              || ! scalar @mktables_list_output_files  # or if no outputs known
20011              || $old_start_time < $most_recent        # or out-of-date
20012              || $old_options ne $command_line_arguments; # or with different
20013                                                          # options
20014
20015# Now we check to see if any output files are older than youngest, if
20016# they are, we need to continue on, otherwise we can presumably bail.
20017if (! $rebuild) {
20018    foreach my $out (@mktables_list_output_files) {
20019        if ( ! file_exists($out)) {
20020            print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
20021            $rebuild = 1;
20022            last;
20023         }
20024        #local $to_trace = 1 if main::DEBUG;
20025        trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
20026        if ( (stat $out)[9] <= $most_recent ) {
20027            #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
20028            print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
20029            $rebuild = 1;
20030            last;
20031        }
20032    }
20033}
20034if (! $rebuild) {
20035    print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
20036    exit(0);
20037}
20038print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20039
20040# Ready to do the major processing.  First create the perl pseudo-property.
20041$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20042
20043# Process each input file
20044foreach my $file (@input_file_objects) {
20045    $file->run;
20046}
20047
20048# Finish the table generation.
20049
20050print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20051finish_Unicode();
20052
20053# For the very specialized case of comparing two Unicode versions...
20054if (DEBUG && $compare_versions) {
20055    handle_compare_versions();
20056}
20057
20058print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20059compile_perl();
20060
20061print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20062add_perl_synonyms();
20063
20064print "Writing tables\n" if $verbosity >= $PROGRESS;
20065write_all_tables();
20066
20067# Write mktables.lst
20068if ( $file_list and $make_list ) {
20069
20070    print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20071    foreach my $file (@input_files, @files_actually_output) {
20072        my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20073        my @directories = grep length, File::Spec->splitdir($directories);
20074        $file = join '/', @directories, $basefile;
20075    }
20076
20077    my $ofh;
20078    if (! open $ofh,">",$file_list) {
20079        Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
20080        return
20081    }
20082    else {
20083        my $localtime = localtime $start_time;
20084        print $ofh <<"END";
20085#
20086# $file_list -- File list for $0.
20087#
20088#   Autogenerated starting on $start_time ($localtime)
20089#   From options $command_line_arguments
20090#
20091# - First section is input files
20092#   ($0 itself is not listed but is automatically considered an input)
20093# - Section separator is /^=+\$/
20094# - Second section is a list of output files.
20095# - Lines matching /^\\s*#/ are treated as comments
20096#   which along with blank lines are ignored.
20097#
20098
20099# Input files:
20100
20101END
20102        print $ofh "$_\n" for sort(@input_files);
20103        print $ofh "\n=================================\n# Output files:\n\n";
20104        print $ofh "$_\n" for sort @files_actually_output;
20105        print $ofh "\n# ",scalar(@input_files)," input files\n",
20106                "# ",scalar(@files_actually_output)+1," output files\n\n",
20107                "# End list\n";
20108        close $ofh
20109            or Carp::my_carp("Failed to close $ofh: $!");
20110
20111        print "Filelist has ",scalar(@input_files)," input files and ",
20112            scalar(@files_actually_output)+1," output files\n"
20113            if $verbosity >= $VERBOSE;
20114    }
20115}
20116
20117# Output these warnings unless -q explicitly specified.
20118if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20119    if (@unhandled_properties) {
20120        print "\nProperties and tables that unexpectedly have no code points\n";
20121        foreach my $property (sort @unhandled_properties) {
20122            print $property, "\n";
20123        }
20124    }
20125
20126    if (%potential_files) {
20127        print "\nInput files that are not considered:\n";
20128        foreach my $file (sort keys %potential_files) {
20129            print File::Spec->abs2rel($file), "\n";
20130        }
20131    }
20132    print "\nAll done\n" if $verbosity >= $VERBOSE;
20133}
20134
20135if ($version_of_mk_invlist_bounds lt $v_version) {
20136    Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20137                . " to be checked and possibly updated to Unicode"
20138                . " $string_version.  Failing tests will be marked TODO");
20139}
20140
20141exit(0);
20142
20143# TRAILING CODE IS USED BY make_property_test_script()
20144__DATA__
20145
20146use strict;
20147use warnings;
20148
20149use feature 'signatures';
20150
20151no warnings 'experimental::uniprop_wildcards';
20152
20153# Test qr/\X/ and the \p{} regular expression constructs.  This file is
20154# constructed by mktables from the tables it generates, so if mktables is
20155# buggy, this won't necessarily catch those bugs.  Tests are generated for all
20156# feasible properties; a few aren't currently feasible; see
20157# is_code_point_usable() in mktables for details.
20158
20159# Standard test packages are not used because this manipulates SIG_WARN.  It
20160# exits 0 if every non-skipped test succeeded; -1 if any failed.
20161
20162my $Tests = 0;
20163my $Fails = 0;
20164
20165# loc_tools.pl requires this function to be defined
20166sub ok($pass, @msg) {
20167    print "not " unless $pass;
20168    print "ok ";
20169    print ++$Tests;
20170    print " - ", join "", @msg if @msg;
20171    print "\n";
20172}
20173
20174sub Expect($expected, $ord, $regex, $warning_type='') {
20175    my $line   = (caller)[2];
20176
20177    # Convert the code point to hex form
20178    my $string = sprintf "\"\\x{%04X}\"", $ord;
20179
20180    my @tests = "";
20181
20182    # The first time through, use all warnings.  If the input should generate
20183    # a warning, add another time through with them turned off
20184    push @tests, "no warnings '$warning_type';" if $warning_type;
20185
20186    foreach my $no_warnings (@tests) {
20187
20188        # Store any warning messages instead of outputting them
20189        local $SIG{__WARN__} = $SIG{__WARN__};
20190        my $warning_message;
20191        $SIG{__WARN__} = sub { $warning_message = $_[0] };
20192
20193        $Tests++;
20194
20195        # A string eval is needed because of the 'no warnings'.
20196        # Assumes no parentheses in the regular expression
20197        my $result = eval "$no_warnings
20198                            my \$RegObj = qr($regex);
20199                            $string =~ \$RegObj ? 1 : 0";
20200        if (not defined $result) {
20201            print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20202            $Fails++;
20203        }
20204        elsif ($result ^ $expected) {
20205            print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20206            $Fails++;
20207        }
20208        elsif ($warning_message) {
20209            if (! $warning_type || ($warning_type && $no_warnings)) {
20210                print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20211                $Fails++;
20212            }
20213            else {
20214                print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20215            }
20216        }
20217        elsif ($warning_type && ! $no_warnings) {
20218            print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20219            $Fails++;
20220        }
20221        else {
20222            print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20223        }
20224    }
20225    return;
20226}
20227
20228sub Error($regex) {
20229    $Tests++;
20230    if (eval { 'x' =~ qr/$regex/; 1 }) {
20231        $Fails++;
20232        my $line = (caller)[2];
20233        print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20234    }
20235    else {
20236        my $line = (caller)[2];
20237        print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20238    }
20239    return;
20240}
20241
20242# Break test files (e.g. GCBTest.txt) character that break allowed here
20243my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20244utf8::upgrade($breakable_utf8);
20245
20246# Break test files (e.g. GCBTest.txt) character that indicates can't break
20247# here
20248my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20249utf8::upgrade($nobreak_utf8);
20250
20251my $are_ctype_locales_available;
20252my $utf8_locale;
20253chdir 't' if -d 't';
20254eval { require "./loc_tools.pl" };
20255if (defined &locales_enabled) {
20256    $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20257    if ($are_ctype_locales_available) {
20258        $utf8_locale = &find_utf8_ctype_locale;
20259    }
20260}
20261
20262# Eval'd so can run on versions earlier than the property is available in
20263my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20264if (! defined $WB_Extend_or_Format_re) {
20265    $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20266}
20267
20268sub _test_break($template, $break_type) {
20269    # Test various break property matches.  The 2nd parameter gives the
20270    # property name.  The input is a line from auxiliary/*Test.txt for the
20271    # given property.  Each such line is a sequence of Unicode (not native)
20272    # code points given by their hex numbers, separated by the two characters
20273    # defined just before this subroutine that indicate that either there can
20274    # or cannot be a break between the adjacent code points.  All these are
20275    # tested.
20276    #
20277    # For the gcb property extra tests are made.  if there isn't a break, that
20278    # means the sequence forms an extended grapheme cluster, which means that
20279    # \X should match the whole thing.  If there is a break, \X should stop
20280    # there.  This is all converted by this routine into a match: $string =~
20281    # /(\X)/, Each \X should match the next cluster; and that is what is
20282    # checked.
20283
20284    my $line   = (caller 1)[2];   # Line number
20285    my $comment = "";
20286
20287    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20288        $template = $1;
20289        $comment = $2;
20290
20291        # Replace leading spaces with a single one.
20292        $comment =~ s/ ^ \s* / # /x;
20293    }
20294
20295    # The line contains characters above the ASCII range, but in Latin1.  It
20296    # may or may not be in utf8, and if it is, it may or may not know it.  So,
20297    # convert these characters to 8 bits.  If knows is in utf8, simply
20298    # downgrade.
20299    if (utf8::is_utf8($template)) {
20300        utf8::downgrade($template);
20301    } else {
20302
20303        # Otherwise, if it is in utf8, but doesn't know it, the next lines
20304        # convert the two problematic characters to their 8-bit equivalents.
20305        # If it isn't in utf8, they don't harm anything.
20306        use bytes;
20307        $template =~ s/$nobreak_utf8/$nobreak/g;
20308        $template =~ s/$breakable_utf8/$breakable/g;
20309    }
20310
20311    # Perl customizes wb.  So change the official tests accordingly
20312    if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20313
20314        # Split into elements that alternate between code point and
20315        # break/no-break
20316        my @line = split / +/, $template;
20317
20318        # Look at each code point and its following one
20319        for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
20320
20321            # The customization only involves changing some breaks to
20322            # non-breaks.
20323            next if $line[$i+1] =~ /$nobreak/;
20324
20325            my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20326            my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20327
20328            # And it only affects adjacent space characters.
20329            next if $lhs !~ /\s/u;
20330
20331            # But, we want to make sure to test spaces followed by a Extend
20332            # or Format.
20333            next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20334
20335            # To test the customization, add some white-space before this to
20336            # create a span.  The $lhs white space may or may not be bound to
20337            # that span, and also with the $rhs.  If the $rhs is a binding
20338            # character, the $lhs is bound to it and not to the span, unless
20339            # $lhs is vertical space.  In all other cases, the $lhs is bound
20340            # to the span.  If the $rhs is white space, it is bound to the
20341            # $lhs
20342            my $bound;
20343            my $span;
20344            if ($rhs =~ /$WB_Extend_or_Format_re/) {
20345                if ($lhs =~ /\v/) {
20346                    $bound = $breakable;
20347                    $span = $nobreak;
20348                }
20349                else {
20350                    $bound = $nobreak;
20351                    $span = $breakable;
20352                }
20353            }
20354            else {
20355                $span = $nobreak;
20356                $bound = $nobreak;
20357            }
20358
20359            splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20360            $i += 4;
20361            $line[$i+1] = $bound;
20362        }
20363        $template = join " ", @line;
20364    }
20365
20366    # The input is just the break/no-break symbols and sequences of Unicode
20367    # code points as hex digits separated by spaces for legibility. e.g.:
20368    # ÷ 0020 × 0308 ÷ 0020 ÷
20369    # Convert to native \x format
20370    $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20371    $template =~ s/ \s* //gx;   # Probably the line above removed all spaces;
20372                                # but be sure
20373
20374    # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20375    # appropriate
20376    my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20377    $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20378
20379    my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20380    my $string = eval "\"$display_string\"";
20381
20382    # The remaining massaging of the input is for the \X tests.  Get rid of
20383    # the leading and trailing breakables
20384    $template =~ s/^ \s* $breakable \s* //x;
20385    $template =~ s/ \s* $breakable \s* $ //x;
20386
20387    # Delete no-breaks
20388    $template =~ s/ \s* $nobreak \s* //xg;
20389
20390    # Split the input into segments that are breakable between them.
20391    my @should_display = split /\s*$breakable\s*/, $template;
20392    my @should_match = map { eval "\"$_\"" } @should_display;
20393
20394    # If a string can be represented in both non-ut8 and utf8, test both cases
20395    my $display_upgrade = "";
20396    UPGRADE:
20397    for my $to_upgrade (0 .. 1) {
20398
20399        if ($to_upgrade) {
20400
20401            # If already in utf8, would just be a repeat
20402            next UPGRADE if utf8::is_utf8($string);
20403
20404            utf8::upgrade($string);
20405            $display_upgrade = " (utf8-upgraded)";
20406        }
20407
20408        my @modifiers = qw(a aa d u i);
20409        if ($are_ctype_locales_available) {
20410            push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20411
20412            # The /l modifier has C after it to indicate the locale to try
20413            push @modifiers, "lC";
20414        }
20415
20416        # Test for each of the regex modifiers.
20417        for my $modifier (@modifiers) {
20418            my $display_locale = "";
20419
20420            # For /l, set the locale to what it says to.
20421            if ($modifier =~ / ^ l (.*) /x) {
20422                my $locale = $1;
20423                $display_locale = "(locale = $locale)";
20424                POSIX::setlocale(POSIX::LC_CTYPE(), $locale);
20425                $modifier = 'l';
20426            }
20427
20428            no warnings qw(locale regexp surrogate);
20429            my $pattern = "(?$modifier:$break_pattern)";
20430
20431            # Actually do the test
20432            my $matched_text;
20433            my $matched = $string =~ qr/$pattern/;
20434            if ($matched) {
20435                $matched_text = "matched";
20436            }
20437            else {
20438                $matched_text = "failed to match";
20439                print "not ";
20440
20441                if (TODO_FAILING_BREAKS) {
20442                    $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20443                    $comment =~ s/#/# TODO/;
20444                }
20445            }
20446            print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20447
20448            # Only print the comment on the first use of this line
20449            $comment = "";
20450
20451            # Repeat with the first \B{} in the pattern.  This makes sure the
20452            # code in regexec.c:find_byclass() for \B gets executed
20453            if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20454                my $B_pattern = "$1$2";
20455                $matched = $string =~ qr/$B_pattern/;
20456                print "not " unless $matched;
20457                $matched_text = ($matched) ? "matched" : "failed to match";
20458                print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20459                print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20460                print "\n";
20461            }
20462        }
20463
20464        next if $break_type ne 'gcb';
20465
20466        # Finally, do the \X match.
20467        my @matches = $string =~ /(\X)/g;
20468
20469        # Look through each matched cluster to verify that it matches what we
20470        # expect.
20471        my $min = (@matches < @should_match) ? @matches : @should_match;
20472        for my $i (0 .. $min - 1) {
20473            $Tests++;
20474            if ($matches[$i] eq $should_match[$i]) {
20475                print "ok $Tests - ";
20476                if ($i == 0) {
20477                    print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20478                } else {
20479                    print "And \\X #", $i + 1,
20480                }
20481                print " correctly matched $should_display[$i]; line $line\n";
20482            } else {
20483                $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20484                                                    split "", $matches[$i]);
20485                print "not ok $Tests -";
20486                print " # TODO" if TODO_FAILING_BREAKS;
20487                print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20488                    $i + 1,
20489                    " should have matched $should_display[$i]",
20490                    " but instead matched $matches[$i]",
20491                    ".  Abandoning rest of line $line\n";
20492                next UPGRADE;
20493            }
20494        }
20495
20496        # And the number of matches should equal the number of expected matches.
20497        $Tests++;
20498        if (@matches == @should_match) {
20499            print "ok $Tests - Nothing was left over; line $line\n";
20500        } else {
20501            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20502            print " # TODO" if TODO_FAILING_BREAKS;
20503            print "\n";
20504        }
20505    }
20506
20507    return;
20508}
20509
20510sub Test_GCB($t) {
20511    _test_break($t, 'gcb');
20512}
20513
20514sub Test_LB($t) {
20515    _test_break($t, 'lb');
20516}
20517
20518sub Test_SB($t) {
20519    _test_break($t, 'sb');
20520}
20521
20522sub Test_WB($t) {
20523    _test_break($t, 'wb');
20524}
20525
20526sub Finished() {
20527    print "1..$Tests\n";
20528    exit($Fails ? -1 : 0);
20529}
20530
20531