xref: /openbsd-src/gnu/usr.bin/perl/lib/unicore/mktables (revision 9f11ffb7133c203312a01e4b986886bc88c7d74b)
1#!/usr/bin/perl -w
2
3# !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
7# There was an attempt when this was first rewritten to make it 5.8
8# compatible, but that has now been abandoned, and newer constructs are used
9# as convenient.
10
11# NOTE: this script can run quite slowly in older/slower systems.
12# It can also consume a lot of memory (128 MB or more), you may need
13# to raise your process resource limits (e.g. in bash, "ulimit -a"
14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
15
16my $start_time;
17BEGIN { # Get the time the script started running; do it at compilation to
18        # get it as close as possible
19    $start_time= time;
20}
21
22require 5.010_001;
23use strict;
24use warnings;
25use Carp;
26use Config;
27use File::Find;
28use File::Path;
29use File::Spec;
30use Text::Tabs;
31use re "/aa";
32use feature 'state';
33
34sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
35my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
36
37sub NON_ASCII_PLATFORM { ord("A") != 65 }
38
39# When a new version of Unicode is published, unfortunately the algorithms for
40# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
41# manually.  The changes may or may not be backward compatible with older
42# releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
43# changes, then come back here and set the variable below to what version the
44# code is expecting.  If a newer version of Unicode is being compiled than
45# expected, a warning will be generated.  If an older version is being
46# compiled, any bounds tests that fail in the generated test file (-maketest
47# option) will be marked as TODO.
48my $version_of_mk_invlist_bounds = v10.0.0;
49
50##########################################################################
51#
52# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
53# from the Unicode database files (lib/unicore/.../*.txt),  It also generates
54# a pod file and .t files, depending on option parameters.
55#
56# The structure of this file is:
57#   First these introductory comments; then
58#   code needed for everywhere, such as debugging stuff; then
59#   code to handle input parameters; then
60#   data structures likely to be of external interest (some of which depend on
61#       the input parameters, so follows them; then
62#   more data structures and subroutine and package (class) definitions; then
63#   the small actual loop to process the input files and finish up; then
64#   a __DATA__ section, for the .t tests
65#
66# This program works on all releases of Unicode so far.  The outputs have been
67# scrutinized most intently for release 5.1.  The others have been checked for
68# somewhat more than just sanity.  It can handle all non-provisional Unicode
69# character properties in those releases.
70#
71# This program is mostly about Unicode character (or code point) properties.
72# A property describes some attribute or quality of a code point, like if it
73# is lowercase or not, its name, what version of Unicode it was first defined
74# in, or what its uppercase equivalent is.  Unicode deals with these disparate
75# possibilities by making all properties into mappings from each code point
76# into some corresponding value.  In the case of it being lowercase or not,
77# the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
78# property maps each Unicode code point to a single value, called a "property
79# value".  (Some more recently defined properties, map a code point to a set
80# of values.)
81#
82# When using a property in a regular expression, what is desired isn't the
83# mapping of the code point to its property's value, but the reverse (or the
84# mathematical "inverse relation"): starting with the property value, "Does a
85# code point map to it?"  These are written in a "compound" form:
86# \p{property=value}, e.g., \p{category=punctuation}.  This program generates
87# files containing the lists of code points that map to each such regular
88# expression property value, one file per list
89#
90# There is also a single form shortcut that Perl adds for many of the commonly
91# used properties.  This happens for all binary properties, plus script,
92# general_category, and block properties.
93#
94# Thus the outputs of this program are files.  There are map files, mostly in
95# the 'To' directory; and there are list files for use in regular expression
96# matching, all in subdirectories of the 'lib' directory, with each
97# subdirectory being named for the property that the lists in it are for.
98# Bookkeeping, test, and documentation files are also generated.
99
100my $matches_directory = 'lib';   # Where match (\p{}) files go.
101my $map_directory = 'To';        # Where map files go.
102
103# DATA STRUCTURES
104#
105# The major data structures of this program are Property, of course, but also
106# Table.  There are two kinds of tables, very similar to each other.
107# "Match_Table" is the data structure giving the list of code points that have
108# a particular property value, mentioned above.  There is also a "Map_Table"
109# data structure which gives the property's mapping from code point to value.
110# There are two structures because the match tables need to be combined in
111# various ways, such as constructing unions, intersections, complements, etc.,
112# and the map ones don't.  And there would be problems, perhaps subtle, if
113# a map table were inadvertently operated on in some of those ways.
114# The use of separate classes with operations defined on one but not the other
115# prevents accidentally confusing the two.
116#
117# At the heart of each table's data structure is a "Range_List", which is just
118# an ordered list of "Ranges", plus ancillary information, and methods to
119# operate on them.  A Range is a compact way to store property information.
120# Each range has a starting code point, an ending code point, and a value that
121# is meant to apply to all the code points between the two end points,
122# inclusive.  For a map table, this value is the property value for those
123# code points.  Two such ranges could be written like this:
124#   0x41 .. 0x5A, 'Upper',
125#   0x61 .. 0x7A, 'Lower'
126#
127# Each range also has a type used as a convenience to classify the values.
128# Most ranges in this program will be Type 0, or normal, but there are some
129# ranges that have a non-zero type.  These are used only in map tables, and
130# are for mappings that don't fit into the normal scheme of things.  Mappings
131# that require a hash entry to communicate with utf8.c are one example;
132# another example is mappings for charnames.pm to use which indicate a name
133# that is algorithmically determinable from its code point (and the reverse).
134# These are used to significantly compact these tables, instead of listing
135# each one of the tens of thousands individually.
136#
137# In a match table, the value of a range is irrelevant (and hence the type as
138# well, which will always be 0), and arbitrarily set to the empty string.
139# Using the example above, there would be two match tables for those two
140# entries, one named Upper would contain the 0x41..0x5A range, and the other
141# named Lower would contain 0x61..0x7A.
142#
143# Actually, there are two types of range lists, "Range_Map" is the one
144# associated with map tables, and "Range_List" with match tables.
145# Again, this is so that methods can be defined on one and not the others so
146# as to prevent operating on them in incorrect ways.
147#
148# Eventually, most tables are written out to files to be read by utf8_heavy.pl
149# in the perl core.  All tables could in theory be written, but some are
150# suppressed because there is no current practical use for them.  It is easy
151# to change which get written by changing various lists that are near the top
152# of the actual code in this file.  The table data structures contain enough
153# ancillary information to allow them to be treated as separate entities for
154# writing, such as the path to each one's file.  There is a heading in each
155# map table that gives the format of its entries, and what the map is for all
156# the code points missing from it.  (This allows tables to be more compact.)
157#
158# The Property data structure contains one or more tables.  All properties
159# contain a map table (except the $perl property which is a
160# pseudo-property containing only match tables), and any properties that
161# are usable in regular expression matches also contain various matching
162# tables, one for each value the property can have.  A binary property can
163# have two values, True and False (or Y and N, which are preferred by Unicode
164# terminology).  Thus each of these properties will have a map table that
165# takes every code point and maps it to Y or N (but having ranges cuts the
166# number of entries in that table way down), and two match tables, one
167# which has a list of all the code points that map to Y, and one for all the
168# code points that map to N.  (For each binary property, a third table is also
169# generated for the pseudo Perl property.  It contains the identical code
170# points as the Y table, but can be written in regular expressions, not in the
171# compound form, but in a "single" form like \p{IsUppercase}.)  Many
172# properties are binary, but some properties have several possible values,
173# some have many, and properties like Name have a different value for every
174# named code point.  Those will not, unless the controlling lists are changed,
175# have their match tables written out.  But all the ones which can be used in
176# regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
177# a property would have either its map table or its match tables written but
178# not both.  Again, what gets written is controlled by lists which can easily
179# be changed.  Starting in 5.14, advantage was taken of this, and all the map
180# tables needed to reconstruct the Unicode db are now written out, while
181# suppressing the Unicode .txt files that contain the data.  Our tables are
182# much more compact than the .txt files, so a significant space savings was
183# achieved.  Also, tables are not written out that are trivially derivable
184# from tables that do get written.  So, there typically is no file containing
185# the code points not matched by a binary property (the table for \P{} versus
186# lowercase \p{}), since you just need to invert the True table to get the
187# False table.
188
189# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
190# how many match tables there are and the content of the maps.  This 'Type' is
191# different than a range 'Type', so don't get confused by the two concepts
192# having the same name.
193#
194# For information about the Unicode properties, see Unicode's UAX44 document:
195
196my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
197
198# As stated earlier, this program will work on any release of Unicode so far.
199# Most obvious problems in earlier data have NOT been corrected except when
200# necessary to make Perl or this program work reasonably, and to keep out
201# potential security issues.  For example, no folding information was given in
202# early releases, so this program substitutes lower case instead, just so that
203# a regular expression with the /i option will do something that actually
204# gives the right results in many cases.  There are also a couple other
205# corrections for version 1.1.5, commented at the point they are made.  As an
206# example of corrections that weren't made (but could be) is this statement
207# from DerivedAge.txt: "The supplementary private use code points and the
208# non-character code points were assigned in version 2.0, but not specifically
209# listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
210# it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
211# further down in these introductory comments.
212#
213# This program works on all non-provisional properties as of the current
214# Unicode release, though the files for some are suppressed for various
215# reasons.  You can change which are output by changing lists in this program.
216#
217# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
218# loose matchings rules (from Unicode TR18):
219#
220#    The recommended names for UCD properties and property values are in
221#    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
222#    [PropValue]. There are both abbreviated names and longer, more
223#    descriptive names. It is strongly recommended that both names be
224#    recognized, and that loose matching of property names be used,
225#    whereby the case distinctions, whitespace, hyphens, and underbar
226#    are ignored.
227#
228# The program still allows Fuzzy to override its determination of if loose
229# matching should be used, but it isn't currently used, as it is no longer
230# needed; the calculations it makes are good enough.
231#
232# SUMMARY OF HOW IT WORKS:
233#
234#   Process arguments
235#
236#   A list is constructed containing each input file that is to be processed
237#
238#   Each file on the list is processed in a loop, using the associated handler
239#   code for each:
240#        The PropertyAliases.txt and PropValueAliases.txt files are processed
241#            first.  These files name the properties and property values.
242#            Objects are created of all the property and property value names
243#            that the rest of the input should expect, including all synonyms.
244#        The other input files give mappings from properties to property
245#           values.  That is, they list code points and say what the mapping
246#           is under the given property.  Some files give the mappings for
247#           just one property; and some for many.  This program goes through
248#           each file and populates the properties and their map tables from
249#           them.  Some properties are listed in more than one file, and
250#           Unicode has set up a precedence as to which has priority if there
251#           is a conflict.  Thus the order of processing matters, and this
252#           program handles the conflict possibility by processing the
253#           overriding input files last, so that if necessary they replace
254#           earlier values.
255#        After this is all done, the program creates the property mappings not
256#            furnished by Unicode, but derivable from what it does give.
257#        The tables of code points that match each property value in each
258#            property that is accessible by regular expressions are created.
259#        The Perl-defined properties are created and populated.  Many of these
260#            require data determined from the earlier steps
261#        Any Perl-defined synonyms are created, and name clashes between Perl
262#            and Unicode are reconciled and warned about.
263#        All the properties are written to files
264#        Any other files are written, and final warnings issued.
265#
266# For clarity, a number of operators have been overloaded to work on tables:
267#   ~ means invert (take all characters not in the set).  The more
268#       conventional '!' is not used because of the possibility of confusing
269#       it with the actual boolean operation.
270#   + means union
271#   - means subtraction
272#   & means intersection
273# The precedence of these is the order listed.  Parentheses should be
274# copiously used.  These are not a general scheme.  The operations aren't
275# defined for a number of things, deliberately, to avoid getting into trouble.
276# Operations are done on references and affect the underlying structures, so
277# that the copy constructors for them have been overloaded to not return a new
278# clone, but the input object itself.
279#
280# The bool operator is deliberately not overloaded to avoid confusion with
281# "should it mean if the object merely exists, or also is non-empty?".
282#
283# WHY CERTAIN DESIGN DECISIONS WERE MADE
284#
285# This program needs to be able to run under miniperl.  Therefore, it uses a
286# minimum of other modules, and hence implements some things itself that could
287# be gotten from CPAN
288#
289# This program uses inputs published by the Unicode Consortium.  These can
290# change incompatibly between releases without the Perl maintainers realizing
291# it.  Therefore this program is now designed to try to flag these.  It looks
292# at the directories where the inputs are, and flags any unrecognized files.
293# It keeps track of all the properties in the files it handles, and flags any
294# that it doesn't know how to handle.  It also flags any input lines that
295# don't match the expected syntax, among other checks.
296#
297# It is also designed so if a new input file matches one of the known
298# templates, one hopefully just needs to add it to a list to have it
299# processed.
300#
301# As mentioned earlier, some properties are given in more than one file.  In
302# particular, the files in the extracted directory are supposedly just
303# reformattings of the others.  But they contain information not easily
304# derivable from the other files, including results for Unihan (which isn't
305# usually available to this program) and for unassigned code points.  They
306# also have historically had errors or been incomplete.  In an attempt to
307# create the best possible data, this program thus processes them first to
308# glean information missing from the other files; then processes those other
309# files to override any errors in the extracted ones.  Much of the design was
310# driven by this need to store things and then possibly override them.
311#
312# It tries to keep fatal errors to a minimum, to generate something usable for
313# testing purposes.  It always looks for files that could be inputs, and will
314# warn about any that it doesn't know how to handle (the -q option suppresses
315# the warning).
316#
317# Why is there more than one type of range?
318#   This simplified things.  There are some very specialized code points that
319#   have to be handled specially for output, such as Hangul syllable names.
320#   By creating a range type (done late in the development process), it
321#   allowed this to be stored with the range, and overridden by other input.
322#   Originally these were stored in another data structure, and it became a
323#   mess trying to decide if a second file that was for the same property was
324#   overriding the earlier one or not.
325#
326# Why are there two kinds of tables, match and map?
327#   (And there is a base class shared by the two as well.)  As stated above,
328#   they actually are for different things.  Development proceeded much more
329#   smoothly when I (khw) realized the distinction.  Map tables are used to
330#   give the property value for every code point (actually every code point
331#   that doesn't map to a default value).  Match tables are used for regular
332#   expression matches, and are essentially the inverse mapping.  Separating
333#   the two allows more specialized methods, and error checks so that one
334#   can't just take the intersection of two map tables, for example, as that
335#   is nonsensical.
336#
337# What about 'fate' and 'status'.  The concept of a table's fate was created
338#   late when it became clear that something more was needed.  The difference
339#   between this and 'status' is unclean, and could be improved if someone
340#   wanted to spend the effort.
341#
342# DEBUGGING
343#
344# This program is written so it will run under miniperl.  Occasionally changes
345# will cause an error where the backtrace doesn't work well under miniperl.
346# To diagnose the problem, you can instead run it under regular perl, if you
347# have one compiled.
348#
349# There is a good trace facility.  To enable it, first sub DEBUG must be set
350# to return true.  Then a line like
351#
352# local $to_trace = 1 if main::DEBUG;
353#
354# can be added to enable tracing in its lexical scope (plus dynamic) or until
355# you insert another line:
356#
357# local $to_trace = 0 if main::DEBUG;
358#
359# To actually trace, use a line like "trace $a, @b, %c, ...;
360#
361# Some of the more complex subroutines already have trace statements in them.
362# Permanent trace statements should be like:
363#
364# trace ... if main::DEBUG && $to_trace;
365#
366# main::stack_trace() will display what its name implies
367#
368# If there is just one or a few files that you're debugging, you can easily
369# cause most everything else to be skipped.  Change the line
370#
371# my $debug_skip = 0;
372#
373# to 1, and every file whose object is in @input_file_objects and doesn't have
374# a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
375# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
376#
377# To compare the output tables, it may be useful to specify the -annotate
378# flag.  (As of this writing, this can't be done on a clean workspace, due to
379# requirements in Text::Tabs used in this option; so first run mktables
380# without this option.)  This option adds comment lines to each table, one for
381# each non-algorithmically named character giving, currently its code point,
382# name, and graphic representation if printable (and you have a font that
383# knows about it).  This makes it easier to see what the particular code
384# points are in each output table.  Non-named code points are annotated with a
385# description of their status, and contiguous ones with the same description
386# will be output as a range rather than individually.  Algorithmically named
387# characters are also output as ranges, except when there are just a few
388# contiguous ones.
389#
390# FUTURE ISSUES
391#
392# The program would break if Unicode were to change its names so that
393# interior white space, underscores, or dashes differences were significant
394# within property and property value names.
395#
396# It might be easier to use the xml versions of the UCD if this program ever
397# would need heavy revision, and the ability to handle old versions was not
398# required.
399#
400# There is the potential for name collisions, in that Perl has chosen names
401# that Unicode could decide it also likes.  There have been such collisions in
402# the past, with mostly Perl deciding to adopt the Unicode definition of the
403# name.  However in the 5.2 Unicode beta testing, there were a number of such
404# collisions, which were withdrawn before the final release, because of Perl's
405# and other's protests.  These all involved new properties which began with
406# 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
407# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
408# Unicode document, so they are unlikely to be used by Unicode for another
409# purpose.  However, they might try something beginning with 'In', or use any
410# of the other Perl-defined properties.  This program will warn you of name
411# collisions, and refuse to generate tables with them, but manual intervention
412# will be required in this event.  One scheme that could be implemented, if
413# necessary, would be to have this program generate another file, or add a
414# field to mktables.lst that gives the date of first definition of a property.
415# Each new release of Unicode would use that file as a basis for the next
416# iteration.  And the Perl synonym addition code could sort based on the age
417# of the property, so older properties get priority, and newer ones that clash
418# would be refused; hence existing code would not be impacted, and some other
419# synonym would have to be used for the new property.  This is ugly, and
420# manual intervention would certainly be easier to do in the short run; lets
421# hope it never comes to this.
422#
423# A NOTE ON UNIHAN
424#
425# This program can generate tables from the Unihan database.  But that DB
426# isn't normally available, so it is marked as optional.  Prior to version
427# 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
428# was split into 8 different files, all beginning with the letters 'Unihan'.
429# If you plunk those files down into the directory mktables ($0) is in, this
430# program will read them and automatically create tables for the properties
431# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
432# plus any you add to the @cjk_properties array and the @cjk_property_values
433# array, being sure to add necessary '# @missings' lines to the latter.  For
434# Unicode versions earlier than 5.2, most of the Unihan properties are not
435# listed at all in PropertyAliases nor PropValueAliases.  This program assumes
436# for these early releases that you want the properties that are specified in
437# the 5.2 release.
438#
439# You may need to adjust the entries to suit your purposes.  setup_unihan(),
440# and filter_unihan_line() are the functions where this is done.  This program
441# already does some adjusting to make the lines look more like the rest of the
442# Unicode DB;  You can see what that is in filter_unihan_line()
443#
444# There is a bug in the 3.2 data file in which some values for the
445# kPrimaryNumeric property have commas and an unexpected comment.  A filter
446# could be added to correct these; or for a particular installation, the
447# Unihan.txt file could be edited to fix them.
448#
449# HOW TO ADD A FILE TO BE PROCESSED
450#
451# A new file from Unicode needs to have an object constructed for it in
452# @input_file_objects, probably at the end or at the end of the extracted
453# ones.  The program should warn you if its name will clash with others on
454# restrictive file systems, like DOS.  If so, figure out a better name, and
455# add lines to the README.perl file giving that.  If the file is a character
456# property, it should be in the format that Unicode has implicitly
457# standardized for such files for the more recently introduced ones.
458# If so, the Input_file constructor for @input_file_objects can just be the
459# file name and release it first appeared in.  If not, then it should be
460# possible to construct an each_line_handler() to massage the line into the
461# standardized form.
462#
463# For non-character properties, more code will be needed.  You can look at
464# the existing entries for clues.
465#
466# UNICODE VERSIONS NOTES
467#
468# The Unicode UCD has had a number of errors in it over the versions.  And
469# these remain, by policy, in the standard for that version.  Therefore it is
470# risky to correct them, because code may be expecting the error.  So this
471# program doesn't generally make changes, unless the error breaks the Perl
472# core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
473# for U+1105, which causes real problems for the algorithms for Jamo
474# calculations, so it is changed here.
475#
476# But it isn't so clear cut as to what to do about concepts that are
477# introduced in a later release; should they extend back to earlier releases
478# where the concept just didn't exist?  It was easier to do this than to not,
479# so that's what was done.  For example, the default value for code points not
480# in the files for various properties was probably undefined until changed by
481# some version.  No_Block for blocks is such an example.  This program will
482# assign No_Block even in Unicode versions that didn't have it.  This has the
483# benefit that code being written doesn't have to special case earlier
484# versions; and the detriment that it doesn't match the Standard precisely for
485# the affected versions.
486#
487# Here are some observations about some of the issues in early versions:
488#
489# Prior to version 3.0, there were 3 character decompositions.  These are not
490# handled by Unicode::Normalize, nor will it compile when presented a version
491# that has them.  However, you can trivially get it to compile by simply
492# ignoring those decompositions, by changing the croak to a carp.  At the time
493# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
494# dist/Unicode-Normalize/mkheader) reads
495#
496#   croak("Weird Canonical Decomposition of U+$h");
497#
498# Simply comment it out.  It will compile, but will not know about any three
499# character decompositions.
500
501# The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
502# that the reason is that the CJK block starting at 4E00 was removed from
503# PropList, and was not put back in until 3.1.0.  The Perl extension (the
504# single property name \p{alpha}) has the correct values.  But the compound
505# form is simply not generated until 3.1, as it can be argued that prior to
506# this release, this was not an official property.  The comments for
507# filter_old_style_proplist() give more details.
508#
509# Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
510# always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
511# reason is that 3.2 introduced U+205F=medium math space, which was not
512# classed as white space, but Perl figured out that it should have been. 4.0
513# reclassified it correctly.
514#
515# Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
516# this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
517# became 202, and ATBL was left with no code points, as all the ones that
518# mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
519# name for the class, it would not have been affected, but if it used the
520# mnemonic, it would have been.
521#
522# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
523# points which eventually came to have this script property value, instead
524# mapped to "Unknown".  But in the next release all these code points were
525# moved to \p{sc=common} instead.
526
527# The tests furnished  by Unicode for testing WordBreak and SentenceBreak
528# generate errors in 5.0 and earlier.
529#
530# The default for missing code points for BidiClass is complicated.  Starting
531# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
532# tries to do the best it can for earlier releases.  It is done in
533# process_PropertyAliases()
534#
535# In version 2.1.2, the entry in UnicodeData.txt:
536#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
537# should instead be
538#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
539# Without this change, there are casing problems for this character.
540#
541# Search for $string_compare_versions to see how to compare changes to
542# properties between Unicode versions
543#
544##############################################################################
545
546my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
547                        # and errors
548my $MAX_LINE_WIDTH = 78;
549
550# Debugging aid to skip most files so as to not be distracted by them when
551# concentrating on the ones being debugged.  Add
552# non_skip => 1,
553# to the constructor for those files you want processed when you set this.
554# Files with a first version number of 0 are special: they are always
555# processed regardless of the state of this flag.  Generally, Jamo.txt and
556# UnicodeData.txt must not be skipped if you want this program to not die
557# before normal completion.
558my $debug_skip = 0;
559
560
561# Normally these are suppressed.
562my $write_Unicode_deprecated_tables = 0;
563
564# Set to 1 to enable tracing.
565our $to_trace = 0;
566
567{ # Closure for trace: debugging aid
568    my $print_caller = 1;        # ? Include calling subroutine name
569    my $main_with_colon = 'main::';
570    my $main_colon_length = length($main_with_colon);
571
572    sub trace {
573        return unless $to_trace;        # Do nothing if global flag not set
574
575        my @input = @_;
576
577        local $DB::trace = 0;
578        $DB::trace = 0;          # Quiet 'used only once' message
579
580        my $line_number;
581
582        # Loop looking up the stack to get the first non-trace caller
583        my $caller_line;
584        my $caller_name;
585        my $i = 0;
586        do {
587            $line_number = $caller_line;
588            (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
589            $caller = $main_with_colon unless defined $caller;
590
591            $caller_name = $caller;
592
593            # get rid of pkg
594            $caller_name =~ s/.*:://;
595            if (substr($caller_name, 0, $main_colon_length)
596                eq $main_with_colon)
597            {
598                $caller_name = substr($caller_name, $main_colon_length);
599            }
600
601        } until ($caller_name ne 'trace');
602
603        # If the stack was empty, we were called from the top level
604        $caller_name = 'main' if ($caller_name eq ""
605                                    || $caller_name eq 'trace');
606
607        my $output = "";
608        #print STDERR __LINE__, ": ", join ", ", @input, "\n";
609        foreach my $string (@input) {
610            if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
611                $output .= simple_dumper($string);
612            }
613            else {
614                $string = "$string" if ref $string;
615                $string = $UNDEF unless defined $string;
616                chomp $string;
617                $string = '""' if $string eq "";
618                $output .= " " if $output ne ""
619                                && $string ne ""
620                                && substr($output, -1, 1) ne " "
621                                && substr($string, 0, 1) ne " ";
622                $output .= $string;
623            }
624        }
625
626        print STDERR sprintf "%4d: ", $line_number if defined $line_number;
627        print STDERR "$caller_name: " if $print_caller;
628        print STDERR $output, "\n";
629        return;
630    }
631}
632
633sub stack_trace() {
634    local $to_trace = 1 if main::DEBUG;
635    my $line = (caller(0))[2];
636    my $i = 1;
637
638    # Accumulate the stack trace
639    while (1) {
640        my ($pkg, $file, $caller_line, $caller) = caller $i++;
641
642        last unless defined $caller;
643
644        trace "called from $caller() at line $line";
645        $line = $caller_line;
646    }
647}
648
649# This is for a rarely used development feature that allows you to compare two
650# versions of the Unicode standard without having to deal with changes caused
651# by the code points introduced in the later version.  You probably also want
652# to use the -annotate option when using this.  Run this program on a unicore
653# containing the starting release you want to compare.  Save that output
654# structure.  Then, switching to a unicore with the ending release, change the
655# 0 in the $string_compare_versions definition just below to a string
656# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
657# to the starting release.  This program will then compile, but throw away all
658# code points introduced after the starting release.  Finally use a diff tool
659# to compare the two directory structures.  They include only the code points
660# common to both releases, and you can see the changes caused just by the
661# underlying release semantic changes.  For versions earlier than 3.2, you
662# must copy a version of DAge.txt into the directory.
663my $string_compare_versions = DEBUG && ""; #  e.g., "2.1";
664my $compare_versions = DEBUG
665                       && $string_compare_versions
666                       && pack "C*", split /\./, $string_compare_versions;
667
668sub uniques {
669    # Returns non-duplicated input values.  From "Perl Best Practices:
670    # Encapsulated Cleverness".  p. 455 in first edition.
671
672    my %seen;
673    # Arguably this breaks encapsulation, if the goal is to permit multiple
674    # distinct objects to stringify to the same value, and be interchangeable.
675    # However, for this program, no two objects stringify identically, and all
676    # lists passed to this function are either objects or strings. So this
677    # doesn't affect correctness, but it does give a couple of percent speedup.
678    no overloading;
679    return grep { ! $seen{$_}++ } @_;
680}
681
682$0 = File::Spec->canonpath($0);
683
684my $make_test_script = 0;      # ? Should we output a test script
685my $make_norm_test_script = 0; # ? Should we output a normalization test script
686my $write_unchanged_files = 0; # ? Should we update the output files even if
687                               #    we don't think they have changed
688my $use_directory = "";        # ? Should we chdir somewhere.
689my $pod_directory;             # input directory to store the pod file.
690my $pod_file = 'perluniprops';
691my $t_path;                     # Path to the .t test file
692my $file_list = 'mktables.lst'; # File to store input and output file names.
693                               # This is used to speed up the build, by not
694                               # executing the main body of the program if
695                               # nothing on the list has changed since the
696                               # previous build
697my $make_list = 1;             # ? Should we write $file_list.  Set to always
698                               # make a list so that when the pumpking is
699                               # preparing a release, s/he won't have to do
700                               # special things
701my $glob_list = 0;             # ? Should we try to include unknown .txt files
702                               # in the input.
703my $output_range_counts = $debugging_build;   # ? Should we include the number
704                                              # of code points in ranges in
705                                              # the output
706my $annotate = 0;              # ? Should character names be in the output
707
708# Verbosity levels; 0 is quiet
709my $NORMAL_VERBOSITY = 1;
710my $PROGRESS = 2;
711my $VERBOSE = 3;
712
713my $verbosity = $NORMAL_VERBOSITY;
714
715# Stored in mktables.lst so that if this program is called with different
716# options, will regenerate even if the files otherwise look like they're
717# up-to-date.
718my $command_line_arguments = join " ", @ARGV;
719
720# Process arguments
721while (@ARGV) {
722    my $arg = shift @ARGV;
723    if ($arg eq '-v') {
724        $verbosity = $VERBOSE;
725    }
726    elsif ($arg eq '-p') {
727        $verbosity = $PROGRESS;
728        $| = 1;     # Flush buffers as we go.
729    }
730    elsif ($arg eq '-q') {
731        $verbosity = 0;
732    }
733    elsif ($arg eq '-w') {
734        # update the files even if they haven't changed
735        $write_unchanged_files = 1;
736    }
737    elsif ($arg eq '-check') {
738        my $this = shift @ARGV;
739        my $ok = shift @ARGV;
740        if ($this ne $ok) {
741            print "Skipping as check params are not the same.\n";
742            exit(0);
743        }
744    }
745    elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
746        -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
747    }
748    elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
749    {
750        $make_test_script = 1;
751    }
752    elsif ($arg eq '-makenormtest')
753    {
754        $make_norm_test_script = 1;
755    }
756    elsif ($arg eq '-makelist') {
757        $make_list = 1;
758    }
759    elsif ($arg eq '-C' && defined ($use_directory = shift)) {
760        -d $use_directory or croak "Unknown directory '$use_directory'";
761    }
762    elsif ($arg eq '-L') {
763
764        # Existence not tested until have chdir'd
765        $file_list = shift;
766    }
767    elsif ($arg eq '-globlist') {
768        $glob_list = 1;
769    }
770    elsif ($arg eq '-c') {
771        $output_range_counts = ! $output_range_counts
772    }
773    elsif ($arg eq '-annotate') {
774        $annotate = 1;
775        $debugging_build = 1;
776        $output_range_counts = 1;
777    }
778    else {
779        my $with_c = 'with';
780        $with_c .= 'out' if $output_range_counts;   # Complements the state
781        croak <<END;
782usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
783          [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
784          [-check A B ]
785  -c          : Output comments $with_c number of code points in ranges
786  -q          : Quiet Mode: Only output serious warnings.
787  -p          : Set verbosity level to normal plus show progress.
788  -v          : Set Verbosity level high:  Show progress and non-serious
789                warnings
790  -w          : Write files regardless
791  -C dir      : Change to this directory before proceeding. All relative paths
792                except those specified by the -P and -T options will be done
793                with respect to this directory.
794  -P dir      : Output $pod_file file to directory 'dir'.
795  -T path     : Create a test script as 'path'; overrides -maketest
796  -L filelist : Use alternate 'filelist' instead of standard one
797  -globlist   : Take as input all non-Test *.txt files in current and sub
798                directories
799  -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
800                overrides -T
801  -makelist   : Rewrite the file list $file_list based on current setup
802  -annotate   : Output an annotation for each character in the table files;
803                useful for debugging mktables, looking at diffs; but is slow
804                and memory intensive
805  -check A B  : Executes $0 only if A and B are the same
806END
807    }
808}
809
810# Stores the most-recently changed file.  If none have changed, can skip the
811# build
812my $most_recent = (stat $0)[9];   # Do this before the chdir!
813
814# Change directories now, because need to read 'version' early.
815if ($use_directory) {
816    if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
817        $pod_directory = File::Spec->rel2abs($pod_directory);
818    }
819    if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
820        $t_path = File::Spec->rel2abs($t_path);
821    }
822    chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
823    if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
824        $pod_directory = File::Spec->abs2rel($pod_directory);
825    }
826    if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
827        $t_path = File::Spec->abs2rel($t_path);
828    }
829}
830
831# Get Unicode version into regular and v-string.  This is done now because
832# various tables below get populated based on it.  These tables are populated
833# here to be near the top of the file, and so easily seeable by those needing
834# to modify things.
835open my $VERSION, "<", "version"
836                    or croak "$0: can't open required file 'version': $!\n";
837my $string_version = <$VERSION>;
838close $VERSION;
839chomp $string_version;
840my $v_version = pack "C*", split /\./, $string_version;        # v string
841
842my $unicode_version = ($compare_versions)
843                      ? (  "$string_compare_versions (using "
844                         . "$string_version rules)")
845                      : $string_version;
846
847# The following are the complete names of properties with property values that
848# are known to not match any code points in some versions of Unicode, but that
849# may change in the future so they should be matchable, hence an empty file is
850# generated for them.
851my @tables_that_may_be_empty;
852push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
853                                                    if $v_version lt v6.3.0;
854push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
855push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
856push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
857                                                    if $v_version ge v4.1.0;
858push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
859                                                    if $v_version ge v6.0.0;
860push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
861                                                    if $v_version ge v6.1.0;
862push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
863                                                    if $v_version ge v6.2.0;
864
865# The lists below are hashes, so the key is the item in the list, and the
866# value is the reason why it is in the list.  This makes generation of
867# documentation easier.
868
869my %why_suppressed;  # No file generated for these.
870
871# Files aren't generated for empty extraneous properties.  This is arguable.
872# Extraneous properties generally come about because a property is no longer
873# used in a newer version of Unicode.  If we generated a file without code
874# points, programs that used to work on that property will still execute
875# without errors.  It just won't ever match (or will always match, with \P{}).
876# This means that the logic is now likely wrong.  I (khw) think its better to
877# find this out by getting an error message.  Just move them to the table
878# above to change this behavior
879my %why_suppress_if_empty_warn_if_not = (
880
881   # It is the only property that has ever officially been removed from the
882   # Standard.  The database never contained any code points for it.
883   'Special_Case_Condition' => 'Obsolete',
884
885   # Apparently never official, but there were code points in some versions of
886   # old-style PropList.txt
887   'Non_Break' => 'Obsolete',
888);
889
890# These would normally go in the warn table just above, but they were changed
891# a long time before this program was written, so warnings about them are
892# moot.
893if ($v_version gt v3.2.0) {
894    push @tables_that_may_be_empty,
895                                'Canonical_Combining_Class=Attached_Below_Left'
896}
897
898# Enum values for to_output_map() method in the Map_Table package. (0 is don't
899# output)
900my $EXTERNAL_MAP = 1;
901my $INTERNAL_MAP = 2;
902my $OUTPUT_ADJUSTED = 3;
903
904# To override computed values for writing the map tables for these properties.
905# The default for enum map tables is to write them out, so that the Unicode
906# .txt files can be removed, but all the data to compute any property value
907# for any code point is available in a more compact form.
908my %global_to_output_map = (
909    # Needed by UCD.pm, but don't want to publicize that it exists, so won't
910    # get stuck supporting it if things change.  Since it is a STRING
911    # property, it normally would be listed in the pod, but INTERNAL_MAP
912    # suppresses that.
913    Unicode_1_Name => $INTERNAL_MAP,
914
915    Present_In => 0,                # Suppress, as easily computed from Age
916    Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
917                                            # retained, but needed for
918                                            # non-ASCII
919
920    # Suppress, as mapping can be found instead from the
921    # Perl_Decomposition_Mapping file
922    Decomposition_Type => 0,
923);
924
925# There are several types of obsolete properties defined by Unicode.  These
926# must be hand-edited for every new Unicode release.
927my %why_deprecated;  # Generates a deprecated warning message if used.
928my %why_stabilized;  # Documentation only
929my %why_obsolete;    # Documentation only
930
931{   # Closure
932    my $simple = 'Perl uses the more complete version';
933    my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
934
935    my $other_properties = 'other properties';
936    my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
937    my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
938
939    %why_deprecated = (
940        'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
941        'Jamo_Short_Name' => $contributory,
942        'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
943        'Other_Alphabetic' => $contributory,
944        'Other_Default_Ignorable_Code_Point' => $contributory,
945        'Other_Grapheme_Extend' => $contributory,
946        'Other_ID_Continue' => $contributory,
947        'Other_ID_Start' => $contributory,
948        'Other_Lowercase' => $contributory,
949        'Other_Math' => $contributory,
950        'Other_Uppercase' => $contributory,
951        'Expands_On_NFC' => $why_no_expand,
952        'Expands_On_NFD' => $why_no_expand,
953        'Expands_On_NFKC' => $why_no_expand,
954        'Expands_On_NFKD' => $why_no_expand,
955    );
956
957    %why_suppressed = (
958        # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
959        # contains the same information, but without the algorithmically
960        # determinable Hangul syllables'.  This file is not published, so it's
961        # existence is not noted in the comment.
962        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
963
964        # Don't suppress ISO_Comment, as otherwise special handling is needed
965        # to differentiate between it and gc=c, which can be written as 'isc',
966        # which is the same characters as ISO_Comment's short name.
967
968        'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
969
970        'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
971        'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
972        'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
973        'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
974
975        FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
976    );
977
978    foreach my $property (
979
980            # The following are suppressed because they were made contributory
981            # or deprecated by Unicode before Perl ever thought about
982            # supporting them.
983            'Jamo_Short_Name',
984            'Grapheme_Link',
985            'Expands_On_NFC',
986            'Expands_On_NFD',
987            'Expands_On_NFKC',
988            'Expands_On_NFKD',
989
990            # The following are suppressed because they have been marked
991            # as deprecated for a sufficient amount of time
992            'Other_Alphabetic',
993            'Other_Default_Ignorable_Code_Point',
994            'Other_Grapheme_Extend',
995            'Other_ID_Continue',
996            'Other_ID_Start',
997            'Other_Lowercase',
998            'Other_Math',
999            'Other_Uppercase',
1000    ) {
1001        $why_suppressed{$property} = $why_deprecated{$property};
1002    }
1003
1004    # Customize the message for all the 'Other_' properties
1005    foreach my $property (keys %why_deprecated) {
1006        next if (my $main_property = $property) !~ s/^Other_//;
1007        $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1008    }
1009}
1010
1011if ($write_Unicode_deprecated_tables) {
1012    foreach my $property (keys %why_suppressed) {
1013        delete $why_suppressed{$property} if $property =~
1014                                                    / ^ Other | Grapheme /x;
1015    }
1016}
1017
1018if ($v_version ge 4.0.0) {
1019    $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1020    if ($v_version ge 6.0.0) {
1021        $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1022    }
1023}
1024if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1025    $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1026    if ($v_version ge 6.0.0) {
1027        $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1028    }
1029}
1030
1031# Probably obsolete forever
1032if ($v_version ge v4.1.0) {
1033    $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1034}
1035if ($v_version ge v6.0.0) {
1036    $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1037    $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1038}
1039
1040# This program can create files for enumerated-like properties, such as
1041# 'Numeric_Type'.  This file would be the same format as for a string
1042# property, with a mapping from code point to its value, so you could look up,
1043# for example, the script a code point is in.  But no one so far wants this
1044# mapping, or they have found another way to get it since this is a new
1045# feature.  So no file is generated except if it is in this list.
1046my @output_mapped_properties = split "\n", <<END;
1047END
1048
1049# If you want more Unihan properties than the default, you need to add them to
1050# these arrays.  Depending on the property type, @missing lines might have to
1051# be added to the second array.  A sample entry would be (including the '#'):
1052# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1053my @cjk_properties = split "\n", <<'END';
1054END
1055my @cjk_property_values = split "\n", <<'END';
1056END
1057
1058# The input files don't list every code point.  Those not listed are to be
1059# defaulted to some value.  Below are hard-coded what those values are for
1060# non-binary properties as of 5.1.  Starting in 5.0, there are
1061# machine-parsable comment lines in the files that give the defaults; so this
1062# list shouldn't have to be extended.  The claim is that all missing entries
1063# for binary properties will default to 'N'.  Unicode tried to change that in
1064# 5.2, but the beta period produced enough protest that they backed off.
1065#
1066# The defaults for the fields that appear in UnicodeData.txt in this hash must
1067# be in the form that it expects.  The others may be synonyms.
1068my $CODE_POINT = '<code point>';
1069my %default_mapping = (
1070    Age => "Unassigned",
1071    # Bidi_Class => Complicated; set in code
1072    Bidi_Mirroring_Glyph => "",
1073    Block => 'No_Block',
1074    Canonical_Combining_Class => 0,
1075    Case_Folding => $CODE_POINT,
1076    Decomposition_Mapping => $CODE_POINT,
1077    Decomposition_Type => 'None',
1078    East_Asian_Width => "Neutral",
1079    FC_NFKC_Closure => $CODE_POINT,
1080    General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1081    Grapheme_Cluster_Break => 'Other',
1082    Hangul_Syllable_Type => 'NA',
1083    ISO_Comment => "",
1084    Jamo_Short_Name => "",
1085    Joining_Group => "No_Joining_Group",
1086    # Joining_Type => Complicated; set in code
1087    kIICore => 'N',   #                       Is converted to binary
1088    #Line_Break => Complicated; set in code
1089    Lowercase_Mapping => $CODE_POINT,
1090    Name => "",
1091    Name_Alias => "",
1092    NFC_QC => 'Yes',
1093    NFD_QC => 'Yes',
1094    NFKC_QC => 'Yes',
1095    NFKD_QC => 'Yes',
1096    Numeric_Type => 'None',
1097    Numeric_Value => 'NaN',
1098    Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1099    Sentence_Break => 'Other',
1100    Simple_Case_Folding => $CODE_POINT,
1101    Simple_Lowercase_Mapping => $CODE_POINT,
1102    Simple_Titlecase_Mapping => $CODE_POINT,
1103    Simple_Uppercase_Mapping => $CODE_POINT,
1104    Titlecase_Mapping => $CODE_POINT,
1105    Unicode_1_Name => "",
1106    Unicode_Radical_Stroke => "",
1107    Uppercase_Mapping => $CODE_POINT,
1108    Word_Break => 'Other',
1109);
1110
1111### End of externally interesting definitions, except for @input_file_objects
1112
1113my $HEADER=<<"EOF";
1114# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1115# This file is machine-generated by $0 from the Unicode
1116# database, Version $unicode_version.  Any changes made here will be lost!
1117EOF
1118
1119my $INTERNAL_ONLY_HEADER = <<"EOF";
1120
1121# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1122# This file is for internal use by core Perl only.  The format and even the
1123# name or existence of this file are subject to change without notice.  Don't
1124# use it directly.  Use Unicode::UCD to access the Unicode character data
1125# base.
1126EOF
1127
1128my $DEVELOPMENT_ONLY=<<"EOF";
1129# !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1130# This file contains information artificially constrained to code points
1131# present in Unicode release $string_compare_versions.
1132# IT CANNOT BE RELIED ON.  It is for use during development only and should
1133# not be used for production.
1134
1135EOF
1136
1137my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1138                                   ? "10FFFF"
1139                                   : "FFFF";
1140my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1141my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1142
1143# We work with above-Unicode code points, up to IV_MAX, but we may want to use
1144# sentinels above that number.  Therefore for internal use, we use a much
1145# smaller number, translating it to IV_MAX only for output.  The exact number
1146# is immaterial (all above-Unicode code points are treated exactly the same),
1147# but the algorithm requires it to be at least
1148# 2 * $MAX_UNICODE_CODEPOINTS + 1
1149my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1150my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1151my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1152
1153my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1154
1155# Matches legal code point.  4-6 hex numbers, If there are 6, the first
1156# two must be 10; if there are 5, the first must not be a 0.  Written this way
1157# to decrease backtracking.  The first regex allows the code point to be at
1158# the end of a word, but to work properly, the word shouldn't end with a valid
1159# hex character.  The second one won't match a code point at the end of a
1160# word, and doesn't have the run-on issue
1161my $run_on_code_point_re =
1162            qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1163my $code_point_re = qr/\b$run_on_code_point_re/;
1164
1165# This matches the beginning of the line in the Unicode DB files that give the
1166# defaults for code points not listed (i.e., missing) in the file.  The code
1167# depends on this ending with a semi-colon, so it can assume it is a valid
1168# field when the line is split() by semi-colons
1169my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
1170
1171# Property types.  Unicode has more types, but these are sufficient for our
1172# purposes.
1173my $UNKNOWN = -1;   # initialized to illegal value
1174my $NON_STRING = 1; # Either binary or enum
1175my $BINARY = 2;
1176my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1177                       # tables, additional true and false tables are
1178                       # generated so that false is anything matching the
1179                       # default value, and true is everything else.
1180my $ENUM = 4;       # Include catalog
1181my $STRING = 5;     # Anything else: string or misc
1182
1183# Some input files have lines that give default values for code points not
1184# contained in the file.  Sometimes these should be ignored.
1185my $NO_DEFAULTS = 0;        # Must evaluate to false
1186my $NOT_IGNORED = 1;
1187my $IGNORED = 2;
1188
1189# Range types.  Each range has a type.  Most ranges are type 0, for normal,
1190# and will appear in the main body of the tables in the output files, but
1191# there are other types of ranges as well, listed below, that are specially
1192# handled.   There are pseudo-types as well that will never be stored as a
1193# type, but will affect the calculation of the type.
1194
1195# 0 is for normal, non-specials
1196my $MULTI_CP = 1;           # Sequence of more than code point
1197my $HANGUL_SYLLABLE = 2;
1198my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1199my $NULL = 4;               # The map is to the null string; utf8.c can't
1200                            # handle these, nor is there an accepted syntax
1201                            # for them in \p{} constructs
1202my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1203                             # otherwise be $MULTI_CP type are instead type 0
1204
1205# process_generic_property_file() can accept certain overrides in its input.
1206# Each of these must begin AND end with $CMD_DELIM.
1207my $CMD_DELIM = "\a";
1208my $REPLACE_CMD = 'replace';    # Override the Replace
1209my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1210
1211my $NO = 0;
1212my $YES = 1;
1213
1214# Values for the Replace argument to add_range.
1215# $NO                      # Don't replace; add only the code points not
1216                           # already present.
1217my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1218                           # the comments at the subroutine definition.
1219my $UNCONDITIONALLY = 2;   # Replace without conditions.
1220my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1221                           # already there
1222my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1223                           # already there
1224my $CROAK = 6;             # Die with an error if is already there
1225
1226# Flags to give property statuses.  The phrases are to remind maintainers that
1227# if the flag is changed, the indefinite article referring to it in the
1228# documentation may need to be as well.
1229my $NORMAL = "";
1230my $DEPRECATED = 'D';
1231my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1232my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1233my $DISCOURAGED = 'X';
1234my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1235my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1236my $STRICTER = 'T';
1237my $a_bold_stricter = "a 'B<$STRICTER>'";
1238my $A_bold_stricter = "A 'B<$STRICTER>'";
1239my $STABILIZED = 'S';
1240my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1241my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1242my $OBSOLETE = 'O';
1243my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1244my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1245
1246# Aliases can also have an extra status:
1247my $INTERNAL_ALIAS = 'P';
1248
1249my %status_past_participles = (
1250    $DISCOURAGED => 'discouraged',
1251    $STABILIZED => 'stabilized',
1252    $OBSOLETE => 'obsolete',
1253    $DEPRECATED => 'deprecated',
1254    $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1255);
1256
1257# Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1258# externally documented.
1259my $ORDINARY = 0;       # The normal fate.
1260my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1261                        # but there is a file written that can be used to
1262                        # reconstruct this table
1263my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1264                        # for Perl's internal use only
1265my $LEGACY_ONLY = 3;    # Like $INTERNAL_ONLY, but not actually used by Perl.
1266                        # Is for backwards compatibility for applications that
1267                        # read the file directly, so it's format is
1268                        # unchangeable.
1269my $SUPPRESSED = 4;     # The file for this table is not written out, and as a
1270                        # result, we don't bother to do many computations on
1271                        # it.
1272my $PLACEHOLDER = 5;    # Like $SUPPRESSED, but we go through all the
1273                        # computations anyway, as the values are needed for
1274                        # things to work.  This happens when we have Perl
1275                        # extensions that depend on Unicode tables that
1276                        # wouldn't normally be in a given Unicode version.
1277
1278# The format of the values of the tables:
1279my $EMPTY_FORMAT = "";
1280my $BINARY_FORMAT = 'b';
1281my $DECIMAL_FORMAT = 'd';
1282my $FLOAT_FORMAT = 'f';
1283my $INTEGER_FORMAT = 'i';
1284my $HEX_FORMAT = 'x';
1285my $RATIONAL_FORMAT = 'r';
1286my $STRING_FORMAT = 's';
1287my $ADJUST_FORMAT = 'a';
1288my $HEX_ADJUST_FORMAT = 'ax';
1289my $DECOMP_STRING_FORMAT = 'c';
1290my $STRING_WHITE_SPACE_LIST = 'sw';
1291
1292my %map_table_formats = (
1293    $BINARY_FORMAT => 'binary',
1294    $DECIMAL_FORMAT => 'single decimal digit',
1295    $FLOAT_FORMAT => 'floating point number',
1296    $INTEGER_FORMAT => 'integer',
1297    $HEX_FORMAT => 'non-negative hex whole number; a code point',
1298    $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1299    $STRING_FORMAT => 'string',
1300    $ADJUST_FORMAT => 'some entries need adjustment',
1301    $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1302    $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1303    $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1304);
1305
1306# Unicode didn't put such derived files in a separate directory at first.
1307my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1308my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1309my $AUXILIARY = 'auxiliary';
1310
1311# Hashes and arrays that will eventually go into Heavy.pl for the use of
1312# utf8_heavy.pl and into UCD.pl for the use of UCD.pm
1313my %loose_to_file_of;       # loosely maps table names to their respective
1314                            # files
1315my %stricter_to_file_of;    # same; but for stricter mapping.
1316my %loose_property_to_file_of; # Maps a loose property name to its map file
1317my %strict_property_to_file_of; # Same, but strict
1318my @inline_definitions = "V0"; # Each element gives a definition of a unique
1319                            # inversion list.  When a definition is inlined,
1320                            # its value in the hash it's in (one of the two
1321                            # defined just above) will include an index into
1322                            # this array.  The 0th element is initialized to
1323                            # the definition for a zero length inversion list
1324my %file_to_swash_name;     # Maps the file name to its corresponding key name
1325                            # in the hash %utf8::SwashInfo
1326my %nv_floating_to_rational; # maps numeric values floating point numbers to
1327                             # their rational equivalent
1328my %loose_property_name_of; # Loosely maps (non_string) property names to
1329                            # standard form
1330my %strict_property_name_of; # Strictly maps (non_string) property names to
1331                            # standard form
1332my %string_property_loose_to_name; # Same, for string properties.
1333my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1334                            # the property name in standard loose form, and
1335                            # 'value' is the default value for that property,
1336                            # also in standard loose form.
1337my %loose_to_standard_value; # loosely maps table names to the canonical
1338                            # alias for them
1339my %ambiguous_names;        # keys are alias names (in standard form) that
1340                            # have more than one possible meaning.
1341my %combination_property;   # keys are alias names (in standard form) that
1342                            # have both a map table, and a binary one that
1343                            # yields true for all non-null maps.
1344my %prop_aliases;           # Keys are standard property name; values are each
1345                            # one's aliases
1346my %prop_value_aliases;     # Keys of top level are standard property name;
1347                            # values are keys to another hash,  Each one is
1348                            # one of the property's values, in standard form.
1349                            # The values are that prop-val's aliases.
1350my %skipped_files;          # List of files that we skip
1351my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1352
1353# Most properties are immune to caseless matching, otherwise you would get
1354# nonsensical results, as properties are a function of a code point, not
1355# everything that is caselessly equivalent to that code point.  For example,
1356# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1357# be true because 's' and 'S' are equivalent caselessly.  However,
1358# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1359# extend that concept to those very few properties that are like this.  Each
1360# such property will match the full range caselessly.  They are hard-coded in
1361# the program; it's not worth trying to make it general as it's extremely
1362# unlikely that they will ever change.
1363my %caseless_equivalent_to;
1364
1365# This is the range of characters that were in Release 1 of Unicode, and
1366# removed in Release 2 (replaced with the current Hangul syllables starting at
1367# U+AC00).  The range was reused starting in Release 3 for other purposes.
1368my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1369my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1370
1371# These constants names and values were taken from the Unicode standard,
1372# version 5.1, section 3.12.  They are used in conjunction with Hangul
1373# syllables.  The '_string' versions are so generated tables can retain the
1374# hex format, which is the more familiar value
1375my $SBase_string = "0xAC00";
1376my $SBase = CORE::hex $SBase_string;
1377my $LBase_string = "0x1100";
1378my $LBase = CORE::hex $LBase_string;
1379my $VBase_string = "0x1161";
1380my $VBase = CORE::hex $VBase_string;
1381my $TBase_string = "0x11A7";
1382my $TBase = CORE::hex $TBase_string;
1383my $SCount = 11172;
1384my $LCount = 19;
1385my $VCount = 21;
1386my $TCount = 28;
1387my $NCount = $VCount * $TCount;
1388
1389# For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1390# with the above published constants.
1391my %Jamo;
1392my %Jamo_L;     # Leading consonants
1393my %Jamo_V;     # Vowels
1394my %Jamo_T;     # Trailing consonants
1395
1396# For code points whose name contains its ordinal as a '-ABCD' suffix.
1397# The key is the base name of the code point, and the value is an
1398# array giving all the ranges that use this base name.  Each range
1399# is actually a hash giving the 'low' and 'high' values of it.
1400my %names_ending_in_code_point;
1401my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1402                                        # removed from the names
1403# Inverse mapping.  The list of ranges that have these kinds of
1404# names.  Each element contains the low, high, and base names in an
1405# anonymous hash.
1406my @code_points_ending_in_code_point;
1407
1408# To hold Unicode's normalization test suite
1409my @normalization_tests;
1410
1411# Boolean: does this Unicode version have the hangul syllables, and are we
1412# writing out a table for them?
1413my $has_hangul_syllables = 0;
1414
1415# Does this Unicode version have code points whose names end in their
1416# respective code points, and are we writing out a table for them?  0 for no;
1417# otherwise points to first property that a table is needed for them, so that
1418# if multiple tables are needed, we don't create duplicates
1419my $needing_code_points_ending_in_code_point = 0;
1420
1421my @backslash_X_tests;     # List of tests read in for testing \X
1422my @LB_tests;              # List of tests read in for testing \b{lb}
1423my @SB_tests;              # List of tests read in for testing \b{sb}
1424my @WB_tests;              # List of tests read in for testing \b{wb}
1425my @unhandled_properties;  # Will contain a list of properties found in
1426                           # the input that we didn't process.
1427my @match_properties;      # Properties that have match tables, to be
1428                           # listed in the pod
1429my @map_properties;        # Properties that get map files written
1430my @named_sequences;       # NamedSequences.txt contents.
1431my %potential_files;       # Generated list of all .txt files in the directory
1432                           # structure so we can warn if something is being
1433                           # ignored.
1434my @missing_early_files;   # Generated list of absent files that we need to
1435                           # proceed in compiling this early Unicode version
1436my @files_actually_output; # List of files we generated.
1437my @more_Names;            # Some code point names are compound; this is used
1438                           # to store the extra components of them.
1439my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1440                           # the minimum before we consider it equivalent to a
1441                           # candidate rational
1442my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1443
1444# These store references to certain commonly used property objects
1445my $age;
1446my $ccc;
1447my $gc;
1448my $perl;
1449my $block;
1450my $perl_charname;
1451my $print;
1452my $All;
1453my $Assigned;   # All assigned characters in this Unicode release
1454my $DI;         # Default_Ignorable_Code_Point property
1455my $NChar;      # Noncharacter_Code_Point property
1456my $script;
1457my $scx;        # Script_Extensions property
1458
1459# Are there conflicting names because of beginning with 'In_', or 'Is_'
1460my $has_In_conflicts = 0;
1461my $has_Is_conflicts = 0;
1462
1463sub internal_file_to_platform ($) {
1464    # Convert our file paths which have '/' separators to those of the
1465    # platform.
1466
1467    my $file = shift;
1468    return undef unless defined $file;
1469
1470    return File::Spec->join(split '/', $file);
1471}
1472
1473sub file_exists ($) {   # platform independent '-e'.  This program internally
1474                        # uses slash as a path separator.
1475    my $file = shift;
1476    return 0 if ! defined $file;
1477    return -e internal_file_to_platform($file);
1478}
1479
1480sub objaddr($) {
1481    # Returns the address of the blessed input object.
1482    # It doesn't check for blessedness because that would do a string eval
1483    # every call, and the program is structured so that this is never called
1484    # for a non-blessed object.
1485
1486    no overloading; # If overloaded, numifying below won't work.
1487
1488    # Numifying a ref gives its address.
1489    return pack 'J', $_[0];
1490}
1491
1492# These are used only if $annotate is true.
1493# The entire range of Unicode characters is examined to populate these
1494# after all the input has been processed.  But most can be skipped, as they
1495# have the same descriptive phrases, such as being unassigned
1496my @viacode;            # Contains the 1 million character names
1497my @age;                # And their ages ("" if none)
1498my @printable;          # boolean: And are those characters printable?
1499my @annotate_char_type; # Contains a type of those characters, specifically
1500                        # for the purposes of annotation.
1501my $annotate_ranges;    # A map of ranges of code points that have the same
1502                        # name for the purposes of annotation.  They map to the
1503                        # upper edge of the range, so that the end point can
1504                        # be immediately found.  This is used to skip ahead to
1505                        # the end of a range, and avoid processing each
1506                        # individual code point in it.
1507my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1508                                   # characters, but excluding those which are
1509                                   # also noncharacter code points
1510
1511# The annotation types are an extension of the regular range types, though
1512# some of the latter are folded into one.  Make the new types negative to
1513# avoid conflicting with the regular types
1514my $SURROGATE_TYPE = -1;
1515my $UNASSIGNED_TYPE = -2;
1516my $PRIVATE_USE_TYPE = -3;
1517my $NONCHARACTER_TYPE = -4;
1518my $CONTROL_TYPE = -5;
1519my $ABOVE_UNICODE_TYPE = -6;
1520my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1521
1522sub populate_char_info ($) {
1523    # Used only with the $annotate option.  Populates the arrays with the
1524    # input code point's info that are needed for outputting more detailed
1525    # comments.  If calling context wants a return, it is the end point of
1526    # any contiguous range of characters that share essentially the same info
1527
1528    my $i = shift;
1529    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1530
1531    $viacode[$i] = $perl_charname->value_of($i) || "";
1532    $age[$i] = (defined $age)
1533               ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1534                  ? $age->value_of($i)
1535                  : "")
1536               : "";
1537
1538    # A character is generally printable if Unicode says it is,
1539    # but below we make sure that most Unicode general category 'C' types
1540    # aren't.
1541    $printable[$i] = $print->contains($i);
1542
1543    # But the characters in this range were removed in v2.0 and replaced by
1544    # different ones later.  Modern fonts will be for the replacement
1545    # characters, so suppress printing them.
1546    if (($v_version lt v2.0
1547         || ($compare_versions && $compare_versions lt v2.0))
1548        && (   $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1549            && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1550    {
1551        $printable[$i] = 0;
1552    }
1553
1554    $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1555
1556    # Only these two regular types are treated specially for annotations
1557    # purposes
1558    $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1559                                && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1560
1561    # Give a generic name to all code points that don't have a real name.
1562    # We output ranges, if applicable, for these.  Also calculate the end
1563    # point of the range.
1564    my $end;
1565    if (! $viacode[$i]) {
1566        if ($i > $MAX_UNICODE_CODEPOINT) {
1567            $viacode[$i] = 'Above-Unicode';
1568            $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1569            $printable[$i] = 0;
1570            $end = $MAX_WORKING_CODEPOINT;
1571        }
1572        elsif ($gc-> table('Private_use')->contains($i)) {
1573            $viacode[$i] = 'Private Use';
1574            $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1575            $printable[$i] = 0;
1576            $end = $gc->table('Private_Use')->containing_range($i)->end;
1577        }
1578        elsif ($NChar->contains($i)) {
1579            $viacode[$i] = 'Noncharacter';
1580            $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1581            $printable[$i] = 0;
1582            $end = $NChar->containing_range($i)->end;
1583        }
1584        elsif ($gc-> table('Control')->contains($i)) {
1585            my $name_ref = property_ref('Name_Alias');
1586            $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1587            $viacode[$i] = (defined $name_ref)
1588                           ? $name_ref->value_of($i)
1589                           : 'Control';
1590            $annotate_char_type[$i] = $CONTROL_TYPE;
1591            $printable[$i] = 0;
1592        }
1593        elsif ($gc-> table('Unassigned')->contains($i)) {
1594            $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1595            $printable[$i] = 0;
1596            $viacode[$i] = 'Unassigned';
1597
1598            if (defined $block) { # No blocks in earliest releases
1599                $viacode[$i] .= ', block=' . $block-> value_of($i);
1600                $end = $gc-> table('Unassigned')->containing_range($i)->end;
1601
1602                # Because we name the unassigned by the blocks they are in, it
1603                # can't go past the end of that block, and it also can't go
1604                # past the unassigned range it is in.  The special table makes
1605                # sure that the non-characters, which are unassigned, are
1606                # separated out.
1607                $end = min($block->containing_range($i)->end,
1608                           $unassigned_sans_noncharacters->
1609                                                    containing_range($i)->end);
1610            }
1611            else {
1612                $end = $i + 1;
1613                while ($unassigned_sans_noncharacters->contains($end)) {
1614                    $end++;
1615                }
1616                $end--;
1617            }
1618        }
1619        elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1620            $viacode[$i] = 'Surrogate';
1621            $annotate_char_type[$i] = $SURROGATE_TYPE;
1622            $printable[$i] = 0;
1623            $end = $gc->table('Surrogate')->containing_range($i)->end;
1624        }
1625        else {
1626            Carp::my_carp_bug("Can't figure out how to annotate "
1627                              . sprintf("U+%04X", $i)
1628                              . ".  Proceeding anyway.");
1629            $viacode[$i] = 'UNKNOWN';
1630            $annotate_char_type[$i] = $UNKNOWN_TYPE;
1631            $printable[$i] = 0;
1632        }
1633    }
1634
1635    # Here, has a name, but if it's one in which the code point number is
1636    # appended to the name, do that.
1637    elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1638        $viacode[$i] .= sprintf("-%04X", $i);
1639
1640        my $limit = $perl_charname->containing_range($i)->end;
1641        if (defined $age) {
1642            # Do all these as groups of the same age, instead of individually,
1643            # because their names are so meaningless, and there are typically
1644            # large quantities of them.
1645            $end = $i + 1;
1646            while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1647                $end++;
1648            }
1649            $end--;
1650        }
1651        else {
1652            $end = $limit;
1653        }
1654    }
1655
1656    # And here, has a name, but if it's a hangul syllable one, replace it with
1657    # the correct name from the Unicode algorithm
1658    elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1659        use integer;
1660        my $SIndex = $i - $SBase;
1661        my $L = $LBase + $SIndex / $NCount;
1662        my $V = $VBase + ($SIndex % $NCount) / $TCount;
1663        my $T = $TBase + $SIndex % $TCount;
1664        $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1665        $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1666        $end = $perl_charname->containing_range($i)->end;
1667    }
1668
1669    return if ! defined wantarray;
1670    return $i if ! defined $end;    # If not a range, return the input
1671
1672    # Save this whole range so can find the end point quickly
1673    $annotate_ranges->add_map($i, $end, $end);
1674
1675    return $end;
1676}
1677
1678# Commented code below should work on Perl 5.8.
1679## This 'require' doesn't necessarily work in miniperl, and even if it does,
1680## the native perl version of it (which is what would operate under miniperl)
1681## is extremely slow, as it does a string eval every call.
1682#my $has_fast_scalar_util = $^X !~ /miniperl/
1683#                            && defined eval "require Scalar::Util";
1684#
1685#sub objaddr($) {
1686#    # Returns the address of the blessed input object.  Uses the XS version if
1687#    # available.  It doesn't check for blessedness because that would do a
1688#    # string eval every call, and the program is structured so that this is
1689#    # never called for a non-blessed object.
1690#
1691#    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1692#
1693#    # Check at least that is a ref.
1694#    my $pkg = ref($_[0]) or return undef;
1695#
1696#    # Change to a fake package to defeat any overloaded stringify
1697#    bless $_[0], 'main::Fake';
1698#
1699#    # Numifying a ref gives its address.
1700#    my $addr = pack 'J', $_[0];
1701#
1702#    # Return to original class
1703#    bless $_[0], $pkg;
1704#    return $addr;
1705#}
1706
1707sub max ($$) {
1708    my $a = shift;
1709    my $b = shift;
1710    return $a if $a >= $b;
1711    return $b;
1712}
1713
1714sub min ($$) {
1715    my $a = shift;
1716    my $b = shift;
1717    return $a if $a <= $b;
1718    return $b;
1719}
1720
1721sub clarify_number ($) {
1722    # This returns the input number with underscores inserted every 3 digits
1723    # in large (5 digits or more) numbers.  Input must be entirely digits, not
1724    # checked.
1725
1726    my $number = shift;
1727    my $pos = length($number) - 3;
1728    return $number if $pos <= 1;
1729    while ($pos > 0) {
1730        substr($number, $pos, 0) = '_';
1731        $pos -= 3;
1732    }
1733    return $number;
1734}
1735
1736sub clarify_code_point_count ($) {
1737    # This is like clarify_number(), but the input is assumed to be a count of
1738    # code points, rather than a generic number.
1739
1740    my $append = "";
1741
1742    my $number = shift;
1743    if ($number > $MAX_UNICODE_CODEPOINTS) {
1744        $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1745        return "All above-Unicode code points" if $number == 0;
1746        $append = " + all above-Unicode code points";
1747    }
1748    return clarify_number($number) . $append;
1749}
1750
1751package Carp;
1752
1753# These routines give a uniform treatment of messages in this program.  They
1754# are placed in the Carp package to cause the stack trace to not include them,
1755# although an alternative would be to use another package and set @CARP_NOT
1756# for it.
1757
1758our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1759
1760# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1761# and overload trying to load Scalar:Util under miniperl.  See
1762# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1763undef $overload::VERSION;
1764
1765sub my_carp {
1766    my $message = shift || "";
1767    my $nofold = shift || 0;
1768
1769    if ($message) {
1770        $message = main::join_lines($message);
1771        $message =~ s/^$0: *//;     # Remove initial program name
1772        $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1773        $message = "\n$0: $message;";
1774
1775        # Fold the message with program name, semi-colon end punctuation
1776        # (which looks good with the message that carp appends to it), and a
1777        # hanging indent for continuation lines.
1778        $message = main::simple_fold($message, "", 4) unless $nofold;
1779        $message =~ s/\n$//;        # Remove the trailing nl so what carp
1780                                    # appends is to the same line
1781    }
1782
1783    return $message if defined wantarray;   # If a caller just wants the msg
1784
1785    carp $message;
1786    return;
1787}
1788
1789sub my_carp_bug {
1790    # This is called when it is clear that the problem is caused by a bug in
1791    # this program.
1792
1793    my $message = shift;
1794    $message =~ s/^$0: *//;
1795    $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1796    carp $message;
1797    return;
1798}
1799
1800sub carp_too_few_args {
1801    if (@_ != 2) {
1802        my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1803        return;
1804    }
1805
1806    my $args_ref = shift;
1807    my $count = shift;
1808
1809    my_carp_bug("Need at least $count arguments to "
1810        . (caller 1)[3]
1811        . ".  Instead got: '"
1812        . join ', ', @$args_ref
1813        . "'.  No action taken.");
1814    return;
1815}
1816
1817sub carp_extra_args {
1818    my $args_ref = shift;
1819    my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1820
1821    unless (ref $args_ref) {
1822        my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1823        return;
1824    }
1825    my ($package, $file, $line) = caller;
1826    my $subroutine = (caller 1)[3];
1827
1828    my $list;
1829    if (ref $args_ref eq 'HASH') {
1830        foreach my $key (keys %$args_ref) {
1831            $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1832        }
1833        $list = join ', ', each %{$args_ref};
1834    }
1835    elsif (ref $args_ref eq 'ARRAY') {
1836        foreach my $arg (@$args_ref) {
1837            $arg = $UNDEF unless defined $arg;
1838        }
1839        $list = join ', ', @$args_ref;
1840    }
1841    else {
1842        my_carp_bug("Can't cope with ref "
1843                . ref($args_ref)
1844                . " . argument to 'carp_extra_args'.  Not checking arguments.");
1845        return;
1846    }
1847
1848    my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1849    return;
1850}
1851
1852package main;
1853
1854{ # Closure
1855
1856    # This program uses the inside-out method for objects, as recommended in
1857    # "Perl Best Practices".  (This is the best solution still, since this has
1858    # to run under miniperl.)  This closure aids in generating those.  There
1859    # are two routines.  setup_package() is called once per package to set
1860    # things up, and then set_access() is called for each hash representing a
1861    # field in the object.  These routines arrange for the object to be
1862    # properly destroyed when no longer used, and for standard accessor
1863    # functions to be generated.  If you need more complex accessors, just
1864    # write your own and leave those accesses out of the call to set_access().
1865    # More details below.
1866
1867    my %constructor_fields; # fields that are to be used in constructors; see
1868                            # below
1869
1870    # The values of this hash will be the package names as keys to other
1871    # hashes containing the name of each field in the package as keys, and
1872    # references to their respective hashes as values.
1873    my %package_fields;
1874
1875    sub setup_package {
1876        # Sets up the package, creating standard DESTROY and dump methods
1877        # (unless already defined).  The dump method is used in debugging by
1878        # simple_dumper().
1879        # The optional parameters are:
1880        #   a)  a reference to a hash, that gets populated by later
1881        #       set_access() calls with one of the accesses being
1882        #       'constructor'.  The caller can then refer to this, but it is
1883        #       not otherwise used by these two routines.
1884        #   b)  a reference to a callback routine to call during destruction
1885        #       of the object, before any fields are actually destroyed
1886
1887        my %args = @_;
1888        my $constructor_ref = delete $args{'Constructor_Fields'};
1889        my $destroy_callback = delete $args{'Destroy_Callback'};
1890        Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1891
1892        my %fields;
1893        my $package = (caller)[0];
1894
1895        $package_fields{$package} = \%fields;
1896        $constructor_fields{$package} = $constructor_ref;
1897
1898        unless ($package->can('DESTROY')) {
1899            my $destroy_name = "${package}::DESTROY";
1900            no strict "refs";
1901
1902            # Use typeglob to give the anonymous subroutine the name we want
1903            *$destroy_name = sub {
1904                my $self = shift;
1905                my $addr = do { no overloading; pack 'J', $self; };
1906
1907                $self->$destroy_callback if $destroy_callback;
1908                foreach my $field (keys %{$package_fields{$package}}) {
1909                    #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1910                    delete $package_fields{$package}{$field}{$addr};
1911                }
1912                return;
1913            }
1914        }
1915
1916        unless ($package->can('dump')) {
1917            my $dump_name = "${package}::dump";
1918            no strict "refs";
1919            *$dump_name = sub {
1920                my $self = shift;
1921                return dump_inside_out($self, $package_fields{$package}, @_);
1922            }
1923        }
1924        return;
1925    }
1926
1927    sub set_access {
1928        # Arrange for the input field to be garbage collected when no longer
1929        # needed.  Also, creates standard accessor functions for the field
1930        # based on the optional parameters-- none if none of these parameters:
1931        #   'addable'    creates an 'add_NAME()' accessor function.
1932        #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1933        #                function.
1934        #   'settable'   creates a 'set_NAME()' accessor function.
1935        #   'constructor' doesn't create an accessor function, but adds the
1936        #                field to the hash that was previously passed to
1937        #                setup_package();
1938        # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1939        # 'add' etc. all mean 'addable'.
1940        # The read accessor function will work on both array and scalar
1941        # values.  If another accessor in the parameter list is 'a', the read
1942        # access assumes an array.  You can also force it to be array access
1943        # by specifying 'readable_array' instead of 'readable'
1944        #
1945        # A sort-of 'protected' access can be set-up by preceding the addable,
1946        # readable or settable with some initial portion of 'protected_' (but,
1947        # the underscore is required), like 'p_a', 'pro_set', etc.  The
1948        # "protection" is only by convention.  All that happens is that the
1949        # accessor functions' names begin with an underscore.  So instead of
1950        # calling set_foo, the call is _set_foo.  (Real protection could be
1951        # accomplished by having a new subroutine, end_package, called at the
1952        # end of each package, and then storing the __LINE__ ranges and
1953        # checking them on every accessor.  But that is way overkill.)
1954
1955        # We create anonymous subroutines as the accessors and then use
1956        # typeglobs to assign them to the proper package and name
1957
1958        my $name = shift;   # Name of the field
1959        my $field = shift;  # Reference to the inside-out hash containing the
1960                            # field
1961
1962        my $package = (caller)[0];
1963
1964        if (! exists $package_fields{$package}) {
1965            croak "$0: Must call 'setup_package' before 'set_access'";
1966        }
1967
1968        # Stash the field so DESTROY can get it.
1969        $package_fields{$package}{$name} = $field;
1970
1971        # Remaining arguments are the accessors.  For each...
1972        foreach my $access (@_) {
1973            my $access = lc $access;
1974
1975            my $protected = "";
1976
1977            # Match the input as far as it goes.
1978            if ($access =~ /^(p[^_]*)_/) {
1979                $protected = $1;
1980                if (substr('protected_', 0, length $protected)
1981                    eq $protected)
1982                {
1983
1984                    # Add 1 for the underscore not included in $protected
1985                    $access = substr($access, length($protected) + 1);
1986                    $protected = '_';
1987                }
1988                else {
1989                    $protected = "";
1990                }
1991            }
1992
1993            if (substr('addable', 0, length $access) eq $access) {
1994                my $subname = "${package}::${protected}add_$name";
1995                no strict "refs";
1996
1997                # add_ accessor.  Don't add if already there, which we
1998                # determine using 'eq' for scalars and '==' otherwise.
1999                *$subname = sub {
2000                    use strict "refs";
2001                    return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2002                    my $self = shift;
2003                    my $value = shift;
2004                    my $addr = do { no overloading; pack 'J', $self; };
2005                    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2006                    if (ref $value) {
2007                        return if grep { $value == $_ } @{$field->{$addr}};
2008                    }
2009                    else {
2010                        return if grep { $value eq $_ } @{$field->{$addr}};
2011                    }
2012                    push @{$field->{$addr}}, $value;
2013                    return;
2014                }
2015            }
2016            elsif (substr('constructor', 0, length $access) eq $access) {
2017                if ($protected) {
2018                    Carp::my_carp_bug("Can't set-up 'protected' constructors")
2019                }
2020                else {
2021                    $constructor_fields{$package}{$name} = $field;
2022                }
2023            }
2024            elsif (substr('readable_array', 0, length $access) eq $access) {
2025
2026                # Here has read access.  If one of the other parameters for
2027                # access is array, or this one specifies array (by being more
2028                # than just 'readable_'), then create a subroutine that
2029                # assumes the data is an array.  Otherwise just a scalar
2030                my $subname = "${package}::${protected}$name";
2031                if (grep { /^a/i } @_
2032                    or length($access) > length('readable_'))
2033                {
2034                    no strict "refs";
2035                    *$subname = sub {
2036                        use strict "refs";
2037                        Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2038                        my $addr = do { no overloading; pack 'J', $_[0]; };
2039                        if (ref $field->{$addr} ne 'ARRAY') {
2040                            my $type = ref $field->{$addr};
2041                            $type = 'scalar' unless $type;
2042                            Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
2043                            return;
2044                        }
2045                        return scalar @{$field->{$addr}} unless wantarray;
2046
2047                        # Make a copy; had problems with caller modifying the
2048                        # original otherwise
2049                        my @return = @{$field->{$addr}};
2050                        return @return;
2051                    }
2052                }
2053                else {
2054
2055                    # Here not an array value, a simpler function.
2056                    no strict "refs";
2057                    *$subname = sub {
2058                        use strict "refs";
2059                        Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2060                        no overloading;
2061                        return $field->{pack 'J', $_[0]};
2062                    }
2063                }
2064            }
2065            elsif (substr('settable', 0, length $access) eq $access) {
2066                my $subname = "${package}::${protected}set_$name";
2067                no strict "refs";
2068                *$subname = sub {
2069                    use strict "refs";
2070                    if (main::DEBUG) {
2071                        return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2072                        Carp::carp_extra_args(\@_) if @_ > 2;
2073                    }
2074                    # $self is $_[0]; $value is $_[1]
2075                    no overloading;
2076                    $field->{pack 'J', $_[0]} = $_[1];
2077                    return;
2078                }
2079            }
2080            else {
2081                Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2082            }
2083        }
2084        return;
2085    }
2086}
2087
2088package Input_file;
2089
2090# All input files use this object, which stores various attributes about them,
2091# and provides for convenient, uniform handling.  The run method wraps the
2092# processing.  It handles all the bookkeeping of opening, reading, and closing
2093# the file, returning only significant input lines.
2094#
2095# Each object gets a handler which processes the body of the file, and is
2096# called by run().  All character property files must use the generic,
2097# default handler, which has code scrubbed to handle things you might not
2098# expect, including automatic EBCDIC handling.  For files that don't deal with
2099# mapping code points to a property value, such as test files,
2100# PropertyAliases, PropValueAliases, and named sequences, you can override the
2101# handler to be a custom one.  Such a handler should basically be a
2102# while(next_line()) {...} loop.
2103#
2104# You can also set up handlers to
2105#   0) call during object construction time, after everything else is done
2106#   1) call before the first line is read, for pre processing
2107#   2) call to adjust each line of the input before the main handler gets
2108#      them.  This can be automatically generated, if appropriately simple
2109#      enough, by specifying a Properties parameter in the constructor.
2110#   3) call upon EOF before the main handler exits its loop
2111#   4) call at the end, for post processing
2112#
2113# $_ is used to store the input line, and is to be filtered by the
2114# each_line_handler()s.  So, if the format of the line is not in the desired
2115# format for the main handler, these are used to do that adjusting.  They can
2116# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2117# so the $_ output of one is used as the input to the next.  The EOF handler
2118# is also stackable, but none of the others are, but could easily be changed
2119# to be so.
2120#
2121# Some properties are used by the Perl core but aren't defined until later
2122# Unicode releases.  The perl interpreter would have problems working when
2123# compiled with an earlier Unicode version that doesn't have them, so we need
2124# to define them somehow for those releases.  The 'Early' constructor
2125# parameter can be used to automatically handle this.  It is essentially
2126# ignored if the Unicode version being compiled has a data file for this
2127# property.  Either code to execute or a file to read can be specified.
2128# Details are at the %early definition.
2129#
2130# Most of the handlers can call insert_lines() or insert_adjusted_lines()
2131# which insert the parameters as lines to be processed before the next input
2132# file line is read.  This allows the EOF handler(s) to flush buffers, for
2133# example.  The difference between the two routines is that the lines inserted
2134# by insert_lines() are subjected to the each_line_handler()s.  (So if you
2135# called it from such a handler, you would get infinite recursion without some
2136# mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
2137# directly to the main handler without any adjustments.  If the
2138# post-processing handler calls any of these, there will be no effect.  Some
2139# error checking for these conditions could be added, but it hasn't been done.
2140#
2141# carp_bad_line() should be called to warn of bad input lines, which clears $_
2142# to prevent further processing of the line.  This routine will output the
2143# message as a warning once, and then keep a count of the lines that have the
2144# same message, and output that count at the end of the file's processing.
2145# This keeps the number of messages down to a manageable amount.
2146#
2147# get_missings() should be called to retrieve any @missing input lines.
2148# Messages will be raised if this isn't done if the options aren't to ignore
2149# missings.
2150
2151sub trace { return main::trace(@_); }
2152
2153{ # Closure
2154    # Keep track of fields that are to be put into the constructor.
2155    my %constructor_fields;
2156
2157    main::setup_package(Constructor_Fields => \%constructor_fields);
2158
2159    my %file; # Input file name, required
2160    main::set_access('file', \%file, qw{ c r });
2161
2162    my %first_released; # Unicode version file was first released in, required
2163    main::set_access('first_released', \%first_released, qw{ c r });
2164
2165    my %handler;    # Subroutine to process the input file, defaults to
2166                    # 'process_generic_property_file'
2167    main::set_access('handler', \%handler, qw{ c });
2168
2169    my %property;
2170    # name of property this file is for.  defaults to none, meaning not
2171    # applicable, or is otherwise determinable, for example, from each line.
2172    main::set_access('property', \%property, qw{ c r });
2173
2174    my %optional;
2175    # This is either an unsigned number, or a list of property names.  In the
2176    # former case, if it is non-zero, it means the file is optional, so if the
2177    # file is absent, no warning about that is output.  In the latter case, it
2178    # is a list of properties that the file (exclusively) defines.  If the
2179    # file is present, tables for those properties will be produced; if
2180    # absent, none will, even if they are listed elsewhere (namely
2181    # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2182    # and no warnings will be raised about them not being available.  (And no
2183    # warning about the file itself will be raised.)
2184    main::set_access('optional', \%optional, qw{ c readable_array } );
2185
2186    my %non_skip;
2187    # This is used for debugging, to skip processing of all but a few input
2188    # files.  Add 'non_skip => 1' to the constructor for those files you want
2189    # processed when you set the $debug_skip global.
2190    main::set_access('non_skip', \%non_skip, 'c');
2191
2192    my %skip;
2193    # This is used to skip processing of this input file (semi-) permanently.
2194    # The value should be the reason the file is being skipped.  It is used
2195    # for files that we aren't planning to process anytime soon, but want to
2196    # allow to be in the directory and be checked for their names not
2197    # conflicting with any other files on a DOS 8.3 name filesystem, but to
2198    # not otherwise be processed, and to not raise a warning about not being
2199    # handled.  In the constructor call, any value that evaluates to a numeric
2200    # 0 or undef means don't skip.  Any other value is a string giving the
2201    # reason it is being skipped, and this will appear in generated pod.
2202    # However, an empty string reason will suppress the pod entry.
2203    # Internally, calls that evaluate to numeric 0 are changed into undef to
2204    # distinguish them from an empty string call.
2205    main::set_access('skip', \%skip, 'c', 'r');
2206
2207    my %each_line_handler;
2208    # list of subroutines to look at and filter each non-comment line in the
2209    # file.  defaults to none.  The subroutines are called in order, each is
2210    # to adjust $_ for the next one, and the final one adjusts it for
2211    # 'handler'
2212    main::set_access('each_line_handler', \%each_line_handler, 'c');
2213
2214    my %retain_trailing_comments;
2215    # This is used to not discard the comments that end data lines.  This
2216    # would be used only for files with non-typical syntax, and most code here
2217    # assumes that comments have been stripped, so special handlers would have
2218    # to be written.  It is assumed that the code will use these in
2219    # single-quoted contexts, and so any "'" marks in the comment will be
2220    # prefixed by a backslash.
2221    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2222
2223    my %properties; # Optional ordered list of the properties that occur in each
2224    # meaningful line of the input file.  If present, an appropriate
2225    # each_line_handler() is automatically generated and pushed onto the stack
2226    # of such handlers.  This is useful when a file contains multiple
2227    # properties per line, but no other special considerations are necessary.
2228    # The special value "<ignored>" means to discard the corresponding input
2229    # field.
2230    # Any @missing lines in the file should also match this syntax; no such
2231    # files exist as of 6.3.  But if it happens in a future release, the code
2232    # could be expanded to properly parse them.
2233    main::set_access('properties', \%properties, qw{ c r });
2234
2235    my %has_missings_defaults;
2236    # ? Are there lines in the file giving default values for code points
2237    # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2238    # the norm, but IGNORED means it has such lines, but the handler doesn't
2239    # use them.  Having these three states allows us to catch changes to the
2240    # UCD that this program should track.  XXX This could be expanded to
2241    # specify the syntax for such lines, like %properties above.
2242    main::set_access('has_missings_defaults',
2243                                        \%has_missings_defaults, qw{ c r });
2244
2245    my %construction_time_handler;
2246    # Subroutine to call at the end of the new method.  If undef, no such
2247    # handler is called.
2248    main::set_access('construction_time_handler',
2249                                        \%construction_time_handler, qw{ c });
2250
2251    my %pre_handler;
2252    # Subroutine to call before doing anything else in the file.  If undef, no
2253    # such handler is called.
2254    main::set_access('pre_handler', \%pre_handler, qw{ c });
2255
2256    my %eof_handler;
2257    # Subroutines to call upon getting an EOF on the input file, but before
2258    # that is returned to the main handler.  This is to allow buffers to be
2259    # flushed.  The handler is expected to call insert_lines() or
2260    # insert_adjusted() with the buffered material
2261    main::set_access('eof_handler', \%eof_handler, qw{ c });
2262
2263    my %post_handler;
2264    # Subroutine to call after all the lines of the file are read in and
2265    # processed.  If undef, no such handler is called.  Note that this cannot
2266    # add lines to be processed; instead use eof_handler
2267    main::set_access('post_handler', \%post_handler, qw{ c });
2268
2269    my %progress_message;
2270    # Message to print to display progress in lieu of the standard one
2271    main::set_access('progress_message', \%progress_message, qw{ c });
2272
2273    my %handle;
2274    # cache open file handle, internal.  Is undef if file hasn't been
2275    # processed at all, empty if has;
2276    main::set_access('handle', \%handle);
2277
2278    my %added_lines;
2279    # cache of lines added virtually to the file, internal
2280    main::set_access('added_lines', \%added_lines);
2281
2282    my %remapped_lines;
2283    # cache of lines added virtually to the file, internal
2284    main::set_access('remapped_lines', \%remapped_lines);
2285
2286    my %errors;
2287    # cache of errors found, internal
2288    main::set_access('errors', \%errors);
2289
2290    my %missings;
2291    # storage of '@missing' defaults lines
2292    main::set_access('missings', \%missings);
2293
2294    my %early;
2295    # Used for properties that must be defined (for Perl's purposes) on
2296    # versions of Unicode earlier than Unicode itself defines them.  The
2297    # parameter is an array (it would be better to be a hash, but not worth
2298    # bothering about due to its rare use).
2299    #
2300    # The first element is either a code reference to call when in a release
2301    # earlier than the Unicode file is available in, or it is an alternate
2302    # file to use instead of the non-existent one.  This file must have been
2303    # plunked down in the same directory as mktables.  Should you be compiling
2304    # on a release that needs such a file, mktables will abort the
2305    # compilation, and tell you where to get the necessary file(s), and what
2306    # name(s) to use to store them as.
2307    # In the case of specifying an alternate file, the array must contain two
2308    # further elements:
2309    #
2310    # [1] is the name of the property that will be generated by this file.
2311    # The class automatically takes the input file and excludes any code
2312    # points in it that were not assigned in the Unicode version being
2313    # compiled.  It then uses this result to define the property in the given
2314    # version.  Since the property doesn't actually exist in the Unicode
2315    # version being compiled, this should be a name accessible only by core
2316    # perl.  If it is the same name as the regular property, the constructor
2317    # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2318    # get output, and so will be unusable by non-core code.  Otherwise it gets
2319    # marked as $INTERNAL_ONLY.
2320    #
2321    # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2322    # the Hangul syllables in that release (which were ripped out in version
2323    # 2) for the given property .  (Hence it is ignored except when compiling
2324    # version 1.  You only get one value that applies to all of them, which
2325    # may not be the actual reality, but probably nobody cares anyway for
2326    # these obsolete characters.)
2327    #
2328    # [3] if present is the default value for the property to assign for code
2329    # points not given in the input.  If not present, the default from the
2330    # normal property is used
2331    #
2332    # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2333    # it means to not add the name in [1] as an alias to the property name
2334    # used for these.  Normally, when compiling Unicode versions that don't
2335    # invoke the early handling, the name is added as a synonym.
2336    #
2337    # Not all files can be handled in the above way, and so the code ref
2338    # alternative is available.  It can do whatever it needs to.  The other
2339    # array elements are optional in this case, and the code is free to use or
2340    # ignore them if they are present.
2341    #
2342    # Internally, the constructor unshifts a 0 or 1 onto this array to
2343    # indicate if an early alternative is actually being used or not.  This
2344    # makes for easier testing later on.
2345    main::set_access('early', \%early, 'c');
2346
2347    my %only_early;
2348    main::set_access('only_early', \%only_early, 'c');
2349
2350    my %required_even_in_debug_skip;
2351    # debug_skip is used to speed up compilation during debugging by skipping
2352    # processing files that are not needed for the task at hand.  However,
2353    # some files pretty much can never be skipped, and this is used to specify
2354    # that this is one of them.  In order to skip this file, the call to the
2355    # constructor must be edited to comment out this parameter.
2356    main::set_access('required_even_in_debug_skip',
2357                     \%required_even_in_debug_skip, 'c');
2358
2359    my %withdrawn;
2360    # Some files get removed from the Unicode DB.  This is a version object
2361    # giving the first release without this file.
2362    main::set_access('withdrawn', \%withdrawn, 'c');
2363
2364    my %in_this_release;
2365    # Calculated value from %first_released and %withdrawn.  Are we compiling
2366    # a Unicode release which includes this file?
2367    main::set_access('in_this_release', \%in_this_release);
2368
2369    sub _next_line;
2370    sub _next_line_with_remapped_range;
2371
2372    sub new {
2373        my $class = shift;
2374
2375        my $self = bless \do{ my $anonymous_scalar }, $class;
2376        my $addr = do { no overloading; pack 'J', $self; };
2377
2378        # Set defaults
2379        $handler{$addr} = \&main::process_generic_property_file;
2380        $retain_trailing_comments{$addr} = 0;
2381        $non_skip{$addr} = 0;
2382        $skip{$addr} = undef;
2383        $has_missings_defaults{$addr} = $NO_DEFAULTS;
2384        $handle{$addr} = undef;
2385        $added_lines{$addr} = [ ];
2386        $remapped_lines{$addr} = [ ];
2387        $each_line_handler{$addr} = [ ];
2388        $eof_handler{$addr} = [ ];
2389        $errors{$addr} = { };
2390        $missings{$addr} = [ ];
2391        $early{$addr} = [ ];
2392        $optional{$addr} = [ ];
2393
2394        # Two positional parameters.
2395        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2396        $file{$addr} = main::internal_file_to_platform(shift);
2397        $first_released{$addr} = shift;
2398
2399        # The rest of the arguments are key => value pairs
2400        # %constructor_fields has been set up earlier to list all possible
2401        # ones.  Either set or push, depending on how the default has been set
2402        # up just above.
2403        my %args = @_;
2404        foreach my $key (keys %args) {
2405            my $argument = $args{$key};
2406
2407            # Note that the fields are the lower case of the constructor keys
2408            my $hash = $constructor_fields{lc $key};
2409            if (! defined $hash) {
2410                Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2411                next;
2412            }
2413            if (ref $hash->{$addr} eq 'ARRAY') {
2414                if (ref $argument eq 'ARRAY') {
2415                    foreach my $argument (@{$argument}) {
2416                        next if ! defined $argument;
2417                        push @{$hash->{$addr}}, $argument;
2418                    }
2419                }
2420                else {
2421                    push @{$hash->{$addr}}, $argument if defined $argument;
2422                }
2423            }
2424            else {
2425                $hash->{$addr} = $argument;
2426            }
2427            delete $args{$key};
2428        };
2429
2430        $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2431
2432        # Convert 0 (meaning don't skip) to undef
2433        undef $skip{$addr} unless $skip{$addr};
2434
2435        # Handle the case where this file is optional
2436        my $pod_message_for_non_existent_optional = "";
2437        if ($optional{$addr}->@*) {
2438
2439            # First element is the pod message
2440            $pod_message_for_non_existent_optional
2441                                                = shift $optional{$addr}->@*;
2442            # Convert a 0 'Optional' argument to an empty list to make later
2443            # code more concise.
2444            if (   $optional{$addr}->@*
2445                && $optional{$addr}->@* == 1
2446                && $optional{$addr}[0] ne ""
2447                && $optional{$addr}[0] !~ /\D/
2448                && $optional{$addr}[0] == 0)
2449            {
2450                $optional{$addr} = [ ];
2451            }
2452            else {  # But if the only element doesn't evaluate to 0, make sure
2453                    # that this file is indeed considered optional below.
2454                unshift $optional{$addr}->@*, 1;
2455            }
2456        }
2457
2458        my $progress;
2459        my $function_instead_of_file = 0;
2460
2461        if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2462            $only_early{$addr} = 1;
2463            pop $early{$addr}->@*;
2464        }
2465
2466        # If we are compiling a Unicode release earlier than the file became
2467        # available, the constructor may have supplied a substitute
2468        if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2469
2470            # Yes, we have a substitute, that we will use; mark it so
2471            unshift $early{$addr}->@*, 1;
2472
2473            # See the definition of %early for what the array elements mean.
2474            # Note that we have just unshifted onto the array, so the numbers
2475            # below are +1 of those in the %early description.
2476            # If we have a property this defines, create a table and default
2477            # map for it now (at essentially compile time), so that it will be
2478            # available for the whole of run time.  (We will want to add this
2479            # name as an alias when we are using the official property name;
2480            # but this must be deferred until run(), because at construction
2481            # time the official names have yet to be defined.)
2482            if ($early{$addr}[2]) {
2483                my $fate = ($property{$addr}
2484                            && $property{$addr} eq $early{$addr}[2])
2485                          ? $PLACEHOLDER
2486                          : $INTERNAL_ONLY;
2487                my $prop_object = Property->new($early{$addr}[2],
2488                                                Fate => $fate,
2489                                                Perl_Extension => 1,
2490                                                );
2491
2492                # If not specified by the constructor, use the default mapping
2493                # for the regular property for this substitute one.
2494                if ($early{$addr}[4]) {
2495                    $prop_object->set_default_map($early{$addr}[4]);
2496                }
2497                elsif (    defined $property{$addr}
2498                       &&  defined $default_mapping{$property{$addr}})
2499                {
2500                    $prop_object
2501                        ->set_default_map($default_mapping{$property{$addr}});
2502                }
2503            }
2504
2505            if (ref $early{$addr}[1] eq 'CODE') {
2506                $function_instead_of_file = 1;
2507
2508                # If the first element of the array is a code ref, the others
2509                # are optional.
2510                $handler{$addr} = $early{$addr}[1];
2511                $property{$addr} = $early{$addr}[2]
2512                                                if defined $early{$addr}[2];
2513                $progress = "substitute $file{$addr}";
2514
2515                undef $file{$addr};
2516            }
2517            else {  # Specifying a substitute file
2518
2519                if (! main::file_exists($early{$addr}[1])) {
2520
2521                    # If we don't see the substitute file, generate an error
2522                    # message giving the needed things, and add it to the list
2523                    # of such to output before actual processing happens
2524                    # (hence the user finds out all of them in one run).
2525                    # Instead of creating a general method for NameAliases,
2526                    # hard-code it here, as there is unlikely to ever be a
2527                    # second one which needs special handling.
2528                    my $string_version = ($file{$addr} eq "NameAliases.txt")
2529                                    ? 'at least 6.1 (the later, the better)'
2530                                    : sprintf "%vd", $first_released{$addr};
2531                    push @missing_early_files, <<END;
2532'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2533END
2534                    ;
2535                    return;
2536                }
2537                $progress = $early{$addr}[1];
2538                $progress .= ", substituting for $file{$addr}" if $file{$addr};
2539                $file{$addr} = $early{$addr}[1];
2540                $property{$addr} = $early{$addr}[2];
2541
2542                # Ignore code points not in the version being compiled
2543                push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2544
2545                if (   $v_version lt v2.0        # Hanguls in this release ...
2546                    && defined $early{$addr}[3]) # ... need special treatment
2547                {
2548                    push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2549                }
2550            }
2551
2552            # And this substitute is valid for all releases.
2553            $first_released{$addr} = v0;
2554        }
2555        else {  # Normal behavior
2556            $progress = $file{$addr};
2557            unshift $early{$addr}->@*, 0; # No substitute
2558        }
2559
2560        my $file = $file{$addr};
2561        $progress_message{$addr} = "Processing $progress"
2562                                            unless $progress_message{$addr};
2563
2564        # A file should be there if it is within the window of versions for
2565        # which Unicode supplies it
2566        if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2567            $in_this_release{$addr} = 0;
2568            $skip{$addr} = "";
2569        }
2570        else {
2571            $in_this_release{$addr} = $first_released{$addr} le $v_version;
2572
2573            # Check that the file for this object (possibly using a substitute
2574            # for early releases) exists or we have a function alternative
2575            if (   ! $function_instead_of_file
2576                && ! main::file_exists($file))
2577            {
2578                # Here there is nothing available for this release.  This is
2579                # fine if we aren't expecting anything in this release.
2580                if (! $in_this_release{$addr}) {
2581                    $skip{$addr} = "";  # Don't remark since we expected
2582                                        # nothing and got nothing
2583                }
2584                elsif ($optional{$addr}->@*) {
2585
2586                    # Here the file is optional in this release; Use the
2587                    # passed in text to document this case in the pod.
2588                    $skip{$addr} = $pod_message_for_non_existent_optional;
2589                }
2590                elsif (   $in_this_release{$addr}
2591                       && ! defined $skip{$addr}
2592                       && defined $file)
2593                { # Doesn't exist but should.
2594                    $skip{$addr} = "'$file' not found.  Possibly Big problems";
2595                    Carp::my_carp($skip{$addr});
2596                }
2597            }
2598            elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2599            {
2600
2601                # The file exists; if not skipped for another reason, and we are
2602                # skipping most everything during debugging builds, use that as
2603                # the skip reason.
2604                $skip{$addr} = '$debug_skip is on'
2605            }
2606        }
2607
2608        if (   ! $debug_skip
2609            && $non_skip{$addr}
2610            && ! $required_even_in_debug_skip{$addr}
2611            && $verbosity)
2612        {
2613            print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2614        }
2615
2616        # Here, we have figured out if we will be skipping this file or not.
2617        # If so, we add any single property it defines to any passed in
2618        # optional property list.  These will be dealt with at run time.
2619        if (defined $skip{$addr}) {
2620            if ($property{$addr}) {
2621                push $optional{$addr}->@*, $property{$addr};
2622            }
2623        } # Otherwise, are going to process the file.
2624        elsif ($property{$addr}) {
2625
2626            # If the file has a property defined in the constructor for it, it
2627            # means that the property is not listed in the file's entries.  So
2628            # add a handler (to the list of line handlers) to insert the
2629            # property name into the lines, to provide a uniform interface to
2630            # the final processing subroutine.
2631            push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2632        }
2633        elsif ($properties{$addr}) {
2634
2635            # Similarly, there may be more than one property represented on
2636            # each line, with no clue but the constructor input what those
2637            # might be.  Add a handler for each line in the input so that it
2638            # creates a separate input line for each property in those input
2639            # lines, thus making them suitable to handle generically.
2640
2641            push @{$each_line_handler{$addr}},
2642                 sub {
2643                    my $file = shift;
2644                    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2645
2646                    my @fields = split /\s*;\s*/, $_, -1;
2647
2648                    if (@fields - 1 > @{$properties{$addr}}) {
2649                        $file->carp_bad_line('Extra fields');
2650                        $_ = "";
2651                        return;
2652                    }
2653                    my $range = shift @fields;  # 0th element is always the
2654                                                # range
2655
2656                    # The next fields in the input line correspond
2657                    # respectively to the stored properties.
2658                    for my $i (0 ..  @{$properties{$addr}} - 1) {
2659                        my $property_name = $properties{$addr}[$i];
2660                        next if $property_name eq '<ignored>';
2661                        $file->insert_adjusted_lines(
2662                              "$range; $property_name; $fields[$i]");
2663                    }
2664                    $_ = "";
2665
2666                    return;
2667                };
2668        }
2669
2670        {   # On non-ascii platforms, we use a special pre-handler
2671            no strict;
2672            no warnings 'once';
2673            *next_line = (main::NON_ASCII_PLATFORM)
2674                         ? *_next_line_with_remapped_range
2675                         : *_next_line;
2676        }
2677
2678        &{$construction_time_handler{$addr}}($self)
2679                                        if $construction_time_handler{$addr};
2680
2681        return $self;
2682    }
2683
2684
2685    use overload
2686        fallback => 0,
2687        qw("") => "_operator_stringify",
2688        "." => \&main::_operator_dot,
2689        ".=" => \&main::_operator_dot_equal,
2690    ;
2691
2692    sub _operator_stringify {
2693        my $self = shift;
2694
2695        return __PACKAGE__ . " object for " . $self->file;
2696    }
2697
2698    sub run {
2699        # Process the input object $self.  This opens and closes the file and
2700        # calls all the handlers for it.  Currently,  this can only be called
2701        # once per file, as it destroy's the EOF handlers
2702
2703        # flag to make sure extracted files are processed early
2704        state $seen_non_extracted = 0;
2705
2706        my $self = shift;
2707        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2708
2709        my $addr = do { no overloading; pack 'J', $self; };
2710
2711        my $file = $file{$addr};
2712
2713        if (! $file) {
2714            $handle{$addr} = 'pretend_is_open';
2715        }
2716        else {
2717            if ($seen_non_extracted) {
2718                if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2719                                            # case of the file's name
2720                {
2721                    Carp::my_carp_bug(main::join_lines(<<END
2722$file should be processed just after the 'Prop...Alias' files, and before
2723anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2724have subtle problems
2725END
2726                    ));
2727                }
2728            }
2729            elsif ($EXTRACTED_DIR
2730
2731                    # We only do this check for generic property files
2732                    && $handler{$addr} == \&main::process_generic_property_file
2733
2734                    && $file !~ /$EXTRACTED/i)
2735            {
2736                # We don't set this (by the 'if' above) if we have no
2737                # extracted directory, so if running on an early version,
2738                # this test won't work.  Not worth worrying about.
2739                $seen_non_extracted = 1;
2740            }
2741
2742            # Mark the file as having being processed, and warn if it
2743            # isn't a file we are expecting.  As we process the files,
2744            # they are deleted from the hash, so any that remain at the
2745            # end of the program are files that we didn't process.
2746            my $fkey = File::Spec->rel2abs($file);
2747            my $exists = delete $potential_files{lc($fkey)};
2748
2749            Carp::my_carp("Was not expecting '$file'.")
2750                                    if $exists && ! $in_this_release{$addr};
2751
2752            # If there is special handling for compiling Unicode releases
2753            # earlier than the first one in which Unicode defines this
2754            # property ...
2755            if ($early{$addr}->@* > 1) {
2756
2757                # Mark as processed any substitute file that would be used in
2758                # such a release
2759                $fkey = File::Spec->rel2abs($early{$addr}[1]);
2760                delete $potential_files{lc($fkey)};
2761
2762                # As commented in the constructor code, when using the
2763                # official property, we still have to allow the publicly
2764                # inaccessible early name so that the core code which uses it
2765                # will work regardless.
2766                if (   ! $only_early{$addr}
2767                    && ! $early{$addr}[0]
2768                    && $early{$addr}->@* > 2)
2769                {
2770                    my $early_property_name = $early{$addr}[2];
2771                    if ($property{$addr} ne $early_property_name) {
2772                        main::property_ref($property{$addr})
2773                                            ->add_alias($early_property_name);
2774                    }
2775                }
2776            }
2777
2778            # We may be skipping this file ...
2779            if (defined $skip{$addr}) {
2780
2781                # If the file isn't supposed to be in this release, there is
2782                # nothing to do
2783                if ($in_this_release{$addr}) {
2784
2785                    # But otherwise, we may print a message
2786                    if ($debug_skip) {
2787                        print STDERR "Skipping input file '$file'",
2788                                     " because '$skip{$addr}'\n";
2789                    }
2790
2791                    # And add it to the list of skipped files, which is later
2792                    # used to make the pod
2793                    $skipped_files{$file} = $skip{$addr};
2794
2795                    # The 'optional' list contains properties that are also to
2796                    # be skipped along with the file.  (There may also be
2797                    # digits which are just placeholders to make sure it isn't
2798                    # an empty list
2799                    foreach my $property ($optional{$addr}->@*) {
2800                        next unless $property =~ /\D/;
2801                        my $prop_object = main::property_ref($property);
2802                        next unless defined $prop_object;
2803                        $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2804                    }
2805                }
2806
2807                return;
2808            }
2809
2810            # Here, we are going to process the file.  Open it, converting the
2811            # slashes used in this program into the proper form for the OS
2812            my $file_handle;
2813            if (not open $file_handle, "<", $file) {
2814                Carp::my_carp("Can't open $file.  Skipping: $!");
2815                return;
2816            }
2817            $handle{$addr} = $file_handle; # Cache the open file handle
2818
2819            # If possible, make sure that the file is the correct version.
2820            # (This data isn't available on early Unicode releases or in
2821            # UnicodeData.txt.)  We don't do this check if we are using a
2822            # substitute file instead of the official one (though the code
2823            # could be extended to do so).
2824            if ($in_this_release{$addr}
2825                && ! $early{$addr}[0]
2826                && lc($file) ne 'unicodedata.txt')
2827            {
2828                if ($file !~ /^Unihan/i) {
2829
2830                    # The non-Unihan files started getting version numbers in
2831                    # 3.2, but some files in 4.0 are unchanged from 3.2, and
2832                    # marked as 3.2.  4.0.1 is the first version where there
2833                    # are no files marked as being from less than 4.0, though
2834                    # some are marked as 4.0.  In versions after that, the
2835                    # numbers are correct.
2836                    if ($v_version ge v4.0.1) {
2837                        $_ = <$file_handle>;    # The version number is in the
2838                                                # very first line
2839                        if ($_ !~ / - $string_version \. /x) {
2840                            chomp;
2841                            $_ =~ s/^#\s*//;
2842
2843                            # 4.0.1 had some valid files that weren't updated.
2844                            if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
2845                                die Carp::my_carp("File '$file' is version "
2846                                                . "'$_'.  It should be "
2847                                                . "version $string_version");
2848                            }
2849                        }
2850                    }
2851                }
2852                elsif ($v_version ge v6.0.0) { # Unihan
2853
2854                    # Unihan files didn't get accurate version numbers until
2855                    # 6.0.  The version is somewhere in the first comment
2856                    # block
2857                    while (<$file_handle>) {
2858                        if ($_ !~ /^#/) {
2859                            Carp::my_carp_bug("Could not find the expected "
2860                                            . "version info in file '$file'");
2861                            last;
2862                        }
2863                        chomp;
2864                        $_ =~ s/^#\s*//;
2865                        next if $_ !~ / version: /x;
2866                        last if $_ =~ /$string_version/;
2867                        die Carp::my_carp("File '$file' is version "
2868                                        . "'$_'.  It should be "
2869                                        . "version $string_version");
2870                    }
2871                }
2872            }
2873        }
2874
2875        print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2876
2877        # Call any special handler for before the file.
2878        &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2879
2880        # Then the main handler
2881        &{$handler{$addr}}($self);
2882
2883        # Then any special post-file handler.
2884        &{$post_handler{$addr}}($self) if $post_handler{$addr};
2885
2886        # If any errors have been accumulated, output the counts (as the first
2887        # error message in each class was output when it was encountered).
2888        if ($errors{$addr}) {
2889            my $total = 0;
2890            my $types = 0;
2891            foreach my $error (keys %{$errors{$addr}}) {
2892                $total += $errors{$addr}->{$error};
2893                delete $errors{$addr}->{$error};
2894                $types++;
2895            }
2896            if ($total > 1) {
2897                my $message
2898                        = "A total of $total lines had errors in $file.  ";
2899
2900                $message .= ($types == 1)
2901                            ? '(Only the first one was displayed.)'
2902                            : '(Only the first of each type was displayed.)';
2903                Carp::my_carp($message);
2904            }
2905        }
2906
2907        if (@{$missings{$addr}}) {
2908            Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2909        }
2910
2911        # If a real file handle, close it.
2912        close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2913                                                        ref $handle{$addr};
2914        $handle{$addr} = "";   # Uses empty to indicate that has already seen
2915                               # the file, as opposed to undef
2916        return;
2917    }
2918
2919    sub _next_line {
2920        # Sets $_ to be the next logical input line, if any.  Returns non-zero
2921        # if such a line exists.  'logical' means that any lines that have
2922        # been added via insert_lines() will be returned in $_ before the file
2923        # is read again.
2924
2925        my $self = shift;
2926        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2927
2928        my $addr = do { no overloading; pack 'J', $self; };
2929
2930        # Here the file is open (or if the handle is not a ref, is an open
2931        # 'virtual' file).  Get the next line; any inserted lines get priority
2932        # over the file itself.
2933        my $adjusted;
2934
2935        LINE:
2936        while (1) { # Loop until find non-comment, non-empty line
2937            #local $to_trace = 1 if main::DEBUG;
2938            my $inserted_ref = shift @{$added_lines{$addr}};
2939            if (defined $inserted_ref) {
2940                ($adjusted, $_) = @{$inserted_ref};
2941                trace $adjusted, $_ if main::DEBUG && $to_trace;
2942                return 1 if $adjusted;
2943            }
2944            else {
2945                last if ! ref $handle{$addr}; # Don't read unless is real file
2946                last if ! defined ($_ = readline $handle{$addr});
2947            }
2948            chomp;
2949            trace $_ if main::DEBUG && $to_trace;
2950
2951            # See if this line is the comment line that defines what property
2952            # value that code points that are not listed in the file should
2953            # have.  The format or existence of these lines is not guaranteed
2954            # by Unicode since they are comments, but the documentation says
2955            # that this was added for machine-readability, so probably won't
2956            # change.  This works starting in Unicode Version 5.0.  They look
2957            # like:
2958            #
2959            # @missing: 0000..10FFFF; Not_Reordered
2960            # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2961            # @missing: 0000..10FFFF; ; NaN
2962            #
2963            # Save the line for a later get_missings() call.
2964            if (/$missing_defaults_prefix/) {
2965                if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2966                    $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2967                }
2968                elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2969                    my @defaults = split /\s* ; \s*/x, $_;
2970
2971                    # The first field is the @missing, which ends in a
2972                    # semi-colon, so can safely shift.
2973                    shift @defaults;
2974
2975                    # Some of these lines may have empty field placeholders
2976                    # which get in the way.  An example is:
2977                    # @missing: 0000..10FFFF; ; NaN
2978                    # Remove them.  Process starting from the top so the
2979                    # splice doesn't affect things still to be looked at.
2980                    for (my $i = @defaults - 1; $i >= 0; $i--) {
2981                        next if $defaults[$i] ne "";
2982                        splice @defaults, $i, 1;
2983                    }
2984
2985                    # What's left should be just the property (maybe) and the
2986                    # default.  Having only one element means it doesn't have
2987                    # the property.
2988                    my $default;
2989                    my $property;
2990                    if (@defaults >= 1) {
2991                        if (@defaults == 1) {
2992                            $default = $defaults[0];
2993                        }
2994                        else {
2995                            $property = $defaults[0];
2996                            $default = $defaults[1];
2997                        }
2998                    }
2999
3000                    if (@defaults < 1
3001                        || @defaults > 2
3002                        || ($default =~ /^</
3003                            && $default !~ /^<code *point>$/i
3004                            && $default !~ /^<none>$/i
3005                            && $default !~ /^<script>$/i))
3006                    {
3007                        $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
3008                    }
3009                    else {
3010
3011                        # If the property is missing from the line, it should
3012                        # be the one for the whole file
3013                        $property = $property{$addr} if ! defined $property;
3014
3015                        # Change <none> to the null string, which is what it
3016                        # really means.  If the default is the code point
3017                        # itself, set it to <code point>, which is what
3018                        # Unicode uses (but sometimes they've forgotten the
3019                        # space)
3020                        if ($default =~ /^<none>$/i) {
3021                            $default = "";
3022                        }
3023                        elsif ($default =~ /^<code *point>$/i) {
3024                            $default = $CODE_POINT;
3025                        }
3026                        elsif ($default =~ /^<script>$/i) {
3027
3028                            # Special case this one.  Currently is from
3029                            # ScriptExtensions.txt, and means for all unlisted
3030                            # code points, use their Script property values.
3031                            # For the code points not listed in that file, the
3032                            # default value is 'Unknown'.
3033                            $default = "Unknown";
3034                        }
3035
3036                        # Store them as a sub-arrays with both components.
3037                        push @{$missings{$addr}}, [ $default, $property ];
3038                    }
3039                }
3040
3041                # There is nothing for the caller to process on this comment
3042                # line.
3043                next;
3044            }
3045
3046            # Unless to keep, remove comments.  If to keep, ignore
3047            # comment-only lines
3048            if ($retain_trailing_comments{$addr}) {
3049                next if / ^ \s* \# /x;
3050
3051                # But escape any single quotes (done in both the comment and
3052                # non-comment portion; this could be a bug someday, but not
3053                # likely)
3054                s/'/\\'/g;
3055            }
3056            else {
3057                s/#.*//;
3058            }
3059
3060            # Remove trailing space, and skip this line if the result is empty
3061            s/\s+$//;
3062            next if /^$/;
3063
3064            # Call any handlers for this line, and skip further processing of
3065            # the line if the handler sets the line to null.
3066            foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3067                &{$sub_ref}($self);
3068                next LINE if /^$/;
3069            }
3070
3071            # Here the line is ok.  return success.
3072            return 1;
3073        } # End of looping through lines.
3074
3075        # If there are EOF handlers, call each (only once) and if it generates
3076        # more lines to process go back in the loop to handle them.
3077        while ($eof_handler{$addr}->@*) {
3078            &{$eof_handler{$addr}[0]}($self);
3079            shift $eof_handler{$addr}->@*;   # Currently only get one shot at it.
3080            goto LINE if $added_lines{$addr};
3081        }
3082
3083        # Return failure -- no more lines.
3084        return 0;
3085
3086    }
3087
3088    sub _next_line_with_remapped_range {
3089        my $self = shift;
3090        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3091
3092        # like _next_line(), but for use on non-ASCII platforms.  It sets $_
3093        # to be the next logical input line, if any.  Returns non-zero if such
3094        # a line exists.  'logical' means that any lines that have been added
3095        # via insert_lines() will be returned in $_ before the file is read
3096        # again.
3097        #
3098        # The difference from _next_line() is that this remaps the Unicode
3099        # code points in the input to those of the native platform.  Each
3100        # input line contains a single code point, or a single contiguous
3101        # range of them  This routine splits each range into its individual
3102        # code points and caches them.  It returns the cached values,
3103        # translated into their native equivalents, one at a time, for each
3104        # call, before reading the next line.  Since native values can only be
3105        # a single byte wide, no translation is needed for code points above
3106        # 0xFF, and ranges that are entirely above that number are not split.
3107        # If an input line contains the range 254-1000, it would be split into
3108        # three elements: 254, 255, and 256-1000.  (The downstream table
3109        # insertion code will sort and coalesce the individual code points
3110        # into appropriate ranges.)
3111
3112        my $addr = do { no overloading; pack 'J', $self; };
3113
3114        while (1) {
3115
3116            # Look in cache before reading the next line.  Return any cached
3117            # value, translated
3118            my $inserted = shift @{$remapped_lines{$addr}};
3119            if (defined $inserted) {
3120                trace $inserted if main::DEBUG && $to_trace;
3121                $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3122                trace $_ if main::DEBUG && $to_trace;
3123                return 1;
3124            }
3125
3126            # Get the next line.
3127            return 0 unless _next_line($self);
3128
3129            # If there is a special handler for it, return the line,
3130            # untranslated.  This should happen only for files that are
3131            # special, not being code-point related, such as property names.
3132            return 1 if $handler{$addr}
3133                                    != \&main::process_generic_property_file;
3134
3135            my ($range, $property_name, $map, @remainder)
3136                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3137
3138            if (@remainder
3139                || ! defined $property_name
3140                || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3141            {
3142                Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
3143            }
3144
3145            my $low = hex $1;
3146            my $high = (defined $2) ? hex $2 : $low;
3147
3148            # If the input maps the range to another code point, remap the
3149            # target if it is between 0 and 255.
3150            my $tail;
3151            if (defined $map) {
3152                $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3153                $tail = "$property_name; $map";
3154                $_ = "$range; $tail";
3155            }
3156            else {
3157                $tail = $property_name;
3158            }
3159
3160            # If entire range is above 255, just return it, unchanged (except
3161            # any mapped-to code point, already changed above)
3162            return 1 if $low > 255;
3163
3164            # Cache an entry for every code point < 255.  For those in the
3165            # range above 255, return a dummy entry for just that portion of
3166            # the range.  Note that this will be out-of-order, but that is not
3167            # a problem.
3168            foreach my $code_point ($low .. $high) {
3169                if ($code_point > 255) {
3170                    $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3171                    return 1;
3172                }
3173                push @{$remapped_lines{$addr}}, "$code_point; $tail";
3174            }
3175        } # End of looping through lines.
3176
3177        # NOTREACHED
3178    }
3179
3180#   Not currently used, not fully tested.
3181#    sub peek {
3182#        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3183#        # record.  Not callable from an each_line_handler(), nor does it call
3184#        # an each_line_handler() on the line.
3185#
3186#        my $self = shift;
3187#        my $addr = do { no overloading; pack 'J', $self; };
3188#
3189#        foreach my $inserted_ref (@{$added_lines{$addr}}) {
3190#            my ($adjusted, $line) = @{$inserted_ref};
3191#            next if $adjusted;
3192#
3193#            # Remove comments and trailing space, and return a non-empty
3194#            # resulting line
3195#            $line =~ s/#.*//;
3196#            $line =~ s/\s+$//;
3197#            return $line if $line ne "";
3198#        }
3199#
3200#        return if ! ref $handle{$addr}; # Don't read unless is real file
3201#        while (1) { # Loop until find non-comment, non-empty line
3202#            local $to_trace = 1 if main::DEBUG;
3203#            trace $_ if main::DEBUG && $to_trace;
3204#            return if ! defined (my $line = readline $handle{$addr});
3205#            chomp $line;
3206#            push @{$added_lines{$addr}}, [ 0, $line ];
3207#
3208#            $line =~ s/#.*//;
3209#            $line =~ s/\s+$//;
3210#            return $line if $line ne "";
3211#        }
3212#
3213#        return;
3214#    }
3215
3216
3217    sub insert_lines {
3218        # Lines can be inserted so that it looks like they were in the input
3219        # file at the place it was when this routine is called.  See also
3220        # insert_adjusted_lines().  Lines inserted via this routine go through
3221        # any each_line_handler()
3222
3223        my $self = shift;
3224
3225        # Each inserted line is an array, with the first element being 0 to
3226        # indicate that this line hasn't been adjusted, and needs to be
3227        # processed.
3228        no overloading;
3229        push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
3230        return;
3231    }
3232
3233    sub insert_adjusted_lines {
3234        # Lines can be inserted so that it looks like they were in the input
3235        # file at the place it was when this routine is called.  See also
3236        # insert_lines().  Lines inserted via this routine are already fully
3237        # adjusted, ready to be processed; each_line_handler()s handlers will
3238        # not be called.  This means this is not a completely general
3239        # facility, as only the last each_line_handler on the stack should
3240        # call this.  It could be made more general, by passing to each of the
3241        # line_handlers their position on the stack, which they would pass on
3242        # to this routine, and that would replace the boolean first element in
3243        # the anonymous array pushed here, so that the next_line routine could
3244        # use that to call only those handlers whose index is after it on the
3245        # stack.  But this is overkill for what is needed now.
3246
3247        my $self = shift;
3248        trace $_[0] if main::DEBUG && $to_trace;
3249
3250        # Each inserted line is an array, with the first element being 1 to
3251        # indicate that this line has been adjusted
3252        no overloading;
3253        push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
3254        return;
3255    }
3256
3257    sub get_missings {
3258        # Returns the stored up @missings lines' values, and clears the list.
3259        # The values are in an array, consisting of the default in the first
3260        # element, and the property in the 2nd.  However, since these lines
3261        # can be stacked up, the return is an array of all these arrays.
3262
3263        my $self = shift;
3264        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3265
3266        my $addr = do { no overloading; pack 'J', $self; };
3267
3268        # If not accepting a list return, just return the first one.
3269        return shift @{$missings{$addr}} unless wantarray;
3270
3271        my @return = @{$missings{$addr}};
3272        undef @{$missings{$addr}};
3273        return @return;
3274    }
3275
3276    sub _exclude_unassigned {
3277
3278        # Takes the range in $_ and excludes code points that aren't assigned
3279        # in this release
3280
3281        state $skip_inserted_count = 0;
3282
3283        # Ignore recursive calls.
3284        if ($skip_inserted_count) {
3285            $skip_inserted_count--;
3286            return;
3287        }
3288
3289        # Find what code points are assigned in this release
3290        main::calculate_Assigned() if ! defined $Assigned;
3291
3292        my $self = shift;
3293        my $addr = do { no overloading; pack 'J', $self; };
3294        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3295
3296        my ($range, @remainder)
3297            = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3298
3299        # Examine the range.
3300        if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3301        {
3302            my $low = hex $1;
3303            my $high = (defined $2) ? hex $2 : $low;
3304
3305            # Split the range into subranges of just those code points in it
3306            # that are assigned.
3307            my @ranges = (Range_List->new(Initialize
3308                              => Range->new($low, $high)) & $Assigned)->ranges;
3309
3310            # Do nothing if nothing in the original range is assigned in this
3311            # release; handle normally if everything is in this release.
3312            if (! @ranges) {
3313                $_ = "";
3314            }
3315            elsif (@ranges != 1) {
3316
3317                # Here, some code points in the original range aren't in this
3318                # release; @ranges gives the ones that are.  Create fake input
3319                # lines for each of the ranges, and set things up so that when
3320                # this routine is called on that fake input, it will do
3321                # nothing.
3322                $skip_inserted_count = @ranges;
3323                my $remainder = join ";", @remainder;
3324                for my $range (@ranges) {
3325                    $self->insert_lines(sprintf("%04X..%04X;%s",
3326                                    $range->start, $range->end, $remainder));
3327                }
3328                $_ = "";    # The original range is now defunct.
3329            }
3330        }
3331
3332        return;
3333    }
3334
3335    sub _fixup_obsolete_hanguls {
3336
3337        # This is called only when compiling Unicode version 1.  All Unicode
3338        # data for subsequent releases assumes that the code points that were
3339        # Hangul syllables in this release only are something else, so if
3340        # using such data, we have to override it
3341
3342        my $self = shift;
3343        my $addr = do { no overloading; pack 'J', $self; };
3344        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3345
3346        my $object = main::property_ref($property{$addr});
3347        $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3348                         $FINAL_REMOVED_HANGUL_SYLLABLE,
3349                         $early{$addr}[3],  # Passed-in value for these
3350                         Replace => $UNCONDITIONALLY);
3351    }
3352
3353    sub _insert_property_into_line {
3354        # Add a property field to $_, if this file requires it.
3355
3356        my $self = shift;
3357        my $addr = do { no overloading; pack 'J', $self; };
3358        my $property = $property{$addr};
3359        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3360
3361        $_ =~ s/(;|$)/; $property$1/;
3362        return;
3363    }
3364
3365    sub carp_bad_line {
3366        # Output consistent error messages, using either a generic one, or the
3367        # one given by the optional parameter.  To avoid gazillions of the
3368        # same message in case the syntax of a  file is way off, this routine
3369        # only outputs the first instance of each message, incrementing a
3370        # count so the totals can be output at the end of the file.
3371
3372        my $self = shift;
3373        my $message = shift;
3374        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3375
3376        my $addr = do { no overloading; pack 'J', $self; };
3377
3378        $message = 'Unexpected line' unless $message;
3379
3380        # No trailing punctuation so as to fit with our addenda.
3381        $message =~ s/[.:;,]$//;
3382
3383        # If haven't seen this exact message before, output it now.  Otherwise
3384        # increment the count of how many times it has occurred
3385        unless ($errors{$addr}->{$message}) {
3386            Carp::my_carp("$message in '$_' in "
3387                            . $file{$addr}
3388                            . " at line $..  Skipping this line;");
3389            $errors{$addr}->{$message} = 1;
3390        }
3391        else {
3392            $errors{$addr}->{$message}++;
3393        }
3394
3395        # Clear the line to prevent any further (meaningful) processing of it.
3396        $_ = "";
3397
3398        return;
3399    }
3400} # End closure
3401
3402package Multi_Default;
3403
3404# Certain properties in early versions of Unicode had more than one possible
3405# default for code points missing from the files.  In these cases, one
3406# default applies to everything left over after all the others are applied,
3407# and for each of the others, there is a description of which class of code
3408# points applies to it.  This object helps implement this by storing the
3409# defaults, and for all but that final default, an eval string that generates
3410# the class that it applies to.
3411
3412
3413{   # Closure
3414
3415    main::setup_package();
3416
3417    my %class_defaults;
3418    # The defaults structure for the classes
3419    main::set_access('class_defaults', \%class_defaults);
3420
3421    my %other_default;
3422    # The default that applies to everything left over.
3423    main::set_access('other_default', \%other_default, 'r');
3424
3425
3426    sub new {
3427        # The constructor is called with default => eval pairs, terminated by
3428        # the left-over default. e.g.
3429        # Multi_Default->new(
3430        #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3431        #               -  0x200D',
3432        #        'R' => 'some other expression that evaluates to code points',
3433        #        .
3434        #        .
3435        #        .
3436        #        'U'));
3437        # It is best to leave the final value be the one that matches the
3438        # above-Unicode code points.
3439
3440        my $class = shift;
3441
3442        my $self = bless \do{my $anonymous_scalar}, $class;
3443        my $addr = do { no overloading; pack 'J', $self; };
3444
3445        while (@_ > 1) {
3446            my $default = shift;
3447            my $eval = shift;
3448            $class_defaults{$addr}->{$default} = $eval;
3449        }
3450
3451        $other_default{$addr} = shift;
3452
3453        return $self;
3454    }
3455
3456    sub get_next_defaults {
3457        # Iterates and returns the next class of defaults.
3458        my $self = shift;
3459        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3460
3461        my $addr = do { no overloading; pack 'J', $self; };
3462
3463        return each %{$class_defaults{$addr}};
3464    }
3465}
3466
3467package Alias;
3468
3469# An alias is one of the names that a table goes by.  This class defines them
3470# including some attributes.  Everything is currently setup in the
3471# constructor.
3472
3473
3474{   # Closure
3475
3476    main::setup_package();
3477
3478    my %name;
3479    main::set_access('name', \%name, 'r');
3480
3481    my %loose_match;
3482    # Should this name match loosely or not.
3483    main::set_access('loose_match', \%loose_match, 'r');
3484
3485    my %make_re_pod_entry;
3486    # Some aliases should not get their own entries in the re section of the
3487    # pod, because they are covered by a wild-card, and some we want to
3488    # discourage use of.  Binary
3489    main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3490
3491    my %ucd;
3492    # Is this documented to be accessible via Unicode::UCD
3493    main::set_access('ucd', \%ucd, 'r', 's');
3494
3495    my %status;
3496    # Aliases have a status, like deprecated, or even suppressed (which means
3497    # they don't appear in documentation).  Enum
3498    main::set_access('status', \%status, 'r');
3499
3500    my %ok_as_filename;
3501    # Similarly, some aliases should not be considered as usable ones for
3502    # external use, such as file names, or we don't want documentation to
3503    # recommend them.  Boolean
3504    main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3505
3506    sub new {
3507        my $class = shift;
3508
3509        my $self = bless \do { my $anonymous_scalar }, $class;
3510        my $addr = do { no overloading; pack 'J', $self; };
3511
3512        $name{$addr} = shift;
3513        $loose_match{$addr} = shift;
3514        $make_re_pod_entry{$addr} = shift;
3515        $ok_as_filename{$addr} = shift;
3516        $status{$addr} = shift;
3517        $ucd{$addr} = shift;
3518
3519        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3520
3521        # Null names are never ok externally
3522        $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3523
3524        return $self;
3525    }
3526}
3527
3528package Range;
3529
3530# A range is the basic unit for storing code points, and is described in the
3531# comments at the beginning of the program.  Each range has a starting code
3532# point; an ending code point (not less than the starting one); a value
3533# that applies to every code point in between the two end-points, inclusive;
3534# and an enum type that applies to the value.  The type is for the user's
3535# convenience, and has no meaning here, except that a non-zero type is
3536# considered to not obey the normal Unicode rules for having standard forms.
3537#
3538# The same structure is used for both map and match tables, even though in the
3539# latter, the value (and hence type) is irrelevant and could be used as a
3540# comment.  In map tables, the value is what all the code points in the range
3541# map to.  Type 0 values have the standardized version of the value stored as
3542# well, so as to not have to recalculate it a lot.
3543
3544sub trace { return main::trace(@_); }
3545
3546{   # Closure
3547
3548    main::setup_package();
3549
3550    my %start;
3551    main::set_access('start', \%start, 'r', 's');
3552
3553    my %end;
3554    main::set_access('end', \%end, 'r', 's');
3555
3556    my %value;
3557    main::set_access('value', \%value, 'r', 's');
3558
3559    my %type;
3560    main::set_access('type', \%type, 'r');
3561
3562    my %standard_form;
3563    # The value in internal standard form.  Defined only if the type is 0.
3564    main::set_access('standard_form', \%standard_form);
3565
3566    # Note that if these fields change, the dump() method should as well
3567
3568    sub new {
3569        return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3570        my $class = shift;
3571
3572        my $self = bless \do { my $anonymous_scalar }, $class;
3573        my $addr = do { no overloading; pack 'J', $self; };
3574
3575        $start{$addr} = shift;
3576        $end{$addr} = shift;
3577
3578        my %args = @_;
3579
3580        my $value = delete $args{'Value'};  # Can be 0
3581        $value = "" unless defined $value;
3582        $value{$addr} = $value;
3583
3584        $type{$addr} = delete $args{'Type'} || 0;
3585
3586        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3587
3588        return $self;
3589    }
3590
3591    use overload
3592        fallback => 0,
3593        qw("") => "_operator_stringify",
3594        "." => \&main::_operator_dot,
3595        ".=" => \&main::_operator_dot_equal,
3596    ;
3597
3598    sub _operator_stringify {
3599        my $self = shift;
3600        my $addr = do { no overloading; pack 'J', $self; };
3601
3602        # Output it like '0041..0065 (value)'
3603        my $return = sprintf("%04X", $start{$addr})
3604                        .  '..'
3605                        . sprintf("%04X", $end{$addr});
3606        my $value = $value{$addr};
3607        my $type = $type{$addr};
3608        $return .= ' (';
3609        $return .= "$value";
3610        $return .= ", Type=$type" if $type != 0;
3611        $return .= ')';
3612
3613        return $return;
3614    }
3615
3616    sub standard_form {
3617        # Calculate the standard form only if needed, and cache the result.
3618        # The standard form is the value itself if the type is special.
3619        # This represents a considerable CPU and memory saving - at the time
3620        # of writing there are 368676 non-special objects, but the standard
3621        # form is only requested for 22047 of them - ie about 6%.
3622
3623        my $self = shift;
3624        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3625
3626        my $addr = do { no overloading; pack 'J', $self; };
3627
3628        return $standard_form{$addr} if defined $standard_form{$addr};
3629
3630        my $value = $value{$addr};
3631        return $value if $type{$addr};
3632        return $standard_form{$addr} = main::standardize($value);
3633    }
3634
3635    sub dump {
3636        # Human, not machine readable.  For machine readable, comment out this
3637        # entire routine and let the standard one take effect.
3638        my $self = shift;
3639        my $indent = shift;
3640        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3641
3642        my $addr = do { no overloading; pack 'J', $self; };
3643
3644        my $return = $indent
3645                    . sprintf("%04X", $start{$addr})
3646                    . '..'
3647                    . sprintf("%04X", $end{$addr})
3648                    . " '$value{$addr}';";
3649        if (! defined $standard_form{$addr}) {
3650            $return .= "(type=$type{$addr})";
3651        }
3652        elsif ($standard_form{$addr} ne $value{$addr}) {
3653            $return .= "(standard '$standard_form{$addr}')";
3654        }
3655        return $return;
3656    }
3657} # End closure
3658
3659package _Range_List_Base;
3660
3661# Base class for range lists.  A range list is simply an ordered list of
3662# ranges, so that the ranges with the lowest starting numbers are first in it.
3663#
3664# When a new range is added that is adjacent to an existing range that has the
3665# same value and type, it merges with it to form a larger range.
3666#
3667# Ranges generally do not overlap, except that there can be multiple entries
3668# of single code point ranges.  This is because of NameAliases.txt.
3669#
3670# In this program, there is a standard value such that if two different
3671# values, have the same standard value, they are considered equivalent.  This
3672# value was chosen so that it gives correct results on Unicode data
3673
3674# There are a number of methods to manipulate range lists, and some operators
3675# are overloaded to handle them.
3676
3677sub trace { return main::trace(@_); }
3678
3679{ # Closure
3680
3681    our $addr;
3682
3683    # Max is initialized to a negative value that isn't adjacent to 0, for
3684    # simpler tests
3685    my $max_init = -2;
3686
3687    main::setup_package();
3688
3689    my %ranges;
3690    # The list of ranges
3691    main::set_access('ranges', \%ranges, 'readable_array');
3692
3693    my %max;
3694    # The highest code point in the list.  This was originally a method, but
3695    # actual measurements said it was used a lot.
3696    main::set_access('max', \%max, 'r');
3697
3698    my %each_range_iterator;
3699    # Iterator position for each_range()
3700    main::set_access('each_range_iterator', \%each_range_iterator);
3701
3702    my %owner_name_of;
3703    # Name of parent this is attached to, if any.  Solely for better error
3704    # messages.
3705    main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3706
3707    my %_search_ranges_cache;
3708    # A cache of the previous result from _search_ranges(), for better
3709    # performance
3710    main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3711
3712    sub new {
3713        my $class = shift;
3714        my %args = @_;
3715
3716        # Optional initialization data for the range list.
3717        my $initialize = delete $args{'Initialize'};
3718
3719        my $self;
3720
3721        # Use _union() to initialize.  _union() returns an object of this
3722        # class, which means that it will call this constructor recursively.
3723        # But it won't have this $initialize parameter so that it won't
3724        # infinitely loop on this.
3725        return _union($class, $initialize, %args) if defined $initialize;
3726
3727        $self = bless \do { my $anonymous_scalar }, $class;
3728        my $addr = do { no overloading; pack 'J', $self; };
3729
3730        # Optional parent object, only for debug info.
3731        $owner_name_of{$addr} = delete $args{'Owner'};
3732        $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3733
3734        # Stringify, in case it is an object.
3735        $owner_name_of{$addr} = "$owner_name_of{$addr}";
3736
3737        # This is used only for error messages, and so a colon is added
3738        $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3739
3740        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3741
3742        $max{$addr} = $max_init;
3743
3744        $_search_ranges_cache{$addr} = 0;
3745        $ranges{$addr} = [];
3746
3747        return $self;
3748    }
3749
3750    use overload
3751        fallback => 0,
3752        qw("") => "_operator_stringify",
3753        "." => \&main::_operator_dot,
3754        ".=" => \&main::_operator_dot_equal,
3755    ;
3756
3757    sub _operator_stringify {
3758        my $self = shift;
3759        my $addr = do { no overloading; pack 'J', $self; };
3760
3761        return "Range_List attached to '$owner_name_of{$addr}'"
3762                                                if $owner_name_of{$addr};
3763        return "anonymous Range_List " . \$self;
3764    }
3765
3766    sub _union {
3767        # Returns the union of the input code points.  It can be called as
3768        # either a constructor or a method.  If called as a method, the result
3769        # will be a new() instance of the calling object, containing the union
3770        # of that object with the other parameter's code points;  if called as
3771        # a constructor, the first parameter gives the class that the new object
3772        # should be, and the second parameter gives the code points to go into
3773        # it.
3774        # In either case, there are two parameters looked at by this routine;
3775        # any additional parameters are passed to the new() constructor.
3776        #
3777        # The code points can come in the form of some object that contains
3778        # ranges, and has a conventionally named method to access them; or
3779        # they can be an array of individual code points (as integers); or
3780        # just a single code point.
3781        #
3782        # If they are ranges, this routine doesn't make any effort to preserve
3783        # the range values and types of one input over the other.  Therefore
3784        # this base class should not allow _union to be called from other than
3785        # initialization code, so as to prevent two tables from being added
3786        # together where the range values matter.  The general form of this
3787        # routine therefore belongs in a derived class, but it was moved here
3788        # to avoid duplication of code.  The failure to overload this in this
3789        # class keeps it safe.
3790        #
3791        # It does make the effort during initialization to accept tables with
3792        # multiple values for the same code point, and to preserve the order
3793        # of these.  If there is only one input range or range set, it doesn't
3794        # sort (as it should already be sorted to the desired order), and will
3795        # accept multiple values per code point.  Otherwise it will merge
3796        # multiple values into a single one.
3797
3798        my $self;
3799        my @args;   # Arguments to pass to the constructor
3800
3801        my $class = shift;
3802
3803        # If a method call, will start the union with the object itself, and
3804        # the class of the new object will be the same as self.
3805        if (ref $class) {
3806            $self = $class;
3807            $class = ref $self;
3808            push @args, $self;
3809        }
3810
3811        # Add the other required parameter.
3812        push @args, shift;
3813        # Rest of parameters are passed on to the constructor
3814
3815        # Accumulate all records from both lists.
3816        my @records;
3817        my $input_count = 0;
3818        for my $arg (@args) {
3819            #local $to_trace = 0 if main::DEBUG;
3820            trace "argument = $arg" if main::DEBUG && $to_trace;
3821            if (! defined $arg) {
3822                my $message = "";
3823                if (defined $self) {
3824                    no overloading;
3825                    $message .= $owner_name_of{pack 'J', $self};
3826                }
3827                Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3828                return;
3829            }
3830
3831            $arg = [ $arg ] if ! ref $arg;
3832            my $type = ref $arg;
3833            if ($type eq 'ARRAY') {
3834                foreach my $element (@$arg) {
3835                    push @records, Range->new($element, $element);
3836                    $input_count++;
3837                }
3838            }
3839            elsif ($arg->isa('Range')) {
3840                push @records, $arg;
3841                $input_count++;
3842            }
3843            elsif ($arg->can('ranges')) {
3844                push @records, $arg->ranges;
3845                $input_count++;
3846            }
3847            else {
3848                my $message = "";
3849                if (defined $self) {
3850                    no overloading;
3851                    $message .= $owner_name_of{pack 'J', $self};
3852                }
3853                Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3854                return;
3855            }
3856        }
3857
3858        # Sort with the range containing the lowest ordinal first, but if
3859        # two ranges start at the same code point, sort with the bigger range
3860        # of the two first, because it takes fewer cycles.
3861        if ($input_count > 1) {
3862            @records = sort { ($a->start <=> $b->start)
3863                                      or
3864                                    # if b is shorter than a, b->end will be
3865                                    # less than a->end, and we want to select
3866                                    # a, so want to return -1
3867                                    ($b->end <=> $a->end)
3868                                   } @records;
3869        }
3870
3871        my $new = $class->new(@_);
3872
3873        # Fold in records so long as they add new information.
3874        for my $set (@records) {
3875            my $start = $set->start;
3876            my $end   = $set->end;
3877            my $value = $set->value;
3878            my $type  = $set->type;
3879            if ($start > $new->max) {
3880                $new->_add_delete('+', $start, $end, $value, Type => $type);
3881            }
3882            elsif ($end > $new->max) {
3883                $new->_add_delete('+', $new->max +1, $end, $value,
3884                                                                Type => $type);
3885            }
3886            elsif ($input_count == 1) {
3887                # Here, overlaps existing range, but is from a single input,
3888                # so preserve the multiple values from that input.
3889                $new->_add_delete('+', $start, $end, $value, Type => $type,
3890                                                Replace => $MULTIPLE_AFTER);
3891            }
3892        }
3893
3894        return $new;
3895    }
3896
3897    sub range_count {        # Return the number of ranges in the range list
3898        my $self = shift;
3899        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3900
3901        no overloading;
3902        return scalar @{$ranges{pack 'J', $self}};
3903    }
3904
3905    sub min {
3906        # Returns the minimum code point currently in the range list, or if
3907        # the range list is empty, 2 beyond the max possible.  This is a
3908        # method because used so rarely, that not worth saving between calls,
3909        # and having to worry about changing it as ranges are added and
3910        # deleted.
3911
3912        my $self = shift;
3913        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3914
3915        my $addr = do { no overloading; pack 'J', $self; };
3916
3917        # If the range list is empty, return a large value that isn't adjacent
3918        # to any that could be in the range list, for simpler tests
3919        return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3920        return $ranges{$addr}->[0]->start;
3921    }
3922
3923    sub contains {
3924        # Boolean: Is argument in the range list?  If so returns $i such that:
3925        #   range[$i]->end < $codepoint <= range[$i+1]->end
3926        # which is one beyond what you want; this is so that the 0th range
3927        # doesn't return false
3928        my $self = shift;
3929        my $codepoint = shift;
3930        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3931
3932        my $i = $self->_search_ranges($codepoint);
3933        return 0 unless defined $i;
3934
3935        # The search returns $i, such that
3936        #   range[$i-1]->end < $codepoint <= range[$i]->end
3937        # So is in the table if and only iff it is at least the start position
3938        # of range $i.
3939        no overloading;
3940        return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3941        return $i + 1;
3942    }
3943
3944    sub containing_range {
3945        # Returns the range object that contains the code point, undef if none
3946
3947        my $self = shift;
3948        my $codepoint = shift;
3949        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3950
3951        my $i = $self->contains($codepoint);
3952        return unless $i;
3953
3954        # contains() returns 1 beyond where we should look
3955        no overloading;
3956        return $ranges{pack 'J', $self}->[$i-1];
3957    }
3958
3959    sub value_of {
3960        # Returns the value associated with the code point, undef if none
3961
3962        my $self = shift;
3963        my $codepoint = shift;
3964        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3965
3966        my $range = $self->containing_range($codepoint);
3967        return unless defined $range;
3968
3969        return $range->value;
3970    }
3971
3972    sub type_of {
3973        # Returns the type of the range containing the code point, undef if
3974        # the code point is not in the table
3975
3976        my $self = shift;
3977        my $codepoint = shift;
3978        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3979
3980        my $range = $self->containing_range($codepoint);
3981        return unless defined $range;
3982
3983        return $range->type;
3984    }
3985
3986    sub _search_ranges {
3987        # Find the range in the list which contains a code point, or where it
3988        # should go if were to add it.  That is, it returns $i, such that:
3989        #   range[$i-1]->end < $codepoint <= range[$i]->end
3990        # Returns undef if no such $i is possible (e.g. at end of table), or
3991        # if there is an error.
3992
3993        my $self = shift;
3994        my $code_point = shift;
3995        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3996
3997        my $addr = do { no overloading; pack 'J', $self; };
3998
3999        return if $code_point > $max{$addr};
4000        my $r = $ranges{$addr};                # The current list of ranges
4001        my $range_list_size = scalar @$r;
4002        my $i;
4003
4004        use integer;        # want integer division
4005
4006        # Use the cached result as the starting guess for this one, because,
4007        # an experiment on 5.1 showed that 90% of the time the cache was the
4008        # same as the result on the next call (and 7% it was one less).
4009        $i = $_search_ranges_cache{$addr};
4010        $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
4011                                            # from an intervening deletion
4012        #local $to_trace = 1 if main::DEBUG;
4013        trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
4014        return $i if $code_point <= $r->[$i]->end
4015                     && ($i == 0 || $r->[$i-1]->end < $code_point);
4016
4017        # Here the cache doesn't yield the correct $i.  Try adding 1.
4018        if ($i < $range_list_size - 1
4019            && $r->[$i]->end < $code_point &&
4020            $code_point <= $r->[$i+1]->end)
4021        {
4022            $i++;
4023            trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
4024            $_search_ranges_cache{$addr} = $i;
4025            return $i;
4026        }
4027
4028        # Here, adding 1 also didn't work.  We do a binary search to
4029        # find the correct position, starting with current $i
4030        my $lower = 0;
4031        my $upper = $range_list_size - 1;
4032        while (1) {
4033            trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
4034
4035            if ($code_point <= $r->[$i]->end) {
4036
4037                # Here we have met the upper constraint.  We can quit if we
4038                # also meet the lower one.
4039                last if $i == 0 || $r->[$i-1]->end < $code_point;
4040
4041                $upper = $i;        # Still too high.
4042
4043            }
4044            else {
4045
4046                # Here, $r[$i]->end < $code_point, so look higher up.
4047                $lower = $i;
4048            }
4049
4050            # Split search domain in half to try again.
4051            my $temp = ($upper + $lower) / 2;
4052
4053            # No point in continuing unless $i changes for next time
4054            # in the loop.
4055            if ($temp == $i) {
4056
4057                # We can't reach the highest element because of the averaging.
4058                # So if one below the upper edge, force it there and try one
4059                # more time.
4060                if ($i == $range_list_size - 2) {
4061
4062                    trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4063                    $i = $range_list_size - 1;
4064
4065                    # Change $lower as well so if fails next time through,
4066                    # taking the average will yield the same $i, and we will
4067                    # quit with the error message just below.
4068                    $lower = $i;
4069                    next;
4070                }
4071                Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
4072                return;
4073            }
4074            $i = $temp;
4075        } # End of while loop
4076
4077        if (main::DEBUG && $to_trace) {
4078            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4079            trace "i=  [ $i ]", $r->[$i];
4080            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4081        }
4082
4083        # Here we have found the offset.  Cache it as a starting point for the
4084        # next call.
4085        $_search_ranges_cache{$addr} = $i;
4086        return $i;
4087    }
4088
4089    sub _add_delete {
4090        # Add, replace or delete ranges to or from a list.  The $type
4091        # parameter gives which:
4092        #   '+' => insert or replace a range, returning a list of any changed
4093        #          ranges.
4094        #   '-' => delete a range, returning a list of any deleted ranges.
4095        #
4096        # The next three parameters give respectively the start, end, and
4097        # value associated with the range.  'value' should be null unless the
4098        # operation is '+';
4099        #
4100        # The range list is kept sorted so that the range with the lowest
4101        # starting position is first in the list, and generally, adjacent
4102        # ranges with the same values are merged into a single larger one (see
4103        # exceptions below).
4104        #
4105        # There are more parameters; all are key => value pairs:
4106        #   Type    gives the type of the value.  It is only valid for '+'.
4107        #           All ranges have types; if this parameter is omitted, 0 is
4108        #           assumed.  Ranges with type 0 are assumed to obey the
4109        #           Unicode rules for casing, etc; ranges with other types are
4110        #           not.  Otherwise, the type is arbitrary, for the caller's
4111        #           convenience, and looked at only by this routine to keep
4112        #           adjacent ranges of different types from being merged into
4113        #           a single larger range, and when Replace =>
4114        #           $IF_NOT_EQUIVALENT is specified (see just below).
4115        #   Replace  determines what to do if the range list already contains
4116        #            ranges which coincide with all or portions of the input
4117        #            range.  It is only valid for '+':
4118        #       => $NO            means that the new value is not to replace
4119        #                         any existing ones, but any empty gaps of the
4120        #                         range list coinciding with the input range
4121        #                         will be filled in with the new value.
4122        #       => $UNCONDITIONALLY  means to replace the existing values with
4123        #                         this one unconditionally.  However, if the
4124        #                         new and old values are identical, the
4125        #                         replacement is skipped to save cycles
4126        #       => $IF_NOT_EQUIVALENT means to replace the existing values
4127        #          (the default)  with this one if they are not equivalent.
4128        #                         Ranges are equivalent if their types are the
4129        #                         same, and they are the same string; or if
4130        #                         both are type 0 ranges, if their Unicode
4131        #                         standard forms are identical.  In this last
4132        #                         case, the routine chooses the more "modern"
4133        #                         one to use.  This is because some of the
4134        #                         older files are formatted with values that
4135        #                         are, for example, ALL CAPs, whereas the
4136        #                         derived files have a more modern style,
4137        #                         which looks better.  By looking for this
4138        #                         style when the pre-existing and replacement
4139        #                         standard forms are the same, we can move to
4140        #                         the modern style
4141        #       => $MULTIPLE_BEFORE means that if this range duplicates an
4142        #                         existing one, but has a different value,
4143        #                         don't replace the existing one, but insert
4144        #                         this one so that the same range can occur
4145        #                         multiple times.  They are stored LIFO, so
4146        #                         that the final one inserted is the first one
4147        #                         returned in an ordered search of the table.
4148        #                         If this is an exact duplicate, including the
4149        #                         value, the original will be moved to be
4150        #                         first, before any other duplicate ranges
4151        #                         with different values.
4152        #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4153        #                         FIFO, so that this one is inserted after all
4154        #                         others that currently exist.  If this is an
4155        #                         exact duplicate, including value, of an
4156        #                         existing range, this one is discarded
4157        #                         (leaving the existing one in its original,
4158        #                         higher priority position
4159        #       => $CROAK         Die with an error if is already there
4160        #       => anything else  is the same as => $IF_NOT_EQUIVALENT
4161        #
4162        # "same value" means identical for non-type-0 ranges, and it means
4163        # having the same standard forms for type-0 ranges.
4164
4165        return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4166
4167        my $self = shift;
4168        my $operation = shift;   # '+' for add/replace; '-' for delete;
4169        my $start = shift;
4170        my $end   = shift;
4171        my $value = shift;
4172
4173        my %args = @_;
4174
4175        $value = "" if not defined $value;        # warning: $value can be "0"
4176
4177        my $replace = delete $args{'Replace'};
4178        $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4179
4180        my $type = delete $args{'Type'};
4181        $type = 0 unless defined $type;
4182
4183        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4184
4185        my $addr = do { no overloading; pack 'J', $self; };
4186
4187        if ($operation ne '+' && $operation ne '-') {
4188            Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
4189            return;
4190        }
4191        unless (defined $start && defined $end) {
4192            Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
4193            return;
4194        }
4195        unless ($end >= $start) {
4196            Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
4197            return;
4198        }
4199        #local $to_trace = 1 if main::DEBUG;
4200
4201        if ($operation eq '-') {
4202            if ($replace != $IF_NOT_EQUIVALENT) {
4203                Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
4204                $replace = $IF_NOT_EQUIVALENT;
4205            }
4206            if ($type) {
4207                Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
4208                $type = 0;
4209            }
4210            if ($value ne "") {
4211                Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
4212                $value = "";
4213            }
4214        }
4215
4216        my $r = $ranges{$addr};               # The current list of ranges
4217        my $range_list_size = scalar @$r;     # And its size
4218        my $max = $max{$addr};                # The current high code point in
4219                                              # the list of ranges
4220
4221        # Do a special case requiring fewer machine cycles when the new range
4222        # starts after the current highest point.  The Unicode input data is
4223        # structured so this is common.
4224        if ($start > $max) {
4225
4226            trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4227            return if $operation eq '-'; # Deleting a non-existing range is a
4228                                         # no-op
4229
4230            # If the new range doesn't logically extend the current final one
4231            # in the range list, create a new range at the end of the range
4232            # list.  (max cleverly is initialized to a negative number not
4233            # adjacent to 0 if the range list is empty, so even adding a range
4234            # to an empty range list starting at 0 will have this 'if'
4235            # succeed.)
4236            if ($start > $max + 1        # non-adjacent means can't extend.
4237                || @{$r}[-1]->value ne $value # values differ, can't extend.
4238                || @{$r}[-1]->type != $type # types differ, can't extend.
4239            ) {
4240                push @$r, Range->new($start, $end,
4241                                     Value => $value,
4242                                     Type => $type);
4243            }
4244            else {
4245
4246                # Here, the new range starts just after the current highest in
4247                # the range list, and they have the same type and value.
4248                # Extend the existing range to incorporate the new one.
4249                @{$r}[-1]->set_end($end);
4250            }
4251
4252            # This becomes the new maximum.
4253            $max{$addr} = $end;
4254
4255            return;
4256        }
4257        #local $to_trace = 0 if main::DEBUG;
4258
4259        trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4260
4261        # Here, the input range isn't after the whole rest of the range list.
4262        # Most likely 'splice' will be needed.  The rest of the routine finds
4263        # the needed splice parameters, and if necessary, does the splice.
4264        # First, find the offset parameter needed by the splice function for
4265        # the input range.  Note that the input range may span multiple
4266        # existing ones, but we'll worry about that later.  For now, just find
4267        # the beginning.  If the input range is to be inserted starting in a
4268        # position not currently in the range list, it must (obviously) come
4269        # just after the range below it, and just before the range above it.
4270        # Slightly less obviously, it will occupy the position currently
4271        # occupied by the range that is to come after it.  More formally, we
4272        # are looking for the position, $i, in the array of ranges, such that:
4273        #
4274        # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4275        #
4276        # (The ordered relationships within existing ranges are also shown in
4277        # the equation above).  However, if the start of the input range is
4278        # within an existing range, the splice offset should point to that
4279        # existing range's position in the list; that is $i satisfies a
4280        # somewhat different equation, namely:
4281        #
4282        #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4283        #
4284        # More briefly, $start can come before or after r[$i]->start, and at
4285        # this point, we don't know which it will be.  However, these
4286        # two equations share these constraints:
4287        #
4288        #   r[$i-1]->end < $start <= r[$i]->end
4289        #
4290        # And that is good enough to find $i.
4291
4292        my $i = $self->_search_ranges($start);
4293        if (! defined $i) {
4294            Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
4295            return;
4296        }
4297
4298        # The search function returns $i such that:
4299        #
4300        # r[$i-1]->end < $start <= r[$i]->end
4301        #
4302        # That means that $i points to the first range in the range list
4303        # that could possibly be affected by this operation.  We still don't
4304        # know if the start of the input range is within r[$i], or if it
4305        # points to empty space between r[$i-1] and r[$i].
4306        trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4307
4308        # Special case the insertion of data that is not to replace any
4309        # existing data.
4310        if ($replace == $NO) {  # If $NO, has to be operation '+'
4311            #local $to_trace = 1 if main::DEBUG;
4312            trace "Doesn't replace" if main::DEBUG && $to_trace;
4313
4314            # Here, the new range is to take effect only on those code points
4315            # that aren't already in an existing range.  This can be done by
4316            # looking through the existing range list and finding the gaps in
4317            # the ranges that this new range affects, and then calling this
4318            # function recursively on each of those gaps, leaving untouched
4319            # anything already in the list.  Gather up a list of the changed
4320            # gaps first so that changes to the internal state as new ranges
4321            # are added won't be a problem.
4322            my @gap_list;
4323
4324            # First, if the starting point of the input range is outside an
4325            # existing one, there is a gap from there to the beginning of the
4326            # existing range -- add a span to fill the part that this new
4327            # range occupies
4328            if ($start < $r->[$i]->start) {
4329                push @gap_list, Range->new($start,
4330                                           main::min($end,
4331                                                     $r->[$i]->start - 1),
4332                                           Type => $type);
4333                trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4334            }
4335
4336            # Then look through the range list for other gaps until we reach
4337            # the highest range affected by the input one.
4338            my $j;
4339            for ($j = $i+1; $j < $range_list_size; $j++) {
4340                trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4341                last if $end < $r->[$j]->start;
4342
4343                # If there is a gap between when this range starts and the
4344                # previous one ends, add a span to fill it.  Note that just
4345                # because there are two ranges doesn't mean there is a
4346                # non-zero gap between them.  It could be that they have
4347                # different values or types
4348                if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4349                    push @gap_list,
4350                        Range->new($r->[$j-1]->end + 1,
4351                                   $r->[$j]->start - 1,
4352                                   Type => $type);
4353                    trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4354                }
4355            }
4356
4357            # Here, we have either found an existing range in the range list,
4358            # beyond the area affected by the input one, or we fell off the
4359            # end of the loop because the input range affects the whole rest
4360            # of the range list.  In either case, $j is 1 higher than the
4361            # highest affected range.  If $j == $i, it means that there are no
4362            # affected ranges, that the entire insertion is in the gap between
4363            # r[$i-1], and r[$i], which we already have taken care of before
4364            # the loop.
4365            # On the other hand, if there are affected ranges, it might be
4366            # that there is a gap that needs filling after the final such
4367            # range to the end of the input range
4368            if ($r->[$j-1]->end < $end) {
4369                    push @gap_list, Range->new(main::max($start,
4370                                                         $r->[$j-1]->end + 1),
4371                                               $end,
4372                                               Type => $type);
4373                    trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4374            }
4375
4376            # Call recursively to fill in all the gaps.
4377            foreach my $gap (@gap_list) {
4378                $self->_add_delete($operation,
4379                                   $gap->start,
4380                                   $gap->end,
4381                                   $value,
4382                                   Type => $type);
4383            }
4384
4385            return;
4386        }
4387
4388        # Here, we have taken care of the case where $replace is $NO.
4389        # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4390        # If inserting a multiple record, this is where it goes, before the
4391        # first (if any) existing one if inserting LIFO.  (If this is to go
4392        # afterwards, FIFO, we below move the pointer to there.)  These imply
4393        # an insertion, and no change to any existing ranges.  Note that $i
4394        # can be -1 if this new range doesn't actually duplicate any existing,
4395        # and comes at the beginning of the list.
4396        if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4397
4398            if ($start != $end) {
4399                Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
4400                return;
4401            }
4402
4403            # If the new code point is within a current range ...
4404            if ($end >= $r->[$i]->start) {
4405
4406                # Don't add an exact duplicate, as it isn't really a multiple
4407                my $existing_value = $r->[$i]->value;
4408                my $existing_type = $r->[$i]->type;
4409                return if $value eq $existing_value && $type eq $existing_type;
4410
4411                # If the multiple value is part of an existing range, we want
4412                # to split up that range, so that only the single code point
4413                # is affected.  To do this, we first call ourselves
4414                # recursively to delete that code point from the table, having
4415                # preserved its current data above.  Then we call ourselves
4416                # recursively again to add the new multiple, which we know by
4417                # the test just above is different than the current code
4418                # point's value, so it will become a range containing a single
4419                # code point: just itself.  Finally, we add back in the
4420                # pre-existing code point, which will again be a single code
4421                # point range.  Because 'i' likely will have changed as a
4422                # result of these operations, we can't just continue on, but
4423                # do this operation recursively as well.  If we are inserting
4424                # LIFO, the pre-existing code point needs to go after the new
4425                # one, so use MULTIPLE_AFTER; and vice versa.
4426                if ($r->[$i]->start != $r->[$i]->end) {
4427                    $self->_add_delete('-', $start, $end, "");
4428                    $self->_add_delete('+', $start, $end, $value, Type => $type);
4429                    return $self->_add_delete('+',
4430                            $start, $end,
4431                            $existing_value,
4432                            Type => $existing_type,
4433                            Replace => ($replace == $MULTIPLE_BEFORE)
4434                                       ? $MULTIPLE_AFTER
4435                                       : $MULTIPLE_BEFORE);
4436                }
4437            }
4438
4439            # If to place this new record after, move to beyond all existing
4440            # ones; but don't add this one if identical to any of them, as it
4441            # isn't really a multiple.  This leaves the original order, so
4442            # that the current request is ignored.  The reasoning is that the
4443            # previous request that wanted this record to have high priority
4444            # should have precedence.
4445            if ($replace == $MULTIPLE_AFTER) {
4446                while ($i < @$r && $r->[$i]->start == $start) {
4447                    return if $value eq $r->[$i]->value
4448                              && $type eq $r->[$i]->type;
4449                    $i++;
4450                }
4451            }
4452            else {
4453                # If instead we are to place this new record before any
4454                # existing ones, remove any identical ones that come after it.
4455                # This changes the existing order so that the new one is
4456                # first, as is being requested.
4457                for (my $j = $i + 1;
4458                     $j < @$r && $r->[$j]->start == $start;
4459                     $j++)
4460                {
4461                    if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4462                        splice @$r, $j, 1;
4463                        last;   # There should only be one instance, so no
4464                                # need to keep looking
4465                    }
4466                }
4467            }
4468
4469            trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4470            my @return = splice @$r,
4471                                $i,
4472                                0,
4473                                Range->new($start,
4474                                           $end,
4475                                           Value => $value,
4476                                           Type => $type);
4477            if (main::DEBUG && $to_trace) {
4478                trace "After splice:";
4479                trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4480                trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4481                trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4482                trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4483                trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4484                trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4485            }
4486            return @return;
4487        }
4488
4489        # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4490        # leaves delete, insert, and replace either unconditionally or if not
4491        # equivalent.  $i still points to the first potential affected range.
4492        # Now find the highest range affected, which will determine the length
4493        # parameter to splice.  (The input range can span multiple existing
4494        # ones.)  If this isn't a deletion, while we are looking through the
4495        # range list, see also if this is a replacement rather than a clean
4496        # insertion; that is if it will change the values of at least one
4497        # existing range.  Start off assuming it is an insert, until find it
4498        # isn't.
4499        my $clean_insert = $operation eq '+';
4500        my $j;        # This will point to the highest affected range
4501
4502        # For non-zero types, the standard form is the value itself;
4503        my $standard_form = ($type) ? $value : main::standardize($value);
4504
4505        for ($j = $i; $j < $range_list_size; $j++) {
4506            trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4507
4508            # If find a range that it doesn't overlap into, we can stop
4509            # searching
4510            last if $end < $r->[$j]->start;
4511
4512            # Here, overlaps the range at $j.  If the values don't match,
4513            # and so far we think this is a clean insertion, it becomes a
4514            # non-clean insertion, i.e., a 'change' or 'replace' instead.
4515            if ($clean_insert) {
4516                if ($r->[$j]->standard_form ne $standard_form) {
4517                    $clean_insert = 0;
4518                    if ($replace == $CROAK) {
4519                        main::croak("The range to add "
4520                        . sprintf("%04X", $start)
4521                        . '-'
4522                        . sprintf("%04X", $end)
4523                        . " with value '$value' overlaps an existing range $r->[$j]");
4524                    }
4525                }
4526                else {
4527
4528                    # Here, the two values are essentially the same.  If the
4529                    # two are actually identical, replacing wouldn't change
4530                    # anything so skip it.
4531                    my $pre_existing = $r->[$j]->value;
4532                    if ($pre_existing ne $value) {
4533
4534                        # Here the new and old standardized values are the
4535                        # same, but the non-standardized values aren't.  If
4536                        # replacing unconditionally, then replace
4537                        if( $replace == $UNCONDITIONALLY) {
4538                            $clean_insert = 0;
4539                        }
4540                        else {
4541
4542                            # Here, are replacing conditionally.  Decide to
4543                            # replace or not based on which appears to look
4544                            # the "nicest".  If one is mixed case and the
4545                            # other isn't, choose the mixed case one.
4546                            my $new_mixed = $value =~ /[A-Z]/
4547                                            && $value =~ /[a-z]/;
4548                            my $old_mixed = $pre_existing =~ /[A-Z]/
4549                                            && $pre_existing =~ /[a-z]/;
4550
4551                            if ($old_mixed != $new_mixed) {
4552                                $clean_insert = 0 if $new_mixed;
4553                                if (main::DEBUG && $to_trace) {
4554                                    if ($clean_insert) {
4555                                        trace "Retaining $pre_existing over $value";
4556                                    }
4557                                    else {
4558                                        trace "Replacing $pre_existing with $value";
4559                                    }
4560                                }
4561                            }
4562                            else {
4563
4564                                # Here casing wasn't different between the two.
4565                                # If one has hyphens or underscores and the
4566                                # other doesn't, choose the one with the
4567                                # punctuation.
4568                                my $new_punct = $value =~ /[-_]/;
4569                                my $old_punct = $pre_existing =~ /[-_]/;
4570
4571                                if ($old_punct != $new_punct) {
4572                                    $clean_insert = 0 if $new_punct;
4573                                    if (main::DEBUG && $to_trace) {
4574                                        if ($clean_insert) {
4575                                            trace "Retaining $pre_existing over $value";
4576                                        }
4577                                        else {
4578                                            trace "Replacing $pre_existing with $value";
4579                                        }
4580                                    }
4581                                }   # else existing one is just as "good";
4582                                    # retain it to save cycles.
4583                            }
4584                        }
4585                    }
4586                }
4587            }
4588        } # End of loop looking for highest affected range.
4589
4590        # Here, $j points to one beyond the highest range that this insertion
4591        # affects (hence to beyond the range list if that range is the final
4592        # one in the range list).
4593
4594        # The splice length is all the affected ranges.  Get it before
4595        # subtracting, for efficiency, so we don't have to later add 1.
4596        my $length = $j - $i;
4597
4598        $j--;        # $j now points to the highest affected range.
4599        trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4600
4601        # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4602        # $j points to the highest affected range.  But it can be < $i or even
4603        # -1.  These happen only if the insertion is entirely in the gap
4604        # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4605        # above exited first time through with $end < $r->[$i]->start.  (And
4606        # then we subtracted one from j)  This implies also that $start <
4607        # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4608        # $start, so the entire input range is in the gap.
4609        if ($j < $i) {
4610
4611            # Here the entire input range is in the gap before $i.
4612
4613            if (main::DEBUG && $to_trace) {
4614                if ($i) {
4615                    trace "Entire range is between $r->[$i-1] and $r->[$i]";
4616                }
4617                else {
4618                    trace "Entire range is before $r->[$i]";
4619                }
4620            }
4621            return if $operation ne '+'; # Deletion of a non-existent range is
4622                                         # a no-op
4623        }
4624        else {
4625
4626            # Here part of the input range is not in the gap before $i.  Thus,
4627            # there is at least one affected one, and $j points to the highest
4628            # such one.
4629
4630            # At this point, here is the situation:
4631            # This is not an insertion of a multiple, nor of tentative ($NO)
4632            # data.
4633            #   $i  points to the first element in the current range list that
4634            #            may be affected by this operation.  In fact, we know
4635            #            that the range at $i is affected because we are in
4636            #            the else branch of this 'if'
4637            #   $j  points to the highest affected range.
4638            # In other words,
4639            #   r[$i-1]->end < $start <= r[$i]->end
4640            # And:
4641            #   r[$i-1]->end < $start <= $end < r[$j+1]->start
4642            #
4643            # Also:
4644            #   $clean_insert is a boolean which is set true if and only if
4645            #        this is a "clean insertion", i.e., not a change nor a
4646            #        deletion (multiple was handled above).
4647
4648            # We now have enough information to decide if this call is a no-op
4649            # or not.  It is a no-op if this is an insertion of already
4650            # existing data.  To be so, it must be contained entirely in one
4651            # range.
4652
4653            if (main::DEBUG && $to_trace && $clean_insert
4654                                         && $start >= $r->[$i]->start
4655                                         && $end   <= $r->[$i]->end)
4656            {
4657                    trace "no-op";
4658            }
4659            return if $clean_insert
4660                      && $start >= $r->[$i]->start
4661                      && $end   <= $r->[$i]->end;
4662        }
4663
4664        # Here, we know that some action will have to be taken.  We have
4665        # calculated the offset and length (though adjustments may be needed)
4666        # for the splice.  Now start constructing the replacement list.
4667        my @replacement;
4668        my $splice_start = $i;
4669
4670        my $extends_below;
4671        my $extends_above;
4672
4673        # See if should extend any adjacent ranges.
4674        if ($operation eq '-') { # Don't extend deletions
4675            $extends_below = $extends_above = 0;
4676        }
4677        else {  # Here, should extend any adjacent ranges.  See if there are
4678                # any.
4679            $extends_below = ($i > 0
4680                            # can't extend unless adjacent
4681                            && $r->[$i-1]->end == $start -1
4682                            # can't extend unless are same standard value
4683                            && $r->[$i-1]->standard_form eq $standard_form
4684                            # can't extend unless share type
4685                            && $r->[$i-1]->type == $type);
4686            $extends_above = ($j+1 < $range_list_size
4687                            && $r->[$j+1]->start == $end +1
4688                            && $r->[$j+1]->standard_form eq $standard_form
4689                            && $r->[$j+1]->type == $type);
4690        }
4691        if ($extends_below && $extends_above) { # Adds to both
4692            $splice_start--;     # start replace at element below
4693            $length += 2;        # will replace on both sides
4694            trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4695
4696            # The result will fill in any gap, replacing both sides, and
4697            # create one large range.
4698            @replacement = Range->new($r->[$i-1]->start,
4699                                      $r->[$j+1]->end,
4700                                      Value => $value,
4701                                      Type => $type);
4702        }
4703        else {
4704
4705            # Here we know that the result won't just be the conglomeration of
4706            # a new range with both its adjacent neighbors.  But it could
4707            # extend one of them.
4708
4709            if ($extends_below) {
4710
4711                # Here the new element adds to the one below, but not to the
4712                # one above.  If inserting, and only to that one range,  can
4713                # just change its ending to include the new one.
4714                if ($length == 0 && $clean_insert) {
4715                    $r->[$i-1]->set_end($end);
4716                    trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4717                    return;
4718                }
4719                else {
4720                    trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4721                    $splice_start--;        # start replace at element below
4722                    $length++;              # will replace the element below
4723                    $start = $r->[$i-1]->start;
4724                }
4725            }
4726            elsif ($extends_above) {
4727
4728                # Here the new element adds to the one above, but not below.
4729                # Mirror the code above
4730                if ($length == 0 && $clean_insert) {
4731                    $r->[$j+1]->set_start($start);
4732                    trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4733                    return;
4734                }
4735                else {
4736                    trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4737                    $length++;        # will replace the element above
4738                    $end = $r->[$j+1]->end;
4739                }
4740            }
4741
4742            trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4743
4744            # Finally, here we know there will have to be a splice.
4745            # If the change or delete affects only the highest portion of the
4746            # first affected range, the range will have to be split.  The
4747            # splice will remove the whole range, but will replace it by a new
4748            # range containing just the unaffected part.  So, in this case,
4749            # add to the replacement list just this unaffected portion.
4750            if (! $extends_below
4751                && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4752            {
4753                push @replacement,
4754                    Range->new($r->[$i]->start,
4755                               $start - 1,
4756                               Value => $r->[$i]->value,
4757                               Type => $r->[$i]->type);
4758            }
4759
4760            # In the case of an insert or change, but not a delete, we have to
4761            # put in the new stuff;  this comes next.
4762            if ($operation eq '+') {
4763                push @replacement, Range->new($start,
4764                                              $end,
4765                                              Value => $value,
4766                                              Type => $type);
4767            }
4768
4769            trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4770            #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4771
4772            # And finally, if we're changing or deleting only a portion of the
4773            # highest affected range, it must be split, as the lowest one was.
4774            if (! $extends_above
4775                && $j >= 0  # Remember that j can be -1 if before first
4776                            # current element
4777                && $end >= $r->[$j]->start
4778                && $end < $r->[$j]->end)
4779            {
4780                push @replacement,
4781                    Range->new($end + 1,
4782                               $r->[$j]->end,
4783                               Value => $r->[$j]->value,
4784                               Type => $r->[$j]->type);
4785            }
4786        }
4787
4788        # And do the splice, as calculated above
4789        if (main::DEBUG && $to_trace) {
4790            trace "replacing $length element(s) at $i with ";
4791            foreach my $replacement (@replacement) {
4792                trace "    $replacement";
4793            }
4794            trace "Before splice:";
4795            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4796            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4797            trace "i  =[", $i, "]", $r->[$i];
4798            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4799            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4800        }
4801
4802        my @return = splice @$r, $splice_start, $length, @replacement;
4803
4804        if (main::DEBUG && $to_trace) {
4805            trace "After splice:";
4806            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4807            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4808            trace "i  =[", $i, "]", $r->[$i];
4809            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4810            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4811            trace "removed ", @return if @return;
4812        }
4813
4814        # An actual deletion could have changed the maximum in the list.
4815        # There was no deletion if the splice didn't return something, but
4816        # otherwise recalculate it.  This is done too rarely to worry about
4817        # performance.
4818        if ($operation eq '-' && @return) {
4819            if (@$r) {
4820                $max{$addr} = $r->[-1]->end;
4821            }
4822            else {  # Now empty
4823                $max{$addr} = $max_init;
4824            }
4825        }
4826        return @return;
4827    }
4828
4829    sub reset_each_range {  # reset the iterator for each_range();
4830        my $self = shift;
4831        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4832
4833        no overloading;
4834        undef $each_range_iterator{pack 'J', $self};
4835        return;
4836    }
4837
4838    sub each_range {
4839        # Iterate over each range in a range list.  Results are undefined if
4840        # the range list is changed during the iteration.
4841
4842        my $self = shift;
4843        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4844
4845        my $addr = do { no overloading; pack 'J', $self; };
4846
4847        return if $self->is_empty;
4848
4849        $each_range_iterator{$addr} = -1
4850                                if ! defined $each_range_iterator{$addr};
4851        $each_range_iterator{$addr}++;
4852        return $ranges{$addr}->[$each_range_iterator{$addr}]
4853                        if $each_range_iterator{$addr} < @{$ranges{$addr}};
4854        undef $each_range_iterator{$addr};
4855        return;
4856    }
4857
4858    sub count {        # Returns count of code points in range list
4859        my $self = shift;
4860        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4861
4862        my $addr = do { no overloading; pack 'J', $self; };
4863
4864        my $count = 0;
4865        foreach my $range (@{$ranges{$addr}}) {
4866            $count += $range->end - $range->start + 1;
4867        }
4868        return $count;
4869    }
4870
4871    sub delete_range {    # Delete a range
4872        my $self = shift;
4873        my $start = shift;
4874        my $end = shift;
4875
4876        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4877
4878        return $self->_add_delete('-', $start, $end, "");
4879    }
4880
4881    sub is_empty { # Returns boolean as to if a range list is empty
4882        my $self = shift;
4883        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4884
4885        no overloading;
4886        return scalar @{$ranges{pack 'J', $self}} == 0;
4887    }
4888
4889    sub hash {
4890        # Quickly returns a scalar suitable for separating tables into
4891        # buckets, i.e. it is a hash function of the contents of a table, so
4892        # there are relatively few conflicts.
4893
4894        my $self = shift;
4895        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4896
4897        my $addr = do { no overloading; pack 'J', $self; };
4898
4899        # These are quickly computable.  Return looks like 'min..max;count'
4900        return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4901    }
4902} # End closure for _Range_List_Base
4903
4904package Range_List;
4905use parent '-norequire', '_Range_List_Base';
4906
4907# A Range_List is a range list for match tables; i.e. the range values are
4908# not significant.  Thus a number of operations can be safely added to it,
4909# such as inversion, intersection.  Note that union is also an unsafe
4910# operation when range values are cared about, and that method is in the base
4911# class, not here.  But things are set up so that that method is callable only
4912# during initialization.  Only in this derived class, is there an operation
4913# that combines two tables.  A Range_Map can thus be used to initialize a
4914# Range_List, and its mappings will be in the list, but are not significant to
4915# this class.
4916
4917sub trace { return main::trace(@_); }
4918
4919{ # Closure
4920
4921    use overload
4922        fallback => 0,
4923        '+' => sub { my $self = shift;
4924                    my $other = shift;
4925
4926                    return $self->_union($other)
4927                },
4928        '+=' => sub { my $self = shift;
4929                    my $other = shift;
4930                    my $reversed = shift;
4931
4932                    if ($reversed) {
4933                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4934                        . ref($other)
4935                        . ' += '
4936                        . ref($self)
4937                        . "'.  undef returned.");
4938                        return;
4939                    }
4940
4941                    return $self->_union($other)
4942                },
4943        '&' => sub { my $self = shift;
4944                    my $other = shift;
4945
4946                    return $self->_intersect($other, 0);
4947                },
4948        '&=' => sub { my $self = shift;
4949                    my $other = shift;
4950                    my $reversed = shift;
4951
4952                    if ($reversed) {
4953                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4954                        . ref($other)
4955                        . ' &= '
4956                        . ref($self)
4957                        . "'.  undef returned.");
4958                        return;
4959                    }
4960
4961                    return $self->_intersect($other, 0);
4962                },
4963        '~' => "_invert",
4964        '-' => "_subtract",
4965    ;
4966
4967    sub _invert {
4968        # Returns a new Range_List that gives all code points not in $self.
4969
4970        my $self = shift;
4971
4972        my $new = Range_List->new;
4973
4974        # Go through each range in the table, finding the gaps between them
4975        my $max = -1;   # Set so no gap before range beginning at 0
4976        for my $range ($self->ranges) {
4977            my $start = $range->start;
4978            my $end   = $range->end;
4979
4980            # If there is a gap before this range, the inverse will contain
4981            # that gap.
4982            if ($start > $max + 1) {
4983                $new->add_range($max + 1, $start - 1);
4984            }
4985            $max = $end;
4986        }
4987
4988        # And finally, add the gap from the end of the table to the max
4989        # possible code point
4990        if ($max < $MAX_WORKING_CODEPOINT) {
4991            $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4992        }
4993        return $new;
4994    }
4995
4996    sub _subtract {
4997        # Returns a new Range_List with the argument deleted from it.  The
4998        # argument can be a single code point, a range, or something that has
4999        # a range, with the _range_list() method on it returning them
5000
5001        my $self = shift;
5002        my $other = shift;
5003        my $reversed = shift;
5004        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5005
5006        if ($reversed) {
5007            Carp::my_carp_bug("Bad news.  Can't cope with '"
5008            . ref($other)
5009            . ' - '
5010            . ref($self)
5011            . "'.  undef returned.");
5012            return;
5013        }
5014
5015        my $new = Range_List->new(Initialize => $self);
5016
5017        if (! ref $other) { # Single code point
5018            $new->delete_range($other, $other);
5019        }
5020        elsif ($other->isa('Range')) {
5021            $new->delete_range($other->start, $other->end);
5022        }
5023        elsif ($other->can('_range_list')) {
5024            foreach my $range ($other->_range_list->ranges) {
5025                $new->delete_range($range->start, $range->end);
5026            }
5027        }
5028        else {
5029            Carp::my_carp_bug("Can't cope with a "
5030                        . ref($other)
5031                        . " argument to '-'.  Subtraction ignored."
5032                        );
5033            return $self;
5034        }
5035
5036        return $new;
5037    }
5038
5039    sub _intersect {
5040        # Returns either a boolean giving whether the two inputs' range lists
5041        # intersect (overlap), or a new Range_List containing the intersection
5042        # of the two lists.  The optional final parameter being true indicates
5043        # to do the check instead of the intersection.
5044
5045        my $a_object = shift;
5046        my $b_object = shift;
5047        my $check_if_overlapping = shift;
5048        $check_if_overlapping = 0 unless defined $check_if_overlapping;
5049        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5050
5051        if (! defined $b_object) {
5052            my $message = "";
5053            $message .= $a_object->_owner_name_of if defined $a_object;
5054            Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
5055            return;
5056        }
5057
5058        # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
5059        # Thus the intersection could be much more simply be written:
5060        #   return ~(~$a_object + ~$b_object);
5061        # But, this is slower, and when taking the inverse of a large
5062        # range_size_1 table, back when such tables were always stored that
5063        # way, it became prohibitively slow, hence the code was changed to the
5064        # below
5065
5066        if ($b_object->isa('Range')) {
5067            $b_object = Range_List->new(Initialize => $b_object,
5068                                        Owner => $a_object->_owner_name_of);
5069        }
5070        $b_object = $b_object->_range_list if $b_object->can('_range_list');
5071
5072        my @a_ranges = $a_object->ranges;
5073        my @b_ranges = $b_object->ranges;
5074
5075        #local $to_trace = 1 if main::DEBUG;
5076        trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
5077
5078        # Start with the first range in each list
5079        my $a_i = 0;
5080        my $range_a = $a_ranges[$a_i];
5081        my $b_i = 0;
5082        my $range_b = $b_ranges[$b_i];
5083
5084        my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
5085                                                if ! $check_if_overlapping;
5086
5087        # If either list is empty, there is no intersection and no overlap
5088        if (! defined $range_a || ! defined $range_b) {
5089            return $check_if_overlapping ? 0 : $new;
5090        }
5091        trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5092
5093        # Otherwise, must calculate the intersection/overlap.  Start with the
5094        # very first code point in each list
5095        my $a = $range_a->start;
5096        my $b = $range_b->start;
5097
5098        # Loop through all the ranges of each list; in each iteration, $a and
5099        # $b are the current code points in their respective lists
5100        while (1) {
5101
5102            # If $a and $b are the same code point, ...
5103            if ($a == $b) {
5104
5105                # it means the lists overlap.  If just checking for overlap
5106                # know the answer now,
5107                return 1 if $check_if_overlapping;
5108
5109                # The intersection includes this code point plus anything else
5110                # common to both current ranges.
5111                my $start = $a;
5112                my $end = main::min($range_a->end, $range_b->end);
5113                if (! $check_if_overlapping) {
5114                    trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5115                    $new->add_range($start, $end);
5116                }
5117
5118                # Skip ahead to the end of the current intersect
5119                $a = $b = $end;
5120
5121                # If the current intersect ends at the end of either range (as
5122                # it must for at least one of them), the next possible one
5123                # will be the beginning code point in it's list's next range.
5124                if ($a == $range_a->end) {
5125                    $range_a = $a_ranges[++$a_i];
5126                    last unless defined $range_a;
5127                    $a = $range_a->start;
5128                }
5129                if ($b == $range_b->end) {
5130                    $range_b = $b_ranges[++$b_i];
5131                    last unless defined $range_b;
5132                    $b = $range_b->start;
5133                }
5134
5135                trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5136            }
5137            elsif ($a < $b) {
5138
5139                # Not equal, but if the range containing $a encompasses $b,
5140                # change $a to be the middle of the range where it does equal
5141                # $b, so the next iteration will get the intersection
5142                if ($range_a->end >= $b) {
5143                    $a = $b;
5144                }
5145                else {
5146
5147                    # Here, the current range containing $a is entirely below
5148                    # $b.  Go try to find a range that could contain $b.
5149                    $a_i = $a_object->_search_ranges($b);
5150
5151                    # If no range found, quit.
5152                    last unless defined $a_i;
5153
5154                    # The search returns $a_i, such that
5155                    #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5156                    # Set $a to the beginning of this new range, and repeat.
5157                    $range_a = $a_ranges[$a_i];
5158                    $a = $range_a->start;
5159                }
5160            }
5161            else { # Here, $b < $a.
5162
5163                # Mirror image code to the leg just above
5164                if ($range_b->end >= $a) {
5165                    $b = $a;
5166                }
5167                else {
5168                    $b_i = $b_object->_search_ranges($a);
5169                    last unless defined $b_i;
5170                    $range_b = $b_ranges[$b_i];
5171                    $b = $range_b->start;
5172                }
5173            }
5174        } # End of looping through ranges.
5175
5176        # Intersection fully computed, or now know that there is no overlap
5177        return $check_if_overlapping ? 0 : $new;
5178    }
5179
5180    sub overlaps {
5181        # Returns boolean giving whether the two arguments overlap somewhere
5182
5183        my $self = shift;
5184        my $other = shift;
5185        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5186
5187        return $self->_intersect($other, 1);
5188    }
5189
5190    sub add_range {
5191        # Add a range to the list.
5192
5193        my $self = shift;
5194        my $start = shift;
5195        my $end = shift;
5196        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5197
5198        return $self->_add_delete('+', $start, $end, "");
5199    }
5200
5201    sub matches_identically_to {
5202        # Return a boolean as to whether or not two Range_Lists match identical
5203        # sets of code points.
5204
5205        my $self = shift;
5206        my $other = shift;
5207        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5208
5209        # These are ordered in increasing real time to figure out (at least
5210        # until a patch changes that and doesn't change this)
5211        return 0 if $self->max != $other->max;
5212        return 0 if $self->min != $other->min;
5213        return 0 if $self->range_count != $other->range_count;
5214        return 0 if $self->count != $other->count;
5215
5216        # Here they could be identical because all the tests above passed.
5217        # The loop below is somewhat simpler since we know they have the same
5218        # number of elements.  Compare range by range, until reach the end or
5219        # find something that differs.
5220        my @a_ranges = $self->ranges;
5221        my @b_ranges = $other->ranges;
5222        for my $i (0 .. @a_ranges - 1) {
5223            my $a = $a_ranges[$i];
5224            my $b = $b_ranges[$i];
5225            trace "self $a; other $b" if main::DEBUG && $to_trace;
5226            return 0 if ! defined $b
5227                        || $a->start != $b->start
5228                        || $a->end != $b->end;
5229        }
5230        return 1;
5231    }
5232
5233    sub is_code_point_usable {
5234        # This used only for making the test script.  See if the input
5235        # proposed trial code point is one that Perl will handle.  If second
5236        # parameter is 0, it won't select some code points for various
5237        # reasons, noted below.
5238
5239        my $code = shift;
5240        my $try_hard = shift;
5241        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5242
5243        return 0 if $code < 0;                # Never use a negative
5244
5245        # shun null.  I'm (khw) not sure why this was done, but NULL would be
5246        # the character very frequently used.
5247        return $try_hard if $code == 0x0000;
5248
5249        # shun non-character code points.
5250        return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5251        return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5252
5253        return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
5254        return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5255
5256        return 1;
5257    }
5258
5259    sub get_valid_code_point {
5260        # Return a code point that's part of the range list.  Returns nothing
5261        # if the table is empty or we can't find a suitable code point.  This
5262        # used only for making the test script.
5263
5264        my $self = shift;
5265        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5266
5267        my $addr = do { no overloading; pack 'J', $self; };
5268
5269        # On first pass, don't choose less desirable code points; if no good
5270        # one is found, repeat, allowing a less desirable one to be selected.
5271        for my $try_hard (0, 1) {
5272
5273            # Look through all the ranges for a usable code point.
5274            for my $set (reverse $self->ranges) {
5275
5276                # Try the edge cases first, starting with the end point of the
5277                # range.
5278                my $end = $set->end;
5279                return $end if is_code_point_usable($end, $try_hard);
5280                $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5281
5282                # End point didn't, work.  Start at the beginning and try
5283                # every one until find one that does work.
5284                for my $trial ($set->start .. $end - 1) {
5285                    return $trial if is_code_point_usable($trial, $try_hard);
5286                }
5287            }
5288        }
5289        return ();  # If none found, give up.
5290    }
5291
5292    sub get_invalid_code_point {
5293        # Return a code point that's not part of the table.  Returns nothing
5294        # if the table covers all code points or a suitable code point can't
5295        # be found.  This used only for making the test script.
5296
5297        my $self = shift;
5298        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5299
5300        # Just find a valid code point of the inverse, if any.
5301        return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5302    }
5303} # end closure for Range_List
5304
5305package Range_Map;
5306use parent '-norequire', '_Range_List_Base';
5307
5308# A Range_Map is a range list in which the range values (called maps) are
5309# significant, and hence shouldn't be manipulated by our other code, which
5310# could be ambiguous or lose things.  For example, in taking the union of two
5311# lists, which share code points, but which have differing values, which one
5312# has precedence in the union?
5313# It turns out that these operations aren't really necessary for map tables,
5314# and so this class was created to make sure they aren't accidentally
5315# applied to them.
5316
5317{ # Closure
5318
5319    sub add_map {
5320        # Add a range containing a mapping value to the list
5321
5322        my $self = shift;
5323        # Rest of parameters passed on
5324
5325        return $self->_add_delete('+', @_);
5326    }
5327
5328    sub replace_map {
5329        # Replace a range
5330
5331        my $self = shift;
5332
5333        return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
5334    }
5335
5336    sub add_duplicate {
5337        # Adds entry to a range list which can duplicate an existing entry
5338
5339        my $self = shift;
5340        my $code_point = shift;
5341        my $value = shift;
5342        my %args = @_;
5343        my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5344        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5345
5346        return $self->add_map($code_point, $code_point,
5347                                $value, Replace => $replace);
5348    }
5349} # End of closure for package Range_Map
5350
5351package _Base_Table;
5352
5353# A table is the basic data structure that gets written out into a file for
5354# use by the Perl core.  This is the abstract base class implementing the
5355# common elements from the derived ones.  A list of the methods to be
5356# furnished by an implementing class is just after the constructor.
5357
5358sub standardize { return main::standardize($_[0]); }
5359sub trace { return main::trace(@_); }
5360
5361{ # Closure
5362
5363    main::setup_package();
5364
5365    my %range_list;
5366    # Object containing the ranges of the table.
5367    main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5368
5369    my %full_name;
5370    # The full table name.
5371    main::set_access('full_name', \%full_name, 'r');
5372
5373    my %name;
5374    # The table name, almost always shorter
5375    main::set_access('name', \%name, 'r');
5376
5377    my %short_name;
5378    # The shortest of all the aliases for this table, with underscores removed
5379    main::set_access('short_name', \%short_name);
5380
5381    my %nominal_short_name_length;
5382    # The length of short_name before removing underscores
5383    main::set_access('nominal_short_name_length',
5384                    \%nominal_short_name_length);
5385
5386    my %complete_name;
5387    # The complete name, including property.
5388    main::set_access('complete_name', \%complete_name, 'r');
5389
5390    my %property;
5391    # Parent property this table is attached to.
5392    main::set_access('property', \%property, 'r');
5393
5394    my %aliases;
5395    # Ordered list of alias objects of the table's name.  The first ones in
5396    # the list are output first in comments
5397    main::set_access('aliases', \%aliases, 'readable_array');
5398
5399    my %comment;
5400    # A comment associated with the table for human readers of the files
5401    main::set_access('comment', \%comment, 's');
5402
5403    my %description;
5404    # A comment giving a short description of the table's meaning for human
5405    # readers of the files.
5406    main::set_access('description', \%description, 'readable_array');
5407
5408    my %note;
5409    # A comment giving a short note about the table for human readers of the
5410    # files.
5411    main::set_access('note', \%note, 'readable_array');
5412
5413    my %fate;
5414    # Enum; there are a number of possibilities for what happens to this
5415    # table: it could be normal, or suppressed, or not for external use.  See
5416    # values at definition for $SUPPRESSED.
5417    main::set_access('fate', \%fate, 'r');
5418
5419    my %find_table_from_alias;
5420    # The parent property passes this pointer to a hash which this class adds
5421    # all its aliases to, so that the parent can quickly take an alias and
5422    # find this table.
5423    main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5424
5425    my %locked;
5426    # After this table is made equivalent to another one; we shouldn't go
5427    # changing the contents because that could mean it's no longer equivalent
5428    main::set_access('locked', \%locked, 'r');
5429
5430    my %file_path;
5431    # This gives the final path to the file containing the table.  Each
5432    # directory in the path is an element in the array
5433    main::set_access('file_path', \%file_path, 'readable_array');
5434
5435    my %status;
5436    # What is the table's status, normal, $OBSOLETE, etc.  Enum
5437    main::set_access('status', \%status, 'r');
5438
5439    my %status_info;
5440    # A comment about its being obsolete, or whatever non normal status it has
5441    main::set_access('status_info', \%status_info, 'r');
5442
5443    my %caseless_equivalent;
5444    # The table this is equivalent to under /i matching, if any.
5445    main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5446
5447    my %range_size_1;
5448    # Is the table to be output with each range only a single code point?
5449    # This is done to avoid breaking existing code that may have come to rely
5450    # on this behavior in previous versions of this program.)
5451    main::set_access('range_size_1', \%range_size_1, 'r', 's');
5452
5453    my %perl_extension;
5454    # A boolean set iff this table is a Perl extension to the Unicode
5455    # standard.
5456    main::set_access('perl_extension', \%perl_extension, 'r');
5457
5458    my %output_range_counts;
5459    # A boolean set iff this table is to have comments written in the
5460    # output file that contain the number of code points in the range.
5461    # The constructor can override the global flag of the same name.
5462    main::set_access('output_range_counts', \%output_range_counts, 'r');
5463
5464    my %write_as_invlist;
5465    # A boolean set iff the output file for this table is to be in the form of
5466    # an inversion list/map.
5467    main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5468
5469    my %format;
5470    # The format of the entries of the table.  This is calculated from the
5471    # data in the table (or passed in the constructor).  This is an enum e.g.,
5472    # $STRING_FORMAT.  It is marked protected as it should not be generally
5473    # used to override calculations.
5474    main::set_access('format', \%format, 'r', 'p_s');
5475
5476    my %has_dependency;
5477    # A boolean that gives whether some other table in this property is
5478    # defined as the complement of this table.  This is a crude, but currently
5479    # sufficient, mechanism to make this table not get destroyed before what
5480    # is dependent on it is.  Other dependencies could be added, so the name
5481    # was chosen to reflect a more general situation than actually is
5482    # currently the case.
5483    main::set_access('has_dependency', \%has_dependency, 'r', 's');
5484
5485    sub new {
5486        # All arguments are key => value pairs, which you can see below, most
5487        # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5488        # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5489        # documented in the Alias package
5490
5491        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5492
5493        my $class = shift;
5494
5495        my $self = bless \do { my $anonymous_scalar }, $class;
5496        my $addr = do { no overloading; pack 'J', $self; };
5497
5498        my %args = @_;
5499
5500        $name{$addr} = delete $args{'Name'};
5501        $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5502        $full_name{$addr} = delete $args{'Full_Name'};
5503        my $complete_name = $complete_name{$addr}
5504                          = delete $args{'Complete_Name'};
5505        $format{$addr} = delete $args{'Format'};
5506        $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5507        $property{$addr} = delete $args{'_Property'};
5508        $range_list{$addr} = delete $args{'_Range_List'};
5509        $status{$addr} = delete $args{'Status'} || $NORMAL;
5510        $status_info{$addr} = delete $args{'_Status_Info'} || "";
5511        $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5512        $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5513        $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5514        $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5515        my $ucd = delete $args{'UCD'};
5516
5517        my $description = delete $args{'Description'};
5518        my $ok_as_filename = delete $args{'OK_as_Filename'};
5519        my $loose_match = delete $args{'Fuzzy'};
5520        my $note = delete $args{'Note'};
5521        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5522        my $perl_extension = delete $args{'Perl_Extension'};
5523        my $suppression_reason = delete $args{'Suppression_Reason'};
5524
5525        # Shouldn't have any left over
5526        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5527
5528        # Can't use || above because conceivably the name could be 0, and
5529        # can't use // operator in case this program gets used in Perl 5.8
5530        $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5531        $output_range_counts{$addr} = $output_range_counts if
5532                                        ! defined $output_range_counts{$addr};
5533
5534        $aliases{$addr} = [ ];
5535        $comment{$addr} = [ ];
5536        $description{$addr} = [ ];
5537        $note{$addr} = [ ];
5538        $file_path{$addr} = [ ];
5539        $locked{$addr} = "";
5540        $has_dependency{$addr} = 0;
5541
5542        push @{$description{$addr}}, $description if $description;
5543        push @{$note{$addr}}, $note if $note;
5544
5545        if ($fate{$addr} == $PLACEHOLDER) {
5546
5547            # A placeholder table doesn't get documented, is a perl extension,
5548            # and quite likely will be empty
5549            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5550            $perl_extension = 1 if ! defined $perl_extension;
5551            $ucd = 0 if ! defined $ucd;
5552            push @tables_that_may_be_empty, $complete_name{$addr};
5553            $self->add_comment(<<END);
5554This is a placeholder because it is not in Version $string_version of Unicode,
5555but is needed by the Perl core to work gracefully.  Because it is not in this
5556version of Unicode, it will not be listed in $pod_file.pod
5557END
5558        }
5559        elsif (exists $why_suppressed{$complete_name}
5560                # Don't suppress if overridden
5561                && ! grep { $_ eq $complete_name{$addr} }
5562                                                    @output_mapped_properties)
5563        {
5564            $fate{$addr} = $SUPPRESSED;
5565        }
5566        elsif ($fate{$addr} == $SUPPRESSED) {
5567            Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5568            # Though currently unused
5569        }
5570        elsif ($suppression_reason) {
5571            Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5572        }
5573
5574        # If hasn't set its status already, see if it is on one of the
5575        # lists of properties or tables that have particular statuses; if
5576        # not, is normal.  The lists are prioritized so the most serious
5577        # ones are checked first
5578        if (! $status{$addr}) {
5579            if (exists $why_deprecated{$complete_name}) {
5580                $status{$addr} = $DEPRECATED;
5581            }
5582            elsif (exists $why_stabilized{$complete_name}) {
5583                $status{$addr} = $STABILIZED;
5584            }
5585            elsif (exists $why_obsolete{$complete_name}) {
5586                $status{$addr} = $OBSOLETE;
5587            }
5588
5589            # Existence above doesn't necessarily mean there is a message
5590            # associated with it.  Use the most serious message.
5591            if ($status{$addr}) {
5592                if ($why_deprecated{$complete_name}) {
5593                    $status_info{$addr}
5594                                = $why_deprecated{$complete_name};
5595                }
5596                elsif ($why_stabilized{$complete_name}) {
5597                    $status_info{$addr}
5598                                = $why_stabilized{$complete_name};
5599                }
5600                elsif ($why_obsolete{$complete_name}) {
5601                    $status_info{$addr}
5602                                = $why_obsolete{$complete_name};
5603                }
5604            }
5605        }
5606
5607        $perl_extension{$addr} = $perl_extension || 0;
5608
5609        # Don't list a property by default that is internal only
5610        if ($fate{$addr} > $MAP_PROXIED) {
5611            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5612            $ucd = 0 if ! defined $ucd;
5613        }
5614        else {
5615            $ucd = 1 if ! defined $ucd;
5616        }
5617
5618        # By convention what typically gets printed only or first is what's
5619        # first in the list, so put the full name there for good output
5620        # clarity.  Other routines rely on the full name being first on the
5621        # list
5622        $self->add_alias($full_name{$addr},
5623                            OK_as_Filename => $ok_as_filename,
5624                            Fuzzy => $loose_match,
5625                            Re_Pod_Entry => $make_re_pod_entry,
5626                            Status => $status{$addr},
5627                            UCD => $ucd,
5628                            );
5629
5630        # Then comes the other name, if meaningfully different.
5631        if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5632            $self->add_alias($name{$addr},
5633                            OK_as_Filename => $ok_as_filename,
5634                            Fuzzy => $loose_match,
5635                            Re_Pod_Entry => $make_re_pod_entry,
5636                            Status => $status{$addr},
5637                            UCD => $ucd,
5638                            );
5639        }
5640
5641        return $self;
5642    }
5643
5644    # Here are the methods that are required to be defined by any derived
5645    # class
5646    for my $sub (qw(
5647                    handle_special_range
5648                    append_to_body
5649                    pre_body
5650                ))
5651                # write() knows how to write out normal ranges, but it calls
5652                # handle_special_range() when it encounters a non-normal one.
5653                # append_to_body() is called by it after it has handled all
5654                # ranges to add anything after the main portion of the table.
5655                # And finally, pre_body() is called after all this to build up
5656                # anything that should appear before the main portion of the
5657                # table.  Doing it this way allows things in the middle to
5658                # affect what should appear before the main portion of the
5659                # table.
5660    {
5661        no strict "refs";
5662        *$sub = sub {
5663            Carp::my_carp_bug( __LINE__
5664                              . ": Must create method '$sub()' for "
5665                              . ref shift);
5666            return;
5667        }
5668    }
5669
5670    use overload
5671        fallback => 0,
5672        "." => \&main::_operator_dot,
5673        ".=" => \&main::_operator_dot_equal,
5674        '!=' => \&main::_operator_not_equal,
5675        '==' => \&main::_operator_equal,
5676    ;
5677
5678    sub ranges {
5679        # Returns the array of ranges associated with this table.
5680
5681        no overloading;
5682        return $range_list{pack 'J', shift}->ranges;
5683    }
5684
5685    sub add_alias {
5686        # Add a synonym for this table.
5687
5688        return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5689
5690        my $self = shift;
5691        my $name = shift;       # The name to add.
5692        my $pointer = shift;    # What the alias hash should point to.  For
5693                                # map tables, this is the parent property;
5694                                # for match tables, it is the table itself.
5695
5696        my %args = @_;
5697        my $loose_match = delete $args{'Fuzzy'};
5698
5699        my $ok_as_filename = delete $args{'OK_as_Filename'};
5700        $ok_as_filename = 1 unless defined $ok_as_filename;
5701
5702        # An internal name does not get documented, unless overridden by the
5703        # input; same for making tests for it.
5704        my $status = delete $args{'Status'} || (($name =~ /^_/)
5705                                                ? $INTERNAL_ALIAS
5706                                                : $NORMAL);
5707        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5708                                            // (($status ne $INTERNAL_ALIAS)
5709                                               ? (($name =~ /^_/) ? $NO : $YES)
5710                                               : $NO);
5711        my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5712
5713        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5714
5715        # Capitalize the first letter of the alias unless it is one of the CJK
5716        # ones which specifically begins with a lower 'k'.  Do this because
5717        # Unicode has varied whether they capitalize first letters or not, and
5718        # have later changed their minds and capitalized them, but not the
5719        # other way around.  So do it always and avoid changes from release to
5720        # release
5721        $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5722
5723        my $addr = do { no overloading; pack 'J', $self; };
5724
5725        # Figure out if should be loosely matched if not already specified.
5726        if (! defined $loose_match) {
5727
5728            # Is a loose_match if isn't null, and doesn't begin with an
5729            # underscore and isn't just a number
5730            if ($name ne ""
5731                && substr($name, 0, 1) ne '_'
5732                && $name !~ qr{^[0-9_.+-/]+$})
5733            {
5734                $loose_match = 1;
5735            }
5736            else {
5737                $loose_match = 0;
5738            }
5739        }
5740
5741        # If this alias has already been defined, do nothing.
5742        return if defined $find_table_from_alias{$addr}->{$name};
5743
5744        # That includes if it is standardly equivalent to an existing alias,
5745        # in which case, add this name to the list, so won't have to search
5746        # for it again.
5747        my $standard_name = main::standardize($name);
5748        if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5749            $find_table_from_alias{$addr}->{$name}
5750                        = $find_table_from_alias{$addr}->{$standard_name};
5751            return;
5752        }
5753
5754        # Set the index hash for this alias for future quick reference.
5755        $find_table_from_alias{$addr}->{$name} = $pointer;
5756        $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5757        local $to_trace = 0 if main::DEBUG;
5758        trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5759        trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5760
5761
5762        # Put the new alias at the end of the list of aliases unless the final
5763        # element begins with an underscore (meaning it is for internal perl
5764        # use) or is all numeric, in which case, put the new one before that
5765        # one.  This floats any all-numeric or underscore-beginning aliases to
5766        # the end.  This is done so that they are listed last in output lists,
5767        # to encourage the user to use a better name (either more descriptive
5768        # or not an internal-only one) instead.  This ordering is relied on
5769        # implicitly elsewhere in this program, like in short_name()
5770        my $list = $aliases{$addr};
5771        my $insert_position = (@$list == 0
5772                                || (substr($list->[-1]->name, 0, 1) ne '_'
5773                                    && $list->[-1]->name =~ /\D/))
5774                            ? @$list
5775                            : @$list - 1;
5776        splice @$list,
5777                $insert_position,
5778                0,
5779                Alias->new($name, $loose_match, $make_re_pod_entry,
5780                           $ok_as_filename, $status, $ucd);
5781
5782        # This name may be shorter than any existing ones, so clear the cache
5783        # of the shortest, so will have to be recalculated.
5784        no overloading;
5785        undef $short_name{pack 'J', $self};
5786        return;
5787    }
5788
5789    sub short_name {
5790        # Returns a name suitable for use as the base part of a file name.
5791        # That is, shorter wins.  It can return undef if there is no suitable
5792        # name.  The name has all non-essential underscores removed.
5793
5794        # The optional second parameter is a reference to a scalar in which
5795        # this routine will store the length the returned name had before the
5796        # underscores were removed, or undef if the return is undef.
5797
5798        # The shortest name can change if new aliases are added.  So using
5799        # this should be deferred until after all these are added.  The code
5800        # that does that should clear this one's cache.
5801        # Any name with alphabetics is preferred over an all numeric one, even
5802        # if longer.
5803
5804        my $self = shift;
5805        my $nominal_length_ptr = shift;
5806        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5807
5808        my $addr = do { no overloading; pack 'J', $self; };
5809
5810        # For efficiency, don't recalculate, but this means that adding new
5811        # aliases could change what the shortest is, so the code that does
5812        # that needs to undef this.
5813        if (defined $short_name{$addr}) {
5814            if ($nominal_length_ptr) {
5815                $$nominal_length_ptr = $nominal_short_name_length{$addr};
5816            }
5817            return $short_name{$addr};
5818        }
5819
5820        # Look at each alias
5821        my $is_last_resort = 0;
5822        my $deprecated_or_discouraged
5823                                = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5824        foreach my $alias ($self->aliases()) {
5825
5826            # Don't use an alias that isn't ok to use for an external name.
5827            next if ! $alias->ok_as_filename;
5828
5829            my $name = main::Standardize($alias->name);
5830            trace $self, $name if main::DEBUG && $to_trace;
5831
5832            # Take the first one, or any non-deprecated non-discouraged one
5833            # over one that is, or a shorter one that isn't numeric.  This
5834            # relies on numeric aliases always being last in the array
5835            # returned by aliases().  Any alpha one will have precedence.
5836            if (   ! defined $short_name{$addr}
5837                || (   $is_last_resort
5838                    && $alias->status !~ $deprecated_or_discouraged)
5839                || ($name =~ /\D/
5840                    && length($name) < length($short_name{$addr})))
5841            {
5842                # Remove interior underscores.
5843                ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5844
5845                $nominal_short_name_length{$addr} = length $name;
5846                $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5847            }
5848        }
5849
5850        # If the short name isn't a nice one, perhaps an equivalent table has
5851        # a better one.
5852        if (   $self->can('children')
5853            && (   ! defined $short_name{$addr}
5854                || $short_name{$addr} eq ""
5855                || $short_name{$addr} eq "_"))
5856        {
5857            my $return;
5858            foreach my $follower ($self->children) {    # All equivalents
5859                my $follower_name = $follower->short_name;
5860                next unless defined $follower_name;
5861
5862                # Anything (except undefined) is better than underscore or
5863                # empty
5864                if (! defined $return || $return eq "_") {
5865                    $return = $follower_name;
5866                    next;
5867                }
5868
5869                # If the new follower name isn't "_" and is shorter than the
5870                # current best one, prefer the new one.
5871                next if $follower_name eq "_";
5872                next if length $follower_name > length $return;
5873                $return = $follower_name;
5874            }
5875            $short_name{$addr} = $return if defined $return;
5876        }
5877
5878        # If no suitable external name return undef
5879        if (! defined $short_name{$addr}) {
5880            $$nominal_length_ptr = undef if $nominal_length_ptr;
5881            return;
5882        }
5883
5884        # Don't allow a null short name.
5885        if ($short_name{$addr} eq "") {
5886            $short_name{$addr} = '_';
5887            $nominal_short_name_length{$addr} = 1;
5888        }
5889
5890        trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5891
5892        if ($nominal_length_ptr) {
5893            $$nominal_length_ptr = $nominal_short_name_length{$addr};
5894        }
5895        return $short_name{$addr};
5896    }
5897
5898    sub external_name {
5899        # Returns the external name that this table should be known by.  This
5900        # is usually the short_name, but not if the short_name is undefined,
5901        # in which case the external_name is arbitrarily set to the
5902        # underscore.
5903
5904        my $self = shift;
5905        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5906
5907        my $short = $self->short_name;
5908        return $short if defined $short;
5909
5910        return '_';
5911    }
5912
5913    sub add_description { # Adds the parameter as a short description.
5914
5915        my $self = shift;
5916        my $description = shift;
5917        chomp $description;
5918        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5919
5920        no overloading;
5921        push @{$description{pack 'J', $self}}, $description;
5922
5923        return;
5924    }
5925
5926    sub add_note { # Adds the parameter as a short note.
5927
5928        my $self = shift;
5929        my $note = shift;
5930        chomp $note;
5931        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5932
5933        no overloading;
5934        push @{$note{pack 'J', $self}}, $note;
5935
5936        return;
5937    }
5938
5939    sub add_comment { # Adds the parameter as a comment.
5940
5941        return unless $debugging_build;
5942
5943        my $self = shift;
5944        my $comment = shift;
5945        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5946
5947        chomp $comment;
5948
5949        no overloading;
5950        push @{$comment{pack 'J', $self}}, $comment;
5951
5952        return;
5953    }
5954
5955    sub comment {
5956        # Return the current comment for this table.  If called in list
5957        # context, returns the array of comments.  In scalar, returns a string
5958        # of each element joined together with a period ending each.
5959
5960        my $self = shift;
5961        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5962
5963        my $addr = do { no overloading; pack 'J', $self; };
5964        my @list = @{$comment{$addr}};
5965        return @list if wantarray;
5966        my $return = "";
5967        foreach my $sentence (@list) {
5968            $return .= '.  ' if $return;
5969            $return .= $sentence;
5970            $return =~ s/\.$//;
5971        }
5972        $return .= '.' if $return;
5973        return $return;
5974    }
5975
5976    sub initialize {
5977        # Initialize the table with the argument which is any valid
5978        # initialization for range lists.
5979
5980        my $self = shift;
5981        my $addr = do { no overloading; pack 'J', $self; };
5982        my $initialization = shift;
5983        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5984
5985        # Replace the current range list with a new one of the same exact
5986        # type.
5987        my $class = ref $range_list{$addr};
5988        $range_list{$addr} = $class->new(Owner => $self,
5989                                        Initialize => $initialization);
5990        return;
5991
5992    }
5993
5994    sub header {
5995        # The header that is output for the table in the file it is written
5996        # in.
5997
5998        my $self = shift;
5999        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6000
6001        my $return = "";
6002        $return .= $DEVELOPMENT_ONLY if $compare_versions;
6003        $return .= $HEADER;
6004        return $return;
6005    }
6006
6007    sub merge_single_annotation_line ($$$) {
6008        my ($output, $annotation, $annotation_column) = @_;
6009
6010        # This appends an annotation comment, $annotation, to $output,
6011        # starting in or after column $annotation_column, removing any
6012        # pre-existing comment from $output.
6013
6014        $annotation =~ s/^ \s* \# \  //x;
6015        $output =~ s/ \s* ( \# \N* )? \n //x;
6016        $output = Text::Tabs::expand($output);
6017
6018        my $spaces = $annotation_column - length $output;
6019        $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
6020
6021        $output = sprintf "%s%*s# %s",
6022                            $output,
6023                            $spaces,
6024                            " ",
6025                            $annotation;
6026        return Text::Tabs::unexpand $output;
6027    }
6028
6029    sub write {
6030        # Write a representation of the table to its file.  It calls several
6031        # functions furnished by sub-classes of this abstract base class to
6032        # handle non-normal ranges, to add stuff before the table, and at its
6033        # end.  If the table is to be written so that adjustments are
6034        # required, this does that conversion.
6035
6036        my $self = shift;
6037        my $use_adjustments = shift; # ? output in adjusted format or not
6038        my $suppress_value = shift;  # Optional, if the value associated with
6039                                     # a range equals this one, don't write
6040                                     # the range
6041        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6042
6043        my $addr = do { no overloading; pack 'J', $self; };
6044        my $write_as_invlist = $write_as_invlist{$addr};
6045
6046        # Start with the header
6047        my @HEADER = $self->header;
6048
6049        # Then the comments
6050        push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
6051                                                        if $comment{$addr};
6052
6053        # Things discovered processing the main body of the document may
6054        # affect what gets output before it, therefore pre_body() isn't called
6055        # until after all other processing of the table is done.
6056
6057        # The main body looks like a 'here' document.  If there are comments,
6058        # get rid of them when processing it.
6059        my @OUT;
6060        if ($annotate || $output_range_counts) {
6061            # Use the line below in Perls that don't have /r
6062            #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
6063            push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
6064        } else {
6065            push @OUT, "return <<'END';\n";
6066        }
6067
6068        if ($range_list{$addr}->is_empty) {
6069
6070            # This is a kludge for empty tables to silence a warning in
6071            # utf8.c, which can't really deal with empty tables, but it can
6072            # deal with a table that matches nothing, as the inverse of 'All'
6073            # does.
6074            push @OUT, "!utf8::All\n";
6075        }
6076        elsif ($self->name eq 'N'
6077
6078               # To save disk space and table cache space, avoid putting out
6079               # binary N tables, but instead create a file which just inverts
6080               # the Y table.  Since the file will still exist and occupy a
6081               # certain number of blocks, might as well output the whole
6082               # thing if it all will fit in one block.   The number of
6083               # ranges below is an approximate number for that.
6084               && ($self->property->type == $BINARY
6085                   || $self->property->type == $FORCED_BINARY)
6086               # && $self->property->tables == 2  Can't do this because the
6087               #        non-binary properties, like NFDQC aren't specifiable
6088               #        by the notation
6089               && $range_list{$addr}->ranges > 15
6090               && ! $annotate)  # Under --annotate, want to see everything
6091        {
6092            push @OUT, "!utf8::" . $self->property->name . "\n";
6093        }
6094        else {
6095            my $range_size_1 = $range_size_1{$addr};
6096
6097            # To make it more readable, use a minimum indentation
6098            my $comment_indent;
6099
6100            # These are used only in $annotate option
6101            my $format;         # e.g. $HEX_ADJUST_FORMAT
6102            my $include_name;   # ? Include the character's name in the
6103                                # annotation?
6104            my $include_cp;     # ? Include its code point
6105
6106            if (! $annotate) {
6107                $comment_indent = ($self->isa('Map_Table'))
6108                                  ? 24
6109                                  : ($write_as_invlist)
6110                                    ? 8
6111                                    : 16;
6112            }
6113            else {
6114                $format = $self->format;
6115
6116                # The name of the character is output only for tables that
6117                # don't already include the name in the output.
6118                my $property = $self->property;
6119                $include_name =
6120                    !  ($property == $perl_charname
6121                        || $property == main::property_ref('Unicode_1_Name')
6122                        || $property == main::property_ref('Name')
6123                        || $property == main::property_ref('Name_Alias')
6124                       );
6125
6126                # Don't include the code point in the annotation where all
6127                # lines are a single code point, so it can be easily found in
6128                # the first column
6129                $include_cp = ! $range_size_1;
6130
6131                if (! $self->isa('Map_Table')) {
6132                    $comment_indent = ($write_as_invlist) ? 8 : 16;
6133                }
6134                else {
6135                    $comment_indent = 16;
6136
6137                    # There are just a few short ranges in this table, so no
6138                    # need to include the code point in the annotation.
6139                    $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
6140
6141                    # We're trying to get this to look good, as the whole
6142                    # point is to make human-readable tables.  It is easier to
6143                    # read if almost all the annotation comments begin in the
6144                    # same column.  Map tables have varying width maps, so can
6145                    # create a jagged comment appearance.  This code does a
6146                    # preliminary pass through these tables looking for the
6147                    # maximum width map in each, and causing the comments to
6148                    # begin just to the right of that.  However, if the
6149                    # comments begin too far to the right of most lines, it's
6150                    # hard to line them up horizontally with their real data.
6151                    # Therefore we ignore the longest outliers
6152                    my $ignore_longest_X_percent = 2;  # Discard longest X%
6153
6154                    # Each key in this hash is a width of at least one of the
6155                    # maps in the table.  Its value is how many lines have
6156                    # that width.
6157                    my %widths;
6158
6159                    # We won't space things further left than one tab stop
6160                    # after the rest of the line; initializing it to that
6161                    # number saves some work.
6162                    my $max_map_width = 8;
6163
6164                    # Fill in the %widths hash
6165                    my $total = 0;
6166                    for my $set ($range_list{$addr}->ranges) {
6167                        my $value = $set->value;
6168
6169                        # These range types don't appear in the main table
6170                        next if $set->type == 0
6171                                && defined $suppress_value
6172                                && $value eq $suppress_value;
6173                        next if $set->type == $MULTI_CP
6174                                || $set->type == $NULL;
6175
6176                        # Include 2 spaces before the beginning of the
6177                        # comment
6178                        my $this_width = length($value) + 2;
6179
6180                        # Ranges of the remaining non-zero types usually
6181                        # occupy just one line (maybe occasionally two, but
6182                        # this doesn't have to be dead accurate).  This is
6183                        # because these ranges are like "unassigned code
6184                        # points"
6185                        my $count = ($set->type != 0)
6186                                    ? 1
6187                                    : $set->end - $set->start + 1;
6188                        $widths{$this_width} += $count;
6189                        $total += $count;
6190                        $max_map_width = $this_width
6191                                            if $max_map_width < $this_width;
6192                    }
6193
6194                    # If the widest map gives us less than two tab stops
6195                    # worth, just take it as-is.
6196                    if ($max_map_width > 16) {
6197
6198                        # Otherwise go through %widths until we have included
6199                        # the desired percentage of lines in the whole table.
6200                        my $running_total = 0;
6201                        foreach my $width (sort { $a <=> $b } keys %widths)
6202                        {
6203                            $running_total += $widths{$width};
6204                            use integer;
6205                            if ($running_total * 100 / $total
6206                                            >= 100 - $ignore_longest_X_percent)
6207                            {
6208                                $max_map_width = $width;
6209                                last;
6210                            }
6211                        }
6212                    }
6213                    $comment_indent += $max_map_width;
6214                }
6215            }
6216
6217            # Values for previous time through the loop.  Initialize to
6218            # something that won't be adjacent to the first iteration;
6219            # only $previous_end matters for that.
6220            my $previous_start;
6221            my $previous_end = -2;
6222            my $previous_value;
6223
6224            # Values for next time through the portion of the loop that splits
6225            # the range.  0 in $next_start means there is no remaining portion
6226            # to deal with.
6227            my $next_start = 0;
6228            my $next_end;
6229            my $next_value;
6230            my $offset = 0;
6231            my $invlist_count = 0;
6232
6233            my $output_value_in_hex = $self->isa('Map_Table')
6234                                && ($self->format eq $HEX_ADJUST_FORMAT
6235                                    || $self->to_output_map == $EXTERNAL_MAP);
6236            # Use leading zeroes just for files whose format should not be
6237            # changed from what it has been.  Otherwise, they just take up
6238            # space and time to process.
6239            my $hex_format = ($self->isa('Map_Table')
6240                              && $self->to_output_map == $EXTERNAL_MAP)
6241                             ? "%04X"
6242                             : "%X";
6243
6244            # The values for some of these tables are stored in mktables as
6245            # hex strings.  Normally, these are just output as strings without
6246            # change, but when we are doing adjustments, we have to operate on
6247            # these numerically, so we convert those to decimal to do that,
6248            # and back to hex for output
6249            my $convert_map_to_from_hex = 0;
6250            my $output_map_in_hex = 0;
6251            if ($self->isa('Map_Table')) {
6252                $convert_map_to_from_hex
6253                   = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6254                      || ($annotate && $self->format eq $HEX_FORMAT);
6255                $output_map_in_hex = $convert_map_to_from_hex
6256                                 || $self->format eq $HEX_FORMAT;
6257            }
6258
6259            # To store any annotations about the characters.
6260            my @annotation;
6261
6262            # Output each range as part of the here document.
6263            RANGE:
6264            for my $set ($range_list{$addr}->ranges) {
6265                if ($set->type != 0) {
6266                    $self->handle_special_range($set);
6267                    next RANGE;
6268                }
6269                my $start = $set->start;
6270                my $end   = $set->end;
6271                my $value  = $set->value;
6272
6273                # Don't output ranges whose value is the one to suppress
6274                next RANGE if defined $suppress_value
6275                              && $value eq $suppress_value;
6276
6277                $value = CORE::hex $value if $convert_map_to_from_hex;
6278
6279
6280                {   # This bare block encloses the scope where we may need to
6281                    # 'redo' to.  Consider a table that is to be written out
6282                    # using single item ranges.  This is given in the
6283                    # $range_size_1 boolean.  To accomplish this, we split the
6284                    # range each time through the loop into two portions, the
6285                    # first item, and the rest.  We handle that first item
6286                    # this time in the loop, and 'redo' to repeat the process
6287                    # for the rest of the range.
6288                    #
6289                    # We may also have to do it, with other special handling,
6290                    # if the table has adjustments.  Consider the table that
6291                    # contains the lowercasing maps.  mktables stores the
6292                    # ASCII range ones as 26 ranges:
6293                    #       ord('A') => ord('a'), .. ord('Z') => ord('z')
6294                    # For compactness, the table that gets written has this as
6295                    # just one range
6296                    #       ( ord('A') .. ord('Z') ) => ord('a')
6297                    # and the software that reads the tables is smart enough
6298                    # to "connect the dots".  This change is accomplished in
6299                    # this loop by looking to see if the current iteration
6300                    # fits the paradigm of the previous iteration, and if so,
6301                    # we merge them by replacing the final output item with
6302                    # the merged data.  Repeated 25 times, this gets A-Z.  But
6303                    # we also have to make sure we don't screw up cases where
6304                    # we have internally stored
6305                    #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
6306                    # This single internal range has to be output as 3 ranges,
6307                    # which is done by splitting, like we do for $range_size_1
6308                    # tables.  (There are very few of such ranges that need to
6309                    # be split, so the gain of doing the combining of other
6310                    # ranges far outweighs the splitting of these.)  The
6311                    # values to use for the redo at the end of this block are
6312                    # set up just below in the scalars whose names begin with
6313                    # '$next_'.
6314
6315                    if (($use_adjustments || $range_size_1) && $end != $start)
6316                    {
6317                        $next_start = $start + 1;
6318                        $next_end = $end;
6319                        $next_value = $value;
6320                        $end = $start;
6321                    }
6322
6323                    if ($use_adjustments && ! $range_size_1) {
6324
6325                        # If this range is adjacent to the previous one, and
6326                        # the values in each are integers that are also
6327                        # adjacent (differ by 1), then this range really
6328                        # extends the previous one that is already in element
6329                        # $OUT[-1].  So we pop that element, and pretend that
6330                        # the range starts with whatever it started with.
6331                        # $offset is incremented by 1 each time so that it
6332                        # gives the current offset from the first element in
6333                        # the accumulating range, and we keep in $value the
6334                        # value of that first element.
6335                        if ($start == $previous_end + 1
6336                            && $value =~ /^ -? \d+ $/xa
6337                            && $previous_value =~ /^ -? \d+ $/xa
6338                            && ($value == ($previous_value + ++$offset)))
6339                        {
6340                            pop @OUT;
6341                            $start = $previous_start;
6342                            $value = $previous_value;
6343                        }
6344                        else {
6345                            $offset = 0;
6346                            if (@annotation == 1) {
6347                                $OUT[-1] = merge_single_annotation_line(
6348                                    $OUT[-1], $annotation[0], $comment_indent);
6349                            }
6350                            else {
6351                                push @OUT, @annotation;
6352                            }
6353                        }
6354                        undef @annotation;
6355
6356                        # Save the current values for the next time through
6357                        # the loop.
6358                        $previous_start = $start;
6359                        $previous_end = $end;
6360                        $previous_value = $value;
6361                    }
6362
6363                    if ($write_as_invlist) {
6364                        if (   $previous_end > 0
6365                            && $output_range_counts{$addr})
6366                        {
6367                            my $complement_count = $start - $previous_end - 1;
6368                            if ($complement_count > 1) {
6369                                $OUT[-1] = merge_single_annotation_line(
6370                                    $OUT[-1],
6371                                       "#"
6372                                     . (" " x 17)
6373                                     . "["
6374                                     .  main::clarify_code_point_count(
6375                                                            $complement_count)
6376                                      . "] in complement\n",
6377                                    $comment_indent);
6378                            }
6379                        }
6380
6381                        # Inversion list format has a single number per line,
6382                        # the starting code point of a range that matches the
6383                        # property
6384                        push @OUT, $start, "\n";
6385                        $invlist_count++;
6386
6387                        # Add a comment with the size of the range, if
6388                        # requested.
6389                        if ($output_range_counts{$addr}) {
6390                            $OUT[-1] = merge_single_annotation_line(
6391                                    $OUT[-1],
6392                                    "# ["
6393                                      . main::clarify_code_point_count($end - $start + 1)
6394                                      . "]\n",
6395                                    $comment_indent);
6396                        }
6397                    }
6398                    elsif ($start != $end) { # If there is a range
6399                        if ($end == $MAX_WORKING_CODEPOINT) {
6400                            push @OUT, sprintf "$hex_format\t$hex_format",
6401                                                $start,
6402                                                $MAX_PLATFORM_CODEPOINT;
6403                        }
6404                        else {
6405                            push @OUT, sprintf "$hex_format\t$hex_format",
6406                                                $start,       $end;
6407                        }
6408                        if (length $value) {
6409                            if ($convert_map_to_from_hex) {
6410                                $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6411                            }
6412                            else {
6413                                $OUT[-1] .= "\t$value\n";
6414                            }
6415                        }
6416
6417                        # Add a comment with the size of the range, if
6418                        # requested.
6419                        if ($output_range_counts{$addr}) {
6420                            $OUT[-1] = merge_single_annotation_line(
6421                                    $OUT[-1],
6422                                    "# ["
6423                                      . main::clarify_code_point_count($end - $start + 1)
6424                                      . "]\n",
6425                                    $comment_indent);
6426                        }
6427                    }
6428                    else { # Here to output a single code point per line.
6429
6430                        # Use any passed in subroutine to output.
6431                        if (ref $range_size_1 eq 'CODE') {
6432                            for my $i ($start .. $end) {
6433                                push @OUT, &{$range_size_1}($i, $value);
6434                            }
6435                        }
6436                        else {
6437
6438                            # Here, caller is ok with default output.
6439                            for (my $i = $start; $i <= $end; $i++) {
6440                                if ($convert_map_to_from_hex) {
6441                                    push @OUT,
6442                                        sprintf "$hex_format\t\t$hex_format\n",
6443                                                 $i,            $value;
6444                                }
6445                                else {
6446                                    push @OUT, sprintf $hex_format, $i;
6447                                    $OUT[-1] .= "\t\t$value" if $value ne "";
6448                                    $OUT[-1] .= "\n";
6449                                }
6450                            }
6451                        }
6452                    }
6453
6454                    if ($annotate) {
6455                        for (my $i = $start; $i <= $end; $i++) {
6456                            my $annotation = "";
6457
6458                            # Get character information if don't have it already
6459                            main::populate_char_info($i)
6460                                                     if ! defined $viacode[$i];
6461                            my $type = $annotate_char_type[$i];
6462
6463                            # Figure out if should output the next code points
6464                            # as part of a range or not.  If this is not in an
6465                            # annotation range, then won't output as a range,
6466                            # so returns $i.  Otherwise use the end of the
6467                            # annotation range, but no further than the
6468                            # maximum possible end point of the loop.
6469                            my $range_end =
6470                                        $range_size_1
6471                                        ? $start
6472                                        : main::min(
6473                                          $annotate_ranges->value_of($i) || $i,
6474                                          $end);
6475
6476                            # Use a range if it is a range, and either is one
6477                            # of the special annotation ranges, or the range
6478                            # is at most 3 long.  This last case causes the
6479                            # algorithmically named code points to be output
6480                            # individually in spans of at most 3, as they are
6481                            # the ones whose $type is > 0.
6482                            if ($range_end != $i
6483                                && ( $type < 0 || $range_end - $i > 2))
6484                            {
6485                                # Here is to output a range.  We don't allow a
6486                                # caller-specified output format--just use the
6487                                # standard one.
6488                                my $range_name = $viacode[$i];
6489
6490                                # For the code points which end in their hex
6491                                # value, we eliminate that from the output
6492                                # annotation, and capitalize only the first
6493                                # letter of each word.
6494                                if ($type == $CP_IN_NAME) {
6495                                    my $hex = sprintf $hex_format, $i;
6496                                    $range_name =~ s/-$hex$//;
6497                                    my @words = split " ", $range_name;
6498                                    for my $word (@words) {
6499                                        $word =
6500                                          ucfirst(lc($word)) if $word ne 'CJK';
6501                                    }
6502                                    $range_name = join " ", @words;
6503                                }
6504                                elsif ($type == $HANGUL_SYLLABLE) {
6505                                    $range_name = "Hangul Syllable";
6506                                }
6507
6508                                # If the annotation would just repeat what's
6509                                # already being output as the range, skip it.
6510                                # (When an inversion list is being written, it
6511                                # isn't a repeat, as that always is in
6512                                # decimal)
6513                                if (   $write_as_invlist
6514                                    || $i != $start
6515                                    || $range_end < $end)
6516                                {
6517                                    if ($range_end < $MAX_WORKING_CODEPOINT)
6518                                    {
6519                                        $annotation = sprintf "%04X..%04X",
6520                                                              $i,   $range_end;
6521                                    }
6522                                    else {
6523                                        $annotation = sprintf "%04X..INFINITY",
6524                                                               $i;
6525                                    }
6526                                }
6527                                else { # Indent if not displaying code points
6528                                    $annotation = " " x 4;
6529                                }
6530
6531                                if ($range_name) {
6532                                    $annotation .= " $age[$i]" if $age[$i];
6533                                    $annotation .= " $range_name";
6534                                }
6535
6536                                # Include the number of code points in the
6537                                # range
6538                                my $count =
6539                                    main::clarify_code_point_count($range_end - $i + 1);
6540                                $annotation .= " [$count]\n";
6541
6542                                # Skip to the end of the range
6543                                $i = $range_end;
6544                            }
6545                            else { # Not in a range.
6546                                my $comment = "";
6547
6548                                # When outputting the names of each character,
6549                                # use the character itself if printable
6550                                $comment .= "'" . main::display_chr($i) . "' "
6551                                                            if $printable[$i];
6552
6553                                my $output_value = $value;
6554
6555                                # Determine the annotation
6556                                if ($format eq $DECOMP_STRING_FORMAT) {
6557
6558                                    # This is very specialized, with the type
6559                                    # of decomposition beginning the line
6560                                    # enclosed in <...>, and the code points
6561                                    # that the code point decomposes to
6562                                    # separated by blanks.  Create two
6563                                    # strings, one of the printable
6564                                    # characters, and one of their official
6565                                    # names.
6566                                    (my $map = $output_value)
6567                                                    =~ s/ \ * < .*? > \ +//x;
6568                                    my $tostr = "";
6569                                    my $to_name = "";
6570                                    my $to_chr = "";
6571                                    foreach my $to (split " ", $map) {
6572                                        $to = CORE::hex $to;
6573                                        $to_name .= " + " if $to_name;
6574                                        $to_chr .= main::display_chr($to);
6575                                        main::populate_char_info($to)
6576                                                    if ! defined $viacode[$to];
6577                                        $to_name .=  $viacode[$to];
6578                                    }
6579
6580                                    $comment .=
6581                                    "=> '$to_chr'; $viacode[$i] => $to_name";
6582                                }
6583                                else {
6584                                    $output_value += $i - $start
6585                                                   if $use_adjustments
6586                                                      # Don't try to adjust a
6587                                                      # non-integer
6588                                                   && $output_value !~ /[-\D]/;
6589
6590                                    if ($output_map_in_hex) {
6591                                        main::populate_char_info($output_value)
6592                                          if ! defined $viacode[$output_value];
6593                                        $comment .= " => '"
6594                                        . main::display_chr($output_value)
6595                                        . "'; " if $printable[$output_value];
6596                                    }
6597                                    if ($include_name && $viacode[$i]) {
6598                                        $comment .= " " if $comment;
6599                                        $comment .= $viacode[$i];
6600                                    }
6601                                    if ($output_map_in_hex) {
6602                                        $comment .=
6603                                                " => $viacode[$output_value]"
6604                                                    if $viacode[$output_value];
6605                                        $output_value = sprintf($hex_format,
6606                                                                $output_value);
6607                                    }
6608                                }
6609
6610                                if ($include_cp) {
6611                                    $annotation = sprintf "%04X %s", $i, $age[$i];
6612                                    if ($use_adjustments) {
6613                                        $annotation .= " => $output_value";
6614                                    }
6615                                }
6616
6617                                if ($comment ne "") {
6618                                    $annotation .= " " if $annotation ne "";
6619                                    $annotation .= $comment;
6620                                }
6621                                $annotation .= "\n" if $annotation ne "";
6622                            }
6623
6624                            if ($annotation ne "") {
6625                                push @annotation, (" " x $comment_indent)
6626                                                  .  "# $annotation";
6627                            }
6628                        }
6629
6630                        # If not adjusting, we don't have to go through the
6631                        # loop again to know that the annotation comes next
6632                        # in the output.
6633                        if (! $use_adjustments) {
6634                            if (@annotation == 1) {
6635                                $OUT[-1] = merge_single_annotation_line(
6636                                    $OUT[-1], $annotation[0], $comment_indent);
6637                            }
6638                            else {
6639                                push @OUT, map { Text::Tabs::unexpand $_ }
6640                                               @annotation;
6641                            }
6642                            undef @annotation;
6643                        }
6644                    }
6645
6646                    # Add the beginning of the range that doesn't match the
6647                    # property, except if the just added match range extends
6648                    # to infinity.  We do this after any annotations for the
6649                    # match range.
6650                    if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6651                        push @OUT, $end + 1, "\n";
6652                        $invlist_count++;
6653                    }
6654
6655                    # If we split the range, set up so the next time through
6656                    # we get the remainder, and redo.
6657                    if ($next_start) {
6658                        $start = $next_start;
6659                        $end = $next_end;
6660                        $value = $next_value;
6661                        $next_start = 0;
6662                        redo;
6663                    }
6664                }
6665            } # End of loop through all the table's ranges
6666
6667            push @OUT, @annotation; # Add orphaned annotation, if any
6668
6669            splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6670        }
6671
6672        # Add anything that goes after the main body, but within the here
6673        # document,
6674        my $append_to_body = $self->append_to_body;
6675        push @OUT, $append_to_body if $append_to_body;
6676
6677        # And finish the here document.
6678        push @OUT, "END\n";
6679
6680        # Done with the main portion of the body.  Can now figure out what
6681        # should appear before it in the file.
6682        my $pre_body = $self->pre_body;
6683        push @HEADER, $pre_body, "\n" if $pre_body;
6684
6685        # All these files should have a .pl suffix added to them.
6686        my @file_with_pl = @{$file_path{$addr}};
6687        $file_with_pl[-1] .= '.pl';
6688
6689        main::write(\@file_with_pl,
6690                    $annotate,      # utf8 iff annotating
6691                    \@HEADER,
6692                    \@OUT);
6693        return;
6694    }
6695
6696    sub set_status {    # Set the table's status
6697        my $self = shift;
6698        my $status = shift; # The status enum value
6699        my $info = shift;   # Any message associated with it.
6700        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6701
6702        my $addr = do { no overloading; pack 'J', $self; };
6703
6704        $status{$addr} = $status;
6705        $status_info{$addr} = $info;
6706        return;
6707    }
6708
6709    sub set_fate {  # Set the fate of a table
6710        my $self = shift;
6711        my $fate = shift;
6712        my $reason = shift;
6713        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6714
6715        my $addr = do { no overloading; pack 'J', $self; };
6716
6717        return if $fate{$addr} == $fate;    # If no-op
6718
6719        # Can only change the ordinary fate, except if going to $MAP_PROXIED
6720        return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6721
6722        $fate{$addr} = $fate;
6723
6724        # Don't document anything to do with a non-normal fated table
6725        if ($fate != $ORDINARY) {
6726            my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6727            foreach my $alias ($self->aliases) {
6728                $alias->set_ucd($put_in_pod);
6729
6730                # MAP_PROXIED doesn't affect the match tables
6731                next if $fate == $MAP_PROXIED;
6732                $alias->set_make_re_pod_entry($put_in_pod);
6733            }
6734        }
6735
6736        # Save the reason for suppression for output
6737        if ($fate >= $SUPPRESSED) {
6738            $reason = "" unless defined $reason;
6739            $why_suppressed{$complete_name{$addr}} = $reason;
6740        }
6741
6742        return;
6743    }
6744
6745    sub lock {
6746        # Don't allow changes to the table from now on.  This stores a stack
6747        # trace of where it was called, so that later attempts to modify it
6748        # can immediately show where it got locked.
6749
6750        my $self = shift;
6751        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6752
6753        my $addr = do { no overloading; pack 'J', $self; };
6754
6755        $locked{$addr} = "";
6756
6757        my $line = (caller(0))[2];
6758        my $i = 1;
6759
6760        # Accumulate the stack trace
6761        while (1) {
6762            my ($pkg, $file, $caller_line, $caller) = caller $i++;
6763
6764            last unless defined $caller;
6765
6766            $locked{$addr} .= "    called from $caller() at line $line\n";
6767            $line = $caller_line;
6768        }
6769        $locked{$addr} .= "    called from main at line $line\n";
6770
6771        return;
6772    }
6773
6774    sub carp_if_locked {
6775        # Return whether a table is locked or not, and, by the way, complain
6776        # if is locked
6777
6778        my $self = shift;
6779        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6780
6781        my $addr = do { no overloading; pack 'J', $self; };
6782
6783        return 0 if ! $locked{$addr};
6784        Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6785        return 1;
6786    }
6787
6788    sub set_file_path { # Set the final directory path for this table
6789        my $self = shift;
6790        # Rest of parameters passed on
6791
6792        no overloading;
6793        @{$file_path{pack 'J', $self}} = @_;
6794        return
6795    }
6796
6797    # Accessors for the range list stored in this table.  First for
6798    # unconditional
6799    for my $sub (qw(
6800                    containing_range
6801                    contains
6802                    count
6803                    each_range
6804                    hash
6805                    is_empty
6806                    matches_identically_to
6807                    max
6808                    min
6809                    range_count
6810                    reset_each_range
6811                    type_of
6812                    value_of
6813                ))
6814    {
6815        no strict "refs";
6816        *$sub = sub {
6817            use strict "refs";
6818            my $self = shift;
6819            return $self->_range_list->$sub(@_);
6820        }
6821    }
6822
6823    # Then for ones that should fail if locked
6824    for my $sub (qw(
6825                    delete_range
6826                ))
6827    {
6828        no strict "refs";
6829        *$sub = sub {
6830            use strict "refs";
6831            my $self = shift;
6832
6833            return if $self->carp_if_locked;
6834            no overloading;
6835            return $self->_range_list->$sub(@_);
6836        }
6837    }
6838
6839} # End closure
6840
6841package Map_Table;
6842use parent '-norequire', '_Base_Table';
6843
6844# A Map Table is a table that contains the mappings from code points to
6845# values.  There are two weird cases:
6846# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6847#    are written in the table's file at the end of the table nonetheless.  It
6848#    requires specially constructed code to handle these; utf8.c can not read
6849#    these in, so they should not go in $map_directory.  As of this writing,
6850#    the only case that these happen is for named sequences used in
6851#    charnames.pm.   But this code doesn't enforce any syntax on these, so
6852#    something else could come along that uses it.
6853# 2) Specials are anything that doesn't fit syntactically into the body of the
6854#    table.  The ranges for these have a map type of non-zero.  The code below
6855#    knows about and handles each possible type.   In most cases, these are
6856#    written as part of the header.
6857#
6858# A map table deliberately can't be manipulated at will unlike match tables.
6859# This is because of the ambiguities having to do with what to do with
6860# overlapping code points.  And there just isn't a need for those things;
6861# what one wants to do is just query, add, replace, or delete mappings, plus
6862# write the final result.
6863# However, there is a method to get the list of possible ranges that aren't in
6864# this table to use for defaulting missing code point mappings.  And,
6865# map_add_or_replace_non_nulls() does allow one to add another table to this
6866# one, but it is clearly very specialized, and defined that the other's
6867# non-null values replace this one's if there is any overlap.
6868
6869sub trace { return main::trace(@_); }
6870
6871{ # Closure
6872
6873    main::setup_package();
6874
6875    my %default_map;
6876    # Many input files omit some entries; this gives what the mapping for the
6877    # missing entries should be
6878    main::set_access('default_map', \%default_map, 'r');
6879
6880    my %anomalous_entries;
6881    # Things that go in the body of the table which don't fit the normal
6882    # scheme of things, like having a range.  Not much can be done with these
6883    # once there except to output them.  This was created to handle named
6884    # sequences.
6885    main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6886    main::set_access('anomalous_entries',       # Append singular, read plural
6887                    \%anomalous_entries,
6888                    'readable_array');
6889
6890    my %replacement_property;
6891    # Certain files are unused by Perl itself, and are kept only for backwards
6892    # compatibility for programs that used them before Unicode::UCD existed.
6893    # These are termed legacy properties.  At some point they may be removed,
6894    # but for now mark them as legacy.  If non empty, this is the name of the
6895    # property to use instead (i.e., the modern equivalent).
6896    main::set_access('replacement_property', \%replacement_property, 'r');
6897
6898    my %to_output_map;
6899    # Enum as to whether or not to write out this map table, and how:
6900    #   0               don't output
6901    #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6902    #                   it should not be removed nor its format changed.  This
6903    #                   is done for those files that have traditionally been
6904    #                   output.  Maps of legacy-only properties default to
6905    #                   this.
6906    #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6907    #                   with this file
6908    #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6909    #                   outputting the actual mappings as-is, we adjust things
6910    #                   to create a much more compact table. Only those few
6911    #                   tables where the mapping is convertible at least to an
6912    #                   integer and compacting makes a big difference should
6913    #                   have this.  Hence, the default is to not do this
6914    #                   unless the table's default mapping is to $CODE_POINT,
6915    #                   and the range size is not 1.
6916    main::set_access('to_output_map', \%to_output_map, 's');
6917
6918    sub new {
6919        my $class = shift;
6920        my $name = shift;
6921
6922        my %args = @_;
6923
6924        # Optional initialization data for the table.
6925        my $initialize = delete $args{'Initialize'};
6926
6927        my $default_map = delete $args{'Default_Map'};
6928        my $property = delete $args{'_Property'};
6929        my $full_name = delete $args{'Full_Name'};
6930        my $replacement_property = delete $args{'Replacement_Property'} // "";
6931        my $to_output_map = delete $args{'To_Output_Map'};
6932
6933        # Rest of parameters passed on; legacy properties have several common
6934        # other attributes
6935        if ($replacement_property) {
6936            $args{"Fate"} = $LEGACY_ONLY;
6937            $args{"Range_Size_1"} = 1;
6938            $args{"Perl_Extension"} = 1;
6939            $args{"UCD"} = 0;
6940        }
6941
6942        my $range_list = Range_Map->new(Owner => $property);
6943
6944        my $self = $class->SUPER::new(
6945                                    Name => $name,
6946                                    Complete_Name =>  $full_name,
6947                                    Full_Name => $full_name,
6948                                    _Property => $property,
6949                                    _Range_List => $range_list,
6950                                    Write_As_Invlist => 0,
6951                                    %args);
6952
6953        my $addr = do { no overloading; pack 'J', $self; };
6954
6955        $anomalous_entries{$addr} = [];
6956        $default_map{$addr} = $default_map;
6957        $replacement_property{$addr} = $replacement_property;
6958        $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6959                                          && $replacement_property;
6960        $to_output_map{$addr} = $to_output_map;
6961
6962        $self->initialize($initialize) if defined $initialize;
6963
6964        return $self;
6965    }
6966
6967    use overload
6968        fallback => 0,
6969        qw("") => "_operator_stringify",
6970    ;
6971
6972    sub _operator_stringify {
6973        my $self = shift;
6974
6975        my $name = $self->property->full_name;
6976        $name = '""' if $name eq "";
6977        return "Map table for Property '$name'";
6978    }
6979
6980    sub add_alias {
6981        # Add a synonym for this table (which means the property itself)
6982        my $self = shift;
6983        my $name = shift;
6984        # Rest of parameters passed on.
6985
6986        $self->SUPER::add_alias($name, $self->property, @_);
6987        return;
6988    }
6989
6990    sub add_map {
6991        # Add a range of code points to the list of specially-handled code
6992        # points.  $MULTI_CP is assumed if the type of special is not passed
6993        # in.
6994
6995        my $self = shift;
6996        my $lower = shift;
6997        my $upper = shift;
6998        my $string = shift;
6999        my %args = @_;
7000
7001        my $type = delete $args{'Type'} || 0;
7002        # Rest of parameters passed on
7003
7004        # Can't change the table if locked.
7005        return if $self->carp_if_locked;
7006
7007        my $addr = do { no overloading; pack 'J', $self; };
7008
7009        $self->_range_list->add_map($lower, $upper,
7010                                    $string,
7011                                    @_,
7012                                    Type => $type);
7013        return;
7014    }
7015
7016    sub append_to_body {
7017        # Adds to the written HERE document of the table's body any anomalous
7018        # entries in the table..
7019
7020        my $self = shift;
7021        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7022
7023        my $addr = do { no overloading; pack 'J', $self; };
7024
7025        return "" unless @{$anomalous_entries{$addr}};
7026        return join("\n", @{$anomalous_entries{$addr}}) . "\n";
7027    }
7028
7029    sub map_add_or_replace_non_nulls {
7030        # This adds the mappings in the table $other to $self.  Non-null
7031        # mappings from $other override those in $self.  It essentially merges
7032        # the two tables, with the second having priority except for null
7033        # mappings.
7034
7035        my $self = shift;
7036        my $other = shift;
7037        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7038
7039        return if $self->carp_if_locked;
7040
7041        if (! $other->isa(__PACKAGE__)) {
7042            Carp::my_carp_bug("$other should be a "
7043                        . __PACKAGE__
7044                        . ".  Not a '"
7045                        . ref($other)
7046                        . "'.  Not added;");
7047            return;
7048        }
7049
7050        my $addr = do { no overloading; pack 'J', $self; };
7051        my $other_addr = do { no overloading; pack 'J', $other; };
7052
7053        local $to_trace = 0 if main::DEBUG;
7054
7055        my $self_range_list = $self->_range_list;
7056        my $other_range_list = $other->_range_list;
7057        foreach my $range ($other_range_list->ranges) {
7058            my $value = $range->value;
7059            next if $value eq "";
7060            $self_range_list->_add_delete('+',
7061                                          $range->start,
7062                                          $range->end,
7063                                          $value,
7064                                          Type => $range->type,
7065                                          Replace => $UNCONDITIONALLY);
7066        }
7067
7068        return;
7069    }
7070
7071    sub set_default_map {
7072        # Define what code points that are missing from the input files should
7073        # map to.  The optional second parameter 'full_name' indicates to
7074        # force using the full name of the map instead of its standard name.
7075
7076        my $self = shift;
7077        my $map = shift;
7078        my $use_full_name = shift // 0;
7079        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7080
7081        if ($use_full_name && $use_full_name ne 'full_name') {
7082            Carp::my_carp_bug("Second parameter to set_default_map() if"
7083                            . " present, must be 'full_name'");
7084        }
7085
7086        my $addr = do { no overloading; pack 'J', $self; };
7087
7088        # Convert the input to the standard equivalent, if any (won't have any
7089        # for $STRING properties)
7090        my $standard = $self->property->table($map);
7091        if (defined $standard) {
7092            $map = ($use_full_name)
7093                   ? $standard->full_name
7094                   : $standard->name;
7095        }
7096
7097        # Warn if there already is a non-equivalent default map for this
7098        # property.  Note that a default map can be a ref, which means that
7099        # what it actually means is delayed until later in the program, and it
7100        # IS permissible to override it here without a message.
7101        my $default_map = $default_map{$addr};
7102        if (defined $default_map
7103            && ! ref($default_map)
7104            && $default_map ne $map
7105            && main::Standardize($map) ne $default_map)
7106        {
7107            my $property = $self->property;
7108            my $map_table = $property->table($map);
7109            my $default_table = $property->table($default_map);
7110            if (defined $map_table
7111                && defined $default_table
7112                && $map_table != $default_table)
7113            {
7114                Carp::my_carp("Changing the default mapping for "
7115                            . $property
7116                            . " from $default_map to $map'");
7117            }
7118        }
7119
7120        $default_map{$addr} = $map;
7121
7122        # Don't also create any missing table for this map at this point,
7123        # because if we did, it could get done before the main table add is
7124        # done for PropValueAliases.txt; instead the caller will have to make
7125        # sure it exists, if desired.
7126        return;
7127    }
7128
7129    sub to_output_map {
7130        # Returns boolean: should we write this map table?
7131
7132        my $self = shift;
7133        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7134
7135        my $addr = do { no overloading; pack 'J', $self; };
7136
7137        # If overridden, use that
7138        return $to_output_map{$addr} if defined $to_output_map{$addr};
7139
7140        my $full_name = $self->full_name;
7141        return $global_to_output_map{$full_name}
7142                                if defined $global_to_output_map{$full_name};
7143
7144        # If table says to output, do so; if says to suppress it, do so.
7145        my $fate = $self->fate;
7146        return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
7147        return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
7148        return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
7149
7150        my $type = $self->property->type;
7151
7152        # Don't want to output binary map tables even for debugging.
7153        return 0 if $type == $BINARY;
7154
7155        # But do want to output string ones.  All the ones that remain to
7156        # be dealt with (i.e. which haven't explicitly been set to external)
7157        # are for internal Perl use only.  The default for those that map to
7158        # $CODE_POINT and haven't been restricted to a single element range
7159        # is to use the adjusted form.
7160        if ($type == $STRING) {
7161            return $INTERNAL_MAP if $self->range_size_1
7162                                    || $default_map{$addr} ne $CODE_POINT;
7163            return $OUTPUT_ADJUSTED;
7164        }
7165
7166        # Otherwise is an $ENUM, do output it, for Perl's purposes
7167        return $INTERNAL_MAP;
7168    }
7169
7170    sub inverse_list {
7171        # Returns a Range_List that is gaps of the current table.  That is,
7172        # the inversion
7173
7174        my $self = shift;
7175        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7176
7177        my $current = Range_List->new(Initialize => $self->_range_list,
7178                                Owner => $self->property);
7179        return ~ $current;
7180    }
7181
7182    sub header {
7183        my $self = shift;
7184        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7185
7186        my $return = $self->SUPER::header();
7187
7188        if ($self->to_output_map >= $INTERNAL_MAP) {
7189            $return .= $INTERNAL_ONLY_HEADER;
7190        }
7191        else {
7192            my $property_name = $self->property->replacement_property;
7193
7194            # The legacy-only properties were gotten above; but there are some
7195            # other properties whose files are in current use that have fixed
7196            # formats.
7197            $property_name = $self->property->full_name unless $property_name;
7198
7199            $return .= <<END;
7200
7201# !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
7202
7203# This file is for internal use by core Perl only.  It is retained for
7204# backwards compatibility with applications that may have come to rely on it,
7205# but its format and even its name or existence are subject to change without
7206# notice in a future Perl version.  Don't use it directly.  Instead, its
7207# contents are now retrievable through a stable API in the Unicode::UCD
7208# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
7209# code points can be retrieved via Unicode::UCD::charprop());
7210END
7211        }
7212        return $return;
7213    }
7214
7215    sub set_final_comment {
7216        # Just before output, create the comment that heads the file
7217        # containing this table.
7218
7219        return unless $debugging_build;
7220
7221        my $self = shift;
7222        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7223
7224        # No sense generating a comment if aren't going to write it out.
7225        return if ! $self->to_output_map;
7226
7227        my $addr = do { no overloading; pack 'J', $self; };
7228
7229        my $property = $self->property;
7230
7231        # Get all the possible names for this property.  Don't use any that
7232        # aren't ok for use in a file name, etc.  This is perhaps causing that
7233        # flag to do double duty, and may have to be changed in the future to
7234        # have our own flag for just this purpose; but it works now to exclude
7235        # Perl generated synonyms from the lists for properties, where the
7236        # name is always the proper Unicode one.
7237        my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
7238
7239        my $count = $self->count;
7240        my $default_map = $default_map{$addr};
7241
7242        # The ranges that map to the default aren't output, so subtract that
7243        # to get those actually output.  A property with matching tables
7244        # already has the information calculated.
7245        if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7246            $count -= $property->table($default_map)->count;
7247        }
7248        elsif (defined $default_map) {
7249
7250            # But for $STRING properties, must calculate now.  Subtract the
7251            # count from each range that maps to the default.
7252            foreach my $range ($self->_range_list->ranges) {
7253                if ($range->value eq $default_map) {
7254                    $count -= $range->end +1 - $range->start;
7255                }
7256            }
7257
7258        }
7259
7260        # Get a  string version of $count with underscores in large numbers,
7261        # for clarity.
7262        my $string_count = main::clarify_code_point_count($count);
7263
7264        my $code_points = ($count == 1)
7265                        ? 'single code point'
7266                        : "$string_count code points";
7267
7268        my $mapping;
7269        my $these_mappings;
7270        my $are;
7271        if (@property_aliases <= 1) {
7272            $mapping = 'mapping';
7273            $these_mappings = 'this mapping';
7274            $are = 'is'
7275        }
7276        else {
7277            $mapping = 'synonymous mappings';
7278            $these_mappings = 'these mappings';
7279            $are = 'are'
7280        }
7281        my $cp;
7282        if ($count >= $MAX_UNICODE_CODEPOINTS) {
7283            $cp = "any code point in Unicode Version $string_version";
7284        }
7285        else {
7286            my $map_to;
7287            if ($default_map eq "") {
7288                $map_to = 'the null string';
7289            }
7290            elsif ($default_map eq $CODE_POINT) {
7291                $map_to = "itself";
7292            }
7293            else {
7294                $map_to = "'$default_map'";
7295            }
7296            if ($count == 1) {
7297                $cp = "the single code point";
7298            }
7299            else {
7300                $cp = "one of the $code_points";
7301            }
7302            $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7303        }
7304
7305        my $comment = "";
7306
7307        my $status = $self->status;
7308        if ($status ne $NORMAL) {
7309            my $warn = uc $status_past_participles{$status};
7310            $comment .= <<END;
7311
7312!!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
7313 All property or property=value combinations contained in this file are $warn.
7314 See $unicode_reference_url for what this means.
7315
7316END
7317        }
7318        $comment .= "This file returns the $mapping:\n";
7319
7320        my $ucd_accessible_name = "";
7321        my $has_underscore_name = 0;
7322        my $full_name = $self->property->full_name;
7323        for my $i (0 .. @property_aliases - 1) {
7324            my $name = $property_aliases[$i]->name;
7325            $has_underscore_name = 1 if $name =~ /^_/;
7326            $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7327            if ($property_aliases[$i]->ucd) {
7328                if ($name eq $full_name) {
7329                    $ucd_accessible_name = $full_name;
7330                }
7331                elsif (! $ucd_accessible_name) {
7332                    $ucd_accessible_name = $name;
7333                }
7334            }
7335        }
7336        $comment .= "\nwhere 'cp' is $cp.";
7337        if ($ucd_accessible_name) {
7338            $comment .= "  Note that $these_mappings";
7339            if ($has_underscore_name) {
7340                $comment .= " (except for the one(s) that begin with an underscore)";
7341            }
7342            $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7343
7344        }
7345
7346        # And append any commentary already set from the actual property.
7347        $comment .= "\n\n" . $self->comment if $self->comment;
7348        if ($self->description) {
7349            $comment .= "\n\n" . join " ", $self->description;
7350        }
7351        if ($self->note) {
7352            $comment .= "\n\n" . join " ", $self->note;
7353        }
7354        $comment .= "\n";
7355
7356        if (! $self->perl_extension) {
7357            $comment .= <<END;
7358
7359For information about what this property really means, see:
7360$unicode_reference_url
7361END
7362        }
7363
7364        if ($count) {        # Format differs for empty table
7365                $comment.= "\nThe format of the ";
7366            if ($self->range_size_1) {
7367                $comment.= <<END;
7368main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7369is in hex; MAPPING is what CODE_POINT maps to.
7370END
7371            }
7372            else {
7373
7374                # There are tables which end up only having one element per
7375                # range, but it is not worth keeping track of for making just
7376                # this comment a little better.
7377                $comment .= <<END;
7378non-comment portions of the main body of lines of this file is:
7379START\\tSTOP\\tMAPPING where START is the starting code point of the
7380range, in hex; STOP is the ending point, or if omitted, the range has just one
7381code point; MAPPING is what each code point between START and STOP maps to.
7382END
7383                if ($self->output_range_counts) {
7384                    $comment .= <<END;
7385Numbers in comments in [brackets] indicate how many code points are in the
7386range (omitted when the range is a single code point or if the mapping is to
7387the null string).
7388END
7389                }
7390            }
7391        }
7392        $self->set_comment(main::join_lines($comment));
7393        return;
7394    }
7395
7396    my %swash_keys; # Makes sure don't duplicate swash names.
7397
7398    # The remaining variables are temporaries used while writing each table,
7399    # to output special ranges.
7400    my @multi_code_point_maps;  # Map is to more than one code point.
7401
7402    sub handle_special_range {
7403        # Called in the middle of write when it finds a range it doesn't know
7404        # how to handle.
7405
7406        my $self = shift;
7407        my $range = shift;
7408        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7409
7410        my $addr = do { no overloading; pack 'J', $self; };
7411
7412        my $type = $range->type;
7413
7414        my $low = $range->start;
7415        my $high = $range->end;
7416        my $map = $range->value;
7417
7418        # No need to output the range if it maps to the default.
7419        return if $map eq $default_map{$addr};
7420
7421        my $property = $self->property;
7422
7423        # Switch based on the map type...
7424        if ($type == $HANGUL_SYLLABLE) {
7425
7426            # These are entirely algorithmically determinable based on
7427            # some constants furnished by Unicode; for now, just set a
7428            # flag to indicate that have them.  After everything is figured
7429            # out, we will output the code that does the algorithm.  (Don't
7430            # output them if not needed because we are suppressing this
7431            # property.)
7432            $has_hangul_syllables = 1 if $property->to_output_map;
7433        }
7434        elsif ($type == $CP_IN_NAME) {
7435
7436            # Code points whose name ends in their code point are also
7437            # algorithmically determinable, but need information about the map
7438            # to do so.  Both the map and its inverse are stored in data
7439            # structures output in the file.  They are stored in the mean time
7440            # in global lists The lists will be written out later into Name.pm,
7441            # which is created only if needed.  In order to prevent duplicates
7442            # in the list, only add to them for one property, should multiple
7443            # ones need them.
7444            if ($needing_code_points_ending_in_code_point == 0) {
7445                $needing_code_points_ending_in_code_point = $property;
7446            }
7447            if ($property == $needing_code_points_ending_in_code_point) {
7448                push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7449                push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7450
7451                my $squeezed = $map =~ s/[-\s]+//gr;
7452                push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7453                                                                          $low;
7454                push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7455                                                                         $high;
7456
7457                push @code_points_ending_in_code_point, { low => $low,
7458                                                        high => $high,
7459                                                        name => $map
7460                                                        };
7461            }
7462        }
7463        elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7464
7465            # Multi-code point maps and null string maps have an entry
7466            # for each code point in the range.  They use the same
7467            # output format.
7468            for my $code_point ($low .. $high) {
7469
7470                # The pack() below can't cope with surrogates.  XXX This may
7471                # no longer be true
7472                if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7473                    Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
7474                    next;
7475                }
7476
7477                # Generate the hash entries for these in the form that
7478                # utf8.c understands.
7479                my $tostr = "";
7480                my $to_name = "";
7481                my $to_chr = "";
7482                foreach my $to (split " ", $map) {
7483                    if ($to !~ /^$code_point_re$/) {
7484                        Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
7485                        next;
7486                    }
7487                    $tostr .= sprintf "\\x{%s}", $to;
7488                    $to = CORE::hex $to;
7489                    if ($annotate) {
7490                        $to_name .= " + " if $to_name;
7491                        $to_chr .= main::display_chr($to);
7492                        main::populate_char_info($to)
7493                                            if ! defined $viacode[$to];
7494                        $to_name .=  $viacode[$to];
7495                    }
7496                }
7497
7498                # The unpack yields a list of the bytes that comprise the
7499                # UTF-8 of $code_point, which are each placed in \xZZ format
7500                # and output in the %s to map to $tostr, so the result looks
7501                # like:
7502                # "\xC4\xB0" => "\x{0069}\x{0307}",
7503                my $utf8 = sprintf(qq["%s" => "$tostr",],
7504                        join("", map { sprintf "\\x%02X", $_ }
7505                            unpack("U0C*", chr $code_point)));
7506
7507                # Add a comment so that a human reader can more easily
7508                # see what's going on.
7509                push @multi_code_point_maps,
7510                        sprintf("%-45s # U+%04X", $utf8, $code_point);
7511                if (! $annotate) {
7512                    $multi_code_point_maps[-1] .= " => $map";
7513                }
7514                else {
7515                    main::populate_char_info($code_point)
7516                                    if ! defined $viacode[$code_point];
7517                    $multi_code_point_maps[-1] .= " '"
7518                        . main::display_chr($code_point)
7519                        . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7520                }
7521            }
7522        }
7523        else {
7524            Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7525        }
7526
7527        return;
7528    }
7529
7530    sub pre_body {
7531        # Returns the string that should be output in the file before the main
7532        # body of this table.  It isn't called until the main body is
7533        # calculated, saving a pass.  The string includes some hash entries
7534        # identifying the format of the body, and what the single value should
7535        # be for all ranges missing from it.  It also includes any code points
7536        # which have map_types that don't go in the main table.
7537
7538        my $self = shift;
7539        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7540
7541        my $addr = do { no overloading; pack 'J', $self; };
7542
7543        my $name = $self->property->swash_name;
7544
7545        # Currently there is nothing in the pre_body unless a swash is being
7546        # generated.
7547        return unless defined $name;
7548
7549        if (defined $swash_keys{$name}) {
7550            Carp::my_carp(main::join_lines(<<END
7551Already created a swash name '$name' for $swash_keys{$name}.  This means that
7552the same name desired for $self shouldn't be used.  Bad News.  This must be
7553fixed before production use, but proceeding anyway
7554END
7555            ));
7556        }
7557        $swash_keys{$name} = "$self";
7558
7559        my $pre_body = "";
7560
7561        # Here we assume we were called after have gone through the whole
7562        # file.  If we actually generated anything for each map type, add its
7563        # respective header and trailer
7564        my $specials_name = "";
7565        if (@multi_code_point_maps) {
7566            $specials_name = "utf8::ToSpec$name";
7567            $pre_body .= <<END;
7568
7569# Some code points require special handling because their mappings are each to
7570# multiple code points.  These do not appear in the main body, but are defined
7571# in the hash below.
7572
7573# Each key is the string of N bytes that together make up the UTF-8 encoding
7574# for the code point.  (i.e. the same as looking at the code point's UTF-8
7575# under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7576\%$specials_name = (
7577END
7578            $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7579        }
7580
7581        my $format = $self->format;
7582
7583        my $return = "";
7584
7585        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7586        if ($output_adjusted) {
7587            if ($specials_name) {
7588                $return .= <<END;
7589# The mappings in the non-hash portion of this file must be modified to get the
7590# correct values by adding the code point ordinal number to each one that is
7591# numeric.
7592END
7593            }
7594            else {
7595                $return .= <<END;
7596# The mappings must be modified to get the correct values by adding the code
7597# point ordinal number to each one that is numeric.
7598END
7599            }
7600        }
7601
7602        $return .= <<END;
7603
7604# The name this swash is to be known by, with the format of the mappings in
7605# the main body of the table, and what all code points missing from this file
7606# map to.
7607\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7608END
7609        if ($specials_name) {
7610            $return .= <<END;
7611\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7612END
7613        }
7614        my $default_map = $default_map{$addr};
7615
7616        # For $CODE_POINT default maps and using adjustments, instead the default
7617        # becomes zero.
7618        $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
7619                .  (($output_adjusted && $default_map eq $CODE_POINT)
7620                   ? "0"
7621                   : $default_map)
7622                . "';";
7623
7624        if ($default_map eq $CODE_POINT) {
7625            $return .= ' # code point maps to itself';
7626        }
7627        elsif ($default_map eq "") {
7628            $return .= ' # code point maps to the null string';
7629        }
7630        $return .= "\n";
7631
7632        $return .= $pre_body;
7633
7634        return $return;
7635    }
7636
7637    sub write {
7638        # Write the table to the file.
7639
7640        my $self = shift;
7641        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7642
7643        my $addr = do { no overloading; pack 'J', $self; };
7644
7645        # Clear the temporaries
7646        undef @multi_code_point_maps;
7647
7648        # Calculate the format of the table if not already done.
7649        my $format = $self->format;
7650        my $type = $self->property->type;
7651        my $default_map = $self->default_map;
7652        if (! defined $format) {
7653            if ($type == $BINARY) {
7654
7655                # Don't bother checking the values, because we elsewhere
7656                # verify that a binary table has only 2 values.
7657                $format = $BINARY_FORMAT;
7658            }
7659            else {
7660                my @ranges = $self->_range_list->ranges;
7661
7662                # default an empty table based on its type and default map
7663                if (! @ranges) {
7664
7665                    # But it turns out that the only one we can say is a
7666                    # non-string (besides binary, handled above) is when the
7667                    # table is a string and the default map is to a code point
7668                    if ($type == $STRING && $default_map eq $CODE_POINT) {
7669                        $format = $HEX_FORMAT;
7670                    }
7671                    else {
7672                        $format = $STRING_FORMAT;
7673                    }
7674                }
7675                else {
7676
7677                    # Start with the most restrictive format, and as we find
7678                    # something that doesn't fit with that, change to the next
7679                    # most restrictive, and so on.
7680                    $format = $DECIMAL_FORMAT;
7681                    foreach my $range (@ranges) {
7682                        next if $range->type != 0;  # Non-normal ranges don't
7683                                                    # affect the main body
7684                        my $map = $range->value;
7685                        if ($map ne $default_map) {
7686                            last if $format eq $STRING_FORMAT;  # already at
7687                                                                # least
7688                                                                # restrictive
7689                            $format = $INTEGER_FORMAT
7690                                                if $format eq $DECIMAL_FORMAT
7691                                                    && $map !~ / ^ [0-9] $ /x;
7692                            $format = $FLOAT_FORMAT
7693                                            if $format eq $INTEGER_FORMAT
7694                                                && $map !~ / ^ -? [0-9]+ $ /x;
7695                            $format = $RATIONAL_FORMAT
7696                                if $format eq $FLOAT_FORMAT
7697                                    && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7698                            $format = $HEX_FORMAT
7699                                if ($format eq $RATIONAL_FORMAT
7700                                       && $map !~
7701                                           m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7702                                        # Assume a leading zero means hex,
7703                                        # even if all digits are 0-9
7704                                    || ($format eq $INTEGER_FORMAT
7705                                        && $map =~ /^0[0-9A-F]/);
7706                            $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7707                                                       && $map =~ /[^0-9A-F]/;
7708                        }
7709                    }
7710                }
7711            }
7712        } # end of calculating format
7713
7714        if ($default_map eq $CODE_POINT
7715            && $format ne $HEX_FORMAT
7716            && ! defined $self->format)    # manual settings are always
7717                                           # considered ok
7718        {
7719            Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7720        }
7721
7722        # If the output is to be adjusted, the format of the table that gets
7723        # output is actually 'a' or 'ax' instead of whatever it is stored
7724        # internally as.
7725        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7726        if ($output_adjusted) {
7727            if ($default_map eq $CODE_POINT) {
7728                $format = $HEX_ADJUST_FORMAT;
7729            }
7730            else {
7731                $format = $ADJUST_FORMAT;
7732            }
7733        }
7734
7735        $self->_set_format($format);
7736
7737        return $self->SUPER::write(
7738            $output_adjusted,
7739            $default_map);   # don't write defaulteds
7740    }
7741
7742    # Accessors for the underlying list that should fail if locked.
7743    for my $sub (qw(
7744                    add_duplicate
7745                    replace_map
7746                ))
7747    {
7748        no strict "refs";
7749        *$sub = sub {
7750            use strict "refs";
7751            my $self = shift;
7752
7753            return if $self->carp_if_locked;
7754            return $self->_range_list->$sub(@_);
7755        }
7756    }
7757} # End closure for Map_Table
7758
7759package Match_Table;
7760use parent '-norequire', '_Base_Table';
7761
7762# A Match table is one which is a list of all the code points that have
7763# the same property and property value, for use in \p{property=value}
7764# constructs in regular expressions.  It adds very little data to the base
7765# structure, but many methods, as these lists can be combined in many ways to
7766# form new ones.
7767# There are only a few concepts added:
7768# 1) Equivalents and Relatedness.
7769#    Two tables can match the identical code points, but have different names.
7770#    This always happens when there is a perl single form extension
7771#    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7772#    tables are set to be related, with the Perl extension being a child, and
7773#    the Unicode property being the parent.
7774#
7775#    It may be that two tables match the identical code points and we don't
7776#    know if they are related or not.  This happens most frequently when the
7777#    Block and Script properties have the exact range.  But note that a
7778#    revision to Unicode could add new code points to the script, which would
7779#    now have to be in a different block (as the block was filled, or there
7780#    would have been 'Unknown' script code points in it and they wouldn't have
7781#    been identical).  So we can't rely on any two properties from Unicode
7782#    always matching the same code points from release to release, and thus
7783#    these tables are considered coincidentally equivalent--not related.  When
7784#    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7785#    'leader', and the others are 'equivalents'.  This concept is useful
7786#    to minimize the number of tables written out.  Only one file is used for
7787#    any identical set of code points, with entries in Heavy.pl mapping all
7788#    the involved tables to it.
7789#
7790#    Related tables will always be identical; we set them up to be so.  Thus
7791#    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7792#    unrelated tables.  Relatedness makes generating the documentation easier.
7793#
7794# 2) Complement.
7795#    Like equivalents, two tables may be the inverses of each other, the
7796#    intersection between them is null, and the union is every Unicode code
7797#    point.  The two tables that occupy a binary property are necessarily like
7798#    this.  By specifying one table as the complement of another, we can avoid
7799#    storing it on disk (using the other table and performing a fast
7800#    transform), and some memory and calculations.
7801#
7802# 3) Conflicting.  It may be that there will eventually be name clashes, with
7803#    the same name meaning different things.  For a while, there actually were
7804#    conflicts, but they have so far been resolved by changing Perl's or
7805#    Unicode's definitions to match the other, but when this code was written,
7806#    it wasn't clear that that was what was going to happen.  (Unicode changed
7807#    because of protests during their beta period.)  Name clashes are warned
7808#    about during compilation, and the documentation.  The generated tables
7809#    are sane, free of name clashes, because the code suppresses the Perl
7810#    version.  But manual intervention to decide what the actual behavior
7811#    should be may be required should this happen.  The introductory comments
7812#    have more to say about this.
7813#
7814# 4) Definition.  This is a string for human consumption that specifies the
7815#    code points that this table matches.  This is used only for the generated
7816#    pod file.  It may be specified explicitly, or automatically computed.
7817#    Only the first portion of complicated definitions is computed and
7818#    displayed.
7819
7820sub standardize { return main::standardize($_[0]); }
7821sub trace { return main::trace(@_); }
7822
7823
7824{ # Closure
7825
7826    main::setup_package();
7827
7828    my %leader;
7829    # The leader table of this one; initially $self.
7830    main::set_access('leader', \%leader, 'r');
7831
7832    my %equivalents;
7833    # An array of any tables that have this one as their leader
7834    main::set_access('equivalents', \%equivalents, 'readable_array');
7835
7836    my %parent;
7837    # The parent table to this one, initially $self.  This allows us to
7838    # distinguish between equivalent tables that are related (for which this
7839    # is set to), and those which may not be, but share the same output file
7840    # because they match the exact same set of code points in the current
7841    # Unicode release.
7842    main::set_access('parent', \%parent, 'r');
7843
7844    my %children;
7845    # An array of any tables that have this one as their parent
7846    main::set_access('children', \%children, 'readable_array');
7847
7848    my %conflicting;
7849    # Array of any tables that would have the same name as this one with
7850    # a different meaning.  This is used for the generated documentation.
7851    main::set_access('conflicting', \%conflicting, 'readable_array');
7852
7853    my %matches_all;
7854    # Set in the constructor for tables that are expected to match all code
7855    # points.
7856    main::set_access('matches_all', \%matches_all, 'r');
7857
7858    my %complement;
7859    # Points to the complement that this table is expressed in terms of; 0 if
7860    # none.
7861    main::set_access('complement', \%complement, 'r');
7862
7863    my %definition;
7864    # Human readable string of the first few ranges of code points matched by
7865    # this table
7866    main::set_access('definition', \%definition, 'r', 's');
7867
7868    sub new {
7869        my $class = shift;
7870
7871        my %args = @_;
7872
7873        # The property for which this table is a listing of property values.
7874        my $property = delete $args{'_Property'};
7875
7876        my $name = delete $args{'Name'};
7877        my $full_name = delete $args{'Full_Name'};
7878        $full_name = $name if ! defined $full_name;
7879
7880        # Optional
7881        my $initialize = delete $args{'Initialize'};
7882        my $matches_all = delete $args{'Matches_All'} || 0;
7883        my $format = delete $args{'Format'};
7884        my $definition = delete $args{'Definition'} // "";
7885        # Rest of parameters passed on.
7886
7887        my $range_list = Range_List->new(Initialize => $initialize,
7888                                         Owner => $property);
7889
7890        my $complete = $full_name;
7891        $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7892                                              # but this helps debug if it
7893                                              # does
7894        # The complete name for a match table includes it's property in a
7895        # compound form 'property=table', except if the property is the
7896        # pseudo-property, perl, in which case it is just the single form,
7897        # 'table' (If you change the '=' must also change the ':' in lots of
7898        # places in this program that assume an equal sign)
7899        $complete = $property->full_name . "=$complete" if $property != $perl;
7900
7901        my $self = $class->SUPER::new(%args,
7902                                      Name => $name,
7903                                      Complete_Name => $complete,
7904                                      Full_Name => $full_name,
7905                                      _Property => $property,
7906                                      _Range_List => $range_list,
7907                                      Format => $EMPTY_FORMAT,
7908                                      Write_As_Invlist => 1,
7909                                      );
7910        my $addr = do { no overloading; pack 'J', $self; };
7911
7912        $conflicting{$addr} = [ ];
7913        $equivalents{$addr} = [ ];
7914        $children{$addr} = [ ];
7915        $matches_all{$addr} = $matches_all;
7916        $leader{$addr} = $self;
7917        $parent{$addr} = $self;
7918        $complement{$addr} = 0;
7919        $definition{$addr} = $definition;
7920
7921        if (defined $format && $format ne $EMPTY_FORMAT) {
7922            Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7923        }
7924
7925        return $self;
7926    }
7927
7928    # See this program's beginning comment block about overloading these.
7929    use overload
7930        fallback => 0,
7931        qw("") => "_operator_stringify",
7932        '=' => sub {
7933                    my $self = shift;
7934
7935                    return if $self->carp_if_locked;
7936                    return $self;
7937                },
7938
7939        '+' => sub {
7940                        my $self = shift;
7941                        my $other = shift;
7942
7943                        return $self->_range_list + $other;
7944                    },
7945        '&' => sub {
7946                        my $self = shift;
7947                        my $other = shift;
7948
7949                        return $self->_range_list & $other;
7950                    },
7951        '+=' => sub {
7952                        my $self = shift;
7953                        my $other = shift;
7954                        my $reversed = shift;
7955
7956                        if ($reversed) {
7957                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7958                            . ref($other)
7959                            . ' += '
7960                            . ref($self)
7961                            . "'.  undef returned.");
7962                            return;
7963                        }
7964
7965                        return if $self->carp_if_locked;
7966
7967                        my $addr = do { no overloading; pack 'J', $self; };
7968
7969                        if (ref $other) {
7970
7971                            # Change the range list of this table to be the
7972                            # union of the two.
7973                            $self->_set_range_list($self->_range_list
7974                                                    + $other);
7975                        }
7976                        else {    # $other is just a simple value
7977                            $self->add_range($other, $other);
7978                        }
7979                        return $self;
7980                    },
7981        '&=' => sub {
7982                        my $self = shift;
7983                        my $other = shift;
7984                        my $reversed = shift;
7985
7986                        if ($reversed) {
7987                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7988                            . ref($other)
7989                            . ' &= '
7990                            . ref($self)
7991                            . "'.  undef returned.");
7992                            return;
7993                        }
7994
7995                        return if $self->carp_if_locked;
7996                        $self->_set_range_list($self->_range_list & $other);
7997                        return $self;
7998                    },
7999        '-' => sub { my $self = shift;
8000                    my $other = shift;
8001                    my $reversed = shift;
8002                    if ($reversed) {
8003                        Carp::my_carp_bug("Bad news.  Can't cope with '"
8004                        . ref($other)
8005                        . ' - '
8006                        . ref($self)
8007                        . "'.  undef returned.");
8008                        return;
8009                    }
8010
8011                    return $self->_range_list - $other;
8012                },
8013        '~' => sub { my $self = shift;
8014                    return ~ $self->_range_list;
8015                },
8016    ;
8017
8018    sub _operator_stringify {
8019        my $self = shift;
8020
8021        my $name = $self->complete_name;
8022        return "Table '$name'";
8023    }
8024
8025    sub _range_list {
8026        # Returns the range list associated with this table, which will be the
8027        # complement's if it has one.
8028
8029        my $self = shift;
8030        my $complement = $self->complement;
8031
8032        # In order to avoid re-complementing on each access, only do the
8033        # complement the first time, and store the result in this table's
8034        # range list to use henceforth.  However, this wouldn't work if the
8035        # controlling (complement) table changed after we do this, so lock it.
8036        # Currently, the value of the complement isn't needed until after it
8037        # is fully constructed, so this works.  If this were to change, the
8038        # each_range iteration functionality would no longer work on this
8039        # complement.
8040        if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
8041            $self->_set_range_list($self->SUPER::_range_list
8042                                + ~ $complement->_range_list);
8043            $complement->lock;
8044        }
8045
8046        return $self->SUPER::_range_list;
8047    }
8048
8049    sub add_alias {
8050        # Add a synonym for this table.  See the comments in the base class
8051
8052        my $self = shift;
8053        my $name = shift;
8054        # Rest of parameters passed on.
8055
8056        $self->SUPER::add_alias($name, $self, @_);
8057        return;
8058    }
8059
8060    sub add_conflicting {
8061        # Add the name of some other object to the list of ones that name
8062        # clash with this match table.
8063
8064        my $self = shift;
8065        my $conflicting_name = shift;   # The name of the conflicting object
8066        my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
8067        my $conflicting_object = shift; # Optional, the conflicting object
8068                                        # itself.  This is used to
8069                                        # disambiguate the text if the input
8070                                        # name is identical to any of the
8071                                        # aliases $self is known by.
8072                                        # Sometimes the conflicting object is
8073                                        # merely hypothetical, so this has to
8074                                        # be an optional parameter.
8075        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8076
8077        my $addr = do { no overloading; pack 'J', $self; };
8078
8079        # Check if the conflicting name is exactly the same as any existing
8080        # alias in this table (as long as there is a real object there to
8081        # disambiguate with).
8082        if (defined $conflicting_object) {
8083            foreach my $alias ($self->aliases) {
8084                if (standardize($alias->name) eq standardize($conflicting_name)) {
8085
8086                    # Here, there is an exact match.  This results in
8087                    # ambiguous comments, so disambiguate by changing the
8088                    # conflicting name to its object's complete equivalent.
8089                    $conflicting_name = $conflicting_object->complete_name;
8090                    last;
8091                }
8092            }
8093        }
8094
8095        # Convert to the \p{...} final name
8096        $conflicting_name = "\\$p" . "{$conflicting_name}";
8097
8098        # Only add once
8099        return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
8100
8101        push @{$conflicting{$addr}}, $conflicting_name;
8102
8103        return;
8104    }
8105
8106    sub is_set_equivalent_to {
8107        # Return boolean of whether or not the other object is a table of this
8108        # type and has been marked equivalent to this one.
8109
8110        my $self = shift;
8111        my $other = shift;
8112        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8113
8114        return 0 if ! defined $other; # Can happen for incomplete early
8115                                      # releases
8116        unless ($other->isa(__PACKAGE__)) {
8117            my $ref_other = ref $other;
8118            my $ref_self = ref $self;
8119            Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
8120            return 0;
8121        }
8122
8123        # Two tables are equivalent if they have the same leader.
8124        no overloading;
8125        return $leader{pack 'J', $self} == $leader{pack 'J', $other};
8126        return;
8127    }
8128
8129    sub set_equivalent_to {
8130        # Set $self equivalent to the parameter table.
8131        # The required Related => 'x' parameter is a boolean indicating
8132        # whether these tables are related or not.  If related, $other becomes
8133        # the 'parent' of $self; if unrelated it becomes the 'leader'
8134        #
8135        # Related tables share all characteristics except names; equivalents
8136        # not quite so many.
8137        # If they are related, one must be a perl extension.  This is because
8138        # we can't guarantee that Unicode won't change one or the other in a
8139        # later release even if they are identical now.
8140
8141        my $self = shift;
8142        my $other = shift;
8143
8144        my %args = @_;
8145        my $related = delete $args{'Related'};
8146
8147        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8148
8149        return if ! defined $other;     # Keep on going; happens in some early
8150                                        # Unicode releases.
8151
8152        if (! defined $related) {
8153            Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
8154            $related = 0;
8155        }
8156
8157        # If already are equivalent, no need to re-do it;  if subroutine
8158        # returns null, it found an error, also do nothing
8159        my $are_equivalent = $self->is_set_equivalent_to($other);
8160        return if ! defined $are_equivalent || $are_equivalent;
8161
8162        my $addr = do { no overloading; pack 'J', $self; };
8163        my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
8164
8165        if ($related) {
8166            if ($current_leader->perl_extension) {
8167                if ($other->perl_extension) {
8168                    Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
8169                    return;
8170                }
8171            } elsif ($self->property != $other->property    # Depending on
8172                                                            # situation, might
8173                                                            # be better to use
8174                                                            # add_alias()
8175                                                            # instead for same
8176                                                            # property
8177                     && ! $other->perl_extension
8178
8179                         # We allow the sc and scx properties to be marked as
8180                         # related.  They are in fact related, and this allows
8181                         # the pod to show that better.  This test isn't valid
8182                         # if this is an early Unicode release without the scx
8183                         # property (having that also implies the sc property
8184                         # exists, so don't have to test for no 'sc')
8185                     && (   ! defined $scx
8186                         && ! (   (   $self->property == $script
8187                                   || $self->property == $scx)
8188                               && (   $self->property == $script
8189                                   || $self->property == $scx))))
8190            {
8191                Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
8192                $related = 0;
8193            }
8194        }
8195
8196        if (! $self->is_empty && ! $self->matches_identically_to($other)) {
8197            Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
8198            return;
8199        }
8200
8201        my $leader = do { no overloading; pack 'J', $current_leader; };
8202        my $other_addr = do { no overloading; pack 'J', $other; };
8203
8204        # Any tables that are equivalent to or children of this table must now
8205        # instead be equivalent to or (children) to the new leader (parent),
8206        # still equivalent.  The equivalency includes their matches_all info,
8207        # and for related tables, their fate and status.
8208        # All related tables are of necessity equivalent, but the converse
8209        # isn't necessarily true
8210        my $status = $other->status;
8211        my $status_info = $other->status_info;
8212        my $fate = $other->fate;
8213        my $matches_all = $matches_all{other_addr};
8214        my $caseless_equivalent = $other->caseless_equivalent;
8215        foreach my $table ($current_leader, @{$equivalents{$leader}}) {
8216            next if $table == $other;
8217            trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
8218
8219            my $table_addr = do { no overloading; pack 'J', $table; };
8220            $leader{$table_addr} = $other;
8221            $matches_all{$table_addr} = $matches_all;
8222            $self->_set_range_list($other->_range_list);
8223            push @{$equivalents{$other_addr}}, $table;
8224            if ($related) {
8225                $parent{$table_addr} = $other;
8226                push @{$children{$other_addr}}, $table;
8227                $table->set_status($status, $status_info);
8228
8229                # This reason currently doesn't get exposed outside; otherwise
8230                # would have to look up the parent's reason and use it instead.
8231                $table->set_fate($fate, "Parent's fate");
8232
8233                $self->set_caseless_equivalent($caseless_equivalent);
8234            }
8235        }
8236
8237        # Now that we've declared these to be equivalent, any changes to one
8238        # of the tables would invalidate that equivalency.
8239        $self->lock;
8240        $other->lock;
8241        return;
8242    }
8243
8244    sub set_complement {
8245        # Set $self to be the complement of the parameter table.  $self is
8246        # locked, as what it contains should all come from the other table.
8247
8248        my $self = shift;
8249        my $other = shift;
8250
8251        my %args = @_;
8252        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8253
8254        if ($other->complement != 0) {
8255            Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8256            return;
8257        }
8258        my $addr = do { no overloading; pack 'J', $self; };
8259        $complement{$addr} = $other;
8260
8261        # Be sure the other property knows we are depending on them; or the
8262        # other table if it is one in the current property.
8263        if ($self->property != $other->property) {
8264            $other->property->set_has_dependency(1);
8265        }
8266        else {
8267            $other->set_has_dependency(1);
8268        }
8269        $self->lock;
8270        return;
8271    }
8272
8273    sub add_range { # Add a range to the list for this table.
8274        my $self = shift;
8275        # Rest of parameters passed on
8276
8277        return if $self->carp_if_locked;
8278        return $self->_range_list->add_range(@_);
8279    }
8280
8281    sub header {
8282        my $self = shift;
8283        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8284
8285        # All match tables are to be used only by the Perl core.
8286        return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8287    }
8288
8289    sub pre_body {  # Does nothing for match tables.
8290        return
8291    }
8292
8293    sub append_to_body {  # Does nothing for match tables.
8294        return
8295    }
8296
8297    sub set_fate {
8298        my $self = shift;
8299        my $fate = shift;
8300        my $reason = shift;
8301        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8302
8303        $self->SUPER::set_fate($fate, $reason);
8304
8305        # All children share this fate
8306        foreach my $child ($self->children) {
8307            $child->set_fate($fate, $reason);
8308        }
8309        return;
8310    }
8311
8312    sub calculate_table_definition
8313    {
8314        # Returns a human-readable string showing some or all of the code
8315        # points matched by this table.  The string will include a
8316        # bracketed-character class for all characters matched in the 00-FF
8317        # range, and the first few ranges matched beyond that.
8318        my $max_ranges = 6;
8319
8320        my $self = shift;
8321        my $definition = $self->definition || "";
8322
8323        # Skip this if already have a definition.
8324        return $definition if $definition;
8325
8326        my $lows_string = "";   # The string representation of the 0-FF
8327                                # characters
8328        my $string_range = "";  # The string rep. of the above FF ranges
8329        my $range_count = 0;    # How many ranges in $string_rage
8330
8331        my @lows_invlist;       # The inversion list of the 0-FF code points
8332        my $first_non_control = ord(" ");   # Everything below this is a
8333                                            # control, on ASCII or EBCDIC
8334        my $max_table_code_point = $self->max;
8335
8336        # On ASCII platforms, the range 80-FF contains no printables.
8337        my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8338
8339
8340        # Look through the first few ranges matched by this table.
8341        $self->reset_each_range;    # Defensive programming
8342        while (defined (my $range = $self->each_range())) {
8343            my $start = $range->start;
8344            my $end = $range->end;
8345
8346            # Accumulate an inversion list of the 00-FF code points
8347            if ($start < 256 && ($start > 0 || $end < 256)) {
8348                push @lows_invlist, $start;
8349                push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8350
8351                # Get next range if there are more ranges below 256
8352                next if $end < 256 && $end < $max_table_code_point;
8353
8354                # If the range straddles the 255/256 boundary, we split it
8355                # there.  We already added above the low portion to the
8356                # inversion list
8357                $start = 256 if $end > 256;
8358            }
8359
8360            # Here, @lows_invlist contains the code points below 256, and
8361            # there is no other range, or the current one starts at or above
8362            # 256.  Generate the [char class] for the 0-255 ones.
8363            while (@lows_invlist) {
8364
8365                # If this range (necessarily the first one, by the way) starts
8366                # at 0 ...
8367                if ($lows_invlist[0] == 0) {
8368
8369                    # If it ends within the block of controls, that means that
8370                    # some controls are in it and some aren't.  Since Unicode
8371                    # properties pretty much only know about a few of the
8372                    # controls, like \n, \t, this means that its one of them
8373                    # that isn't in the range.  Complement the inversion list
8374                    # which will likely cause these to be output using their
8375                    # mnemonics, hence being clearer.
8376                    if ($lows_invlist[1] < $first_non_control) {
8377                        $lows_string .= '^';
8378                        shift @lows_invlist;
8379                        push @lows_invlist, 256;
8380                    }
8381                    elsif ($lows_invlist[1] <= $highest_printable) {
8382
8383                        # Here, it extends into the printables block.  Split
8384                        # into two ranges so that the controls are separate.
8385                        $lows_string .= sprintf "\\x00-\\x%02x",
8386                                                    $first_non_control - 1;
8387                        $lows_invlist[0] = $first_non_control;
8388                    }
8389                }
8390
8391                # If the range completely contains the printables, don't
8392                # individually spell out the printables.
8393                if (    $lows_invlist[0] <= $first_non_control
8394                    && $lows_invlist[1] > $highest_printable)
8395                {
8396                    $lows_string .= sprintf "\\x%02x-\\x%02x",
8397                                        $lows_invlist[0], $lows_invlist[1] - 1;
8398                    shift @lows_invlist;
8399                    shift @lows_invlist;
8400                    next;
8401                }
8402
8403                # Here, the range may include some but not all printables.
8404                # Look at each one individually
8405                foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8406                    my $char = chr $ord;
8407
8408                    # If there is already something in the list, an
8409                    # alphanumeric char could be the next in sequence.  If so,
8410                    # we start or extend a range.  That is, we could have so
8411                    # far something like 'a-c', and the next char is a 'd', so
8412                    # we change it to 'a-d'.  We use native_to_unicode()
8413                    # because a-z on EBCDIC means 26 chars, and excludes the
8414                    # gap ones.
8415                    if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8416                        my $prev = substr($lows_string, -1);
8417                        if (   $prev !~ /[[:alnum:]]/
8418                            ||   utf8::native_to_unicode(ord $prev) + 1
8419                              != utf8::native_to_unicode(ord $char))
8420                        {
8421                            # Not extending the range
8422                            $lows_string .= $char;
8423                        }
8424                        elsif (   length $lows_string > 1
8425                               && substr($lows_string, -2, 1) eq '-')
8426                        {
8427                            # We had a sequence like '-c' and the current
8428                            # character is 'd'.  Extend the range.
8429                            substr($lows_string, -1, 1) = $char;
8430                        }
8431                        else {
8432                            # We had something like 'd' and this is 'e'.
8433                            # Start a range.
8434                            $lows_string .= "-$char";
8435                        }
8436                    }
8437                    elsif ($char =~ /[[:graph:]]/) {
8438
8439                        # We output a graphic char as-is, preceded by a
8440                        # backslash if it is a metacharacter
8441                        $lows_string .= '\\'
8442                                if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8443                        $lows_string .= $char;
8444                    } # Otherwise use mnemonic for any that have them
8445                    elsif ($char =~ /[\a]/) {
8446                        $lows_string .= '\a';
8447                    }
8448                    elsif ($char =~ /[\b]/) {
8449                        $lows_string .= '\b';
8450                    }
8451                    elsif ($char eq "\e") {
8452                        $lows_string .= '\e';
8453                    }
8454                    elsif ($char eq "\f") {
8455                        $lows_string .= '\f';
8456                    }
8457                    elsif ($char eq "\cK") {
8458                        $lows_string .= '\cK';
8459                    }
8460                    elsif ($char eq "\n") {
8461                        $lows_string .= '\n';
8462                    }
8463                    elsif ($char eq "\r") {
8464                        $lows_string .= '\r';
8465                    }
8466                    elsif ($char eq "\t") {
8467                        $lows_string .= '\t';
8468                    }
8469                    else {
8470
8471                        # Here is a non-graphic without a mnemonic.  We use \x
8472                        # notation.  But if the ordinal of this is one above
8473                        # the previous, create or extend the range
8474                        my $hex_representation = sprintf("%02x", ord $char);
8475                        if (   length $lows_string >= 4
8476                            && substr($lows_string, -4, 2) eq '\\x'
8477                            && hex(substr($lows_string, -2)) + 1 == ord $char)
8478                        {
8479                            if (       length $lows_string >= 5
8480                                &&     substr($lows_string, -5, 1) eq '-'
8481                                && (   length $lows_string == 5
8482                                    || substr($lows_string, -6, 1) ne '\\'))
8483                            {
8484                                substr($lows_string, -2) = $hex_representation;
8485                            }
8486                            else {
8487                                $lows_string .= '-\\x' . $hex_representation;
8488                            }
8489                        }
8490                        else {
8491                            $lows_string .= '\\x' . $hex_representation;
8492                        }
8493                    }
8494                }
8495            }
8496
8497            # Done with assembling the string of all lows.  If there are only
8498            # lows in the property, are completely done.
8499            if ($max_table_code_point < 256) {
8500                $self->reset_each_range;
8501                last;
8502            }
8503
8504            # Otherwise, quit if reached max number of non-lows ranges.  If
8505            # there are lows, count them as one unit towards the maximum.
8506            $range_count++;
8507            if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8508                $string_range .= " ...";
8509                $self->reset_each_range;
8510                last;
8511            }
8512
8513            # Otherwise add this range.
8514            $string_range .= ", " if $string_range ne "";
8515            if ($start == $end) {
8516                $string_range .= sprintf("U+%04X", $start);
8517            }
8518            elsif ($end >= $MAX_WORKING_CODEPOINT)  {
8519                $string_range .= sprintf("U+%04X..infinity", $start);
8520            }
8521            else  {
8522                $string_range .= sprintf("U+%04X..%04X",
8523                                        $start, $end);
8524            }
8525        }
8526
8527        # Done with all the ranges we're going to look at.  Assemble the
8528        # definition from the lows + non-lows.
8529
8530        if ($lows_string ne "" || $string_range ne "") {
8531            if ($lows_string ne "") {
8532                $definition .= "[$lows_string]";
8533                $definition .= ", " if $string_range;
8534            }
8535            $definition .= $string_range;
8536        }
8537
8538        return $definition;
8539    }
8540
8541    sub write {
8542        my $self = shift;
8543        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8544
8545        return $self->SUPER::write(0); # No adjustments
8546    }
8547
8548    sub set_final_comment {
8549        # This creates a comment for the file that is to hold the match table
8550        # $self.  It is somewhat convoluted to make the English read nicely,
8551        # but, heh, it's just a comment.
8552        # This should be called only with the leader match table of all the
8553        # ones that share the same file.  It lists all such tables, ordered so
8554        # that related ones are together.
8555
8556        return unless $debugging_build;
8557
8558        my $leader = shift;   # Should only be called on the leader table of
8559                              # an equivalent group
8560        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8561
8562        my $addr = do { no overloading; pack 'J', $leader; };
8563
8564        if ($leader{$addr} != $leader) {
8565            Carp::my_carp_bug(<<END
8566set_final_comment() must be called on a leader table, which $leader is not.
8567It is equivalent to $leader{$addr}.  No comment created
8568END
8569            );
8570            return;
8571        }
8572
8573        # Get the number of code points matched by each of the tables in this
8574        # file, and add underscores for clarity.
8575        my $count = $leader->count;
8576        my $unicode_count;
8577        my $non_unicode_string;
8578        if ($count > $MAX_UNICODE_CODEPOINTS) {
8579            $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8580                                       - $MAX_UNICODE_CODEPOINT);
8581            $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8582        }
8583        else {
8584            $unicode_count = $count;
8585            $non_unicode_string = "";
8586        }
8587        my $string_count = main::clarify_code_point_count($unicode_count);
8588
8589        my $loose_count = 0;        # how many aliases loosely matched
8590        my $compound_name = "";     # ? Are any names compound?, and if so, an
8591                                    # example
8592        my $properties_with_compound_names = 0;    # count of these
8593
8594
8595        my %flags;              # The status flags used in the file
8596        my $total_entries = 0;  # number of entries written in the comment
8597        my $matches_comment = ""; # The portion of the comment about the
8598                                  # \p{}'s
8599        my @global_comments;    # List of all the tables' comments that are
8600                                # there before this routine was called.
8601        my $has_ucd_alias = 0;  # If there is an alias that is accessible via
8602                                # Unicode::UCD.  If not, then don't say it is
8603                                # in the comment
8604
8605        # Get list of all the parent tables that are equivalent to this one
8606        # (including itself).
8607        my @parents = grep { $parent{main::objaddr $_} == $_ }
8608                            main::uniques($leader, @{$equivalents{$addr}});
8609        my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
8610                                              # tables
8611        for my $parent (@parents) {
8612
8613            my $property = $parent->property;
8614
8615            # Special case 'N' tables in properties with two match tables when
8616            # the other is a 'Y' one.  These are likely to be binary tables,
8617            # but not necessarily.  In either case, \P{} will match the
8618            # complement of \p{}, and so if something is a synonym of \p, the
8619            # complement of that something will be the synonym of \P.  This
8620            # would be true of any property with just two match tables, not
8621            # just those whose values are Y and N; but that would require a
8622            # little extra work, and there are none such so far in Unicode.
8623            my $perl_p = 'p';        # which is it?  \p{} or \P{}
8624            my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
8625
8626            if (scalar $property->tables == 2
8627                && $parent == $property->table('N')
8628                && defined (my $yes = $property->table('Y')))
8629            {
8630                my $yes_addr = do { no overloading; pack 'J', $yes; };
8631                @yes_perl_synonyms
8632                    = grep { $_->property == $perl }
8633                                    main::uniques($yes,
8634                                                $parent{$yes_addr},
8635                                                $parent{$yes_addr}->children);
8636
8637                # But these synonyms are \P{} ,not \p{}
8638                $perl_p = 'P';
8639            }
8640
8641            my @description;        # Will hold the table description
8642            my @note;               # Will hold the table notes.
8643            my @conflicting;        # Will hold the table conflicts.
8644
8645            # Look at the parent, any yes synonyms, and all the children
8646            my $parent_addr = do { no overloading; pack 'J', $parent; };
8647            for my $table ($parent,
8648                           @yes_perl_synonyms,
8649                           @{$children{$parent_addr}})
8650            {
8651                my $table_addr = do { no overloading; pack 'J', $table; };
8652                my $table_property = $table->property;
8653
8654                # Tables are separated by a blank line to create a grouping.
8655                $matches_comment .= "\n" if $matches_comment;
8656
8657                # The table is named based on the property and value
8658                # combination it is for, like script=greek.  But there may be
8659                # a number of synonyms for each side, like 'sc' for 'script',
8660                # and 'grek' for 'greek'.  Any combination of these is a valid
8661                # name for this table.  In this case, there are three more,
8662                # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
8663                # listing all possible combinations in the comment, we make
8664                # sure that each synonym occurs at least once, and add
8665                # commentary that the other combinations are possible.
8666                # Because regular expressions don't recognize things like
8667                # \p{jsn=}, only look at non-null right-hand-sides
8668                my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8669                my @table_aliases = grep { $_->name ne "" } $table->aliases;
8670
8671                # The alias lists above are already ordered in the order we
8672                # want to output them.  To ensure that each synonym is listed,
8673                # we must use the max of the two numbers.  But if there are no
8674                # legal synonyms (nothing in @table_aliases), then we don't
8675                # list anything.
8676                my $listed_combos = (@table_aliases)
8677                                    ?  main::max(scalar @table_aliases,
8678                                                 scalar @property_aliases)
8679                                    : 0;
8680                trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8681
8682                my $property_had_compound_name = 0;
8683
8684                for my $i (0 .. $listed_combos - 1) {
8685                    $total_entries++;
8686
8687                    # The current alias for the property is the next one on
8688                    # the list, or if beyond the end, start over.  Similarly
8689                    # for the table (\p{prop=table})
8690                    my $property_alias = $property_aliases
8691                                            [$i % @property_aliases]->name;
8692                    my $table_alias_object = $table_aliases
8693                                                        [$i % @table_aliases];
8694                    my $table_alias = $table_alias_object->name;
8695                    my $loose_match = $table_alias_object->loose_match;
8696                    $has_ucd_alias |= $table_alias_object->ucd;
8697
8698                    if ($table_alias !~ /\D/) { # Clarify large numbers.
8699                        $table_alias = main::clarify_number($table_alias)
8700                    }
8701
8702                    # Add a comment for this alias combination
8703                    my $current_match_comment;
8704                    if ($table_property == $perl) {
8705                        $current_match_comment = "\\$perl_p"
8706                                                    . "{$table_alias}";
8707                    }
8708                    else {
8709                        $current_match_comment
8710                                        = "\\p{$property_alias=$table_alias}";
8711                        $property_had_compound_name = 1;
8712                    }
8713
8714                    # Flag any abnormal status for this table.
8715                    my $flag = $property->status
8716                                || $table->status
8717                                || $table_alias_object->status;
8718                    if ($flag && $flag ne $PLACEHOLDER) {
8719                        $flags{$flag} = $status_past_participles{$flag};
8720                    }
8721
8722                    $loose_count++;
8723
8724                    # Pretty up the comment.  Note the \b; it says don't make
8725                    # this line a continuation.
8726                    $matches_comment .= sprintf("\b%-1s%-s%s\n",
8727                                        $flag,
8728                                        " " x 7,
8729                                        $current_match_comment);
8730                } # End of generating the entries for this table.
8731
8732                # Save these for output after this group of related tables.
8733                push @description, $table->description;
8734                push @note, $table->note;
8735                push @conflicting, $table->conflicting;
8736
8737                # And this for output after all the tables.
8738                push @global_comments, $table->comment;
8739
8740                # Compute an alternate compound name using the final property
8741                # synonym and the first table synonym with a colon instead of
8742                # the equal sign used elsewhere.
8743                if ($property_had_compound_name) {
8744                    $properties_with_compound_names ++;
8745                    if (! $compound_name || @property_aliases > 1) {
8746                        $compound_name = $property_aliases[-1]->name
8747                                        . ': '
8748                                        . $table_aliases[0]->name;
8749                    }
8750                }
8751            } # End of looping through all children of this table
8752
8753            # Here have assembled in $matches_comment all the related tables
8754            # to the current parent (preceded by the same info for all the
8755            # previous parents).  Put out information that applies to all of
8756            # the current family.
8757            if (@conflicting) {
8758
8759                # But output the conflicting information now, as it applies to
8760                # just this table.
8761                my $conflicting = join ", ", @conflicting;
8762                if ($conflicting) {
8763                    $matches_comment .= <<END;
8764
8765    Note that contrary to what you might expect, the above is NOT the same as
8766END
8767                    $matches_comment .= "any of: " if @conflicting > 1;
8768                    $matches_comment .= "$conflicting\n";
8769                }
8770            }
8771            if (@description) {
8772                $matches_comment .= "\n    Meaning: "
8773                                    . join('; ', @description)
8774                                    . "\n";
8775            }
8776            if (@note) {
8777                $matches_comment .= "\n    Note: "
8778                                    . join("\n    ", @note)
8779                                    . "\n";
8780            }
8781        } # End of looping through all tables
8782
8783        $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8784
8785
8786        my $code_points;
8787        my $match;
8788        my $any_of_these;
8789        if ($unicode_count == 1) {
8790            $match = 'matches';
8791            $code_points = 'single code point';
8792        }
8793        else {
8794            $match = 'match';
8795            $code_points = "$string_count code points";
8796        }
8797
8798        my $synonyms;
8799        my $entries;
8800        if ($total_entries == 1) {
8801            $synonyms = "";
8802            $entries = 'entry';
8803            $any_of_these = 'this'
8804        }
8805        else {
8806            $synonyms = " any of the following regular expression constructs";
8807            $entries = 'entries';
8808            $any_of_these = 'any of these'
8809        }
8810
8811        my $comment = "";
8812        if ($has_ucd_alias) {
8813            $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8814        }
8815        if ($has_unrelated) {
8816            $comment .= <<END;
8817This file is for tables that are not necessarily related:  To conserve
8818resources, every table that matches the identical set of code points in this
8819version of Unicode uses this file.  Each one is listed in a separate group
8820below.  It could be that the tables will match the same set of code points in
8821other Unicode releases, or it could be purely coincidence that they happen to
8822be the same in Unicode $unicode_version, and hence may not in other versions.
8823
8824END
8825        }
8826
8827        if (%flags) {
8828            foreach my $flag (sort keys %flags) {
8829                $comment .= <<END;
8830'$flag' below means that this form is $flags{$flag}.
8831END
8832                if ($flag eq $INTERNAL_ALIAS) {
8833                    $comment .= "DO NOT USE!!!";
8834                }
8835                else {
8836                    $comment .= "Consult $pod_file.pod";
8837                }
8838                $comment .= "\n";
8839            }
8840            $comment .= "\n";
8841        }
8842
8843        if ($total_entries == 0) {
8844            Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8845            $comment .= <<END;
8846This file returns the $code_points in Unicode Version
8847$unicode_version for
8848$leader, but it is inaccessible through Perl regular expressions, as
8849"\\p{prop=}" is not recognized.
8850END
8851
8852        } else {
8853            $comment .= <<END;
8854This file returns the $code_points in Unicode Version
8855$unicode_version that
8856$match$synonyms:
8857
8858$matches_comment
8859$pod_file.pod should be consulted for the syntax rules for $any_of_these,
8860including if adding or subtracting white space, underscore, and hyphen
8861characters matters or doesn't matter, and other permissible syntactic
8862variants.  Upper/lower case distinctions never matter.
8863END
8864
8865        }
8866        if ($compound_name) {
8867            $comment .= <<END;
8868
8869A colon can be substituted for the equals sign, and
8870END
8871            if ($properties_with_compound_names > 1) {
8872                $comment .= <<END;
8873within each group above,
8874END
8875            }
8876            $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8877
8878            # Note the \b below, it says don't make that line a continuation.
8879            $comment .= <<END;
8880anything to the left of the equals (or colon) can be combined with anything to
8881the right.  Thus, for example,
8882$compound_name
8883\bis also valid.
8884END
8885        }
8886
8887        # And append any comment(s) from the actual tables.  They are all
8888        # gathered here, so may not read all that well.
8889        if (@global_comments) {
8890            $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8891        }
8892
8893        if ($count) {   # The format differs if no code points, and needs no
8894                        # explanation in that case
8895            if ($leader->write_as_invlist) {
8896                $comment.= <<END;
8897
8898The first data line of this file begins with the letter V to indicate it is in
8899inversion list format.  The number following the V gives the number of lines
8900remaining.  Each of those remaining lines is a single number representing the
8901starting code point of a range which goes up to but not including the number
8902on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8903the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8904the property.  The final line's range extends to the platform's infinity.
8905END
8906            }
8907            else {
8908                $comment.= <<END;
8909The format of the lines of this file is:
8910START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8911STOP is the ending point, or if omitted, the range has just one code point.
8912END
8913            }
8914            if ($leader->output_range_counts) {
8915                $comment .= <<END;
8916Numbers in comments in [brackets] indicate how many code points are in the
8917range.
8918END
8919            }
8920        }
8921
8922        $leader->set_comment(main::join_lines($comment));
8923        return;
8924    }
8925
8926    # Accessors for the underlying list
8927    for my $sub (qw(
8928                    get_valid_code_point
8929                    get_invalid_code_point
8930                ))
8931    {
8932        no strict "refs";
8933        *$sub = sub {
8934            use strict "refs";
8935            my $self = shift;
8936
8937            return $self->_range_list->$sub(@_);
8938        }
8939    }
8940} # End closure for Match_Table
8941
8942package Property;
8943
8944# The Property class represents a Unicode property, or the $perl
8945# pseudo-property.  It contains a map table initialized empty at construction
8946# time, and for properties accessible through regular expressions, various
8947# match tables, created through the add_match_table() method, and referenced
8948# by the table('NAME') or tables() methods, the latter returning a list of all
8949# of the match tables.  Otherwise table operations implicitly are for the map
8950# table.
8951#
8952# Most of the data in the property is actually about its map table, so it
8953# mostly just uses that table's accessors for most methods.  The two could
8954# have been combined into one object, but for clarity because of their
8955# differing semantics, they have been kept separate.  It could be argued that
8956# the 'file' and 'directory' fields should be kept with the map table.
8957#
8958# Each property has a type.  This can be set in the constructor, or in the
8959# set_type accessor, but mostly it is figured out by the data.  Every property
8960# starts with unknown type, overridden by a parameter to the constructor, or
8961# as match tables are added, or ranges added to the map table, the data is
8962# inspected, and the type changed.  After the table is mostly or entirely
8963# filled, compute_type() should be called to finalize they analysis.
8964#
8965# There are very few operations defined.  One can safely remove a range from
8966# the map table, and property_add_or_replace_non_nulls() adds the maps from another
8967# table to this one, replacing any in the intersection of the two.
8968
8969sub standardize { return main::standardize($_[0]); }
8970sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8971
8972{   # Closure
8973
8974    # This hash will contain as keys, all the aliases of all properties, and
8975    # as values, pointers to their respective property objects.  This allows
8976    # quick look-up of a property from any of its names.
8977    my %alias_to_property_of;
8978
8979    sub dump_alias_to_property_of {
8980        # For debugging
8981
8982        print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8983        return;
8984    }
8985
8986    sub property_ref {
8987        # This is a package subroutine, not called as a method.
8988        # If the single parameter is a literal '*' it returns a list of all
8989        # defined properties.
8990        # Otherwise, the single parameter is a name, and it returns a pointer
8991        # to the corresponding property object, or undef if none.
8992        #
8993        # Properties can have several different names.  The 'standard' form of
8994        # each of them is stored in %alias_to_property_of as they are defined.
8995        # But it's possible that this subroutine will be called with some
8996        # variant, so if the initial lookup fails, it is repeated with the
8997        # standardized form of the input name.  If found, besides returning the
8998        # result, the input name is added to the list so future calls won't
8999        # have to do the conversion again.
9000
9001        my $name = shift;
9002
9003        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9004
9005        if (! defined $name) {
9006            Carp::my_carp_bug("Undefined input property.  No action taken.");
9007            return;
9008        }
9009
9010        return main::uniques(values %alias_to_property_of) if $name eq '*';
9011
9012        # Return cached result if have it.
9013        my $result = $alias_to_property_of{$name};
9014        return $result if defined $result;
9015
9016        # Convert the input to standard form.
9017        my $standard_name = standardize($name);
9018
9019        $result = $alias_to_property_of{$standard_name};
9020        return unless defined $result;        # Don't cache undefs
9021
9022        # Cache the result before returning it.
9023        $alias_to_property_of{$name} = $result;
9024        return $result;
9025    }
9026
9027
9028    main::setup_package();
9029
9030    my %map;
9031    # A pointer to the map table object for this property
9032    main::set_access('map', \%map);
9033
9034    my %full_name;
9035    # The property's full name.  This is a duplicate of the copy kept in the
9036    # map table, but is needed because stringify needs it during
9037    # construction of the map table, and then would have a chicken before egg
9038    # problem.
9039    main::set_access('full_name', \%full_name, 'r');
9040
9041    my %table_ref;
9042    # This hash will contain as keys, all the aliases of any match tables
9043    # attached to this property, and as values, the pointers to their
9044    # respective tables.  This allows quick look-up of a table from any of its
9045    # names.
9046    main::set_access('table_ref', \%table_ref);
9047
9048    my %type;
9049    # The type of the property, $ENUM, $BINARY, etc
9050    main::set_access('type', \%type, 'r');
9051
9052    my %file;
9053    # The filename where the map table will go (if actually written).
9054    # Normally defaulted, but can be overridden.
9055    main::set_access('file', \%file, 'r', 's');
9056
9057    my %directory;
9058    # The directory where the map table will go (if actually written).
9059    # Normally defaulted, but can be overridden.
9060    main::set_access('directory', \%directory, 's');
9061
9062    my %pseudo_map_type;
9063    # This is used to affect the calculation of the map types for all the
9064    # ranges in the table.  It should be set to one of the values that signify
9065    # to alter the calculation.
9066    main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
9067
9068    my %has_only_code_point_maps;
9069    # A boolean used to help in computing the type of data in the map table.
9070    main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
9071
9072    my %unique_maps;
9073    # A list of the first few distinct mappings this property has.  This is
9074    # used to disambiguate between binary and enum property types, so don't
9075    # have to keep more than three.
9076    main::set_access('unique_maps', \%unique_maps);
9077
9078    my %pre_declared_maps;
9079    # A boolean that gives whether the input data should declare all the
9080    # tables used, or not.  If the former, unknown ones raise a warning.
9081    main::set_access('pre_declared_maps',
9082                                    \%pre_declared_maps, 'r', 's');
9083
9084    my %has_dependency;
9085    # A boolean that gives whether some table somewhere is defined as the
9086    # complement of a table in this property.  This is a crude, but currently
9087    # sufficient, mechanism to make this property not get destroyed before
9088    # what is dependent on it is.  Other dependencies could be added, so the
9089    # name was chosen to reflect a more general situation than actually is
9090    # currently the case.
9091    main::set_access('has_dependency', \%has_dependency, 'r', 's');
9092
9093    sub new {
9094        # The only required parameter is the positionally first, name.  All
9095        # other parameters are key => value pairs.  See the documentation just
9096        # above for the meanings of the ones not passed directly on to the map
9097        # table constructor.
9098
9099        my $class = shift;
9100        my $name = shift || "";
9101
9102        my $self = property_ref($name);
9103        if (defined $self) {
9104            my $options_string = join ", ", @_;
9105            $options_string = ".  Ignoring options $options_string" if $options_string;
9106            Carp::my_carp("$self is already in use.  Using existing one$options_string;");
9107            return $self;
9108        }
9109
9110        my %args = @_;
9111
9112        $self = bless \do { my $anonymous_scalar }, $class;
9113        my $addr = do { no overloading; pack 'J', $self; };
9114
9115        $directory{$addr} = delete $args{'Directory'};
9116        $file{$addr} = delete $args{'File'};
9117        $full_name{$addr} = delete $args{'Full_Name'} || $name;
9118        $type{$addr} = delete $args{'Type'} || $UNKNOWN;
9119        $pseudo_map_type{$addr} = delete $args{'Map_Type'};
9120        $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
9121                                    # Starting in this release, property
9122                                    # values should be defined for all
9123                                    # properties, except those overriding this
9124                                    // $v_version ge v5.1.0;
9125
9126        # Rest of parameters passed on.
9127
9128        $has_only_code_point_maps{$addr} = 1;
9129        $table_ref{$addr} = { };
9130        $unique_maps{$addr} = { };
9131        $has_dependency{$addr} = 0;
9132
9133        $map{$addr} = Map_Table->new($name,
9134                                    Full_Name => $full_name{$addr},
9135                                    _Alias_Hash => \%alias_to_property_of,
9136                                    _Property => $self,
9137                                    %args);
9138        return $self;
9139    }
9140
9141    # See this program's beginning comment block about overloading the copy
9142    # constructor.  Few operations are defined on properties, but a couple are
9143    # useful.  It is safe to take the inverse of a property, and to remove a
9144    # single code point from it.
9145    use overload
9146        fallback => 0,
9147        qw("") => "_operator_stringify",
9148        "." => \&main::_operator_dot,
9149        ".=" => \&main::_operator_dot_equal,
9150        '==' => \&main::_operator_equal,
9151        '!=' => \&main::_operator_not_equal,
9152        '=' => sub { return shift },
9153        '-=' => "_minus_and_equal",
9154    ;
9155
9156    sub _operator_stringify {
9157        return "Property '" .  shift->full_name . "'";
9158    }
9159
9160    sub _minus_and_equal {
9161        # Remove a single code point from the map table of a property.
9162
9163        my $self = shift;
9164        my $other = shift;
9165        my $reversed = shift;
9166        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9167
9168        if (ref $other) {
9169            Carp::my_carp_bug("Bad news.  Can't cope with a "
9170                        . ref($other)
9171                        . " argument to '-='.  Subtraction ignored.");
9172            return $self;
9173        }
9174        elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
9175            Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
9176            . ref $self
9177            . " from a non-object.  undef returned.");
9178            return;
9179        }
9180        else {
9181            no overloading;
9182            $map{pack 'J', $self}->delete_range($other, $other);
9183        }
9184        return $self;
9185    }
9186
9187    sub add_match_table {
9188        # Add a new match table for this property, with name given by the
9189        # parameter.  It returns a pointer to the table.
9190
9191        my $self = shift;
9192        my $name = shift;
9193        my %args = @_;
9194
9195        my $addr = do { no overloading; pack 'J', $self; };
9196
9197        my $table = $table_ref{$addr}{$name};
9198        my $standard_name = main::standardize($name);
9199        if (defined $table
9200            || (defined ($table = $table_ref{$addr}{$standard_name})))
9201        {
9202            Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
9203            $table_ref{$addr}{$name} = $table;
9204            return $table;
9205        }
9206        else {
9207
9208            # See if this is a perl extension, if not passed in.
9209            my $perl_extension = delete $args{'Perl_Extension'};
9210            $perl_extension
9211                        = $self->perl_extension if ! defined $perl_extension;
9212
9213            my $fate;
9214            my $suppression_reason = "";
9215            if ($self->name =~ /^_/) {
9216                $fate = $SUPPRESSED;
9217                $suppression_reason = "Parent property is internal only";
9218            }
9219            elsif ($self->fate >= $SUPPRESSED) {
9220                $fate = $self->fate;
9221                $suppression_reason = $why_suppressed{$self->complete_name};
9222
9223            }
9224            elsif ($name =~ /^_/) {
9225                $fate = $INTERNAL_ONLY;
9226            }
9227            $table = Match_Table->new(
9228                                Name => $name,
9229                                Perl_Extension => $perl_extension,
9230                                _Alias_Hash => $table_ref{$addr},
9231                                _Property => $self,
9232                                Fate => $fate,
9233                                Suppression_Reason => $suppression_reason,
9234                                Status => $self->status,
9235                                _Status_Info => $self->status_info,
9236                                %args);
9237            return unless defined $table;
9238        }
9239
9240        # Save the names for quick look up
9241        $table_ref{$addr}{$standard_name} = $table;
9242        $table_ref{$addr}{$name} = $table;
9243
9244        # Perhaps we can figure out the type of this property based on the
9245        # fact of adding this match table.  First, string properties don't
9246        # have match tables; second, a binary property can't have 3 match
9247        # tables
9248        if ($type{$addr} == $UNKNOWN) {
9249            $type{$addr} = $NON_STRING;
9250        }
9251        elsif ($type{$addr} == $STRING) {
9252            Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
9253            $type{$addr} = $NON_STRING;
9254        }
9255        elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
9256            if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
9257                if ($type{$addr} == $BINARY) {
9258                    Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
9259                }
9260                $type{$addr} = $ENUM;
9261            }
9262        }
9263
9264        return $table;
9265    }
9266
9267    sub delete_match_table {
9268        # Delete the table referred to by $2 from the property $1.
9269
9270        my $self = shift;
9271        my $table_to_remove = shift;
9272        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9273
9274        my $addr = do { no overloading; pack 'J', $self; };
9275
9276        # Remove all names that refer to it.
9277        foreach my $key (keys %{$table_ref{$addr}}) {
9278            delete $table_ref{$addr}{$key}
9279                                if $table_ref{$addr}{$key} == $table_to_remove;
9280        }
9281
9282        $table_to_remove->DESTROY;
9283        return;
9284    }
9285
9286    sub table {
9287        # Return a pointer to the match table (with name given by the
9288        # parameter) associated with this property; undef if none.
9289
9290        my $self = shift;
9291        my $name = shift;
9292        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9293
9294        my $addr = do { no overloading; pack 'J', $self; };
9295
9296        return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9297
9298        # If quick look-up failed, try again using the standard form of the
9299        # input name.  If that succeeds, cache the result before returning so
9300        # won't have to standardize this input name again.
9301        my $standard_name = main::standardize($name);
9302        return unless defined $table_ref{$addr}{$standard_name};
9303
9304        $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9305        return $table_ref{$addr}{$name};
9306    }
9307
9308    sub tables {
9309        # Return a list of pointers to all the match tables attached to this
9310        # property
9311
9312        no overloading;
9313        return main::uniques(values %{$table_ref{pack 'J', shift}});
9314    }
9315
9316    sub directory {
9317        # Returns the directory the map table for this property should be
9318        # output in.  If a specific directory has been specified, that has
9319        # priority;  'undef' is returned if the type isn't defined;
9320        # or $map_directory for everything else.
9321
9322        my $addr = do { no overloading; pack 'J', shift; };
9323
9324        return $directory{$addr} if defined $directory{$addr};
9325        return undef if $type{$addr} == $UNKNOWN;
9326        return $map_directory;
9327    }
9328
9329    sub swash_name {
9330        # Return the name that is used to both:
9331        #   1)  Name the file that the map table is written to.
9332        #   2)  The name of swash related stuff inside that file.
9333        # The reason for this is that the Perl core historically has used
9334        # certain names that aren't the same as the Unicode property names.
9335        # To continue using these, $file is hard-coded in this file for those,
9336        # but otherwise the standard name is used.  This is different from the
9337        # external_name, so that the rest of the files, like in lib can use
9338        # the standard name always, without regard to historical precedent.
9339
9340        my $self = shift;
9341        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9342
9343        my $addr = do { no overloading; pack 'J', $self; };
9344
9345        # Swash names are used only on either
9346        # 1) legacy-only properties, because the formats for these are
9347        #    unchangeable, and they have had these lines in them; or
9348        # 2) regular or internal-only map tables
9349        # 3) otherwise there should be no access to the
9350        #    property map table from other parts of Perl.
9351        return if $map{$addr}->fate != $ORDINARY
9352                  && $map{$addr}->fate != $LEGACY_ONLY
9353                  && ! ($map{$addr}->name =~ /^_/
9354                        && $map{$addr}->fate == $INTERNAL_ONLY);
9355
9356        return $file{$addr} if defined $file{$addr};
9357        return $map{$addr}->external_name;
9358    }
9359
9360    sub to_create_match_tables {
9361        # Returns a boolean as to whether or not match tables should be
9362        # created for this property.
9363
9364        my $self = shift;
9365        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9366
9367        # The whole point of this pseudo property is match tables.
9368        return 1 if $self == $perl;
9369
9370        my $addr = do { no overloading; pack 'J', $self; };
9371
9372        # Don't generate tables of code points that match the property values
9373        # of a string property.  Such a list would most likely have many
9374        # property values, each with just one or very few code points mapping
9375        # to it.
9376        return 0 if $type{$addr} == $STRING;
9377
9378        # Otherwise, do.
9379        return 1;
9380    }
9381
9382    sub property_add_or_replace_non_nulls {
9383        # This adds the mappings in the property $other to $self.  Non-null
9384        # mappings from $other override those in $self.  It essentially merges
9385        # the two properties, with the second having priority except for null
9386        # mappings.
9387
9388        my $self = shift;
9389        my $other = shift;
9390        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9391
9392        if (! $other->isa(__PACKAGE__)) {
9393            Carp::my_carp_bug("$other should be a "
9394                            . __PACKAGE__
9395                            . ".  Not a '"
9396                            . ref($other)
9397                            . "'.  Not added;");
9398            return;
9399        }
9400
9401        no overloading;
9402        return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
9403    }
9404
9405    sub set_proxy_for {
9406        # Certain tables are not generally written out to files, but
9407        # Unicode::UCD has the intelligence to know that the file for $self
9408        # can be used to reconstruct those tables.  This routine just changes
9409        # things so that UCD pod entries for those suppressed tables are
9410        # generated, so the fact that a proxy is used is invisible to the
9411        # user.
9412
9413        my $self = shift;
9414
9415        foreach my $property_name (@_) {
9416            my $ref = property_ref($property_name);
9417            next if $ref->to_output_map;
9418            $ref->set_fate($MAP_PROXIED);
9419        }
9420    }
9421
9422    sub set_type {
9423        # Set the type of the property.  Mostly this is figured out by the
9424        # data in the table.  But this is used to set it explicitly.  The
9425        # reason it is not a standard accessor is that when setting a binary
9426        # property, we need to make sure that all the true/false aliases are
9427        # present, as they were omitted in early Unicode releases.
9428
9429        my $self = shift;
9430        my $type = shift;
9431        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9432
9433        if ($type != $ENUM
9434            && $type != $BINARY
9435            && $type != $FORCED_BINARY
9436            && $type != $STRING)
9437        {
9438            Carp::my_carp("Unrecognized type '$type'.  Type not set");
9439            return;
9440        }
9441
9442        { no overloading; $type{pack 'J', $self} = $type; }
9443        return if $type != $BINARY && $type != $FORCED_BINARY;
9444
9445        my $yes = $self->table('Y');
9446        $yes = $self->table('Yes') if ! defined $yes;
9447        $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9448                                                            if ! defined $yes;
9449
9450        # Add aliases in order wanted, duplicates will be ignored.  We use a
9451        # binary property present in all releases for its ordered lists of
9452        # true/false aliases.  Note, that could run into problems in
9453        # outputting things in that we don't distinguish between the name and
9454        # full name of these.  Hopefully, if the table was already created
9455        # before this code is executed, it was done with these set properly.
9456        my $bm = property_ref("Bidi_Mirrored");
9457        foreach my $alias ($bm->table("Y")->aliases) {
9458            $yes->add_alias($alias->name);
9459        }
9460        my $no = $self->table('N');
9461        $no = $self->table('No') if ! defined $no;
9462        $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9463        foreach my $alias ($bm->table("N")->aliases) {
9464            $no->add_alias($alias->name);
9465        }
9466
9467        return;
9468    }
9469
9470    sub add_map {
9471        # Add a map to the property's map table.  This also keeps
9472        # track of the maps so that the property type can be determined from
9473        # its data.
9474
9475        my $self = shift;
9476        my $start = shift;  # First code point in range
9477        my $end = shift;    # Final code point in range
9478        my $map = shift;    # What the range maps to.
9479        # Rest of parameters passed on.
9480
9481        my $addr = do { no overloading; pack 'J', $self; };
9482
9483        # If haven't the type of the property, gather information to figure it
9484        # out.
9485        if ($type{$addr} == $UNKNOWN) {
9486
9487            # If the map contains an interior blank or dash, or most other
9488            # nonword characters, it will be a string property.  This
9489            # heuristic may actually miss some string properties.  If so, they
9490            # may need to have explicit set_types called for them.  This
9491            # happens in the Unihan properties.
9492            if ($map =~ / (?<= . ) [ -] (?= . ) /x
9493                || $map =~ / [^\w.\/\ -]  /x)
9494            {
9495                $self->set_type($STRING);
9496
9497                # $unique_maps is used for disambiguating between ENUM and
9498                # BINARY later; since we know the property is not going to be
9499                # one of those, no point in keeping the data around
9500                undef $unique_maps{$addr};
9501            }
9502            else {
9503
9504                # Not necessarily a string.  The final decision has to be
9505                # deferred until all the data are in.  We keep track of if all
9506                # the values are code points for that eventual decision.
9507                $has_only_code_point_maps{$addr} &=
9508                                            $map =~ / ^ $code_point_re $/x;
9509
9510                # For the purposes of disambiguating between binary and other
9511                # enumerations at the end, we keep track of the first three
9512                # distinct property values.  Once we get to three, we know
9513                # it's not going to be binary, so no need to track more.
9514                if (scalar keys %{$unique_maps{$addr}} < 3) {
9515                    $unique_maps{$addr}{main::standardize($map)} = 1;
9516                }
9517            }
9518        }
9519
9520        # Add the mapping by calling our map table's method
9521        return $map{$addr}->add_map($start, $end, $map, @_);
9522    }
9523
9524    sub compute_type {
9525        # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
9526        # should be called after the property is mostly filled with its maps.
9527        # We have been keeping track of what the property values have been,
9528        # and now have the necessary information to figure out the type.
9529
9530        my $self = shift;
9531        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9532
9533        my $addr = do { no overloading; pack 'J', $self; };
9534
9535        my $type = $type{$addr};
9536
9537        # If already have figured these out, no need to do so again, but we do
9538        # a double check on ENUMS to make sure that a string property hasn't
9539        # improperly been classified as an ENUM, so continue on with those.
9540        return if $type == $STRING
9541                  || $type == $BINARY
9542                  || $type == $FORCED_BINARY;
9543
9544        # If every map is to a code point, is a string property.
9545        if ($type == $UNKNOWN
9546            && ($has_only_code_point_maps{$addr}
9547                || (defined $map{$addr}->default_map
9548                    && $map{$addr}->default_map eq "")))
9549        {
9550            $self->set_type($STRING);
9551        }
9552        else {
9553
9554            # Otherwise, it is to some sort of enumeration.  (The case where
9555            # it is a Unicode miscellaneous property, and treated like a
9556            # string in this program is handled in add_map()).  Distinguish
9557            # between binary and some other enumeration type.  Of course, if
9558            # there are more than two values, it's not binary.  But more
9559            # subtle is the test that the default mapping is defined means it
9560            # isn't binary.  This in fact may change in the future if Unicode
9561            # changes the way its data is structured.  But so far, no binary
9562            # properties ever have @missing lines for them, so the default map
9563            # isn't defined for them.  The few properties that are two-valued
9564            # and aren't considered binary have the default map defined
9565            # starting in Unicode 5.0, when the @missing lines appeared; and
9566            # this program has special code to put in a default map for them
9567            # for earlier than 5.0 releases.
9568            if ($type == $ENUM
9569                || scalar keys %{$unique_maps{$addr}} > 2
9570                || defined $self->default_map)
9571            {
9572                my $tables = $self->tables;
9573                my $count = $self->count;
9574                if ($verbosity && $tables > 500 && $tables/$count > .1) {
9575                    Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9576                }
9577                $self->set_type($ENUM);
9578            }
9579            else {
9580                $self->set_type($BINARY);
9581            }
9582        }
9583        undef $unique_maps{$addr};  # Garbage collect
9584        return;
9585    }
9586
9587    sub set_fate {
9588        my $self = shift;
9589        my $fate = shift;
9590        my $reason = shift;  # Ignored unless suppressing
9591        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9592
9593        my $addr = do { no overloading; pack 'J', $self; };
9594        if ($fate >= $SUPPRESSED) {
9595            $why_suppressed{$self->complete_name} = $reason;
9596        }
9597
9598        # Each table shares the property's fate, except that MAP_PROXIED
9599        # doesn't affect match tables
9600        $map{$addr}->set_fate($fate, $reason);
9601        if ($fate != $MAP_PROXIED) {
9602            foreach my $table ($map{$addr}, $self->tables) {
9603                $table->set_fate($fate, $reason);
9604            }
9605        }
9606        return;
9607    }
9608
9609
9610    # Most of the accessors for a property actually apply to its map table.
9611    # Setup up accessor functions for those, referring to %map
9612    for my $sub (qw(
9613                    add_alias
9614                    add_anomalous_entry
9615                    add_comment
9616                    add_conflicting
9617                    add_description
9618                    add_duplicate
9619                    add_note
9620                    aliases
9621                    comment
9622                    complete_name
9623                    containing_range
9624                    count
9625                    default_map
9626                    definition
9627                    delete_range
9628                    description
9629                    each_range
9630                    external_name
9631                    fate
9632                    file_path
9633                    format
9634                    initialize
9635                    inverse_list
9636                    is_empty
9637                    replacement_property
9638                    name
9639                    note
9640                    perl_extension
9641                    property
9642                    range_count
9643                    ranges
9644                    range_size_1
9645                    replace_map
9646                    reset_each_range
9647                    set_comment
9648                    set_default_map
9649                    set_file_path
9650                    set_final_comment
9651                    _set_format
9652                    set_range_size_1
9653                    set_status
9654                    set_to_output_map
9655                    short_name
9656                    status
9657                    status_info
9658                    to_output_map
9659                    type_of
9660                    value_of
9661                    write
9662                ))
9663                    # 'property' above is for symmetry, so that one can take
9664                    # the property of a property and get itself, and so don't
9665                    # have to distinguish between properties and tables in
9666                    # calling code
9667    {
9668        no strict "refs";
9669        *$sub = sub {
9670            use strict "refs";
9671            my $self = shift;
9672            no overloading;
9673            return $map{pack 'J', $self}->$sub(@_);
9674        }
9675    }
9676
9677
9678} # End closure
9679
9680package main;
9681
9682sub display_chr {
9683    # Converts an ordinal printable character value to a displayable string,
9684    # using a dotted circle to hold combining characters.
9685
9686    my $ord = shift;
9687    my $chr = chr $ord;
9688    return $chr if $ccc->table(0)->contains($ord);
9689    return "\x{25CC}$chr";
9690}
9691
9692sub join_lines($) {
9693    # Returns lines of the input joined together, so that they can be folded
9694    # properly.
9695    # This causes continuation lines to be joined together into one long line
9696    # for folding.  A continuation line is any line that doesn't begin with a
9697    # space or "\b" (the latter is stripped from the output).  This is so
9698    # lines can be be in a HERE document so as to fit nicely in the terminal
9699    # width, but be joined together in one long line, and then folded with
9700    # indents, '#' prefixes, etc, properly handled.
9701    # A blank separates the joined lines except if there is a break; an extra
9702    # blank is inserted after a period ending a line.
9703
9704    # Initialize the return with the first line.
9705    my ($return, @lines) = split "\n", shift;
9706
9707    # If the first line is null, it was an empty line, add the \n back in
9708    $return = "\n" if $return eq "";
9709
9710    # Now join the remainder of the physical lines.
9711    for my $line (@lines) {
9712
9713        # An empty line means wanted a blank line, so add two \n's to get that
9714        # effect, and go to the next line.
9715        if (length $line == 0) {
9716            $return .= "\n\n";
9717            next;
9718        }
9719
9720        # Look at the last character of what we have so far.
9721        my $previous_char = substr($return, -1, 1);
9722
9723        # And at the next char to be output.
9724        my $next_char = substr($line, 0, 1);
9725
9726        if ($previous_char ne "\n") {
9727
9728            # Here didn't end wth a nl.  If the next char a blank or \b, it
9729            # means that here there is a break anyway.  So add a nl to the
9730            # output.
9731            if ($next_char eq " " || $next_char eq "\b") {
9732                $previous_char = "\n";
9733                $return .= $previous_char;
9734            }
9735
9736            # Add an extra space after periods.
9737            $return .= " " if $previous_char eq '.';
9738        }
9739
9740        # Here $previous_char is still the latest character to be output.  If
9741        # it isn't a nl, it means that the next line is to be a continuation
9742        # line, with a blank inserted between them.
9743        $return .= " " if $previous_char ne "\n";
9744
9745        # Get rid of any \b
9746        substr($line, 0, 1) = "" if $next_char eq "\b";
9747
9748        # And append this next line.
9749        $return .= $line;
9750    }
9751
9752    return $return;
9753}
9754
9755sub simple_fold($;$$$) {
9756    # Returns a string of the input (string or an array of strings) folded
9757    # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9758    # a \n
9759    # This is tailored for the kind of text written by this program,
9760    # especially the pod file, which can have very long names with
9761    # underscores in the middle, or words like AbcDefgHij....  We allow
9762    # breaking in the middle of such constructs if the line won't fit
9763    # otherwise.  The break in such cases will come either just after an
9764    # underscore, or just before one of the Capital letters.
9765
9766    local $to_trace = 0 if main::DEBUG;
9767
9768    my $line = shift;
9769    my $prefix = shift;     # Optional string to prepend to each output
9770                            # line
9771    $prefix = "" unless defined $prefix;
9772
9773    my $hanging_indent = shift; # Optional number of spaces to indent
9774                                # continuation lines
9775    $hanging_indent = 0 unless $hanging_indent;
9776
9777    my $right_margin = shift;   # Optional number of spaces to narrow the
9778                                # total width by.
9779    $right_margin = 0 unless defined $right_margin;
9780
9781    # Call carp with the 'nofold' option to avoid it from trying to call us
9782    # recursively
9783    Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
9784
9785    # The space available doesn't include what's automatically prepended
9786    # to each line, or what's reserved on the right.
9787    my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9788    # XXX Instead of using the 'nofold' perhaps better to look up the stack
9789
9790    if (DEBUG && $hanging_indent >= $max) {
9791        Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
9792        $hanging_indent = 0;
9793    }
9794
9795    # First, split into the current physical lines.
9796    my @line;
9797    if (ref $line) {        # Better be an array, because not bothering to
9798                            # test
9799        foreach my $line (@{$line}) {
9800            push @line, split /\n/, $line;
9801        }
9802    }
9803    else {
9804        @line = split /\n/, $line;
9805    }
9806
9807    #local $to_trace = 1 if main::DEBUG;
9808    trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9809
9810    # Look at each current physical line.
9811    for (my $i = 0; $i < @line; $i++) {
9812        Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9813        #local $to_trace = 1 if main::DEBUG;
9814        trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9815
9816        # Remove prefix, because will be added back anyway, don't want
9817        # doubled prefix
9818        $line[$i] =~ s/^$prefix//;
9819
9820        # Remove trailing space
9821        $line[$i] =~ s/\s+\Z//;
9822
9823        # If the line is too long, fold it.
9824        if (length $line[$i] > $max) {
9825            my $remainder;
9826
9827            # Here needs to fold.  Save the leading space in the line for
9828            # later.
9829            $line[$i] =~ /^ ( \s* )/x;
9830            my $leading_space = $1;
9831            trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9832
9833            # If character at final permissible position is white space,
9834            # fold there, which will delete that white space
9835            if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9836                $remainder = substr($line[$i], $max);
9837                $line[$i] = substr($line[$i], 0, $max - 1);
9838            }
9839            else {
9840
9841                # Otherwise fold at an acceptable break char closest to
9842                # the max length.  Look at just the maximal initial
9843                # segment of the line
9844                my $segment = substr($line[$i], 0, $max - 1);
9845                if ($segment =~
9846                    /^ ( .{$hanging_indent}   # Don't look before the
9847                                              #  indent.
9848                        \ *                   # Don't look in leading
9849                                              #  blanks past the indent
9850                            [^ ] .*           # Find the right-most
9851                        (?:                   #  acceptable break:
9852                            [ \s = ]          # space or equal
9853                            | - (?! [.0-9] )  # or non-unary minus.
9854                        )                     # $1 includes the character
9855                    )/x)
9856                {
9857                    # Split into the initial part that fits, and remaining
9858                    # part of the input
9859                    $remainder = substr($line[$i], length $1);
9860                    $line[$i] = $1;
9861                    trace $line[$i] if DEBUG && $to_trace;
9862                    trace $remainder if DEBUG && $to_trace;
9863                }
9864
9865                # If didn't find a good breaking spot, see if there is a
9866                # not-so-good breaking spot.  These are just after
9867                # underscores or where the case changes from lower to
9868                # upper.  Use \a as a soft hyphen, but give up
9869                # and don't break the line if there is actually a \a
9870                # already in the input.  We use an ascii character for the
9871                # soft-hyphen to avoid any attempt by miniperl to try to
9872                # access the files that this program is creating.
9873                elsif ($segment !~ /\a/
9874                       && ($segment =~ s/_/_\a/g
9875                       || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
9876                {
9877                    # Here were able to find at least one place to insert
9878                    # our substitute soft hyphen.  Find the right-most one
9879                    # and replace it by a real hyphen.
9880                    trace $segment if DEBUG && $to_trace;
9881                    substr($segment,
9882                            rindex($segment, "\a"),
9883                            1) = '-';
9884
9885                    # Then remove the soft hyphen substitutes.
9886                    $segment =~ s/\a//g;
9887                    trace $segment if DEBUG && $to_trace;
9888
9889                    # And split into the initial part that fits, and
9890                    # remainder of the line
9891                    my $pos = rindex($segment, '-');
9892                    $remainder = substr($line[$i], $pos);
9893                    trace $remainder if DEBUG && $to_trace;
9894                    $line[$i] = substr($segment, 0, $pos + 1);
9895                }
9896            }
9897
9898            # Here we know if we can fold or not.  If we can, $remainder
9899            # is what remains to be processed in the next iteration.
9900            if (defined $remainder) {
9901                trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9902
9903                # Insert the folded remainder of the line as a new element
9904                # of the array.  (It may still be too long, but we will
9905                # deal with that next time through the loop.)  Omit any
9906                # leading space in the remainder.
9907                $remainder =~ s/^\s+//;
9908                trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9909
9910                # But then indent by whichever is larger of:
9911                # 1) the leading space on the input line;
9912                # 2) the hanging indent.
9913                # This preserves indentation in the original line.
9914                my $lead = ($leading_space)
9915                            ? length $leading_space
9916                            : $hanging_indent;
9917                $lead = max($lead, $hanging_indent);
9918                splice @line, $i+1, 0, (" " x $lead) . $remainder;
9919            }
9920        }
9921
9922        # Ready to output the line. Get rid of any trailing space
9923        # And prefix by the required $prefix passed in.
9924        $line[$i] =~ s/\s+$//;
9925        $line[$i] = "$prefix$line[$i]\n";
9926    } # End of looping through all the lines.
9927
9928    return join "", @line;
9929}
9930
9931sub property_ref {  # Returns a reference to a property object.
9932    return Property::property_ref(@_);
9933}
9934
9935sub force_unlink ($) {
9936    my $filename = shift;
9937    return unless file_exists($filename);
9938    return if CORE::unlink($filename);
9939
9940    # We might need write permission
9941    chmod 0777, $filename;
9942    CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9943    return;
9944}
9945
9946sub write ($$@) {
9947    # Given a filename and references to arrays of lines, write the lines of
9948    # each array to the file
9949    # Filename can be given as an arrayref of directory names
9950
9951    return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
9952
9953    my $file  = shift;
9954    my $use_utf8 = shift;
9955
9956    # Get into a single string if an array, and get rid of, in Unix terms, any
9957    # leading '.'
9958    $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9959    $file = File::Spec->canonpath($file);
9960
9961    # If has directories, make sure that they all exist
9962    (undef, my $directories, undef) = File::Spec->splitpath($file);
9963    File::Path::mkpath($directories) if $directories && ! -d $directories;
9964
9965    push @files_actually_output, $file;
9966
9967    force_unlink ($file);
9968
9969    my $OUT;
9970    if (not open $OUT, ">", $file) {
9971        Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9972        return;
9973    }
9974
9975    binmode $OUT, ":utf8" if $use_utf8;
9976
9977    while (defined (my $lines_ref = shift)) {
9978        unless (@$lines_ref) {
9979            Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9980        }
9981
9982        print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9983    }
9984    close $OUT or die Carp::my_carp("close '$file' failed: $!");
9985
9986    print "$file written.\n" if $verbosity >= $VERBOSE;
9987
9988    return;
9989}
9990
9991
9992sub Standardize($) {
9993    # This converts the input name string into a standardized equivalent to
9994    # use internally.
9995
9996    my $name = shift;
9997    unless (defined $name) {
9998      Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
9999      return;
10000    }
10001
10002    # Remove any leading or trailing white space
10003    $name =~ s/^\s+//g;
10004    $name =~ s/\s+$//g;
10005
10006    # Convert interior white space and hyphens into underscores.
10007    $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
10008
10009    # Capitalize the letter following an underscore, and convert a sequence of
10010    # multiple underscores to a single one
10011    $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
10012
10013    # And capitalize the first letter, but not for the special cjk ones.
10014    $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
10015    return $name;
10016}
10017
10018sub standardize ($) {
10019    # Returns a lower-cased standardized name, without underscores.  This form
10020    # is chosen so that it can distinguish between any real versus superficial
10021    # Unicode name differences.  It relies on the fact that Unicode doesn't
10022    # have interior underscores, white space, nor dashes in any
10023    # stricter-matched name.  It should not be used on Unicode code point
10024    # names (the Name property), as they mostly, but not always follow these
10025    # rules.
10026
10027    my $name = Standardize(shift);
10028    return if !defined $name;
10029
10030    $name =~ s/ (?<= .) _ (?= . ) //xg;
10031    return lc $name;
10032}
10033
10034sub utf8_heavy_name ($$) {
10035    # Returns the name that utf8_heavy.pl will use to find a table.  XXX
10036    # perhaps this function should be placed somewhere, like Heavy.pl so that
10037    # utf8_heavy can use it directly without duplicating code that can get
10038    # out-of sync.
10039
10040    my $table = shift;
10041    my $alias = shift;
10042    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10043
10044    my $property = $table->property;
10045    $property = ($property == $perl)
10046                ? ""                # 'perl' is never explicitly stated
10047                : standardize($property->name) . '=';
10048    if ($alias->loose_match) {
10049        return $property . standardize($alias->name);
10050    }
10051    else {
10052        return lc ($property . $alias->name);
10053    }
10054
10055    return;
10056}
10057
10058{   # Closure
10059
10060    my $indent_increment = " " x (($debugging_build) ? 2 : 0);
10061    %main::already_output = ();
10062
10063    $main::simple_dumper_nesting = 0;
10064
10065    sub simple_dumper {
10066        # Like Simple Data::Dumper. Good enough for our needs. We can't use
10067        # the real thing as we have to run under miniperl.
10068
10069        # It is designed so that on input it is at the beginning of a line,
10070        # and the final thing output in any call is a trailing ",\n".
10071
10072        my $item = shift;
10073        my $indent = shift;
10074        $indent = "" if ! $debugging_build || ! defined $indent;
10075
10076        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10077
10078        # nesting level is localized, so that as the call stack pops, it goes
10079        # back to the prior value.
10080        local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
10081        local %main::already_output = %main::already_output;
10082        $main::simple_dumper_nesting++;
10083        #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
10084
10085        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10086
10087        # Determine the indent for recursive calls.
10088        my $next_indent = $indent . $indent_increment;
10089
10090        my $output;
10091        if (! ref $item) {
10092
10093            # Dump of scalar: just output it in quotes if not a number.  To do
10094            # so we must escape certain characters, and therefore need to
10095            # operate on a copy to avoid changing the original
10096            my $copy = $item;
10097            $copy = $UNDEF unless defined $copy;
10098
10099            # Quote non-integers (integers also have optional leading '-')
10100            if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
10101
10102                # Escape apostrophe and backslash
10103                $copy =~ s/ ( ['\\] ) /\\$1/xg;
10104                $copy = "'$copy'";
10105            }
10106            $output = "$indent$copy,\n";
10107        }
10108        else {
10109
10110            # Keep track of cycles in the input, and refuse to infinitely loop
10111            my $addr = do { no overloading; pack 'J', $item; };
10112            if (defined $main::already_output{$addr}) {
10113                return "${indent}ALREADY OUTPUT: $item\n";
10114            }
10115            $main::already_output{$addr} = $item;
10116
10117            if (ref $item eq 'ARRAY') {
10118                my $using_brackets;
10119                $output = $indent;
10120                if ($main::simple_dumper_nesting > 1) {
10121                    $output .= '[';
10122                    $using_brackets = 1;
10123                }
10124                else {
10125                    $using_brackets = 0;
10126                }
10127
10128                # If the array is empty, put the closing bracket on the same
10129                # line.  Otherwise, recursively add each array element
10130                if (@$item == 0) {
10131                    $output .= " ";
10132                }
10133                else {
10134                    $output .= "\n";
10135                    for (my $i = 0; $i < @$item; $i++) {
10136
10137                        # Indent array elements one level
10138                        $output .= &simple_dumper($item->[$i], $next_indent);
10139                        next if ! $debugging_build;
10140                        $output =~ s/\n$//;      # Remove any trailing nl so
10141                        $output .= " # [$i]\n";  # as to add a comment giving
10142                                                 # the array index
10143                    }
10144                    $output .= $indent;     # Indent closing ']' to orig level
10145                }
10146                $output .= ']' if $using_brackets;
10147                $output .= ",\n";
10148            }
10149            elsif (ref $item eq 'HASH') {
10150                my $is_first_line;
10151                my $using_braces;
10152                my $body_indent;
10153
10154                # No surrounding braces at top level
10155                $output .= $indent;
10156                if ($main::simple_dumper_nesting > 1) {
10157                    $output .= "{\n";
10158                    $is_first_line = 0;
10159                    $body_indent = $next_indent;
10160                    $next_indent .= $indent_increment;
10161                    $using_braces = 1;
10162                }
10163                else {
10164                    $is_first_line = 1;
10165                    $body_indent = $indent;
10166                    $using_braces = 0;
10167                }
10168
10169                # Output hashes sorted alphabetically instead of apparently
10170                # random.  Use caseless alphabetic sort
10171                foreach my $key (sort { lc $a cmp lc $b } keys %$item)
10172                {
10173                    if ($is_first_line) {
10174                        $is_first_line = 0;
10175                    }
10176                    else {
10177                        $output .= "$body_indent";
10178                    }
10179
10180                    # The key must be a scalar, but this recursive call quotes
10181                    # it
10182                    $output .= &simple_dumper($key);
10183
10184                    # And change the trailing comma and nl to the hash fat
10185                    # comma for clarity, and so the value can be on the same
10186                    # line
10187                    $output =~ s/,\n$/ => /;
10188
10189                    # Recursively call to get the value's dump.
10190                    my $next = &simple_dumper($item->{$key}, $next_indent);
10191
10192                    # If the value is all on one line, remove its indent, so
10193                    # will follow the => immediately.  If it takes more than
10194                    # one line, start it on a new line.
10195                    if ($next !~ /\n.*\n/) {
10196                        $next =~ s/^ *//;
10197                    }
10198                    else {
10199                        $output .= "\n";
10200                    }
10201                    $output .= $next;
10202                }
10203
10204                $output .= "$indent},\n" if $using_braces;
10205            }
10206            elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
10207                $output = $indent . ref($item) . "\n";
10208                # XXX see if blessed
10209            }
10210            elsif ($item->can('dump')) {
10211
10212                # By convention in this program, objects furnish a 'dump'
10213                # method.  Since not doing any output at this level, just pass
10214                # on the input indent
10215                $output = $item->dump($indent);
10216            }
10217            else {
10218                Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
10219            }
10220        }
10221        return $output;
10222    }
10223}
10224
10225sub dump_inside_out {
10226    # Dump inside-out hashes in an object's state by converting them to a
10227    # regular hash and then calling simple_dumper on that.
10228
10229    my $object = shift;
10230    my $fields_ref = shift;
10231
10232    my $addr = do { no overloading; pack 'J', $object; };
10233
10234    my %hash;
10235    foreach my $key (keys %$fields_ref) {
10236        $hash{$key} = $fields_ref->{$key}{$addr};
10237    }
10238
10239    return simple_dumper(\%hash, @_);
10240}
10241
10242sub _operator_dot {
10243    # Overloaded '.' method that is common to all packages.  It uses the
10244    # package's stringify method.
10245
10246    my $self = shift;
10247    my $other = shift;
10248    my $reversed = shift;
10249    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10250
10251    $other = "" unless defined $other;
10252
10253    foreach my $which (\$self, \$other) {
10254        next unless ref $$which;
10255        if ($$which->can('_operator_stringify')) {
10256            $$which = $$which->_operator_stringify;
10257        }
10258        else {
10259            my $ref = ref $$which;
10260            my $addr = do { no overloading; pack 'J', $$which; };
10261            $$which = "$ref ($addr)";
10262        }
10263    }
10264    return ($reversed)
10265            ? "$other$self"
10266            : "$self$other";
10267}
10268
10269sub _operator_dot_equal {
10270    # Overloaded '.=' method that is common to all packages.
10271
10272    my $self = shift;
10273    my $other = shift;
10274    my $reversed = shift;
10275    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10276
10277    $other = "" unless defined $other;
10278
10279    if ($reversed) {
10280        return $other .= "$self";
10281    }
10282    else {
10283        return "$self" . "$other";
10284    }
10285}
10286
10287sub _operator_equal {
10288    # Generic overloaded '==' routine.  To be equal, they must be the exact
10289    # same object
10290
10291    my $self = shift;
10292    my $other = shift;
10293
10294    return 0 unless defined $other;
10295    return 0 unless ref $other;
10296    no overloading;
10297    return $self == $other;
10298}
10299
10300sub _operator_not_equal {
10301    my $self = shift;
10302    my $other = shift;
10303
10304    return ! _operator_equal($self, $other);
10305}
10306
10307sub substitute_PropertyAliases($) {
10308    # Deal with early releases that don't have the crucial PropertyAliases.txt
10309    # file.
10310
10311    my $file_object = shift;
10312    $file_object->insert_lines(get_old_property_aliases());
10313
10314    process_PropertyAliases($file_object);
10315}
10316
10317
10318sub process_PropertyAliases($) {
10319    # This reads in the PropertyAliases.txt file, which contains almost all
10320    # the character properties in Unicode and their equivalent aliases:
10321    # scf       ; Simple_Case_Folding         ; sfc
10322    #
10323    # Field 0 is the preferred short name for the property.
10324    # Field 1 is the full name.
10325    # Any succeeding ones are other accepted names.
10326
10327    my $file= shift;
10328    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10329
10330    # Add any cjk properties that may have been defined.
10331    $file->insert_lines(@cjk_properties);
10332
10333    while ($file->next_line) {
10334
10335        my @data = split /\s*;\s*/;
10336
10337        my $full = $data[1];
10338
10339        # This line is defective in early Perls.  The property in Unihan.txt
10340        # is kRSUnicode.
10341        if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
10342            push @data, qw(cjkRSUnicode kRSUnicode);
10343        }
10344
10345        my $this = Property->new($data[0], Full_Name => $full);
10346
10347        $this->set_fate($SUPPRESSED, $why_suppressed{$full})
10348                                                    if $why_suppressed{$full};
10349
10350        # Start looking for more aliases after these two.
10351        for my $i (2 .. @data - 1) {
10352            $this->add_alias($data[$i]);
10353        }
10354
10355    }
10356
10357    my $scf = property_ref("Simple_Case_Folding");
10358    $scf->add_alias("scf");
10359    $scf->add_alias("sfc");
10360
10361    return;
10362}
10363
10364sub finish_property_setup {
10365    # Finishes setting up after PropertyAliases.
10366
10367    my $file = shift;
10368    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10369
10370    # This entry was missing from this file in earlier Unicode versions
10371    if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10372        Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10373    }
10374
10375    # These are used so much, that we set globals for them.
10376    $gc = property_ref('General_Category');
10377    $block = property_ref('Block');
10378    $script = property_ref('Script');
10379    $age = property_ref('Age');
10380
10381    # Perl adds this alias.
10382    $gc->add_alias('Category');
10383
10384    # Unicode::Normalize expects this file with this name and directory.
10385    $ccc = property_ref('Canonical_Combining_Class');
10386    if (defined $ccc) {
10387        $ccc->set_file('CombiningClass');
10388        $ccc->set_directory(File::Spec->curdir());
10389    }
10390
10391    # These two properties aren't actually used in the core, but unfortunately
10392    # the names just above that are in the core interfere with these, so
10393    # choose different names.  These aren't a problem unless the map tables
10394    # for these files get written out.
10395    my $lowercase = property_ref('Lowercase');
10396    $lowercase->set_file('IsLower') if defined $lowercase;
10397    my $uppercase = property_ref('Uppercase');
10398    $uppercase->set_file('IsUpper') if defined $uppercase;
10399
10400    # Set up the hard-coded default mappings, but only on properties defined
10401    # for this release
10402    foreach my $property (keys %default_mapping) {
10403        my $property_object = property_ref($property);
10404        next if ! defined $property_object;
10405        my $default_map = $default_mapping{$property};
10406        $property_object->set_default_map($default_map);
10407
10408        # A map of <code point> implies the property is string.
10409        if ($property_object->type == $UNKNOWN
10410            && $default_map eq $CODE_POINT)
10411        {
10412            $property_object->set_type($STRING);
10413        }
10414    }
10415
10416    # The following use the Multi_Default class to create objects for
10417    # defaults.
10418
10419    # Bidi class has a complicated default, but the derived file takes care of
10420    # the complications, leaving just 'L'.
10421    if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10422        property_ref('Bidi_Class')->set_default_map('L');
10423    }
10424    else {
10425        my $default;
10426
10427        # The derived file was introduced in 3.1.1.  The values below are
10428        # taken from table 3-8, TUS 3.0
10429        my $default_R =
10430            'my $default = Range_List->new;
10431             $default->add_range(0x0590, 0x05FF);
10432             $default->add_range(0xFB1D, 0xFB4F);'
10433        ;
10434
10435        # The defaults apply only to unassigned characters
10436        $default_R .= '$gc->table("Unassigned") & $default;';
10437
10438        if ($v_version lt v3.0.0) {
10439            $default = Multi_Default->new(R => $default_R, 'L');
10440        }
10441        else {
10442
10443            # AL apparently not introduced until 3.0:  TUS 2.x references are
10444            # not on-line to check it out
10445            my $default_AL =
10446                'my $default = Range_List->new;
10447                 $default->add_range(0x0600, 0x07BF);
10448                 $default->add_range(0xFB50, 0xFDFF);
10449                 $default->add_range(0xFE70, 0xFEFF);'
10450            ;
10451
10452            # Non-character code points introduced in this release; aren't AL
10453            if ($v_version ge 3.1.0) {
10454                $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10455            }
10456            $default_AL .= '$gc->table("Unassigned") & $default';
10457            $default = Multi_Default->new(AL => $default_AL,
10458                                          R => $default_R,
10459                                          'L');
10460        }
10461        property_ref('Bidi_Class')->set_default_map($default);
10462    }
10463
10464    # Joining type has a complicated default, but the derived file takes care
10465    # of the complications, leaving just 'U' (or Non_Joining), except the file
10466    # is bad in 3.1.0
10467    if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10468        if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10469            property_ref('Joining_Type')->set_default_map('Non_Joining');
10470        }
10471        else {
10472
10473            # Otherwise, there are not one, but two possibilities for the
10474            # missing defaults: T and U.
10475            # The missing defaults that evaluate to T are given by:
10476            # T = Mn + Cf - ZWNJ - ZWJ
10477            # where Mn and Cf are the general category values. In other words,
10478            # any non-spacing mark or any format control character, except
10479            # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10480            # WIDTH JOINER (joining type C).
10481            my $default = Multi_Default->new(
10482               'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10483               'Non_Joining');
10484            property_ref('Joining_Type')->set_default_map($default);
10485        }
10486    }
10487
10488    # Line break has a complicated default in early releases. It is 'Unknown'
10489    # for non-assigned code points; 'AL' for assigned.
10490    if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10491        my $lb = property_ref('Line_Break');
10492        if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10493            $lb->set_default_map('Unknown');
10494        }
10495        else {
10496            my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10497                                             'Unknown',
10498                                            );
10499            $lb->set_default_map($default);
10500        }
10501    }
10502
10503    # For backwards compatibility with applications that may read the mapping
10504    # file directly (it was documented in 5.12 and 5.14 as being thusly
10505    # usable), keep it from being adjusted.  (range_size_1 is
10506    # used to force the traditional format.)
10507    if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10508        $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10509        $nfkc_cf->set_range_size_1(1);
10510    }
10511    if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10512        $bmg->set_to_output_map($EXTERNAL_MAP);
10513        $bmg->set_range_size_1(1);
10514    }
10515
10516    property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10517
10518    return;
10519}
10520
10521sub get_old_property_aliases() {
10522    # Returns what would be in PropertyAliases.txt if it existed in very old
10523    # versions of Unicode.  It was derived from the one in 3.2, and pared
10524    # down based on the data that was actually in the older releases.
10525    # An attempt was made to use the existence of files to mean inclusion or
10526    # not of various aliases, but if this was not sufficient, using version
10527    # numbers was resorted to.
10528
10529    my @return;
10530
10531    # These are to be used in all versions (though some are constructed by
10532    # this program if missing)
10533    push @return, split /\n/, <<'END';
10534bc        ; Bidi_Class
10535Bidi_M    ; Bidi_Mirrored
10536cf        ; Case_Folding
10537ccc       ; Canonical_Combining_Class
10538dm        ; Decomposition_Mapping
10539dt        ; Decomposition_Type
10540gc        ; General_Category
10541isc       ; ISO_Comment
10542lc        ; Lowercase_Mapping
10543na        ; Name
10544na1       ; Unicode_1_Name
10545nt        ; Numeric_Type
10546nv        ; Numeric_Value
10547scf       ; Simple_Case_Folding
10548slc       ; Simple_Lowercase_Mapping
10549stc       ; Simple_Titlecase_Mapping
10550suc       ; Simple_Uppercase_Mapping
10551tc        ; Titlecase_Mapping
10552uc        ; Uppercase_Mapping
10553END
10554
10555    if (-e 'Blocks.txt') {
10556        push @return, "blk       ; Block\n";
10557    }
10558    if (-e 'ArabicShaping.txt') {
10559        push @return, split /\n/, <<'END';
10560jg        ; Joining_Group
10561jt        ; Joining_Type
10562END
10563    }
10564    if (-e 'PropList.txt') {
10565
10566        # This first set is in the original old-style proplist.
10567        push @return, split /\n/, <<'END';
10568Bidi_C    ; Bidi_Control
10569Dash      ; Dash
10570Dia       ; Diacritic
10571Ext       ; Extender
10572Hex       ; Hex_Digit
10573Hyphen    ; Hyphen
10574IDC       ; ID_Continue
10575Ideo      ; Ideographic
10576Join_C    ; Join_Control
10577Math      ; Math
10578QMark     ; Quotation_Mark
10579Term      ; Terminal_Punctuation
10580WSpace    ; White_Space
10581END
10582        # The next sets were added later
10583        if ($v_version ge v3.0.0) {
10584            push @return, split /\n/, <<'END';
10585Upper     ; Uppercase
10586Lower     ; Lowercase
10587END
10588        }
10589        if ($v_version ge v3.0.1) {
10590            push @return, split /\n/, <<'END';
10591NChar     ; Noncharacter_Code_Point
10592END
10593        }
10594        # The next sets were added in the new-style
10595        if ($v_version ge v3.1.0) {
10596            push @return, split /\n/, <<'END';
10597OAlpha    ; Other_Alphabetic
10598OLower    ; Other_Lowercase
10599OMath     ; Other_Math
10600OUpper    ; Other_Uppercase
10601END
10602        }
10603        if ($v_version ge v3.1.1) {
10604            push @return, "AHex      ; ASCII_Hex_Digit\n";
10605        }
10606    }
10607    if (-e 'EastAsianWidth.txt') {
10608        push @return, "ea        ; East_Asian_Width\n";
10609    }
10610    if (-e 'CompositionExclusions.txt') {
10611        push @return, "CE        ; Composition_Exclusion\n";
10612    }
10613    if (-e 'LineBreak.txt') {
10614        push @return, "lb        ; Line_Break\n";
10615    }
10616    if (-e 'BidiMirroring.txt') {
10617        push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
10618    }
10619    if (-e 'Scripts.txt') {
10620        push @return, "sc        ; Script\n";
10621    }
10622    if (-e 'DNormalizationProps.txt') {
10623        push @return, split /\n/, <<'END';
10624Comp_Ex   ; Full_Composition_Exclusion
10625FC_NFKC   ; FC_NFKC_Closure
10626NFC_QC    ; NFC_Quick_Check
10627NFD_QC    ; NFD_Quick_Check
10628NFKC_QC   ; NFKC_Quick_Check
10629NFKD_QC   ; NFKD_Quick_Check
10630XO_NFC    ; Expands_On_NFC
10631XO_NFD    ; Expands_On_NFD
10632XO_NFKC   ; Expands_On_NFKC
10633XO_NFKD   ; Expands_On_NFKD
10634END
10635    }
10636    if (-e 'DCoreProperties.txt') {
10637        push @return, split /\n/, <<'END';
10638Alpha     ; Alphabetic
10639IDS       ; ID_Start
10640XIDC      ; XID_Continue
10641XIDS      ; XID_Start
10642END
10643        # These can also appear in some versions of PropList.txt
10644        push @return, "Lower     ; Lowercase\n"
10645                                    unless grep { $_ =~ /^Lower\b/} @return;
10646        push @return, "Upper     ; Uppercase\n"
10647                                    unless grep { $_ =~ /^Upper\b/} @return;
10648    }
10649
10650    # This flag requires the DAge.txt file to be copied into the directory.
10651    if (DEBUG && $compare_versions) {
10652        push @return, 'age       ; Age';
10653    }
10654
10655    return @return;
10656}
10657
10658sub substitute_PropValueAliases($) {
10659    # Deal with early releases that don't have the crucial
10660    # PropValueAliases.txt file.
10661
10662    my $file_object = shift;
10663    $file_object->insert_lines(get_old_property_value_aliases());
10664
10665    process_PropValueAliases($file_object);
10666}
10667
10668sub process_PropValueAliases {
10669    # This file contains values that properties look like:
10670    # bc ; AL        ; Arabic_Letter
10671    # blk; n/a       ; Greek_And_Coptic                 ; Greek
10672    #
10673    # Field 0 is the property.
10674    # Field 1 is the short name of a property value or 'n/a' if no
10675    #                short name exists;
10676    # Field 2 is the full property value name;
10677    # Any other fields are more synonyms for the property value.
10678    # Purely numeric property values are omitted from the file; as are some
10679    # others, fewer and fewer in later releases
10680
10681    # Entries for the ccc property have an extra field before the
10682    # abbreviation:
10683    # ccc;   0; NR   ; Not_Reordered
10684    # It is the numeric value that the names are synonyms for.
10685
10686    # There are comment entries for values missing from this file:
10687    # # @missing: 0000..10FFFF; ISO_Comment; <none>
10688    # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10689
10690    my $file= shift;
10691    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10692
10693    if ($v_version lt 4.0.0) {
10694        $file->insert_lines(split /\n/, <<'END'
10695Hangul_Syllable_Type; L                                ; Leading_Jamo
10696Hangul_Syllable_Type; LV                               ; LV_Syllable
10697Hangul_Syllable_Type; LVT                              ; LVT_Syllable
10698Hangul_Syllable_Type; NA                               ; Not_Applicable
10699Hangul_Syllable_Type; T                                ; Trailing_Jamo
10700Hangul_Syllable_Type; V                                ; Vowel_Jamo
10701END
10702        );
10703    }
10704    if ($v_version lt 4.1.0) {
10705        $file->insert_lines(split /\n/, <<'END'
10706_Perl_GCB; CN                               ; Control
10707_Perl_GCB; CR                               ; CR
10708_Perl_GCB; EX                               ; Extend
10709_Perl_GCB; L                                ; L
10710_Perl_GCB; LF                               ; LF
10711_Perl_GCB; LV                               ; LV
10712_Perl_GCB; LVT                              ; LVT
10713_Perl_GCB; T                                ; T
10714_Perl_GCB; V                                ; V
10715_Perl_GCB; XX                               ; Other
10716END
10717        );
10718    }
10719
10720
10721    # Add any explicit cjk values
10722    $file->insert_lines(@cjk_property_values);
10723
10724    # This line is used only for testing the code that checks for name
10725    # conflicts.  There is a script Inherited, and when this line is executed
10726    # it causes there to be a name conflict with the 'Inherited' that this
10727    # program generates for this block property value
10728    #$file->insert_lines('blk; n/a; Herited');
10729
10730    # Process each line of the file ...
10731    while ($file->next_line) {
10732
10733        # Fix typo in input file
10734        s/CCC133/CCC132/g if $v_version eq v6.1.0;
10735
10736        my ($property, @data) = split /\s*;\s*/;
10737
10738        # The ccc property has an extra field at the beginning, which is the
10739        # numeric value.  Move it to be after the other two, mnemonic, fields,
10740        # so that those will be used as the property value's names, and the
10741        # number will be an extra alias.  (Rightmost splice removes field 1-2,
10742        # returning them in a slice; left splice inserts that before anything,
10743        # thus shifting the former field 0 to after them.)
10744        splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10745
10746        if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10747            my $new_style = $data[1] =~ s/-/_/gr;
10748            splice @data, 1, 0, $new_style;
10749        }
10750
10751        # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
10752        # there is no short name, use the full one in element 1
10753        if ($data[0] eq "n/a") {
10754            $data[0] = $data[1];
10755        }
10756        elsif ($data[0] ne $data[1]
10757               && standardize($data[0]) eq standardize($data[1])
10758               && $data[1] !~ /[[:upper:]]/)
10759        {
10760            # Also, there is a bug in the file in which "n/a" is omitted, and
10761            # the two fields are identical except for case, and the full name
10762            # is all lower case.  Copy the "short" name unto the full one to
10763            # give it some upper case.
10764
10765            $data[1] = $data[0];
10766        }
10767
10768        # Earlier releases had the pseudo property 'qc' that should expand to
10769        # the ones that replace it below.
10770        if ($property eq 'qc') {
10771            if (lc $data[0] eq 'y') {
10772                $file->insert_lines('NFC_QC; Y      ; Yes',
10773                                    'NFD_QC; Y      ; Yes',
10774                                    'NFKC_QC; Y     ; Yes',
10775                                    'NFKD_QC; Y     ; Yes',
10776                                    );
10777            }
10778            elsif (lc $data[0] eq 'n') {
10779                $file->insert_lines('NFC_QC; N      ; No',
10780                                    'NFD_QC; N      ; No',
10781                                    'NFKC_QC; N     ; No',
10782                                    'NFKD_QC; N     ; No',
10783                                    );
10784            }
10785            elsif (lc $data[0] eq 'm') {
10786                $file->insert_lines('NFC_QC; M      ; Maybe',
10787                                    'NFKC_QC; M     ; Maybe',
10788                                    );
10789            }
10790            else {
10791                $file->carp_bad_line("qc followed by unexpected '$data[0]");
10792            }
10793            next;
10794        }
10795
10796        # The first field is the short name, 2nd is the full one.
10797        my $property_object = property_ref($property);
10798        my $table = $property_object->add_match_table($data[0],
10799                                                Full_Name => $data[1]);
10800
10801        # Start looking for more aliases after these two.
10802        for my $i (2 .. @data - 1) {
10803            $table->add_alias($data[$i]);
10804        }
10805    } # End of looping through the file
10806
10807    # As noted in the comments early in the program, it generates tables for
10808    # the default values for all releases, even those for which the concept
10809    # didn't exist at the time.  Here we add those if missing.
10810    if (defined $age && ! defined $age->table('Unassigned')) {
10811        $age->add_match_table('Unassigned');
10812    }
10813    $block->add_match_table('No_Block') if -e 'Blocks.txt'
10814                                    && ! defined $block->table('No_Block');
10815
10816
10817    # Now set the default mappings of the properties from the file.  This is
10818    # done after the loop because a number of properties have only @missings
10819    # entries in the file, and may not show up until the end.
10820    my @defaults = $file->get_missings;
10821    foreach my $default_ref (@defaults) {
10822        my $default = $default_ref->[0];
10823        my $property = property_ref($default_ref->[1]);
10824        $property->set_default_map($default);
10825    }
10826    return;
10827}
10828
10829sub get_old_property_value_aliases () {
10830    # Returns what would be in PropValueAliases.txt if it existed in very old
10831    # versions of Unicode.  It was derived from the one in 3.2, and pared
10832    # down.  An attempt was made to use the existence of files to mean
10833    # inclusion or not of various aliases, but if this was not sufficient,
10834    # using version numbers was resorted to.
10835
10836    my @return = split /\n/, <<'END';
10837bc ; AN        ; Arabic_Number
10838bc ; B         ; Paragraph_Separator
10839bc ; CS        ; Common_Separator
10840bc ; EN        ; European_Number
10841bc ; ES        ; European_Separator
10842bc ; ET        ; European_Terminator
10843bc ; L         ; Left_To_Right
10844bc ; ON        ; Other_Neutral
10845bc ; R         ; Right_To_Left
10846bc ; WS        ; White_Space
10847
10848Bidi_M; N; No; F; False
10849Bidi_M; Y; Yes; T; True
10850
10851# The standard combining classes are very much different in v1, so only use
10852# ones that look right (not checked thoroughly)
10853ccc;   0; NR   ; Not_Reordered
10854ccc;   1; OV   ; Overlay
10855ccc;   7; NK   ; Nukta
10856ccc;   8; KV   ; Kana_Voicing
10857ccc;   9; VR   ; Virama
10858ccc; 202; ATBL ; Attached_Below_Left
10859ccc; 216; ATAR ; Attached_Above_Right
10860ccc; 218; BL   ; Below_Left
10861ccc; 220; B    ; Below
10862ccc; 222; BR   ; Below_Right
10863ccc; 224; L    ; Left
10864ccc; 228; AL   ; Above_Left
10865ccc; 230; A    ; Above
10866ccc; 232; AR   ; Above_Right
10867ccc; 234; DA   ; Double_Above
10868
10869dt ; can       ; canonical
10870dt ; enc       ; circle
10871dt ; fin       ; final
10872dt ; font      ; font
10873dt ; fra       ; fraction
10874dt ; init      ; initial
10875dt ; iso       ; isolated
10876dt ; med       ; medial
10877dt ; n/a       ; none
10878dt ; nb        ; noBreak
10879dt ; sqr       ; square
10880dt ; sub       ; sub
10881dt ; sup       ; super
10882
10883gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10884gc ; Cc        ; Control
10885gc ; Cn        ; Unassigned
10886gc ; Co        ; Private_Use
10887gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10888gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10889gc ; Ll        ; Lowercase_Letter
10890gc ; Lm        ; Modifier_Letter
10891gc ; Lo        ; Other_Letter
10892gc ; Lu        ; Uppercase_Letter
10893gc ; M         ; Mark                             # Mc | Me | Mn
10894gc ; Mc        ; Spacing_Mark
10895gc ; Mn        ; Nonspacing_Mark
10896gc ; N         ; Number                           # Nd | Nl | No
10897gc ; Nd        ; Decimal_Number
10898gc ; No        ; Other_Number
10899gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10900gc ; Pd        ; Dash_Punctuation
10901gc ; Pe        ; Close_Punctuation
10902gc ; Po        ; Other_Punctuation
10903gc ; Ps        ; Open_Punctuation
10904gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10905gc ; Sc        ; Currency_Symbol
10906gc ; Sm        ; Math_Symbol
10907gc ; So        ; Other_Symbol
10908gc ; Z         ; Separator                        # Zl | Zp | Zs
10909gc ; Zl        ; Line_Separator
10910gc ; Zp        ; Paragraph_Separator
10911gc ; Zs        ; Space_Separator
10912
10913nt ; de        ; Decimal
10914nt ; di        ; Digit
10915nt ; n/a       ; None
10916nt ; nu        ; Numeric
10917END
10918
10919    if (-e 'ArabicShaping.txt') {
10920        push @return, split /\n/, <<'END';
10921jg ; n/a       ; AIN
10922jg ; n/a       ; ALEF
10923jg ; n/a       ; DAL
10924jg ; n/a       ; GAF
10925jg ; n/a       ; LAM
10926jg ; n/a       ; MEEM
10927jg ; n/a       ; NO_JOINING_GROUP
10928jg ; n/a       ; NOON
10929jg ; n/a       ; QAF
10930jg ; n/a       ; SAD
10931jg ; n/a       ; SEEN
10932jg ; n/a       ; TAH
10933jg ; n/a       ; WAW
10934
10935jt ; C         ; Join_Causing
10936jt ; D         ; Dual_Joining
10937jt ; L         ; Left_Joining
10938jt ; R         ; Right_Joining
10939jt ; U         ; Non_Joining
10940jt ; T         ; Transparent
10941END
10942        if ($v_version ge v3.0.0) {
10943            push @return, split /\n/, <<'END';
10944jg ; n/a       ; ALAPH
10945jg ; n/a       ; BEH
10946jg ; n/a       ; BETH
10947jg ; n/a       ; DALATH_RISH
10948jg ; n/a       ; E
10949jg ; n/a       ; FEH
10950jg ; n/a       ; FINAL_SEMKATH
10951jg ; n/a       ; GAMAL
10952jg ; n/a       ; HAH
10953jg ; n/a       ; HAMZA_ON_HEH_GOAL
10954jg ; n/a       ; HE
10955jg ; n/a       ; HEH
10956jg ; n/a       ; HEH_GOAL
10957jg ; n/a       ; HETH
10958jg ; n/a       ; KAF
10959jg ; n/a       ; KAPH
10960jg ; n/a       ; KNOTTED_HEH
10961jg ; n/a       ; LAMADH
10962jg ; n/a       ; MIM
10963jg ; n/a       ; NUN
10964jg ; n/a       ; PE
10965jg ; n/a       ; QAPH
10966jg ; n/a       ; REH
10967jg ; n/a       ; REVERSED_PE
10968jg ; n/a       ; SADHE
10969jg ; n/a       ; SEMKATH
10970jg ; n/a       ; SHIN
10971jg ; n/a       ; SWASH_KAF
10972jg ; n/a       ; TAW
10973jg ; n/a       ; TEH_MARBUTA
10974jg ; n/a       ; TETH
10975jg ; n/a       ; YEH
10976jg ; n/a       ; YEH_BARREE
10977jg ; n/a       ; YEH_WITH_TAIL
10978jg ; n/a       ; YUDH
10979jg ; n/a       ; YUDH_HE
10980jg ; n/a       ; ZAIN
10981END
10982        }
10983    }
10984
10985
10986    if (-e 'EastAsianWidth.txt') {
10987        push @return, split /\n/, <<'END';
10988ea ; A         ; Ambiguous
10989ea ; F         ; Fullwidth
10990ea ; H         ; Halfwidth
10991ea ; N         ; Neutral
10992ea ; Na        ; Narrow
10993ea ; W         ; Wide
10994END
10995    }
10996
10997    if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10998        my @lb = split /\n/, <<'END';
10999lb ; AI        ; Ambiguous
11000lb ; AL        ; Alphabetic
11001lb ; B2        ; Break_Both
11002lb ; BA        ; Break_After
11003lb ; BB        ; Break_Before
11004lb ; BK        ; Mandatory_Break
11005lb ; CB        ; Contingent_Break
11006lb ; CL        ; Close_Punctuation
11007lb ; CM        ; Combining_Mark
11008lb ; CR        ; Carriage_Return
11009lb ; EX        ; Exclamation
11010lb ; GL        ; Glue
11011lb ; HY        ; Hyphen
11012lb ; ID        ; Ideographic
11013lb ; IN        ; Inseperable
11014lb ; IS        ; Infix_Numeric
11015lb ; LF        ; Line_Feed
11016lb ; NS        ; Nonstarter
11017lb ; NU        ; Numeric
11018lb ; OP        ; Open_Punctuation
11019lb ; PO        ; Postfix_Numeric
11020lb ; PR        ; Prefix_Numeric
11021lb ; QU        ; Quotation
11022lb ; SA        ; Complex_Context
11023lb ; SG        ; Surrogate
11024lb ; SP        ; Space
11025lb ; SY        ; Break_Symbols
11026lb ; XX        ; Unknown
11027lb ; ZW        ; ZWSpace
11028END
11029        # If this Unicode version predates the lb property, we use our
11030        # substitute one
11031        if (-e 'LBsubst.txt') {
11032            $_ = s/^lb/_Perl_LB/r for @lb;
11033        }
11034        push @return, @lb;
11035    }
11036
11037    if (-e 'DNormalizationProps.txt') {
11038        push @return, split /\n/, <<'END';
11039qc ; M         ; Maybe
11040qc ; N         ; No
11041qc ; Y         ; Yes
11042END
11043    }
11044
11045    if (-e 'Scripts.txt') {
11046        push @return, split /\n/, <<'END';
11047sc ; Arab      ; Arabic
11048sc ; Armn      ; Armenian
11049sc ; Beng      ; Bengali
11050sc ; Bopo      ; Bopomofo
11051sc ; Cans      ; Canadian_Aboriginal
11052sc ; Cher      ; Cherokee
11053sc ; Cyrl      ; Cyrillic
11054sc ; Deva      ; Devanagari
11055sc ; Dsrt      ; Deseret
11056sc ; Ethi      ; Ethiopic
11057sc ; Geor      ; Georgian
11058sc ; Goth      ; Gothic
11059sc ; Grek      ; Greek
11060sc ; Gujr      ; Gujarati
11061sc ; Guru      ; Gurmukhi
11062sc ; Hang      ; Hangul
11063sc ; Hani      ; Han
11064sc ; Hebr      ; Hebrew
11065sc ; Hira      ; Hiragana
11066sc ; Ital      ; Old_Italic
11067sc ; Kana      ; Katakana
11068sc ; Khmr      ; Khmer
11069sc ; Knda      ; Kannada
11070sc ; Laoo      ; Lao
11071sc ; Latn      ; Latin
11072sc ; Mlym      ; Malayalam
11073sc ; Mong      ; Mongolian
11074sc ; Mymr      ; Myanmar
11075sc ; Ogam      ; Ogham
11076sc ; Orya      ; Oriya
11077sc ; Qaai      ; Inherited
11078sc ; Runr      ; Runic
11079sc ; Sinh      ; Sinhala
11080sc ; Syrc      ; Syriac
11081sc ; Taml      ; Tamil
11082sc ; Telu      ; Telugu
11083sc ; Thaa      ; Thaana
11084sc ; Thai      ; Thai
11085sc ; Tibt      ; Tibetan
11086sc ; Yiii      ; Yi
11087sc ; Zyyy      ; Common
11088END
11089    }
11090
11091    if ($v_version ge v2.0.0) {
11092        push @return, split /\n/, <<'END';
11093dt ; com       ; compat
11094dt ; nar       ; narrow
11095dt ; sml       ; small
11096dt ; vert      ; vertical
11097dt ; wide      ; wide
11098
11099gc ; Cf        ; Format
11100gc ; Cs        ; Surrogate
11101gc ; Lt        ; Titlecase_Letter
11102gc ; Me        ; Enclosing_Mark
11103gc ; Nl        ; Letter_Number
11104gc ; Pc        ; Connector_Punctuation
11105gc ; Sk        ; Modifier_Symbol
11106END
11107    }
11108    if ($v_version ge v2.1.2) {
11109        push @return, "bc ; S         ; Segment_Separator\n";
11110    }
11111    if ($v_version ge v2.1.5) {
11112        push @return, split /\n/, <<'END';
11113gc ; Pf        ; Final_Punctuation
11114gc ; Pi        ; Initial_Punctuation
11115END
11116    }
11117    if ($v_version ge v2.1.8) {
11118        push @return, "ccc; 240; IS   ; Iota_Subscript\n";
11119    }
11120
11121    if ($v_version ge v3.0.0) {
11122        push @return, split /\n/, <<'END';
11123bc ; AL        ; Arabic_Letter
11124bc ; BN        ; Boundary_Neutral
11125bc ; LRE       ; Left_To_Right_Embedding
11126bc ; LRO       ; Left_To_Right_Override
11127bc ; NSM       ; Nonspacing_Mark
11128bc ; PDF       ; Pop_Directional_Format
11129bc ; RLE       ; Right_To_Left_Embedding
11130bc ; RLO       ; Right_To_Left_Override
11131
11132ccc; 233; DB   ; Double_Below
11133END
11134    }
11135
11136    if ($v_version ge v3.1.0) {
11137        push @return, "ccc; 226; R    ; Right\n";
11138    }
11139
11140    return @return;
11141}
11142
11143sub process_NormalizationsTest {
11144
11145    # Each line looks like:
11146    #      source code point; NFC; NFD; NFKC; NFKD
11147    # e.g.
11148    #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
11149
11150    my $file= shift;
11151    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11152
11153    # Process each line of the file ...
11154    while ($file->next_line) {
11155
11156        next if /^@/;
11157
11158        my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
11159
11160        foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
11161            $$var = pack "U0U*", map { hex } split " ", $$var;
11162            $$var =~ s/(\\)/$1$1/g;
11163        }
11164
11165        push @normalization_tests,
11166                "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n";
11167    } # End of looping through the file
11168}
11169
11170sub output_perl_charnames_line ($$) {
11171
11172    # Output the entries in Perl_charnames specially, using 5 digits instead
11173    # of four.  This makes the entries a constant length, and simplifies
11174    # charnames.pm which this table is for.  Unicode can have 6 digit
11175    # ordinals, but they are all private use or noncharacters which do not
11176    # have names, so won't be in this table.
11177
11178    return sprintf "%05X\t%s\n", $_[0], $_[1];
11179}
11180
11181{ # Closure
11182
11183    # These are constants to the $property_info hash in this subroutine, to
11184    # avoid using a quoted-string which might have a typo.
11185    my $TYPE  = 'type';
11186    my $DEFAULT_MAP = 'default_map';
11187    my $DEFAULT_TABLE = 'default_table';
11188    my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
11189    my $MISSINGS = 'missings';
11190
11191    sub process_generic_property_file {
11192        # This processes a file containing property mappings and puts them
11193        # into internal map tables.  It should be used to handle any property
11194        # files that have mappings from a code point or range thereof to
11195        # something else.  This means almost all the UCD .txt files.
11196        # each_line_handlers() should be set to adjust the lines of these
11197        # files, if necessary, to what this routine understands:
11198        #
11199        # 0374          ; NFD_QC; N
11200        # 003C..003E    ; Math
11201        #
11202        # the fields are: "codepoint-range ; property; map"
11203        #
11204        # meaning the codepoints in the range all have the value 'map' under
11205        # 'property'.
11206        # Beginning and trailing white space in each field are not significant.
11207        # Note there is not a trailing semi-colon in the above.  A trailing
11208        # semi-colon means the map is a null-string.  An omitted map, as
11209        # opposed to a null-string, is assumed to be 'Y', based on Unicode
11210        # table syntax.  (This could have been hidden from this routine by
11211        # doing it in the $file object, but that would require parsing of the
11212        # line there, so would have to parse it twice, or change the interface
11213        # to pass this an array.  So not done.)
11214        #
11215        # The map field may begin with a sequence of commands that apply to
11216        # this range.  Each such command begins and ends with $CMD_DELIM.
11217        # These are used to indicate, for example, that the mapping for a
11218        # range has a non-default type.
11219        #
11220        # This loops through the file, calling its next_line() method, and
11221        # then taking the map and adding it to the property's table.
11222        # Complications arise because any number of properties can be in the
11223        # file, in any order, interspersed in any way.  The first time a
11224        # property is seen, it gets information about that property and
11225        # caches it for quick retrieval later.  It also normalizes the maps
11226        # so that only one of many synonyms is stored.  The Unicode input
11227        # files do use some multiple synonyms.
11228
11229        my $file = shift;
11230        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11231
11232        my %property_info;               # To keep track of what properties
11233                                         # have already had entries in the
11234                                         # current file, and info about each,
11235                                         # so don't have to recompute.
11236        my $property_name;               # property currently being worked on
11237        my $property_type;               # and its type
11238        my $previous_property_name = ""; # name from last time through loop
11239        my $property_object;             # pointer to the current property's
11240                                         # object
11241        my $property_addr;               # the address of that object
11242        my $default_map;                 # the string that code points missing
11243                                         # from the file map to
11244        my $default_table;               # For non-string properties, a
11245                                         # reference to the match table that
11246                                         # will contain the list of code
11247                                         # points that map to $default_map.
11248
11249        # Get the next real non-comment line
11250        LINE:
11251        while ($file->next_line) {
11252
11253            # Default replacement type; means that if parts of the range have
11254            # already been stored in our tables, the new map overrides them if
11255            # they differ more than cosmetically
11256            my $replace = $IF_NOT_EQUIVALENT;
11257            my $map_type;            # Default type for the map of this range
11258
11259            #local $to_trace = 1 if main::DEBUG;
11260            trace $_ if main::DEBUG && $to_trace;
11261
11262            # Split the line into components
11263            my ($range, $property_name, $map, @remainder)
11264                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11265
11266            # If more or less on the line than we are expecting, warn and skip
11267            # the line
11268            if (@remainder) {
11269                $file->carp_bad_line('Extra fields');
11270                next LINE;
11271            }
11272            elsif ( ! defined $property_name) {
11273                $file->carp_bad_line('Missing property');
11274                next LINE;
11275            }
11276
11277            # Examine the range.
11278            if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
11279            {
11280                $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
11281                next LINE;
11282            }
11283            my $low = hex $1;
11284            my $high = (defined $2) ? hex $2 : $low;
11285
11286            # If changing to a new property, get the things constant per
11287            # property
11288            if ($previous_property_name ne $property_name) {
11289
11290                $property_object = property_ref($property_name);
11291                if (! defined $property_object) {
11292                    $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
11293                    next LINE;
11294                }
11295                { no overloading; $property_addr = pack 'J', $property_object; }
11296
11297                # Defer changing names until have a line that is acceptable
11298                # (the 'next' statement above means is unacceptable)
11299                $previous_property_name = $property_name;
11300
11301                # If not the first time for this property, retrieve info about
11302                # it from the cache
11303                if (defined ($property_info{$property_addr}{$TYPE})) {
11304                    $property_type = $property_info{$property_addr}{$TYPE};
11305                    $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
11306                    $map_type
11307                        = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
11308                    $default_table
11309                            = $property_info{$property_addr}{$DEFAULT_TABLE};
11310                }
11311                else {
11312
11313                    # Here, is the first time for this property.  Set up the
11314                    # cache.
11315                    $property_type = $property_info{$property_addr}{$TYPE}
11316                                   = $property_object->type;
11317                    $map_type
11318                        = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
11319                        = $property_object->pseudo_map_type;
11320
11321                    # The Unicode files are set up so that if the map is not
11322                    # defined, it is a binary property
11323                    if (! defined $map && $property_type != $BINARY) {
11324                        if ($property_type != $UNKNOWN
11325                            && $property_type != $NON_STRING)
11326                        {
11327                            $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
11328                        }
11329                        else {
11330                            $property_object->set_type($BINARY);
11331                            $property_type
11332                                = $property_info{$property_addr}{$TYPE}
11333                                = $BINARY;
11334                        }
11335                    }
11336
11337                    # Get any @missings default for this property.  This
11338                    # should precede the first entry for the property in the
11339                    # input file, and is located in a comment that has been
11340                    # stored by the Input_file class until we access it here.
11341                    # It's possible that there is more than one such line
11342                    # waiting for us; collect them all, and parse
11343                    my @missings_list = $file->get_missings
11344                                            if $file->has_missings_defaults;
11345                    foreach my $default_ref (@missings_list) {
11346                        my $default = $default_ref->[0];
11347                        my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
11348
11349                        # For string properties, the default is just what the
11350                        # file says, but non-string properties should already
11351                        # have set up a table for the default property value;
11352                        # use the table for these, so can resolve synonyms
11353                        # later to a single standard one.
11354                        if ($property_type == $STRING
11355                            || $property_type == $UNKNOWN)
11356                        {
11357                            $property_info{$addr}{$MISSINGS} = $default;
11358                        }
11359                        else {
11360                            $property_info{$addr}{$MISSINGS}
11361                                        = $property_object->table($default);
11362                        }
11363                    }
11364
11365                    # Finished storing all the @missings defaults in the input
11366                    # file so far.  Get the one for the current property.
11367                    my $missings = $property_info{$property_addr}{$MISSINGS};
11368
11369                    # But we likely have separately stored what the default
11370                    # should be.  (This is to accommodate versions of the
11371                    # standard where the @missings lines are absent or
11372                    # incomplete.)  Hopefully the two will match.  But check
11373                    # it out.
11374                    $default_map = $property_object->default_map;
11375
11376                    # If the map is a ref, it means that the default won't be
11377                    # processed until later, so undef it, so next few lines
11378                    # will redefine it to something that nothing will match
11379                    undef $default_map if ref $default_map;
11380
11381                    # Create a $default_map if don't have one; maybe a dummy
11382                    # that won't match anything.
11383                    if (! defined $default_map) {
11384
11385                        # Use any @missings line in the file.
11386                        if (defined $missings) {
11387                            if (ref $missings) {
11388                                $default_map = $missings->full_name;
11389                                $default_table = $missings;
11390                            }
11391                            else {
11392                                $default_map = $missings;
11393                            }
11394
11395                            # And store it with the property for outside use.
11396                            $property_object->set_default_map($default_map);
11397                        }
11398                        else {
11399
11400                            # Neither an @missings nor a default map.  Create
11401                            # a dummy one, so won't have to test definedness
11402                            # in the main loop.
11403                            $default_map = '_Perl This will never be in a file
11404                                            from Unicode';
11405                        }
11406                    }
11407
11408                    # Here, we have $default_map defined, possibly in terms of
11409                    # $missings, but maybe not, and possibly is a dummy one.
11410                    if (defined $missings) {
11411
11412                        # Make sure there is no conflict between the two.
11413                        # $missings has priority.
11414                        if (ref $missings) {
11415                            $default_table
11416                                        = $property_object->table($default_map);
11417                            if (! defined $default_table
11418                                || $default_table != $missings)
11419                            {
11420                                if (! defined $default_table) {
11421                                    $default_table = $UNDEF;
11422                                }
11423                                $file->carp_bad_line(<<END
11424The \@missings line for $property_name in $file says that missings default to
11425$missings, but we expect it to be $default_table.  $missings used.
11426END
11427                                );
11428                                $default_table = $missings;
11429                                $default_map = $missings->full_name;
11430                            }
11431                            $property_info{$property_addr}{$DEFAULT_TABLE}
11432                                                        = $default_table;
11433                        }
11434                        elsif ($default_map ne $missings) {
11435                            $file->carp_bad_line(<<END
11436The \@missings line for $property_name in $file says that missings default to
11437$missings, but we expect it to be $default_map.  $missings used.
11438END
11439                            );
11440                            $default_map = $missings;
11441                        }
11442                    }
11443
11444                    $property_info{$property_addr}{$DEFAULT_MAP}
11445                                                    = $default_map;
11446
11447                    # If haven't done so already, find the table corresponding
11448                    # to this map for non-string properties.
11449                    if (! defined $default_table
11450                        && $property_type != $STRING
11451                        && $property_type != $UNKNOWN)
11452                    {
11453                        $default_table = $property_info{$property_addr}
11454                                                        {$DEFAULT_TABLE}
11455                                    = $property_object->table($default_map);
11456                    }
11457                } # End of is first time for this property
11458            } # End of switching properties.
11459
11460            # Ready to process the line.
11461            # The Unicode files are set up so that if the map is not defined,
11462            # it is a binary property with value 'Y'
11463            if (! defined $map) {
11464                $map = 'Y';
11465            }
11466            else {
11467
11468                # If the map begins with a special command to us (enclosed in
11469                # delimiters), extract the command(s).
11470                while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11471                    my $command = $1;
11472                    if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
11473                        $replace = $1;
11474                    }
11475                    elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
11476                        $map_type = $1;
11477                    }
11478                    else {
11479                        $file->carp_bad_line("Unknown command line: '$1'");
11480                        next LINE;
11481                    }
11482                }
11483            }
11484
11485            if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
11486            {
11487
11488                # Here, we have a map to a particular code point, and the
11489                # default map is to a code point itself.  If the range
11490                # includes the particular code point, change that portion of
11491                # the range to the default.  This makes sure that in the final
11492                # table only the non-defaults are listed.
11493                my $decimal_map = hex $map;
11494                if ($low <= $decimal_map && $decimal_map <= $high) {
11495
11496                    # If the range includes stuff before or after the map
11497                    # we're changing, split it and process the split-off parts
11498                    # later.
11499                    if ($low < $decimal_map) {
11500                        $file->insert_adjusted_lines(
11501                                            sprintf("%04X..%04X; %s; %s",
11502                                                    $low,
11503                                                    $decimal_map - 1,
11504                                                    $property_name,
11505                                                    $map));
11506                    }
11507                    if ($high > $decimal_map) {
11508                        $file->insert_adjusted_lines(
11509                                            sprintf("%04X..%04X; %s; %s",
11510                                                    $decimal_map + 1,
11511                                                    $high,
11512                                                    $property_name,
11513                                                    $map));
11514                    }
11515                    $low = $high = $decimal_map;
11516                    $map = $CODE_POINT;
11517                }
11518            }
11519
11520            # If we can tell that this is a synonym for the default map, use
11521            # the default one instead.
11522            if ($property_type != $STRING
11523                && $property_type != $UNKNOWN)
11524            {
11525                my $table = $property_object->table($map);
11526                if (defined $table && $table == $default_table) {
11527                    $map = $default_map;
11528                }
11529            }
11530
11531            # And figure out the map type if not known.
11532            if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11533                if ($map eq "") {   # Nulls are always $NULL map type
11534                    $map_type = $NULL;
11535                } # Otherwise, non-strings, and those that don't allow
11536                  # $MULTI_CP, and those that aren't multiple code points are
11537                  # 0
11538                elsif
11539                   (($property_type != $STRING && $property_type != $UNKNOWN)
11540                   || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11541                   || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
11542                {
11543                    $map_type = 0;
11544                }
11545                else {
11546                    $map_type = $MULTI_CP;
11547                }
11548            }
11549
11550            $property_object->add_map($low, $high,
11551                                        $map,
11552                                        Type => $map_type,
11553                                        Replace => $replace);
11554        } # End of loop through file's lines
11555
11556        return;
11557    }
11558}
11559
11560{ # Closure for UnicodeData.txt handling
11561
11562    # This file was the first one in the UCD; its design leads to some
11563    # awkwardness in processing.  Here is a sample line:
11564    # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11565    # The fields in order are:
11566    my $i = 0;            # The code point is in field 0, and is shifted off.
11567    my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
11568    my $CATEGORY = $i++;  # category (e.g. "Lu")
11569    my $CCC = $i++;       # Canonical combining class (e.g. "230")
11570    my $BIDI = $i++;      # directional class (e.g. "L")
11571    my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
11572    my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
11573    my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11574                                         # Dual-use in this program; see below
11575    my $NUMERIC = $i++;   # numeric value
11576    my $MIRRORED = $i++;  # ? mirrored
11577    my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11578    my $COMMENT = $i++;   # iso comment
11579    my $UPPER = $i++;     # simple uppercase mapping
11580    my $LOWER = $i++;     # simple lowercase mapping
11581    my $TITLE = $i++;     # simple titlecase mapping
11582    my $input_field_count = $i;
11583
11584    # This routine in addition outputs these extra fields:
11585
11586    my $DECOMP_TYPE = $i++; # Decomposition type
11587
11588    # These fields are modifications of ones above, and are usually
11589    # suppressed; they must come last, as for speed, the loop upper bound is
11590    # normally set to ignore them
11591    my $NAME = $i++;        # This is the strict name field, not the one that
11592                            # charnames uses.
11593    my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
11594                            # by Unicode::Normalize
11595    my $last_field = $i - 1;
11596
11597    # All these are read into an array for each line, with the indices defined
11598    # above.  The empty fields in the example line above indicate that the
11599    # value is defaulted.  The handler called for each line of the input
11600    # changes these to their defaults.
11601
11602    # Here are the official names of the properties, in a parallel array:
11603    my @field_names;
11604    $field_names[$BIDI] = 'Bidi_Class';
11605    $field_names[$CATEGORY] = 'General_Category';
11606    $field_names[$CCC] = 'Canonical_Combining_Class';
11607    $field_names[$CHARNAME] = 'Perl_Charnames';
11608    $field_names[$COMMENT] = 'ISO_Comment';
11609    $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11610    $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11611    $field_names[$LOWER] = 'Lowercase_Mapping';
11612    $field_names[$MIRRORED] = 'Bidi_Mirrored';
11613    $field_names[$NAME] = 'Name';
11614    $field_names[$NUMERIC] = 'Numeric_Value';
11615    $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11616    $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11617    $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11618    $field_names[$TITLE] = 'Titlecase_Mapping';
11619    $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11620    $field_names[$UPPER] = 'Uppercase_Mapping';
11621
11622    # Some of these need a little more explanation:
11623    # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11624    #   property, but is used in calculating the Numeric_Type.  Perl however,
11625    #   creates a file from this field, so a Perl property is created from it.
11626    # Similarly, the Other_Digit field is used only for calculating the
11627    #   Numeric_Type, and so it can be safely re-used as the place to store
11628    #   the value for Numeric_Type; hence it is referred to as
11629    #   $NUMERIC_TYPE_OTHER_DIGIT.
11630    # The input field named $PERL_DECOMPOSITION is a combination of both the
11631    #   decomposition mapping and its type.  Perl creates a file containing
11632    #   exactly this field, so it is used for that.  The two properties are
11633    #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11634    #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
11635    #   output it), as Perl doesn't use it directly.
11636    # The input field named here $CHARNAME is used to construct the
11637    #   Perl_Charnames property, which is a combination of the Name property
11638    #   (which the input field contains), and the Unicode_1_Name property, and
11639    #   others from other files.  Since, the strict Name property is not used
11640    #   by Perl, this field is used for the table that Perl does use.  The
11641    #   strict Name property table is usually suppressed (unless the lists are
11642    #   changed to output it), so it is accumulated in a separate field,
11643    #   $NAME, which to save time is discarded unless the table is actually to
11644    #   be output
11645
11646    # This file is processed like most in this program.  Control is passed to
11647    # process_generic_property_file() which calls filter_UnicodeData_line()
11648    # for each input line.  This filter converts the input into line(s) that
11649    # process_generic_property_file() understands.  There is also a setup
11650    # routine called before any of the file is processed, and a handler for
11651    # EOF processing, all in this closure.
11652
11653    # A huge speed-up occurred at the cost of some added complexity when these
11654    # routines were altered to buffer the outputs into ranges.  Almost all the
11655    # lines of the input file apply to just one code point, and for most
11656    # properties, the map for the next code point up is the same as the
11657    # current one.  So instead of creating a line for each property for each
11658    # input line, filter_UnicodeData_line() remembers what the previous map
11659    # of a property was, and doesn't generate a line to pass on until it has
11660    # to, as when the map changes; and that passed-on line encompasses the
11661    # whole contiguous range of code points that have the same map for that
11662    # property.  This means a slight amount of extra setup, and having to
11663    # flush these buffers on EOF, testing if the maps have changed, plus
11664    # remembering state information in the closure.  But it means a lot less
11665    # real time in not having to change the data base for each property on
11666    # each line.
11667
11668    # Another complication is that there are already a few ranges designated
11669    # in the input.  There are two lines for each, with the same maps except
11670    # the code point and name on each line.  This was actually the hardest
11671    # thing to design around.  The code points in those ranges may actually
11672    # have real maps not given by these two lines.  These maps will either
11673    # be algorithmically determinable, or be in the extracted files furnished
11674    # with the UCD.  In the event of conflicts between these extracted files,
11675    # and this one, Unicode says that this one prevails.  But it shouldn't
11676    # prevail for conflicts that occur in these ranges.  The data from the
11677    # extracted files prevails in those cases.  So, this program is structured
11678    # so that those files are processed first, storing maps.  Then the other
11679    # files are processed, generally overwriting what the extracted files
11680    # stored.  But just the range lines in this input file are processed
11681    # without overwriting.  This is accomplished by adding a special string to
11682    # the lines output to tell process_generic_property_file() to turn off the
11683    # overwriting for just this one line.
11684    # A similar mechanism is used to tell it that the map is of a non-default
11685    # type.
11686
11687    sub setup_UnicodeData { # Called before any lines of the input are read
11688        my $file = shift;
11689        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11690
11691        # Create a new property specially located that is a combination of
11692        # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11693        # _Perl_Name_Alias properties.  (The final one duplicates elements of the
11694        # first, and starting in v6.1, is the same as the 'Name_Alias
11695        # property.)  A comment for the new property will later be constructed
11696        # based on the actual properties present and used
11697        $perl_charname = Property->new('Perl_Charnames',
11698                       Default_Map => "",
11699                       Directory => File::Spec->curdir(),
11700                       File => 'Name',
11701                       Fate => $INTERNAL_ONLY,
11702                       Perl_Extension => 1,
11703                       Range_Size_1 => \&output_perl_charnames_line,
11704                       Type => $STRING,
11705                       );
11706        $perl_charname->set_proxy_for('Name');
11707
11708        my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11709                                        Directory => File::Spec->curdir(),
11710                                        File => 'Decomposition',
11711                                        Format => $DECOMP_STRING_FORMAT,
11712                                        Fate => $INTERNAL_ONLY,
11713                                        Perl_Extension => 1,
11714                                        Default_Map => $CODE_POINT,
11715
11716                                        # normalize.pm can't cope with these
11717                                        Output_Range_Counts => 0,
11718
11719                                        # This is a specially formatted table
11720                                        # explicitly for normalize.pm, which
11721                                        # is expecting a particular format,
11722                                        # which means that mappings containing
11723                                        # multiple code points are in the main
11724                                        # body of the table
11725                                        Map_Type => $COMPUTE_NO_MULTI_CP,
11726                                        Type => $STRING,
11727                                        To_Output_Map => $INTERNAL_MAP,
11728                                        );
11729        $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11730        $Perl_decomp->add_comment(join_lines(<<END
11731This mapping is a combination of the Unicode 'Decomposition_Type' and
11732'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
11733identical to the official Unicode 'Decomposition_Mapping' property except for
11734two things:
11735 1) It omits the algorithmically determinable Hangul syllable decompositions,
11736which normalize.pm handles algorithmically.
11737 2) It contains the decomposition type as well.  Non-canonical decompositions
11738begin with a word in angle brackets, like <super>, which denotes the
11739compatible decomposition type.  If the map does not begin with the <angle
11740brackets>, the decomposition is canonical.
11741END
11742        ));
11743
11744        my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11745                                        Default_Map => "",
11746                                        Perl_Extension => 1,
11747                                        Directory => $map_directory,
11748                                        Type => $STRING,
11749                                        To_Output_Map => $OUTPUT_ADJUSTED,
11750                                        );
11751        $Decimal_Digit->add_comment(join_lines(<<END
11752This file gives the mapping of all code points which represent a single
11753decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11754points, and the mapping of each non-initial element of each range is actually
11755not to "0", but to the offset that element has from its corresponding DIGIT 0.
11756These code points are those that have Numeric_Type=Decimal; not special
11757things, like subscripts nor Roman numerals.
11758END
11759        ));
11760
11761        # These properties are not used for generating anything else, and are
11762        # usually not output.  By making them last in the list, we can just
11763        # change the high end of the loop downwards to avoid the work of
11764        # generating a table(s) that is/are just going to get thrown away.
11765        if (! property_ref('Decomposition_Mapping')->to_output_map
11766            && ! property_ref('Name')->to_output_map)
11767        {
11768            $last_field = min($NAME, $DECOMP_MAP) - 1;
11769        } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11770            $last_field = $DECOMP_MAP;
11771        } elsif (property_ref('Name')->to_output_map) {
11772            $last_field = $NAME;
11773        }
11774        return;
11775    }
11776
11777    my $first_time = 1;                 # ? Is this the first line of the file
11778    my $in_range = 0;                   # ? Are we in one of the file's ranges
11779    my $previous_cp;                    # hex code point of previous line
11780    my $decimal_previous_cp = -1;       # And its decimal equivalent
11781    my @start;                          # For each field, the current starting
11782                                        # code point in hex for the range
11783                                        # being accumulated.
11784    my @fields;                         # The input fields;
11785    my @previous_fields;                # And those from the previous call
11786
11787    sub filter_UnicodeData_line {
11788        # Handle a single input line from UnicodeData.txt; see comments above
11789        # Conceptually this takes a single line from the file containing N
11790        # properties, and converts it into N lines with one property per line,
11791        # which is what the final handler expects.  But there are
11792        # complications due to the quirkiness of the input file, and to save
11793        # time, it accumulates ranges where the property values don't change
11794        # and only emits lines when necessary.  This is about an order of
11795        # magnitude fewer lines emitted.
11796
11797        my $file = shift;
11798        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11799
11800        # $_ contains the input line.
11801        # -1 in split means retain trailing null fields
11802        (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11803
11804        #local $to_trace = 1 if main::DEBUG;
11805        trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11806        if (@fields > $input_field_count) {
11807            $file->carp_bad_line('Extra fields');
11808            $_ = "";
11809            return;
11810        }
11811
11812        my $decimal_cp = hex $cp;
11813
11814        # We have to output all the buffered ranges when the next code point
11815        # is not exactly one after the previous one, which means there is a
11816        # gap in the ranges.
11817        my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11818
11819        # The decomposition mapping field requires special handling.  It looks
11820        # like either:
11821        #
11822        # <compat> 0032 0020
11823        # 0041 0300
11824        #
11825        # The decomposition type is enclosed in <brackets>; if missing, it
11826        # means the type is canonical.  There are two decomposition mapping
11827        # tables: the one for use by Perl's normalize.pm has a special format
11828        # which is this field intact; the other, for general use is of
11829        # standard format.  In either case we have to find the decomposition
11830        # type.  Empty fields have None as their type, and map to the code
11831        # point itself
11832        if ($fields[$PERL_DECOMPOSITION] eq "") {
11833            $fields[$DECOMP_TYPE] = 'None';
11834            $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11835        }
11836        else {
11837            ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11838                                            =~ / < ( .+? ) > \s* ( .+ ) /x;
11839            if (! defined $fields[$DECOMP_TYPE]) {
11840                $fields[$DECOMP_TYPE] = 'Canonical';
11841                $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11842            }
11843            else {
11844                $fields[$DECOMP_MAP] = $map;
11845            }
11846        }
11847
11848        # The 3 numeric fields also require special handling.  The 2 digit
11849        # fields must be either empty or match the number field.  This means
11850        # that if it is empty, they must be as well, and the numeric type is
11851        # None, and the numeric value is 'Nan'.
11852        # The decimal digit field must be empty or match the other digit
11853        # field.  If the decimal digit field is non-empty, the code point is
11854        # a decimal digit, and the other two fields will have the same value.
11855        # If it is empty, but the other digit field is non-empty, the code
11856        # point is an 'other digit', and the number field will have the same
11857        # value as the other digit field.  If the other digit field is empty,
11858        # but the number field is non-empty, the code point is a generic
11859        # numeric type.
11860        if ($fields[$NUMERIC] eq "") {
11861            if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11862                || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11863            ) {
11864                $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11865            }
11866            $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11867            $fields[$NUMERIC] = 'NaN';
11868        }
11869        else {
11870            $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11871            if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11872                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11873                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11874                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11875            }
11876            elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11877                $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11878                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11879            }
11880            else {
11881                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11882
11883                # Rationals require extra effort.
11884                if ($fields[$NUMERIC] =~ qr{/}) {
11885                    reduce_fraction(\$fields[$NUMERIC]);
11886                    register_fraction($fields[$NUMERIC])
11887                }
11888            }
11889        }
11890
11891        # For the properties that have empty fields in the file, and which
11892        # mean something different from empty, change them to that default.
11893        # Certain fields just haven't been empty so far in any Unicode
11894        # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11895        # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11896        # the defaults; which are very unlikely to ever change.
11897        $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11898        $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11899
11900        # UAX44 says that if title is empty, it is the same as whatever upper
11901        # is,
11902        $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11903
11904        # There are a few pairs of lines like:
11905        #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11906        #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11907        # that define ranges.  These should be processed after the fields are
11908        # adjusted above, as they may override some of them; but mostly what
11909        # is left is to possibly adjust the $CHARNAME field.  The names of all the
11910        # paired lines start with a '<', but this is also true of '<control>,
11911        # which isn't one of these special ones.
11912        if ($fields[$CHARNAME] eq '<control>') {
11913
11914            # Some code points in this file have the pseudo-name
11915            # '<control>', but the official name for such ones is the null
11916            # string.
11917            $fields[$NAME] = $fields[$CHARNAME] = "";
11918
11919            # We had better not be in between range lines.
11920            if ($in_range) {
11921                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11922                $in_range = 0;
11923            }
11924        }
11925        elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11926
11927            # Here is a non-range line.  We had better not be in between range
11928            # lines.
11929            if ($in_range) {
11930                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11931                $in_range = 0;
11932            }
11933            if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11934
11935                # These are code points whose names end in their code points,
11936                # which means the names are algorithmically derivable from the
11937                # code points.  To shorten the output Name file, the algorithm
11938                # for deriving these is placed in the file instead of each
11939                # code point, so they have map type $CP_IN_NAME
11940                $fields[$CHARNAME] = $CMD_DELIM
11941                                 . $MAP_TYPE_CMD
11942                                 . '='
11943                                 . $CP_IN_NAME
11944                                 . $CMD_DELIM
11945                                 . $fields[$CHARNAME];
11946            }
11947            $fields[$NAME] = $fields[$CHARNAME];
11948        }
11949        elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11950            $fields[$CHARNAME] = $fields[$NAME] = $1;
11951
11952            # Here we are at the beginning of a range pair.
11953            if ($in_range) {
11954                $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11955            }
11956            $in_range = 1;
11957
11958            # Because the properties in the range do not overwrite any already
11959            # in the db, we must flush the buffers of what's already there, so
11960            # they get handled in the normal scheme.
11961            $force_output = 1;
11962
11963        }
11964        elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11965            $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11966            $_ = "";
11967            return;
11968        }
11969        else { # Here, we are at the last line of a range pair.
11970
11971            if (! $in_range) {
11972                $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11973                $_ = "";
11974                return;
11975            }
11976            $in_range = 0;
11977
11978            $fields[$NAME] = $fields[$CHARNAME];
11979
11980            # Check that the input is valid: that the closing of the range is
11981            # the same as the beginning.
11982            foreach my $i (0 .. $last_field) {
11983                next if $fields[$i] eq $previous_fields[$i];
11984                $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11985            }
11986
11987            # The processing differs depending on the type of range,
11988            # determined by its $CHARNAME
11989            if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11990
11991                # Check that the data looks right.
11992                if ($decimal_previous_cp != $SBase) {
11993                    $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
11994                }
11995                if ($decimal_cp != $SBase + $SCount - 1) {
11996                    $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
11997                }
11998
11999                # The Hangul syllable range has a somewhat complicated name
12000                # generation algorithm.  Each code point in it has a canonical
12001                # decomposition also computable by an algorithm.  The
12002                # perl decomposition map table built from these is used only
12003                # by normalize.pm, which has the algorithm built in it, so the
12004                # decomposition maps are not needed, and are large, so are
12005                # omitted from it.  If the full decomposition map table is to
12006                # be output, the decompositions are generated for it, in the
12007                # EOF handling code for this input file.
12008
12009                $previous_fields[$DECOMP_TYPE] = 'Canonical';
12010
12011                # This range is stored in our internal structure with its
12012                # own map type, different from all others.
12013                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12014                                        = $CMD_DELIM
12015                                          . $MAP_TYPE_CMD
12016                                          . '='
12017                                          . $HANGUL_SYLLABLE
12018                                          . $CMD_DELIM
12019                                          . $fields[$CHARNAME];
12020            }
12021            elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
12022
12023                # All the CJK ranges like this have the name given as a
12024                # special case in the next code line.  And for the others, we
12025                # hope that Unicode continues to use the correct name in
12026                # future releases, so we don't have to make further special
12027                # cases.
12028                my $name = ($fields[$CHARNAME] =~ /^CJK/)
12029                           ? 'CJK UNIFIED IDEOGRAPH'
12030                           : uc $fields[$CHARNAME];
12031
12032                # The name for these contains the code point itself, and all
12033                # are defined to have the same base name, regardless of what
12034                # is in the file.  They are stored in our internal structure
12035                # with a map type of $CP_IN_NAME
12036                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12037                                        = $CMD_DELIM
12038                                           . $MAP_TYPE_CMD
12039                                           . '='
12040                                           . $CP_IN_NAME
12041                                           . $CMD_DELIM
12042                                           . $name;
12043
12044            }
12045            elsif ($fields[$CATEGORY] eq 'Co'
12046                     || $fields[$CATEGORY] eq 'Cs')
12047            {
12048                # The names of all the code points in these ranges are set to
12049                # null, as there are no names for the private use and
12050                # surrogate code points.
12051
12052                $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
12053            }
12054            else {
12055                $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
12056            }
12057
12058            # The first line of the range caused everything else to be output,
12059            # and then its values were stored as the beginning values for the
12060            # next set of ranges, which this one ends.  Now, for each value,
12061            # add a command to tell the handler that these values should not
12062            # replace any existing ones in our database.
12063            foreach my $i (0 .. $last_field) {
12064                $previous_fields[$i] = $CMD_DELIM
12065                                        . $REPLACE_CMD
12066                                        . '='
12067                                        . $NO
12068                                        . $CMD_DELIM
12069                                        . $previous_fields[$i];
12070            }
12071
12072            # And change things so it looks like the entire range has been
12073            # gone through with this being the final part of it.  Adding the
12074            # command above to each field will cause this range to be flushed
12075            # during the next iteration, as it guaranteed that the stored
12076            # field won't match whatever value the next one has.
12077            $previous_cp = $cp;
12078            $decimal_previous_cp = $decimal_cp;
12079
12080            # We are now set up for the next iteration; so skip the remaining
12081            # code in this subroutine that does the same thing, but doesn't
12082            # know about these ranges.
12083            $_ = "";
12084
12085            return;
12086        }
12087
12088        # On the very first line, we fake it so the code below thinks there is
12089        # nothing to output, and initialize so that when it does get output it
12090        # uses the first line's values for the lowest part of the range.
12091        # (One could avoid this by using peek(), but then one would need to
12092        # know the adjustments done above and do the same ones in the setup
12093        # routine; not worth it)
12094        if ($first_time) {
12095            $first_time = 0;
12096            @previous_fields = @fields;
12097            @start = ($cp) x scalar @fields;
12098            $decimal_previous_cp = $decimal_cp - 1;
12099        }
12100
12101        # For each field, output the stored up ranges that this code point
12102        # doesn't fit in.  Earlier we figured out if all ranges should be
12103        # terminated because of changing the replace or map type styles, or if
12104        # there is a gap between this new code point and the previous one, and
12105        # that is stored in $force_output.  But even if those aren't true, we
12106        # need to output the range if this new code point's value for the
12107        # given property doesn't match the stored range's.
12108        #local $to_trace = 1 if main::DEBUG;
12109        foreach my $i (0 .. $last_field) {
12110            my $field = $fields[$i];
12111            if ($force_output || $field ne $previous_fields[$i]) {
12112
12113                # Flush the buffer of stored values.
12114                $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12115
12116                # Start a new range with this code point and its value
12117                $start[$i] = $cp;
12118                $previous_fields[$i] = $field;
12119            }
12120        }
12121
12122        # Set the values for the next time.
12123        $previous_cp = $cp;
12124        $decimal_previous_cp = $decimal_cp;
12125
12126        # The input line has generated whatever adjusted lines are needed, and
12127        # should not be looked at further.
12128        $_ = "";
12129        return;
12130    }
12131
12132    sub EOF_UnicodeData {
12133        # Called upon EOF to flush the buffers, and create the Hangul
12134        # decomposition mappings if needed.
12135
12136        my $file = shift;
12137        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12138
12139        # Flush the buffers.
12140        foreach my $i (0 .. $last_field) {
12141            $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12142        }
12143
12144        if (-e 'Jamo.txt') {
12145
12146            # The algorithm is published by Unicode, based on values in
12147            # Jamo.txt, (which should have been processed before this
12148            # subroutine), and the results left in %Jamo
12149            unless (%Jamo) {
12150                Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
12151                return;
12152            }
12153
12154            # If the full decomposition map table is being output, insert
12155            # into it the Hangul syllable mappings.  This is to avoid having
12156            # to publish a subroutine in it to compute them.  (which would
12157            # essentially be this code.)  This uses the algorithm published by
12158            # Unicode.  (No hangul syllables in version 1)
12159            if ($v_version ge v2.0.0
12160                && property_ref('Decomposition_Mapping')->to_output_map) {
12161                for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
12162                    use integer;
12163                    my $SIndex = $S - $SBase;
12164                    my $L = $LBase + $SIndex / $NCount;
12165                    my $V = $VBase + ($SIndex % $NCount) / $TCount;
12166                    my $T = $TBase + $SIndex % $TCount;
12167
12168                    trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
12169                    my $decomposition = sprintf("%04X %04X", $L, $V);
12170                    $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
12171                    $file->insert_adjusted_lines(
12172                                sprintf("%04X; Decomposition_Mapping; %s",
12173                                        $S,
12174                                        $decomposition));
12175                }
12176            }
12177        }
12178
12179        return;
12180    }
12181
12182    sub filter_v1_ucd {
12183        # Fix UCD lines in version 1.  This is probably overkill, but this
12184        # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
12185        # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
12186        #       removed.  This program retains them
12187        # 2)    didn't include ranges, which it should have, and which are now
12188        #       added in @corrected_lines below.  It was hand populated by
12189        #       taking the data from Version 2, verified by analyzing
12190        #       DAge.txt.
12191        # 3)    There is a syntax error in the entry for U+09F8 which could
12192        #       cause problems for utf8_heavy, and so is changed.  It's
12193        #       numeric value was simply a minus sign, without any number.
12194        #       (Eventually Unicode changed the code point to non-numeric.)
12195        # 4)    The decomposition types often don't match later versions
12196        #       exactly, and the whole syntax of that field is different; so
12197        #       the syntax is changed as well as the types to their later
12198        #       terminology.  Otherwise normalize.pm would be very unhappy
12199        # 5)    Many ccc classes are different.  These are left intact.
12200        # 6)    U+FF10..U+FF19 are missing their numeric values in all three
12201        #       fields.  These are unchanged because it doesn't really cause
12202        #       problems for Perl.
12203        # 7)    A number of code points, such as controls, don't have their
12204        #       Unicode Version 1 Names in this file.  These are added.
12205        # 8)    A number of Symbols were marked as Lm.  This changes those in
12206        #       the Latin1 range, so that regexes work.
12207        # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
12208        #       referred to by their lc equivalents.  Not fixed.
12209
12210        my @corrected_lines = split /\n/, <<'END';
122114E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
122129FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12213E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
12214F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
12215F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
12216FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12217END
12218
12219        my $file = shift;
12220        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12221
12222        #local $to_trace = 1 if main::DEBUG;
12223        trace $_ if main::DEBUG && $to_trace;
12224
12225        # -1 => retain trailing null fields
12226        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12227
12228        # At the first place that is wrong in the input, insert all the
12229        # corrections, replacing the wrong line.
12230        if ($code_point eq '4E00') {
12231            my @copy = @corrected_lines;
12232            $_ = shift @copy;
12233            ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12234
12235            $file->insert_lines(@copy);
12236        }
12237        elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
12238
12239            # There are no Lm characters in Latin1; these should be 'Sk', but
12240            # there isn't that in V1.
12241            $fields[$CATEGORY] = 'So';
12242        }
12243
12244        if ($fields[$NUMERIC] eq '-') {
12245            $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
12246        }
12247
12248        if  ($fields[$PERL_DECOMPOSITION] ne "") {
12249
12250            # Several entries have this change to superscript 2 or 3 in the
12251            # middle.  Convert these to the modern version, which is to use
12252            # the actual U+00B2 and U+00B3 (the superscript forms) instead.
12253            # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
12254            # 'HHHH HHHH 00B3 HHHH'.
12255            # It turns out that all of these that don't have another
12256            # decomposition defined at the beginning of the line have the
12257            # <square> decomposition in later releases.
12258            if ($code_point ne '00B2' && $code_point ne '00B3') {
12259                if  ($fields[$PERL_DECOMPOSITION]
12260                                    =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
12261                {
12262                    if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
12263                        $fields[$PERL_DECOMPOSITION] = '<square> '
12264                        . $fields[$PERL_DECOMPOSITION];
12265                    }
12266                }
12267            }
12268
12269            # If is like '<+circled> 0052 <-circled>', convert to
12270            # '<circled> 0052'
12271            $fields[$PERL_DECOMPOSITION] =~
12272                            s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
12273
12274            # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
12275            $fields[$PERL_DECOMPOSITION] =~
12276                            s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
12277            or $fields[$PERL_DECOMPOSITION] =~
12278                            s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
12279            or $fields[$PERL_DECOMPOSITION] =~
12280                            s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
12281            or $fields[$PERL_DECOMPOSITION] =~
12282                        s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
12283
12284            # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
12285            $fields[$PERL_DECOMPOSITION] =~
12286                    s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
12287
12288            # Change names to modern form.
12289            $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
12290            $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
12291            $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
12292            $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
12293
12294            # One entry has weird braces
12295            $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
12296
12297            # One entry at U+2116 has an extra <sup>
12298            $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
12299        }
12300
12301        $_ = join ';', $code_point, @fields;
12302        trace $_ if main::DEBUG && $to_trace;
12303        return;
12304    }
12305
12306    sub filter_bad_Nd_ucd {
12307        # Early versions specified a value in the decimal digit field even
12308        # though the code point wasn't a decimal digit.  Clear the field in
12309        # that situation, so that the main code doesn't think it is a decimal
12310        # digit.
12311
12312        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12313        if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
12314            $fields[$PERL_DECIMAL_DIGIT] = "";
12315            $_ = join ';', $code_point, @fields;
12316        }
12317        return;
12318    }
12319
12320    my @U1_control_names = split /\n/, <<'END';
12321NULL
12322START OF HEADING
12323START OF TEXT
12324END OF TEXT
12325END OF TRANSMISSION
12326ENQUIRY
12327ACKNOWLEDGE
12328BELL
12329BACKSPACE
12330HORIZONTAL TABULATION
12331LINE FEED
12332VERTICAL TABULATION
12333FORM FEED
12334CARRIAGE RETURN
12335SHIFT OUT
12336SHIFT IN
12337DATA LINK ESCAPE
12338DEVICE CONTROL ONE
12339DEVICE CONTROL TWO
12340DEVICE CONTROL THREE
12341DEVICE CONTROL FOUR
12342NEGATIVE ACKNOWLEDGE
12343SYNCHRONOUS IDLE
12344END OF TRANSMISSION BLOCK
12345CANCEL
12346END OF MEDIUM
12347SUBSTITUTE
12348ESCAPE
12349FILE SEPARATOR
12350GROUP SEPARATOR
12351RECORD SEPARATOR
12352UNIT SEPARATOR
12353DELETE
12354BREAK PERMITTED HERE
12355NO BREAK HERE
12356INDEX
12357NEXT LINE
12358START OF SELECTED AREA
12359END OF SELECTED AREA
12360CHARACTER TABULATION SET
12361CHARACTER TABULATION WITH JUSTIFICATION
12362LINE TABULATION SET
12363PARTIAL LINE DOWN
12364PARTIAL LINE UP
12365REVERSE LINE FEED
12366SINGLE SHIFT TWO
12367SINGLE SHIFT THREE
12368DEVICE CONTROL STRING
12369PRIVATE USE ONE
12370PRIVATE USE TWO
12371SET TRANSMIT STATE
12372CANCEL CHARACTER
12373MESSAGE WAITING
12374START OF GUARDED AREA
12375END OF GUARDED AREA
12376START OF STRING
12377SINGLE CHARACTER INTRODUCER
12378CONTROL SEQUENCE INTRODUCER
12379STRING TERMINATOR
12380OPERATING SYSTEM COMMAND
12381PRIVACY MESSAGE
12382APPLICATION PROGRAM COMMAND
12383END
12384
12385    sub filter_early_U1_names {
12386        # Very early versions did not have the Unicode_1_name field specified.
12387        # They differed in which ones were present; make sure a U1 name
12388        # exists, so that Unicode::UCD::charinfo will work
12389
12390        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12391
12392
12393        # @U1_control names above are entirely positional, so we pull them out
12394        # in the exact order required, with gaps for the ones that don't have
12395        # names.
12396        if ($code_point =~ /^00[01]/
12397            || $code_point eq '007F'
12398            || $code_point =~ /^008[2-9A-F]/
12399            || $code_point =~ /^009[0-8A-F]/)
12400        {
12401            my $u1_name = shift @U1_control_names;
12402            $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12403            $_ = join ';', $code_point, @fields;
12404        }
12405        return;
12406    }
12407
12408    sub filter_v2_1_5_ucd {
12409        # A dozen entries in this 2.1.5 file had the mirrored and numeric
12410        # columns swapped;  These all had mirrored be 'N'.  So if the numeric
12411        # column appears to be N, swap it back.
12412
12413        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12414        if ($fields[$NUMERIC] eq 'N') {
12415            $fields[$NUMERIC] = $fields[$MIRRORED];
12416            $fields[$MIRRORED] = 'N';
12417            $_ = join ';', $code_point, @fields;
12418        }
12419        return;
12420    }
12421
12422    sub filter_v6_ucd {
12423
12424        # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12425        # it wasn't accepted, to allow for some deprecation cycles.  This
12426        # function is not called after 5.16
12427
12428        return if $_ !~ /^(?:0007|1F514|070F);/;
12429
12430        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12431        if ($code_point eq '0007') {
12432            $fields[$CHARNAME] = "";
12433        }
12434        elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12435                            # http://www.unicode.org/versions/corrigendum8.html
12436            $fields[$BIDI] = "AL";
12437        }
12438        elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12439            $fields[$CHARNAME] = "";
12440        }
12441
12442        $_ = join ';', $code_point, @fields;
12443
12444        return;
12445    }
12446} # End closure for UnicodeData
12447
12448sub process_GCB_test {
12449
12450    my $file = shift;
12451    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12452
12453    while ($file->next_line) {
12454        push @backslash_X_tests, $_;
12455    }
12456
12457    return;
12458}
12459
12460sub process_LB_test {
12461
12462    my $file = shift;
12463    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12464
12465    while ($file->next_line) {
12466        push @LB_tests, $_;
12467    }
12468
12469    return;
12470}
12471
12472sub process_SB_test {
12473
12474    my $file = shift;
12475    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12476
12477    while ($file->next_line) {
12478        push @SB_tests, $_;
12479    }
12480
12481    return;
12482}
12483
12484sub process_WB_test {
12485
12486    my $file = shift;
12487    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12488
12489    while ($file->next_line) {
12490        push @WB_tests, $_;
12491    }
12492
12493    return;
12494}
12495
12496sub process_NamedSequences {
12497    # NamedSequences.txt entries are just added to an array.  Because these
12498    # don't look like the other tables, they have their own handler.
12499    # An example:
12500    # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12501    #
12502    # This just adds the sequence to an array for later handling
12503
12504    my $file = shift;
12505    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12506
12507    while ($file->next_line) {
12508        my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12509        if (@remainder) {
12510            $file->carp_bad_line(
12511                "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12512            next;
12513        }
12514
12515        # Note single \t in keeping with special output format of
12516        # Perl_charnames.  But it turns out that the code points don't have to
12517        # be 5 digits long, like the rest, based on the internal workings of
12518        # charnames.pm.  This could be easily changed for consistency.
12519        push @named_sequences, "$sequence\t$name";
12520    }
12521    return;
12522}
12523
12524{ # Closure
12525
12526    my $first_range;
12527
12528    sub  filter_early_ea_lb {
12529        # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
12530        # third field be the name of the code point, which can be ignored in
12531        # most cases.  But it can be meaningful if it marks a range:
12532        # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12533        # 3400;W;<CJK Ideograph Extension A, First>
12534        #
12535        # We need to see the First in the example above to know it's a range.
12536        # They did not use the later range syntaxes.  This routine changes it
12537        # to use the modern syntax.
12538        # $1 is the Input_file object.
12539
12540        my @fields = split /\s*;\s*/;
12541        if ($fields[2] =~ /^<.*, First>/) {
12542            $first_range = $fields[0];
12543            $_ = "";
12544        }
12545        elsif ($fields[2] =~ /^<.*, Last>/) {
12546            $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12547        }
12548        else {
12549            undef $first_range;
12550            $_ = "$fields[0]; $fields[1]";
12551        }
12552
12553        return;
12554    }
12555}
12556
12557sub filter_substitute_lb {
12558    # Used on Unicodes that predate the LB property, where there is a
12559    # substitute file.  This just does the regular ea_lb handling for such
12560    # files, and then substitutes the long property value name for the short
12561    # one that comes with the file.  (The other break files have the long
12562    # names in them, so this is the odd one out.)  The reason for doing this
12563    # kludge is that regen/mk_invlists.pl is expecting the long name.  This
12564    # also fixes the typo 'Inseperable' that leads to problems.
12565
12566    filter_early_ea_lb;
12567    return unless $_;
12568
12569    my @fields = split /\s*;\s*/;
12570    $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12571    $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12572    $_ = join '; ', @fields;
12573}
12574
12575sub filter_old_style_arabic_shaping {
12576    # Early versions used a different term for the later one.
12577
12578    my @fields = split /\s*;\s*/;
12579    $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12580    $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
12581    $_ = join ';', @fields;
12582    return;
12583}
12584
12585{ # Closure
12586    my $lc; # Table for lowercase mapping
12587    my $tc;
12588    my $uc;
12589    my %special_casing_code_points;
12590
12591    sub setup_special_casing {
12592        # SpecialCasing.txt contains the non-simple case change mappings.  The
12593        # simple ones are in UnicodeData.txt, which should already have been
12594        # read in to the full property data structures, so as to initialize
12595        # these with the simple ones.  Then the SpecialCasing.txt entries
12596        # add or overwrite the ones which have different full mappings.
12597
12598        # This routine sees if the simple mappings are to be output, and if
12599        # so, copies what has already been put into the full mapping tables,
12600        # while they still contain only the simple mappings.
12601
12602        # The reason it is done this way is that the simple mappings are
12603        # probably not going to be output, so it saves work to initialize the
12604        # full tables with the simple mappings, and then overwrite those
12605        # relatively few entries in them that have different full mappings,
12606        # and thus skip the simple mapping tables altogether.
12607
12608        my $file= shift;
12609        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12610
12611        $lc = property_ref('lc');
12612        $tc = property_ref('tc');
12613        $uc = property_ref('uc');
12614
12615        # For each of the case change mappings...
12616        foreach my $full_casing_table ($lc, $tc, $uc) {
12617            my $full_casing_name = $full_casing_table->name;
12618            my $full_casing_full_name = $full_casing_table->full_name;
12619            unless (defined $full_casing_table
12620                    && ! $full_casing_table->is_empty)
12621            {
12622                Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
12623            }
12624
12625            # Create a table in the old-style format and with the original
12626            # file name for backwards compatibility with applications that
12627            # read it directly.  The new tables contain both the simple and
12628            # full maps, and the old are missing simple maps when there is a
12629            # conflicting full one.  Probably it would have been ok to add
12630            # those to the legacy version, as was already done in 5.14 to the
12631            # case folding one, but this was not done, out of an abundance of
12632            # caution.  The tables are set up here before we deal with the
12633            # full maps so that as we handle those, we can override the simple
12634            # maps for them in the legacy table, and merely add them in the
12635            # new-style one.
12636            my $legacy = Property->new("Legacy_" . $full_casing_full_name,
12637                                File => $full_casing_full_name
12638                                                          =~ s/case_Mapping//r,
12639                                Format => $HEX_FORMAT,
12640                                Default_Map => $CODE_POINT,
12641                                Initialize => $full_casing_table,
12642                                Replacement_Property => $full_casing_full_name,
12643            );
12644
12645            $full_casing_table->add_comment(join_lines( <<END
12646This file includes both the simple and full case changing maps.  The simple
12647ones are in the main body of the table below, and the full ones adding to or
12648overriding them are in the hash.
12649END
12650            ));
12651
12652            # The simple version's name in each mapping merely has an 's' in
12653            # front of the full one's
12654            my $simple_name = 's' . $full_casing_name;
12655            my $simple = property_ref($simple_name);
12656            $simple->initialize($full_casing_table) if $simple->to_output_map();
12657        }
12658
12659        return;
12660    }
12661
12662    sub filter_2_1_8_special_casing_line {
12663
12664        # This version had duplicate entries in this file.  Delete all but the
12665        # first one
12666        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12667                                              # fields
12668        if (exists $special_casing_code_points{$fields[0]}) {
12669            $_ = "";
12670            return;
12671        }
12672
12673        $special_casing_code_points{$fields[0]} = 1;
12674        filter_special_casing_line(@_);
12675    }
12676
12677    sub filter_special_casing_line {
12678        # Change the format of $_ from SpecialCasing.txt into something that
12679        # the generic handler understands.  Each input line contains three
12680        # case mappings.  This will generate three lines to pass to the
12681        # generic handler for each of those.
12682
12683        # The input syntax (after stripping comments and trailing white space
12684        # is like one of the following (with the final two being entries that
12685        # we ignore):
12686        # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12687        # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12688        # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12689        # Note the trailing semi-colon, unlike many of the input files.  That
12690        # means that there will be an extra null field generated by the split
12691
12692        my $file = shift;
12693        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12694
12695        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12696                                              # fields
12697
12698        # field #4 is when this mapping is conditional.  If any of these get
12699        # implemented, it would be by hard-coding in the casing functions in
12700        # the Perl core, not through tables.  But if there is a new condition
12701        # we don't know about, output a warning.  We know about all the
12702        # conditions through 6.0
12703        if ($fields[4] ne "") {
12704            my @conditions = split ' ', $fields[4];
12705            if ($conditions[0] ne 'tr'  # We know that these languages have
12706                                        # conditions, and some are multiple
12707                && $conditions[0] ne 'az'
12708                && $conditions[0] ne 'lt'
12709
12710                # And, we know about a single condition Final_Sigma, but
12711                # nothing else.
12712                && ($v_version gt v5.2.0
12713                    && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12714            {
12715                $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12716            }
12717            elsif ($conditions[0] ne 'Final_Sigma') {
12718
12719                    # Don't print out a message for Final_Sigma, because we
12720                    # have hard-coded handling for it.  (But the standard
12721                    # could change what the rule should be, but it wouldn't
12722                    # show up here anyway.
12723
12724                    print "# SKIPPING Special Casing: $_\n"
12725                                                    if $verbosity >= $VERBOSE;
12726            }
12727            $_ = "";
12728            return;
12729        }
12730        elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12731            $file->carp_bad_line('Extra fields');
12732            $_ = "";
12733            return;
12734        }
12735
12736        my $decimal_code_point = hex $fields[0];
12737
12738        # Loop to handle each of the three mappings in the input line, in
12739        # order, with $i indicating the current field number.
12740        my $i = 0;
12741        for my $object ($lc, $tc, $uc) {
12742            $i++;   # First time through, $i = 0 ... 3rd time = 3
12743
12744            my $value = $object->value_of($decimal_code_point);
12745            $value = ($value eq $CODE_POINT)
12746                      ? $decimal_code_point
12747                      : hex $value;
12748
12749            # If this isn't a multi-character mapping, it should already have
12750            # been read in.
12751            if ($fields[$i] !~ / /) {
12752                if ($value != hex $fields[$i]) {
12753                    Carp::my_carp("Bad news. UnicodeData.txt thinks "
12754                                  . $object->name
12755                                  . "(0x$fields[0]) is $value"
12756                                  . " and SpecialCasing.txt thinks it is "
12757                                  . hex($fields[$i])
12758                                  . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
12759                }
12760            }
12761            else {
12762
12763                # The mapping goes into both the legacy table, in which it
12764                # replaces the simple one...
12765                $file->insert_adjusted_lines("$fields[0]; Legacy_"
12766                                             . $object->full_name
12767                                             . "; $fields[$i]");
12768
12769                # ... and the regular table, in which it is additional,
12770                # beyond the simple mapping.
12771                $file->insert_adjusted_lines("$fields[0]; "
12772                                             . $object->name
12773                                            . "; "
12774                                            . $CMD_DELIM
12775                                            . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12776                                            . $CMD_DELIM
12777                                            . $fields[$i]);
12778            }
12779        }
12780
12781        # Everything has been handled by the insert_adjusted_lines()
12782        $_ = "";
12783
12784        return;
12785    }
12786}
12787
12788sub filter_old_style_case_folding {
12789    # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12790    # and later style.  Different letters were used in the earlier.
12791
12792    my $file = shift;
12793    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12794
12795    my @fields = split /\s*;\s*/;
12796
12797    if ($fields[1] eq 'L') {
12798        $fields[1] = 'C';             # L => C always
12799    }
12800    elsif ($fields[1] eq 'E') {
12801        if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
12802            $fields[1] = 'F'
12803        }
12804        else {
12805            $fields[1] = 'C'
12806        }
12807    }
12808    else {
12809        $file->carp_bad_line("Expecting L or E in second field");
12810        $_ = "";
12811        return;
12812    }
12813    $_ = join("; ", @fields) . ';';
12814    return;
12815}
12816
12817{ # Closure for case folding
12818
12819    # Create the map for simple only if are going to output it, for otherwise
12820    # it takes no part in anything we do.
12821    my $to_output_simple;
12822
12823    sub setup_case_folding($) {
12824        # Read in the case foldings in CaseFolding.txt.  This handles both
12825        # simple and full case folding.
12826
12827        $to_output_simple
12828                        = property_ref('Simple_Case_Folding')->to_output_map;
12829
12830        if (! $to_output_simple) {
12831            property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12832        }
12833
12834        # If we ever wanted to show that these tables were combined, a new
12835        # property method could be created, like set_combined_props()
12836        property_ref('Case_Folding')->add_comment(join_lines( <<END
12837This file includes both the simple and full case folding maps.  The simple
12838ones are in the main body of the table below, and the full ones adding to or
12839overriding them are in the hash.
12840END
12841        ));
12842        return;
12843    }
12844
12845    sub filter_case_folding_line {
12846        # Called for each line in CaseFolding.txt
12847        # Input lines look like:
12848        # 0041; C; 0061; # LATIN CAPITAL LETTER A
12849        # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12850        # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12851        #
12852        # 'C' means that folding is the same for both simple and full
12853        # 'F' that it is only for full folding
12854        # 'S' that it is only for simple folding
12855        # 'T' is locale-dependent, and ignored
12856        # 'I' is a type of 'F' used in some early releases.
12857        # Note the trailing semi-colon, unlike many of the input files.  That
12858        # means that there will be an extra null field generated by the split
12859        # below, which we ignore and hence is not an error.
12860
12861        my $file = shift;
12862        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12863
12864        my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12865        if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12866            $file->carp_bad_line('Extra fields');
12867            $_ = "";
12868            return;
12869        }
12870
12871        if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12872            $_ = "";
12873            return;
12874        }
12875
12876        # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12877        # I are all full foldings; S is single-char.  For S, there is always
12878        # an F entry, so we must allow multiple values for the same code
12879        # point.  Fortunately this table doesn't need further manipulation
12880        # which would preclude using multiple-values.  The S is now included
12881        # so that _swash_inversion_hash() is able to construct closures
12882        # without having to worry about F mappings.
12883        if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12884            $_ = "$range; Case_Folding; "
12885                 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12886        }
12887        else {
12888            $_ = "";
12889            $file->carp_bad_line('Expecting C F I S or T in second field');
12890        }
12891
12892        # C and S are simple foldings, but simple case folding is not needed
12893        # unless we explicitly want its map table output.
12894        if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12895            $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12896        }
12897
12898        return;
12899    }
12900
12901} # End case fold closure
12902
12903sub filter_jamo_line {
12904    # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12905    # from this file that is used in generating the Name property for Jamo
12906    # code points.  But, it also is used to convert early versions' syntax
12907    # into the modern form.  Here are two examples:
12908    # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12909    # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12910    #
12911    # The input is $_, the output is $_ filtered.
12912
12913    my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12914
12915    # Let the caller handle unexpected input.  In earlier versions, there was
12916    # a third field which is supposed to be a comment, but did not have a '#'
12917    # before it.
12918    return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12919
12920    $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12921                                # beginning.
12922
12923    # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12924    $fields[1] = 'R' if $fields[0] eq '1105';
12925
12926    # Add to structure so can generate Names from it.
12927    my $cp = hex $fields[0];
12928    my $short_name = $fields[1];
12929    $Jamo{$cp} = $short_name;
12930    if ($cp <= $LBase + $LCount) {
12931        $Jamo_L{$short_name} = $cp - $LBase;
12932    }
12933    elsif ($cp <= $VBase + $VCount) {
12934        $Jamo_V{$short_name} = $cp - $VBase;
12935    }
12936    elsif ($cp <= $TBase + $TCount) {
12937        $Jamo_T{$short_name} = $cp - $TBase;
12938    }
12939    else {
12940        Carp::my_carp_bug("Unexpected Jamo code point in $_");
12941    }
12942
12943
12944    # Reassemble using just the first two fields to look like a typical
12945    # property file line
12946    $_ = "$fields[0]; $fields[1]";
12947
12948    return;
12949}
12950
12951sub register_fraction($) {
12952    # This registers the input rational number so that it can be passed on to
12953    # utf8_heavy.pl, both in rational and floating forms.
12954
12955    my $rational = shift;
12956
12957    my $float = eval $rational;
12958    $nv_floating_to_rational{$float} = $rational;
12959    return;
12960}
12961
12962sub gcd($$) {   # Greatest-common-divisor; from
12963                # http://en.wikipedia.org/wiki/Euclidean_algorithm
12964    my ($a, $b) = @_;
12965
12966    use integer;
12967
12968    while ($b != 0) {
12969       my $temp = $b;
12970       $b = $a % $b;
12971       $a = $temp;
12972    }
12973    return $a;
12974}
12975
12976sub reduce_fraction($) {
12977    my $fraction_ref = shift;
12978
12979    # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
12980    # hence this is needed.  The argument is a reference to the
12981    # string denoting the fraction, which must be of the form:
12982    if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12983        Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
12984        return;
12985    }
12986
12987    my $sign = $1;
12988    my $numerator = $2;
12989    my $denominator = $3;
12990
12991    use integer;
12992
12993    # Find greatest common divisor
12994    my $gcd = gcd($numerator, $denominator);
12995
12996    # And reduce using the gcd.
12997    if ($gcd != 1) {
12998        $numerator    /= $gcd;
12999        $denominator  /= $gcd;
13000        $$fraction_ref = "$sign$numerator/$denominator";
13001    }
13002
13003    return;
13004}
13005
13006sub filter_numeric_value_line {
13007    # DNumValues contains lines of a different syntax than the typical
13008    # property file:
13009    # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
13010    #
13011    # This routine transforms $_ containing the anomalous syntax to the
13012    # typical, by filtering out the extra columns, and convert early version
13013    # decimal numbers to strings that look like rational numbers.
13014
13015    my $file = shift;
13016    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13017
13018    # Starting in 5.1, there is a rational field.  Just use that, omitting the
13019    # extra columns.  Otherwise convert the decimal number in the second field
13020    # to a rational, and omit extraneous columns.
13021    my @fields = split /\s*;\s*/, $_, -1;
13022    my $rational;
13023
13024    if ($v_version ge v5.1.0) {
13025        if (@fields != 4) {
13026            $file->carp_bad_line('Not 4 semi-colon separated fields');
13027            $_ = "";
13028            return;
13029        }
13030        reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
13031        $rational = $fields[3];
13032
13033        $_ = join '; ', @fields[ 0, 3 ];
13034    }
13035    else {
13036
13037        # Here, is an older Unicode file, which has decimal numbers instead of
13038        # rationals in it.  Use the fraction to calculate the denominator and
13039        # convert to rational.
13040
13041        if (@fields != 2 && @fields != 3) {
13042            $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
13043            $_ = "";
13044            return;
13045        }
13046
13047        my $codepoints = $fields[0];
13048        my $decimal = $fields[1];
13049        if ($decimal =~ s/\.0+$//) {
13050
13051            # Anything ending with a decimal followed by nothing but 0's is an
13052            # integer
13053            $_ = "$codepoints; $decimal";
13054            $rational = $decimal;
13055        }
13056        else {
13057
13058            my $denominator;
13059            if ($decimal =~ /\.50*$/) {
13060                $denominator = 2;
13061            }
13062
13063            # Here have the hardcoded repeating decimals in the fraction, and
13064            # the denominator they imply.  There were only a few denominators
13065            # in the older Unicode versions of this file which this code
13066            # handles, so it is easy to convert them.
13067
13068            # The 4 is because of a round-off error in the Unicode 3.2 files
13069            elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
13070                $denominator = 3;
13071            }
13072            elsif ($decimal =~ /\.[27]50*$/) {
13073                $denominator = 4;
13074            }
13075            elsif ($decimal =~ /\.[2468]0*$/) {
13076                $denominator = 5;
13077            }
13078            elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
13079                $denominator = 6;
13080            }
13081            elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
13082                $denominator = 8;
13083            }
13084            if ($denominator) {
13085                my $sign = ($decimal < 0) ? "-" : "";
13086                my $numerator = int((abs($decimal) * $denominator) + .5);
13087                $rational = "$sign$numerator/$denominator";
13088                $_ = "$codepoints; $rational";
13089            }
13090            else {
13091                $file->carp_bad_line("Can't cope with number '$decimal'.");
13092                $_ = "";
13093                return;
13094            }
13095        }
13096    }
13097
13098    register_fraction($rational) if $rational =~ qr{/};
13099    return;
13100}
13101
13102{ # Closure
13103    my %unihan_properties;
13104
13105    sub construct_unihan {
13106
13107        my $file_object = shift;
13108
13109        return unless file_exists($file_object->file);
13110
13111        if ($v_version lt v4.0.0) {
13112            push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
13113            push @cjk_property_values, split "\n", <<'END';
13114# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
13115END
13116        }
13117
13118        if ($v_version ge v3.0.0) {
13119            push @cjk_properties, split "\n", <<'END';
13120cjkIRG_GSource; kIRG_GSource
13121cjkIRG_JSource; kIRG_JSource
13122cjkIRG_KSource; kIRG_KSource
13123cjkIRG_TSource; kIRG_TSource
13124cjkIRG_VSource; kIRG_VSource
13125END
13126        push @cjk_property_values, split "\n", <<'END';
13127# @missing: 0000..10FFFF; cjkIRG_GSource; <none>
13128# @missing: 0000..10FFFF; cjkIRG_JSource; <none>
13129# @missing: 0000..10FFFF; cjkIRG_KSource; <none>
13130# @missing: 0000..10FFFF; cjkIRG_TSource; <none>
13131# @missing: 0000..10FFFF; cjkIRG_VSource; <none>
13132END
13133        }
13134        if ($v_version ge v3.1.0) {
13135            push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
13136            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
13137        }
13138        if ($v_version ge v3.1.1) {
13139            push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
13140            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
13141        }
13142        if ($v_version ge v3.2.0) {
13143            push @cjk_properties, split "\n", <<'END';
13144cjkAccountingNumeric; kAccountingNumeric
13145cjkCompatibilityVariant; kCompatibilityVariant
13146cjkOtherNumeric; kOtherNumeric
13147cjkPrimaryNumeric; kPrimaryNumeric
13148END
13149            push @cjk_property_values, split "\n", <<'END';
13150# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
13151# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
13152# @missing: 0000..10FFFF; cjkOtherNumeric; NaN
13153# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
13154END
13155        }
13156        if ($v_version gt v4.0.0) {
13157            push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
13158            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
13159        }
13160
13161        if ($v_version ge v4.1.0) {
13162            push @cjk_properties, 'cjkIICore ; kIICore';
13163            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
13164        }
13165    }
13166
13167    sub setup_unihan {
13168        # Do any special setup for Unihan properties.
13169
13170        # This property gives the wrong computed type, so override.
13171        my $usource = property_ref('kIRG_USource');
13172        $usource->set_type($STRING) if defined $usource;
13173
13174        # This property is to be considered binary (it says so in
13175        # http://www.unicode.org/reports/tr38/)
13176        my $iicore = property_ref('kIICore');
13177        if (defined $iicore) {
13178            $iicore->set_type($FORCED_BINARY);
13179            $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
13180
13181            # Unicode doesn't include the maps for this property, so don't
13182            # warn that they are missing.
13183            $iicore->set_pre_declared_maps(0);
13184            $iicore->add_comment(join_lines( <<END
13185This property contains string values, but any non-empty ones are considered to
13186be 'core', so Perl creates tables for both: 1) its string values, plus 2)
13187tables so that \\p{kIICore} matches any code point which has a non-empty
13188value for this property.
13189END
13190            ));
13191        }
13192
13193        return;
13194    }
13195
13196    sub filter_unihan_line {
13197        # Change unihan db lines to look like the others in the db.  Here is
13198        # an input sample:
13199        #   U+341C        kCangjie        IEKN
13200
13201        # Tabs are used instead of semi-colons to separate fields; therefore
13202        # they may have semi-colons embedded in them.  Change these to periods
13203        # so won't screw up the rest of the code.
13204        s/;/./g;
13205
13206        # Remove lines that don't look like ones we accept.
13207        if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
13208            $_ = "";
13209            return;
13210        }
13211
13212        # Extract the property, and save a reference to its object.
13213        my $property = $1;
13214        if (! exists $unihan_properties{$property}) {
13215            $unihan_properties{$property} = property_ref($property);
13216        }
13217
13218        # Don't do anything unless the property is one we're handling, which
13219        # we determine by seeing if there is an object defined for it or not
13220        if (! defined $unihan_properties{$property}) {
13221            $_ = "";
13222            return;
13223        }
13224
13225        # Convert the tab separators to our standard semi-colons, and convert
13226        # the U+HHHH notation to the rest of the standard's HHHH
13227        s/\t/;/g;
13228        s/\b U \+ (?= $code_point_re )//xg;
13229
13230        #local $to_trace = 1 if main::DEBUG;
13231        trace $_ if main::DEBUG && $to_trace;
13232
13233        return;
13234    }
13235}
13236
13237sub filter_blocks_lines {
13238    # In the Blocks.txt file, the names of the blocks don't quite match the
13239    # names given in PropertyValueAliases.txt, so this changes them so they
13240    # do match:  Blanks and hyphens are changed into underscores.  Also makes
13241    # early release versions look like later ones
13242    #
13243    # $_ is transformed to the correct value.
13244
13245    my $file = shift;
13246        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13247
13248    if ($v_version lt v3.2.0) {
13249        if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
13250            $_ = "";
13251            return;
13252        }
13253
13254        # Old versions used a different syntax to mark the range.
13255        $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
13256    }
13257
13258    my @fields = split /\s*;\s*/, $_, -1;
13259    if (@fields != 2) {
13260        $file->carp_bad_line("Expecting exactly two fields");
13261        $_ = "";
13262        return;
13263    }
13264
13265    # Change hyphens and blanks in the block name field only
13266    $fields[1] =~ s/[ -]/_/g;
13267    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
13268
13269    $_ = join("; ", @fields);
13270    return;
13271}
13272
13273{ # Closure
13274    my $current_property;
13275
13276    sub filter_old_style_proplist {
13277        # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
13278        # was in a completely different syntax.  Ken Whistler of Unicode says
13279        # that it was something he used as an aid for his own purposes, but
13280        # was never an official part of the standard.  Many of the properties
13281        # in it were incorporated into the later PropList.txt, but some were
13282        # not.  This program uses this early file to generate property tables
13283        # that are otherwise not accessible in the early UCD's.  It does this
13284        # for the ones that eventually became official, and don't appear to be
13285        # too different in their contents from the later official version, and
13286        # throws away the rest.  It could be argued that the ones it generates
13287        # were probably not really official at that time, so should be
13288        # ignored.  You can easily modify things to skip all of them by
13289        # changing this function to just set $_ to "", and return; and to skip
13290        # certain of them by by simply removing their declarations from
13291        # get_old_property_aliases().
13292        #
13293        # Here is a list of all the ones that are thrown away:
13294        #   Alphabetic                   The definitions for this are very
13295        #                                defective, so better to not mislead
13296        #                                people into thinking it works.
13297        #                                Instead the Perl extension of the
13298        #                                same name is constructed from first
13299        #                                principles.
13300        #   Bidi=*                       duplicates UnicodeData.txt
13301        #   Combining                    never made into official property;
13302        #                                is \P{ccc=0}
13303        #   Composite                    never made into official property.
13304        #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
13305        #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
13306        #   Delimiter                    never made into official property;
13307        #                                removed in 3.0.1
13308        #   Format Control               never made into official property;
13309        #                                similar to gc=cf
13310        #   High Surrogate               duplicates Blocks.txt
13311        #   Ignorable Control            never made into official property;
13312        #                                similar to di=y
13313        #   ISO Control                  duplicates UnicodeData.txt: gc=cc
13314        #   Left of Pair                 never made into official property;
13315        #   Line Separator               duplicates UnicodeData.txt: gc=zl
13316        #   Low Surrogate                duplicates Blocks.txt
13317        #   Non-break                    was actually listed as a property
13318        #                                in 3.2, but without any code
13319        #                                points.  Unicode denies that this
13320        #                                was ever an official property
13321        #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
13322        #   Numeric                      duplicates UnicodeData.txt: gc=cc
13323        #   Paired Punctuation           never made into official property;
13324        #                                appears to be gc=ps + gc=pe
13325        #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
13326        #   Private Use                  duplicates UnicodeData.txt: gc=co
13327        #   Private Use High Surrogate   duplicates Blocks.txt
13328        #   Punctuation                  duplicates UnicodeData.txt: gc=p
13329        #   Space                        different definition than eventual
13330        #                                one.
13331        #   Titlecase                    duplicates UnicodeData.txt: gc=lt
13332        #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
13333        #   Zero-width                   never made into official property;
13334        #                                subset of gc=cf
13335        # Most of the properties have the same names in this file as in later
13336        # versions, but a couple do not.
13337        #
13338        # This subroutine filters $_, converting it from the old style into
13339        # the new style.  Here's a sample of the old-style
13340        #
13341        #   *******************************************
13342        #
13343        #   Property dump for: 0x100000A0 (Join Control)
13344        #
13345        #   200C..200D  (2 chars)
13346        #
13347        # In the example, the property is "Join Control".  It is kept in this
13348        # closure between calls to the subroutine.  The numbers beginning with
13349        # 0x were internal to Ken's program that generated this file.
13350
13351        # If this line contains the property name, extract it.
13352        if (/^Property dump for: [^(]*\((.*)\)/) {
13353            $_ = $1;
13354
13355            # Convert white space to underscores.
13356            s/ /_/g;
13357
13358            # Convert the few properties that don't have the same name as
13359            # their modern counterparts
13360            s/Identifier_Part/ID_Continue/
13361            or s/Not_a_Character/NChar/;
13362
13363            # If the name matches an existing property, use it.
13364            if (defined property_ref($_)) {
13365                trace "new property=", $_ if main::DEBUG && $to_trace;
13366                $current_property = $_;
13367            }
13368            else {        # Otherwise discard it
13369                trace "rejected property=", $_ if main::DEBUG && $to_trace;
13370                undef $current_property;
13371            }
13372            $_ = "";    # The property is saved for the next lines of the
13373                        # file, but this defining line is of no further use,
13374                        # so clear it so that the caller won't process it
13375                        # further.
13376        }
13377        elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13378
13379            # Here, the input line isn't a header defining a property for the
13380            # following section, and either we aren't in such a section, or
13381            # the line doesn't look like one that defines the code points in
13382            # such a section.  Ignore this line.
13383            $_ = "";
13384        }
13385        else {
13386
13387            # Here, we have a line defining the code points for the current
13388            # stashed property.  Anything starting with the first blank is
13389            # extraneous.  Otherwise, it should look like a normal range to
13390            # the caller.  Append the property name so that it looks just like
13391            # a modern PropList entry.
13392
13393            $_ =~ s/\s.*//;
13394            $_ .= "; $current_property";
13395        }
13396        trace $_ if main::DEBUG && $to_trace;
13397        return;
13398    }
13399} # End closure for old style proplist
13400
13401sub filter_old_style_normalization_lines {
13402    # For early releases of Unicode, the lines were like:
13403    #        74..2A76    ; NFKD_NO
13404    # For later releases this became:
13405    #        74..2A76    ; NFKD_QC; N
13406    # Filter $_ to look like those in later releases.
13407    # Similarly for MAYBEs
13408
13409    s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13410
13411    # Also, the property FC_NFKC was abbreviated to FNC
13412    s/FNC/FC_NFKC/;
13413    return;
13414}
13415
13416sub setup_script_extensions {
13417    # The Script_Extensions property starts out with a clone of the Script
13418    # property.
13419
13420    $scx = property_ref("Script_Extensions");
13421    return unless defined $scx;
13422
13423    $scx->_set_format($STRING_WHITE_SPACE_LIST);
13424    $scx->initialize($script);
13425    $scx->set_default_map($script->default_map);
13426    $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
13427    $scx->add_comment(join_lines( <<END
13428The values for code points that appear in one script are just the same as for
13429the 'Script' property.  Likewise the values for those that appear in many
13430scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
13431values of code points that appear in a few scripts are a space separated list
13432of those scripts.
13433END
13434    ));
13435
13436    # Initialize scx's tables and the aliases for them to be the same as sc's
13437    foreach my $table ($script->tables) {
13438        my $scx_table = $scx->add_match_table($table->name,
13439                                Full_Name => $table->full_name);
13440        foreach my $alias ($table->aliases) {
13441            $scx_table->add_alias($alias->name);
13442        }
13443    }
13444}
13445
13446sub  filter_script_extensions_line {
13447    # The Scripts file comes with the full name for the scripts; the
13448    # ScriptExtensions, with the short name.  The final mapping file is a
13449    # combination of these, and without adjustment, would have inconsistent
13450    # entries.  This filters the latter file to convert to full names.
13451    # Entries look like this:
13452    # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13453
13454    my @fields = split /\s*;\s*/;
13455
13456    # This script was erroneously omitted in this Unicode version.
13457    $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13458
13459    my @full_names;
13460    foreach my $short_name (split " ", $fields[1]) {
13461        push @full_names, $script->table($short_name)->full_name;
13462    }
13463    $fields[1] = join " ", @full_names;
13464    $_ = join "; ", @fields;
13465
13466    return;
13467}
13468
13469sub generate_hst {
13470
13471    # Populates the Hangul Syllable Type property from first principles
13472
13473    my $file= shift;
13474    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13475
13476    # These few ranges are hard-coded in.
13477    $file->insert_lines(split /\n/, <<'END'
134781100..1159    ; L
13479115F          ; L
134801160..11A2    ; V
1348111A8..11F9    ; T
13482END
13483);
13484
13485    # The Hangul syllables in version 1 are at different code points than
13486    # those that came along starting in version 2, and have different names;
13487    # they comprise about 60% of the code points of the later version.
13488    # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13489    # initial set is a subset of the later version, with different English
13490    # transliterations.  I did not see an easy mapping between them.  The
13491    # later set includes essentially all possibilities, even ones that aren't
13492    # in modern use (if they ever were), and over 96% of the new ones are type
13493    # LVT.  Mathematically, the early set must also contain a preponderance of
13494    # LVT values.  In lieu of doing nothing, we just set them all to LVT, and
13495    # expect that this will be right most of the time, which is better than
13496    # not being right at all.
13497    if ($v_version lt v2.0.0) {
13498        my $property = property_ref($file->property);
13499        $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13500                                    $FIRST_REMOVED_HANGUL_SYLLABLE,
13501                                    $FINAL_REMOVED_HANGUL_SYLLABLE));
13502        push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13503        return;
13504    }
13505
13506    # The algorithmically derived syllables are almost all LVT ones, so
13507    # initialize the whole range with that.
13508    $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13509                        $SBase, $SBase + $SCount -1);
13510
13511    # Those ones that aren't LVT are LV, and they occur at intervals of
13512    # $TCount code points, starting with the first code point, at $SBase.
13513    for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13514        $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13515    }
13516
13517    return;
13518}
13519
13520sub generate_GCB {
13521
13522    # Populates the Grapheme Cluster Break property from first principles
13523
13524    my $file= shift;
13525    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13526
13527    # All these definitions are from
13528    # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13529    # from http://www.unicode.org/reports/tr29/tr29-4.html
13530
13531    foreach my $range ($gc->ranges) {
13532
13533        # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13534        # and gc=Cf
13535        if ($range->value =~ / ^ M [en] $ /x) {
13536            $file->insert_lines(sprintf "%04X..%04X; Extend",
13537                                $range->start,  $range->end);
13538        }
13539        elsif ($range->value =~ / ^ C [cf] $ /x) {
13540            $file->insert_lines(sprintf "%04X..%04X; Control",
13541                                $range->start,  $range->end);
13542        }
13543    }
13544    $file->insert_lines("2028; Control"); # Line Separator
13545    $file->insert_lines("2029; Control"); # Paragraph Separator
13546
13547    $file->insert_lines("000D; CR");
13548    $file->insert_lines("000A; LF");
13549
13550    # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13551    foreach my $code_point ( qw{
13552                                09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13553                                0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13554                                }
13555    ) {
13556        my $category = $gc->value_of(hex $code_point);
13557        next if ! defined $category || $category eq 'Cn'; # But not if
13558                                                          # unassigned in this
13559                                                          # release
13560        $file->insert_lines("$code_point; Extend");
13561    }
13562
13563    my $hst = property_ref('Hangul_Syllable_Type');
13564    if ($hst->count > 0) {
13565        foreach my $range ($hst->ranges) {
13566            $file->insert_lines(sprintf "%04X..%04X; %s",
13567                                    $range->start, $range->end, $range->value);
13568        }
13569    }
13570    else {
13571        generate_hst($file);
13572    }
13573
13574    main::process_generic_property_file($file);
13575}
13576
13577
13578sub fixup_early_perl_name_alias {
13579
13580    # Different versions of Unicode have varying support for the name synonyms
13581    # below.  Just include everything.  As of 6.1, all these are correct in
13582    # the Unicode-supplied file.
13583
13584    my $file= shift;
13585    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13586
13587
13588    # ALERT did not come along until 6.0, at which point it became preferred
13589    # over BELL.  By inserting it last in early releases, BELL is preferred
13590    # over it; and vice-vers in 6.0
13591    my $type_for_bell = ($v_version lt v6.0.0)
13592               ? 'correction'
13593               : 'alternate';
13594    $file->insert_lines(split /\n/, <<END
135950007;BELL; $type_for_bell
13596000A;LINE FEED (LF);alternate
13597000C;FORM FEED (FF);alternate
13598000D;CARRIAGE RETURN (CR);alternate
135990085;NEXT LINE (NEL);alternate
13600END
13601
13602    );
13603
13604    # One might think that the the 'Unicode_1_Name' field, could work for most
13605    # of the above names, but sadly that field varies depending on the
13606    # release.  Version 1.1.5 had no names for any of the controls; Version
13607    # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13608    # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13609    #   changed to parenthesized versions like "NEXT LINE" to
13610    #       "NEXT LINE (NEL)";
13611    #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13612    #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13613    #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13614    #
13615    # All these are present in the 6.1 NameAliases.txt
13616
13617    return;
13618}
13619
13620sub filter_later_version_name_alias_line {
13621
13622    # This file has an extra entry per line for the alias type.  This is
13623    # handled by creating a compound entry: "$alias: $type";  First, split
13624    # the line into components.
13625    my ($range, $alias, $type, @remainder)
13626        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13627
13628    # This file contains multiple entries for some components, so tell the
13629    # downstream code to allow this in our internal tables; the
13630    # $MULTIPLE_AFTER preserves the input ordering.
13631    $_ = join ";", $range, $CMD_DELIM
13632                           . $REPLACE_CMD
13633                           . '='
13634                           . $MULTIPLE_AFTER
13635                           . $CMD_DELIM
13636                           . "$alias: $type",
13637                   @remainder;
13638    return;
13639}
13640
13641sub filter_early_version_name_alias_line {
13642
13643    # Early versions did not have the trailing alias type field; implicitly it
13644    # was 'correction'.
13645    $_ .= "; correction";
13646
13647    filter_later_version_name_alias_line;
13648    return;
13649}
13650
13651sub filter_all_caps_script_names {
13652
13653    # Some early Unicode releases had the script names in all CAPS.  This
13654    # converts them to just the first letter of each word being capital.
13655
13656    my ($range, $script, @remainder)
13657        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13658    my @words = split /[_-]/, $script;
13659    for my $word (@words) {
13660        $word =
13661            ucfirst(lc($word)) if $word ne 'CJK';
13662    }
13663    $script = join "_", @words;
13664    $_ = join ";", $range, $script, @remainder;
13665}
13666
13667sub finish_Unicode() {
13668    # This routine should be called after all the Unicode files have been read
13669    # in.  It:
13670    # 1) Creates properties that are missing from the version of Unicode being
13671    #    compiled, and which, for whatever reason, are needed for the Perl
13672    #    core to function properly.  These are minimally populated as
13673    #    necessary.
13674    # 2) Adds the mappings for code points missing from the files which have
13675    #    defaults specified for them.
13676    # 3) At this this point all mappings are known, so it computes the type of
13677    #    each property whose type hasn't been determined yet.
13678    # 4) Calculates all the regular expression match tables based on the
13679    #    mappings.
13680    # 5) Calculates and adds the tables which are defined by Unicode, but
13681    #    which aren't derived by them, and certain derived tables that Perl
13682    #    uses.
13683
13684    # Folding information was introduced later into Unicode data.  To get
13685    # Perl's case ignore (/i) to work at all in releases that don't have
13686    # folding, use the best available alternative, which is lower casing.
13687    my $fold = property_ref('Case_Folding');
13688    if ($fold->is_empty) {
13689        $fold->initialize(property_ref('Lowercase_Mapping'));
13690        $fold->add_note(join_lines(<<END
13691WARNING: This table uses lower case as a substitute for missing fold
13692information
13693END
13694        ));
13695    }
13696
13697    # Multiple-character mapping was introduced later into Unicode data, so it
13698    # is by default the simple version.  If to output the simple versions and
13699    # not present, just use the regular (which in these Unicode versions is
13700    # the simple as well).
13701    foreach my $map (qw {   Uppercase_Mapping
13702                            Lowercase_Mapping
13703                            Titlecase_Mapping
13704                            Case_Folding
13705                        } )
13706    {
13707        my $comment = <<END;
13708
13709Note that although the Perl core uses this file, it has the standard values
13710for code points from U+0000 to U+00FF compiled in, so changing this table will
13711not change the core's behavior with respect to these code points.  Use
13712Unicode::Casing to override this table.
13713END
13714        if ($map eq 'Case_Folding') {
13715            $comment .= <<END;
13716(/i regex matching is not overridable except by using a custom regex engine)
13717END
13718        }
13719        property_ref($map)->add_comment(join_lines($comment));
13720        my $simple = property_ref("Simple_$map");
13721        next if ! $simple->is_empty;
13722        if ($simple->to_output_map) {
13723            $simple->initialize(property_ref($map));
13724        }
13725        else {
13726            property_ref($map)->set_proxy_for($simple->name);
13727        }
13728    }
13729
13730    # For each property, fill in any missing mappings, and calculate the re
13731    # match tables.  If a property has more than one missing mapping, the
13732    # default is a reference to a data structure, and may require data from
13733    # other properties to resolve.  The sort is used to cause these to be
13734    # processed last, after all the other properties have been calculated.
13735    # (Fortunately, the missing properties so far don't depend on each other.)
13736    foreach my $property
13737        (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13738        property_ref('*'))
13739    {
13740        # $perl has been defined, but isn't one of the Unicode properties that
13741        # need to be finished up.
13742        next if $property == $perl;
13743
13744        # Nor do we need to do anything with properties that aren't going to
13745        # be output.
13746        next if $property->fate == $SUPPRESSED;
13747
13748        # Handle the properties that have more than one possible default
13749        if (ref $property->default_map) {
13750            my $default_map = $property->default_map;
13751
13752            # These properties have stored in the default_map:
13753            # One or more of:
13754            #   1)  A default map which applies to all code points in a
13755            #       certain class
13756            #   2)  an expression which will evaluate to the list of code
13757            #       points in that class
13758            # And
13759            #   3) the default map which applies to every other missing code
13760            #      point.
13761            #
13762            # Go through each list.
13763            while (my ($default, $eval) = $default_map->get_next_defaults) {
13764
13765                # Get the class list, and intersect it with all the so-far
13766                # unspecified code points yielding all the code points
13767                # in the class that haven't been specified.
13768                my $list = eval $eval;
13769                if ($@) {
13770                    Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13771                    last;
13772                }
13773
13774                # Narrow down the list to just those code points we don't have
13775                # maps for yet.
13776                $list = $list & $property->inverse_list;
13777
13778                # Add mappings to the property for each code point in the list
13779                foreach my $range ($list->ranges) {
13780                    $property->add_map($range->start, $range->end, $default,
13781                    Replace => $CROAK);
13782                }
13783            }
13784
13785            # All remaining code points have the other mapping.  Set that up
13786            # so the normal single-default mapping code will work on them
13787            $property->set_default_map($default_map->other_default);
13788
13789            # And fall through to do that
13790        }
13791
13792        # We should have enough data now to compute the type of the property.
13793        my $property_name = $property->name;
13794        $property->compute_type;
13795        my $property_type = $property->type;
13796
13797        next if ! $property->to_create_match_tables;
13798
13799        # Here want to create match tables for this property
13800
13801        # The Unicode db always (so far, and they claim into the future) have
13802        # the default for missing entries in binary properties be 'N' (unless
13803        # there is a '@missing' line that specifies otherwise)
13804        if (! defined $property->default_map) {
13805            if ($property_type == $BINARY) {
13806                $property->set_default_map('N');
13807            }
13808            elsif ($property_type == $ENUM) {
13809                Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13810                $property->set_default_map('XXX This makes sure there is a default map');
13811            }
13812        }
13813
13814        # Add any remaining code points to the mapping, using the default for
13815        # missing code points.
13816        my $default_table;
13817        my $default_map = $property->default_map;
13818        if ($property_type == $FORCED_BINARY) {
13819
13820            # A forced binary property creates a 'Y' table that matches all
13821            # non-default values.  The actual string values are also written out
13822            # as a map table.  (The default value will almost certainly be the
13823            # empty string, so the pod glosses over the distinction, and just
13824            # talks about empty vs non-empty.)
13825            my $yes = $property->table("Y");
13826            foreach my $range ($property->ranges) {
13827                next if $range->value eq $default_map;
13828                $yes->add_range($range->start, $range->end);
13829            }
13830            $property->table("N")->set_complement($yes);
13831        }
13832        else {
13833            if (defined $default_map) {
13834
13835                # Make sure there is a match table for the default
13836                if (! defined ($default_table = $property->table($default_map)))
13837                {
13838                    $default_table = $property->add_match_table($default_map);
13839                }
13840
13841                # And, if the property is binary, the default table will just
13842                # be the complement of the other table.
13843                if ($property_type == $BINARY) {
13844                    my $non_default_table;
13845
13846                    # Find the non-default table.
13847                    for my $table ($property->tables) {
13848                        if ($table == $default_table) {
13849                            if ($v_version le v5.0.0) {
13850                                $table->add_alias($_) for qw(N No F False);
13851                            }
13852                            next;
13853                        } elsif ($v_version le v5.0.0) {
13854                            $table->add_alias($_) for qw(Y Yes T True);
13855                        }
13856                        $non_default_table = $table;
13857                    }
13858                    $default_table->set_complement($non_default_table);
13859                }
13860                else {
13861
13862                    # This fills in any missing values with the default.  It's
13863                    # not necessary to do this with binary properties, as the
13864                    # default is defined completely in terms of the Y table.
13865                    $property->add_map(0, $MAX_WORKING_CODEPOINT,
13866                                    $default_map, Replace => $NO);
13867                }
13868            }
13869
13870            # Have all we need to populate the match tables.
13871            my $maps_should_be_defined = $property->pre_declared_maps;
13872            foreach my $range ($property->ranges) {
13873                my $map = $range->value;
13874                my $table = $property->table($map);
13875                if (! defined $table) {
13876
13877                    # Integral and rational property values are not
13878                    # necessarily defined in PropValueAliases, but whether all
13879                    # the other ones should be depends on the property.
13880                    if ($maps_should_be_defined
13881                        && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13882                    {
13883                        Carp::my_carp("Table '$property_name=$map' should "
13884                                    . "have been defined.  Defining it now.")
13885                    }
13886                    $table = $property->add_match_table($map);
13887                }
13888
13889                next if $table->complement != 0; # Don't need to populate these
13890                $table->add_range($range->start, $range->end);
13891            }
13892        }
13893
13894        # For Perl 5.6 compatibility, all properties matchable in regexes can
13895        # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
13896        # But warn if this creates a conflict with a (new) Unicode property
13897        # name, although it appears that Unicode has made a decision never to
13898        # begin a property name with 'Is_', so this shouldn't happen.
13899        foreach my $alias ($property->aliases) {
13900            my $Is_name = 'Is_' . $alias->name;
13901            if (defined (my $pre_existing = property_ref($Is_name))) {
13902                Carp::my_carp(<<END
13903There is already an alias named $Is_name (from " . $pre_existing . "), so
13904creating one for $property won't work.  This is bad news.  If it is not too
13905late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13906from the git blame log for this area of the code that suppressed individual
13907aliases that conflict with the new Unicode names.  Proceeding anyway.
13908END
13909                );
13910            }
13911        } # End of loop through aliases for this property
13912    } # End of loop through all Unicode properties.
13913
13914    # Fill in the mappings that Unicode doesn't completely furnish.  First the
13915    # single letter major general categories.  If Unicode were to start
13916    # delivering the values, this would be redundant, but better that than to
13917    # try to figure out if should skip and not get it right.  Ths could happen
13918    # if a new major category were to be introduced, and the hard-coded test
13919    # wouldn't know about it.
13920    # This routine depends on the standard names for the general categories
13921    # being what it thinks they are, like 'Cn'.  The major categories are the
13922    # union of all the general category tables which have the same first
13923    # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13924    foreach my $minor_table ($gc->tables) {
13925        my $minor_name = $minor_table->name;
13926        next if length $minor_name == 1;
13927        if (length $minor_name != 2) {
13928            Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
13929            next;
13930        }
13931
13932        my $major_name = uc(substr($minor_name, 0, 1));
13933        my $major_table = $gc->table($major_name);
13934        $major_table += $minor_table;
13935    }
13936
13937    # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
13938    # defines it as LC)
13939    my $LC = $gc->table('LC');
13940    $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
13941    $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
13942
13943
13944    if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13945                         # deliver the correct values in it
13946        $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13947
13948        # Lt not in release 1.
13949        if (defined $gc->table('Lt')) {
13950            $LC += $gc->table('Lt');
13951            $gc->table('Lt')->set_caseless_equivalent($LC);
13952        }
13953    }
13954    $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13955
13956    $gc->table('Ll')->set_caseless_equivalent($LC);
13957    $gc->table('Lu')->set_caseless_equivalent($LC);
13958
13959    # Create digit and case fold tables with the original file names for
13960    # backwards compatibility with applications that read them directly.
13961    my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
13962                              Default_Map => "",
13963                              File => 'Digit',    # Trad. location
13964                              Directory => $map_directory,
13965                              Type => $STRING,
13966                              Replacement_Property => "Perl_Decimal_Digit",
13967                              Initialize => property_ref('Perl_Decimal_Digit'),
13968                            );
13969    $Digit->add_comment(join_lines(<<END
13970This file gives the mapping of all code points which represent a single
13971decimal digit [0-9] to their respective digits.  For example, the code point
13972U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
13973that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
13974numerals.
13975END
13976    ));
13977
13978    # Make sure this assumption in perl core code is valid in this Unicode
13979    # release, with known exceptions
13980    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13981        next if $range->end - $range->start == 9;
13982        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
13983        next if $range->end == 0x19DA && $v_version eq v5.2.0;
13984        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13985        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13986                    . " decimal digits.  Code in regcomp.c assumes it does,"
13987                    . " and will have to be fixed.  Proceeding anyway.");
13988    }
13989
13990    Property->new('Legacy_Case_Folding',
13991                    File => "Fold",
13992                    Directory => $map_directory,
13993                    Default_Map => $CODE_POINT,
13994                    Type => $STRING,
13995                    Replacement_Property => "Case_Folding",
13996                    Format => $HEX_FORMAT,
13997                    Initialize => property_ref('cf'),
13998    );
13999
14000    # The Script_Extensions property started out as a clone of the Script
14001    # property.  But processing its data file caused some elements to be
14002    # replaced with different data.  (These elements were for the Common and
14003    # Inherited properties.)  This data is a qw() list of all the scripts that
14004    # the code points in the given range are in.  An example line is:
14005    # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
14006    #
14007    # The code above has created a new match table named "Arab Syrc Thaa"
14008    # which contains 060C.  (The cloned table started out with this code point
14009    # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
14010    # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
14011    # match table.  This is repeated for all these tables and ranges.  The map
14012    # data is retained in the map table for reference, but the spurious match
14013    # tables are deleted.
14014
14015    if (defined $scx) {
14016        foreach my $table ($scx->tables) {
14017            next unless $table->name =~ /\s/;   # All the new and only the new
14018                                                # tables have a space in their
14019                                                # names
14020            my @scripts = split /\s+/, $table->name;
14021            foreach my $script (@scripts) {
14022                my $script_table = $scx->table($script);
14023                $script_table += $table;
14024            }
14025            $scx->delete_match_table($table);
14026        }
14027
14028        # Mark the scx table as the parent of the corresponding sc table for
14029        # those which are identical.  This causes the pod for the script table
14030        # to refer to the corresponding scx one.
14031        #
14032        # This has to be in a separate loop from above, so as to wait until
14033        # the tables are stabilized before checking for equivalency.
14034        if (defined $pod_directory) {
14035            foreach my $table ($scx->tables) {
14036                my $plain_sc_equiv = $script->table($table->name);
14037                if ($table->matches_identically_to($plain_sc_equiv)) {
14038                    $plain_sc_equiv->set_equivalent_to($table, Related => 1);
14039                }
14040            }
14041        }
14042    }
14043
14044    return;
14045}
14046
14047sub pre_3_dot_1_Nl () {
14048
14049    # Return a range list for gc=nl for Unicode versions prior to 3.1, which
14050    # is when Unicode's became fully usable.  These code points were
14051    # determined by inspection and experimentation.  gc=nl is important for
14052    # certain Perl-extension properties that should be available in all
14053    # releases.
14054
14055    my $Nl = Range_List->new();
14056    if (defined (my $official = $gc->table('Nl'))) {
14057        $Nl += $official;
14058    }
14059    else {
14060        $Nl->add_range(0x2160, 0x2182);
14061        $Nl->add_range(0x3007, 0x3007);
14062        $Nl->add_range(0x3021, 0x3029);
14063    }
14064    $Nl->add_range(0xFE20, 0xFE23);
14065    $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
14066                                                            # these were added
14067    return $Nl;
14068}
14069
14070sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
14071                            # called before the Cn's are completely filled.
14072                            # Works on Unicodes earlier than ones that
14073                            # explicitly specify Cn.
14074    return if defined $Assigned;
14075
14076    if (! defined $gc || $gc->is_empty()) {
14077        Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
14078    }
14079
14080    $Assigned = $perl->add_match_table('Assigned',
14081                                Description  => "All assigned code points",
14082                                );
14083    while (defined (my $range = $gc->each_range())) {
14084        my $standard_value = standardize($range->value);
14085        next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
14086        $Assigned->add_range($range->start, $range->end);
14087    }
14088}
14089
14090sub calculate_DI() {    # Set $DI to a Range_List equivalent to the
14091                        # Default_Ignorable_Code_Point property.  Works on
14092                        # Unicodes earlier than ones that explicitly specify
14093                        # DI.
14094    return if defined $DI;
14095
14096    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
14097        $DI = $di->table('Y');
14098    }
14099    else {
14100        $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
14101                                              0x2060 .. 0x206F,
14102                                              0xFE00 .. 0xFE0F,
14103                                              0xFFF0 .. 0xFFFB,
14104                                            ]);
14105        if ($v_version ge v2.0) {
14106            $DI += $gc->table('Cf')
14107                +  $gc->table('Cs');
14108
14109            # These are above the Unicode version 1 max
14110            $DI->add_range(0xE0000, 0xE0FFF);
14111        }
14112        $DI += $gc->table('Cc')
14113             - ord("\t")
14114             - utf8::unicode_to_native(0x0A)  # LINE FEED
14115             - utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14116             - ord("\f")
14117             - utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14118             - utf8::unicode_to_native(0x85); # NEL
14119    }
14120}
14121
14122sub calculate_NChar() {  # Create a Perl extension match table which is the
14123                         # same as the Noncharacter_Code_Point property, and
14124                         # set $NChar to point to it.  Works on Unicodes
14125                         # earlier than ones that explicitly specify NChar
14126    return if defined $NChar;
14127
14128    $NChar = $perl->add_match_table('_Perl_Nchar',
14129                                    Perl_Extension => 1,
14130                                    Fate => $INTERNAL_ONLY);
14131    if (defined (my $off_nchar = property_ref('NChar'))) {
14132        $NChar->initialize($off_nchar->table('Y'));
14133    }
14134    else {
14135        $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
14136        if ($v_version ge v2.0) {   # First release with these nchars
14137            for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
14138                $NChar += [ $i .. $i+1 ];
14139            }
14140        }
14141    }
14142}
14143
14144sub handle_compare_versions () {
14145    # This fixes things up for the $compare_versions capability, where we
14146    # compare Unicode version X with version Y (with Y > X), and we are
14147    # running it on the Unicode Data for version Y.
14148    #
14149    # It works by calculating the code points whose meaning has been specified
14150    # after release X, by using the Age property.  The complement of this set
14151    # is the set of code points whose meaning is unchanged between the
14152    # releases.  This is the set the program restricts itself to.  It includes
14153    # everything whose meaning has been specified by the time version X came
14154    # along, plus those still unassigned by the time of version Y.  (We will
14155    # continue to use the word 'assigned' to mean 'meaning has been
14156    # specified', as it's shorter and is accurate in all cases except the
14157    # Noncharacter code points.)
14158    #
14159    # This function is run after all the properties specified by Unicode have
14160    # been calculated for release Y.  This makes sure we get all the nuances
14161    # of Y's rules.  (It is done before the Perl extensions are calculated, as
14162    # those are based entirely on the Unicode ones.)  But doing it after the
14163    # Unicode table calculations means we have to fix up the Unicode tables.
14164    # We do this by subtracting the code points that have been assigned since
14165    # X (which is actually done by ANDing each table of assigned code points
14166    # with the set of unchanged code points).  Most Unicode properties are of
14167    # the form such that all unassigned code points have a default, grab-bag,
14168    # property value which is changed when the code point gets assigned.  For
14169    # these, we just remove the changed code points from the table for the
14170    # latter property value, and add them back in to the grab-bag one.  A few
14171    # other properties are not entirely of this form and have values for some
14172    # or all unassigned code points that are not the grab-bag one.  These have
14173    # to be handled specially, and are hard-coded in to this routine based on
14174    # manual inspection of the Unicode character database.  A list of the
14175    # outlier code points is made for each of these properties, and those
14176    # outliers are excluded from adding and removing from tables.
14177    #
14178    # Note that there are glitches when comparing against Unicode 1.1, as some
14179    # Hangul syllables in it were later ripped out and eventually replaced
14180    # with other things.
14181
14182    print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
14183
14184    my $after_first_version = "All matching code points were added after "
14185                            . "Unicode $string_compare_versions";
14186
14187    # Calculate the delta as those code points that have been newly assigned
14188    # since the first compare version.
14189    my $delta = Range_List->new();
14190    foreach my $table ($age->tables) {
14191        use version;
14192        next if $table == $age->table('Unassigned');
14193        next if version->parse($table->name)
14194             le version->parse($string_compare_versions);
14195        $delta += $table;
14196    }
14197    if ($delta->is_empty) {
14198        die ("No changes; perhaps you need a 'DAge.txt' file?");
14199    }
14200
14201    my $unchanged = ~ $delta;
14202
14203    calculate_Assigned() if ! defined $Assigned;
14204    $Assigned &= $unchanged;
14205
14206    # $Assigned now contains the code points that were assigned as of Unicode
14207    # version X.
14208
14209    # A block is all or nothing.  If nothing is assigned in it, it all goes
14210    # back to the No_Block pool; but if even one code point is assigned, the
14211    # block is retained.
14212    my $no_block = $block->table('No_Block');
14213    foreach my $this_block ($block->tables) {
14214        next if     $this_block == $no_block
14215                ||  ! ($this_block & $Assigned)->is_empty;
14216        $this_block->set_fate($SUPPRESSED, $after_first_version);
14217        foreach my $range ($this_block->ranges) {
14218            $block->replace_map($range->start, $range->end, 'No_Block')
14219        }
14220        $no_block += $this_block;
14221    }
14222
14223    my @special_delta_properties;   # List of properties that have to be
14224                                    # handled specially.
14225    my %restricted_delta;           # Keys are the entries in
14226                                    # @special_delta_properties;  values
14227                                    # are the range list of the code points
14228                                    # that behave normally when they get
14229                                    # assigned.
14230
14231    # In the next three properties, the Default Ignorable code points are
14232    # outliers.
14233    calculate_DI();
14234    $DI &= $unchanged;
14235
14236    push @special_delta_properties, property_ref('_Perl_GCB');
14237    $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14238
14239    if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
14240    {
14241        push @special_delta_properties, $cwnfkcc;
14242        $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14243    }
14244
14245    calculate_NChar();      # Non-character code points
14246    $NChar &= $unchanged;
14247
14248    # This may have to be updated from time-to-time to get the most accurate
14249    # results.
14250    my $default_BC_non_LtoR = Range_List->new(Initialize =>
14251                        # These came from the comments in v8.0 DBidiClass.txt
14252                                                        [ # AL
14253                                                            0x0600 .. 0x07BF,
14254                                                            0x08A0 .. 0x08FF,
14255                                                            0xFB50 .. 0xFDCF,
14256                                                            0xFDF0 .. 0xFDFF,
14257                                                            0xFE70 .. 0xFEFF,
14258                                                            0x1EE00 .. 0x1EEFF,
14259                                                           # R
14260                                                            0x0590 .. 0x05FF,
14261                                                            0x07C0 .. 0x089F,
14262                                                            0xFB1D .. 0xFB4F,
14263                                                            0x10800 .. 0x10FFF,
14264                                                            0x1E800 .. 0x1EDFF,
14265                                                            0x1EF00 .. 0x1EFFF,
14266                                                           # ET
14267                                                            0x20A0 .. 0x20CF,
14268                                                         ]
14269                                          );
14270    $default_BC_non_LtoR += $DI + $NChar;
14271    push @special_delta_properties, property_ref('BidiClass');
14272    $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
14273
14274    if (defined (my $eaw = property_ref('East_Asian_Width'))) {
14275
14276        my $default_EA_width_W = Range_List->new(Initialize =>
14277                                    # From comments in v8.0 EastAsianWidth.txt
14278                                                [
14279                                                    0x3400 .. 0x4DBF,
14280                                                    0x4E00 .. 0x9FFF,
14281                                                    0xF900 .. 0xFAFF,
14282                                                    0x20000 .. 0x2A6DF,
14283                                                    0x2A700 .. 0x2B73F,
14284                                                    0x2B740 .. 0x2B81F,
14285                                                    0x2B820 .. 0x2CEAF,
14286                                                    0x2F800 .. 0x2FA1F,
14287                                                    0x20000 .. 0x2FFFD,
14288                                                    0x30000 .. 0x3FFFD,
14289                                                ]
14290                                             );
14291        push @special_delta_properties, $eaw;
14292        $restricted_delta{$special_delta_properties[-1]}
14293                                                       = ~ $default_EA_width_W;
14294
14295        # Line break came along in the same release as East_Asian_Width, and
14296        # the non-grab-bag default set is a superset of the EAW one.
14297        if (defined (my $lb = property_ref('Line_Break'))) {
14298            my $default_LB_non_XX = Range_List->new(Initialize =>
14299                                        # From comments in v8.0 LineBreak.txt
14300                                                        [ 0x20A0 .. 0x20CF ]);
14301            $default_LB_non_XX += $default_EA_width_W;
14302            push @special_delta_properties, $lb;
14303            $restricted_delta{$special_delta_properties[-1]}
14304                                                        = ~ $default_LB_non_XX;
14305        }
14306    }
14307
14308    # Go through every property, skipping those we've already worked on, those
14309    # that are immutable, and the perl ones that will be calculated after this
14310    # routine has done its fixup.
14311    foreach my $property (property_ref('*')) {
14312        next if    $property == $perl     # Done later in the program
14313                || $property == $block    # Done just above
14314                || $property == $DI       # Done just above
14315                || $property == $NChar    # Done just above
14316
14317                   # The next two are invariant across Unicode versions
14318                || $property == property_ref('Pattern_Syntax')
14319                || $property == property_ref('Pattern_White_Space');
14320
14321        #  Find the grab-bag value.
14322        my $default_map = $property->default_map;
14323
14324        if (! $property->to_create_match_tables) {
14325
14326            # Here there aren't any match tables.  So far, all such properties
14327            # have a default map, and don't require special handling.  Just
14328            # change each newly assigned code point back to the default map,
14329            # as if they were unassigned.
14330            foreach my $range ($delta->ranges) {
14331                $property->add_map($range->start,
14332                                $range->end,
14333                                $default_map,
14334                                Replace => $UNCONDITIONALLY);
14335            }
14336        }
14337        else {  # Here there are match tables.  Find the one (if any) for the
14338                # grab-bag value that unassigned code points go to.
14339            my $default_table;
14340            if (defined $default_map) {
14341                $default_table = $property->table($default_map);
14342            }
14343
14344            # If some code points don't go back to the the grab-bag when they
14345            # are considered unassigned, exclude them from the list that does
14346            # that.
14347            my $this_delta = $delta;
14348            my $this_unchanged = $unchanged;
14349            if (grep { $_ == $property } @special_delta_properties) {
14350                $this_delta = $delta & $restricted_delta{$property};
14351                $this_unchanged = ~ $this_delta;
14352            }
14353
14354            # Fix up each match table for this property.
14355            foreach my $table ($property->tables) {
14356                if (defined $default_table && $table == $default_table) {
14357
14358                    # The code points assigned after release X (the ones we
14359                    # are excluding in this routine) go back on to the default
14360                    # (grab-bag) table.  However, some of these tables don't
14361                    # actually exist, but are specified solely by the other
14362                    # tables.  (In a binary property, we don't need to
14363                    # actually have an 'N' table, as it's just the complement
14364                    # of the 'Y' table.)  Such tables will be locked, so just
14365                    # skip those.
14366                    $table += $this_delta unless $table->locked;
14367                }
14368                else {
14369
14370                    # Here the table is not for the default value.  We need to
14371                    # subtract the code points we are ignoring for this
14372                    # comparison (the deltas) from it.  But if the table
14373                    # started out with nothing, no need to exclude anything,
14374                    # and want to skip it here anyway, so it gets listed
14375                    # properly in the pod.
14376                    next if $table->is_empty;
14377
14378                    # Save the deltas for later, before we do the subtraction
14379                    my $deltas = $table & $this_delta;
14380
14381                    $table &= $this_unchanged;
14382
14383                    # Suppress the table if the subtraction left it with
14384                    # nothing in it
14385                    if ($table->is_empty) {
14386                        if ($property->type == $BINARY) {
14387                            push @tables_that_may_be_empty, $table->complete_name;
14388                        }
14389                        else {
14390                            $table->set_fate($SUPPRESSED, $after_first_version);
14391                        }
14392                    }
14393
14394                    # Now we add the removed code points to the property's
14395                    # map, as they should now map to the grab-bag default
14396                    # property (which they did in the first comparison
14397                    # version).  But we don't have to do this if the map is
14398                    # only for internal use.
14399                    if (defined $default_map && $property->to_output_map) {
14400
14401                        # The gc property has pseudo property values whose names
14402                        # have length 1.  These are the union of all the
14403                        # property values whose name is longer than 1 and
14404                        # whose first letter is all the same.  The replacement
14405                        # is done once for the longer-named tables.
14406                        next if $property == $gc && length $table->name == 1;
14407
14408                        foreach my $range ($deltas->ranges) {
14409                            $property->add_map($range->start,
14410                                            $range->end,
14411                                            $default_map,
14412                                            Replace => $UNCONDITIONALLY);
14413                        }
14414                    }
14415                }
14416            }
14417        }
14418    }
14419
14420    # The above code doesn't work on 'gc=C', as it is a superset of the default
14421    # ('Cn') table.  It's easiest to just special case it here.
14422    my $C = $gc->table('C');
14423    $C += $gc->table('Cn');
14424
14425    return;
14426}
14427
14428sub compile_perl() {
14429    # Create perl-defined tables.  Almost all are part of the pseudo-property
14430    # named 'perl' internally to this program.  Many of these are recommended
14431    # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14432    # on those found there.
14433    # Almost all of these are equivalent to some Unicode property.
14434    # A number of these properties have equivalents restricted to the ASCII
14435    # range, with their names prefaced by 'Posix', to signify that these match
14436    # what the Posix standard says they should match.  A couple are
14437    # effectively this, but the name doesn't have 'Posix' in it because there
14438    # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
14439    # to the full Unicode range, by our guesses as to what is appropriate.
14440
14441    # 'All' is all code points.  As an error check, instead of just setting it
14442    # to be that, construct it to be the union of all the major categories
14443    $All = $perl->add_match_table('All',
14444      Description
14445        => "All code points, including those above Unicode.  Same as qr/./s",
14446      Matches_All => 1);
14447
14448    foreach my $major_table ($gc->tables) {
14449
14450        # Major categories are the ones with single letter names.
14451        next if length($major_table->name) != 1;
14452
14453        $All += $major_table;
14454    }
14455
14456    if ($All->max != $MAX_WORKING_CODEPOINT) {
14457        Carp::my_carp_bug("Generated highest code point ("
14458           . sprintf("%X", $All->max)
14459           . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14460    }
14461    if ($All->range_count != 1 || $All->min != 0) {
14462     Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14463    }
14464
14465    my $Any = $perl->add_match_table('Any',
14466                                    Description  => "All Unicode code points");
14467    $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14468    $Any->add_alias('Unicode');
14469
14470    calculate_Assigned();
14471
14472    # Our internal-only property should be treated as more than just a
14473    # synonym; grandfather it in to the pod.
14474    $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
14475                            Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
14476            ->set_equivalent_to(property_ref('ccc')->table('Above'),
14477                                                                Related => 1);
14478
14479    my $ASCII = $perl->add_match_table('ASCII');
14480    if (defined $block) {   # This is equivalent to the block if have it.
14481        my $Unicode_ASCII = $block->table('Basic_Latin');
14482        if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14483            $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14484        }
14485    }
14486
14487    # Very early releases didn't have blocks, so initialize ASCII ourselves if
14488    # necessary
14489    if ($ASCII->is_empty) {
14490        if (! NON_ASCII_PLATFORM) {
14491            $ASCII->add_range(0, 127);
14492        }
14493        else {
14494            for my $i (0 .. 127) {
14495                $ASCII->add_range(utf8::unicode_to_native($i),
14496                                  utf8::unicode_to_native($i));
14497            }
14498        }
14499    }
14500
14501    # Get the best available case definitions.  Early Unicode versions didn't
14502    # have Uppercase and Lowercase defined, so use the general category
14503    # instead for them, modified by hard-coding in the code points each is
14504    # missing.
14505    my $Lower = $perl->add_match_table('XPosixLower');
14506    my $Unicode_Lower = property_ref('Lowercase');
14507    if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14508        $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14509
14510    }
14511    else {
14512        $Lower += $gc->table('Lowercase_Letter');
14513
14514        # There are quite a few code points in Lower, that aren't in gc=lc,
14515        # and not all are in all releases.
14516        my $temp = Range_List->new(Initialize => [
14517                                                utf8::unicode_to_native(0xAA),
14518                                                utf8::unicode_to_native(0xBA),
14519                                                0x02B0 .. 0x02B8,
14520                                                0x02C0 .. 0x02C1,
14521                                                0x02E0 .. 0x02E4,
14522                                                0x0345,
14523                                                0x037A,
14524                                                0x1D2C .. 0x1D6A,
14525                                                0x1D78,
14526                                                0x1D9B .. 0x1DBF,
14527                                                0x2071,
14528                                                0x207F,
14529                                                0x2090 .. 0x209C,
14530                                                0x2170 .. 0x217F,
14531                                                0x24D0 .. 0x24E9,
14532                                                0x2C7C .. 0x2C7D,
14533                                                0xA770,
14534                                                0xA7F8 .. 0xA7F9,
14535                                ]);
14536        $Lower += $temp & $Assigned;
14537    }
14538    my $Posix_Lower = $perl->add_match_table("PosixLower",
14539                            Initialize => $Lower & $ASCII,
14540                            );
14541
14542    my $Upper = $perl->add_match_table("XPosixUpper");
14543    my $Unicode_Upper = property_ref('Uppercase');
14544    if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14545        $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14546    }
14547    else {
14548
14549        # Unlike Lower, there are only two ranges in Upper that aren't in
14550        # gc=Lu, and all code points were assigned in all releases.
14551        $Upper += $gc->table('Uppercase_Letter');
14552        $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
14553        $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
14554    }
14555    my $Posix_Upper = $perl->add_match_table("PosixUpper",
14556                            Initialize => $Upper & $ASCII,
14557                            );
14558
14559    # Earliest releases didn't have title case.  Initialize it to empty if not
14560    # otherwise present
14561    my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14562                                       Description => '(= \p{Gc=Lt})');
14563    my $lt = $gc->table('Lt');
14564
14565    # Earlier versions of mktables had this related to $lt since they have
14566    # identical code points, but their caseless equivalents are not the same,
14567    # one being 'Cased' and the other being 'LC', and so now must be kept as
14568    # separate entities.
14569    if (defined $lt) {
14570        $Title += $lt;
14571    }
14572    else {
14573        push @tables_that_may_be_empty, $Title->complete_name;
14574    }
14575
14576    my $Unicode_Cased = property_ref('Cased');
14577    if (defined $Unicode_Cased) {
14578        my $yes = $Unicode_Cased->table('Y');
14579        my $no = $Unicode_Cased->table('N');
14580        $Title->set_caseless_equivalent($yes);
14581        if (defined $Unicode_Upper) {
14582            $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14583            $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14584        }
14585        $Upper->set_caseless_equivalent($yes);
14586        if (defined $Unicode_Lower) {
14587            $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14588            $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14589        }
14590        $Lower->set_caseless_equivalent($yes);
14591    }
14592    else {
14593        # If this Unicode version doesn't have Cased, set up the Perl
14594        # extension from first principles.  From Unicode 5.1: Definition D120:
14595        # A character C is defined to be cased if and only if C has the
14596        # Lowercase or Uppercase property or has a General_Category value of
14597        # Titlecase_Letter.
14598        my $cased = $perl->add_match_table('Cased',
14599                        Initialize => $Lower + $Upper + $Title,
14600                        Description => 'Uppercase or Lowercase or Titlecase',
14601                        );
14602        # $notcased is purely for the caseless equivalents below
14603        my $notcased = $perl->add_match_table('_Not_Cased',
14604                                Initialize => ~ $cased,
14605                                Fate => $INTERNAL_ONLY,
14606                                Description => 'All not-cased code points');
14607        $Title->set_caseless_equivalent($cased);
14608        if (defined $Unicode_Upper) {
14609            $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14610            $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14611        }
14612        $Upper->set_caseless_equivalent($cased);
14613        if (defined $Unicode_Lower) {
14614            $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14615            $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14616        }
14617        $Lower->set_caseless_equivalent($cased);
14618    }
14619
14620    # Similarly, set up our own Case_Ignorable property if this Unicode
14621    # version doesn't have it.  From Unicode 5.1: Definition D121: A character
14622    # C is defined to be case-ignorable if C has the value MidLetter or the
14623    # value MidNumLet for the Word_Break property or its General_Category is
14624    # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
14625    # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
14626
14627    # Perl has long had an internal-only alias for this property; grandfather
14628    # it in to the pod, but discourage its use.
14629    my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
14630                                                     Re_Pod_Entry => 1,
14631                                                     Fate => $INTERNAL_ONLY,
14632                                                     Status => $DISCOURAGED);
14633    my $case_ignorable = property_ref('Case_Ignorable');
14634    if (defined $case_ignorable && ! $case_ignorable->is_empty) {
14635        $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
14636                                                                Related => 1);
14637    }
14638    else {
14639
14640        $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
14641
14642        # The following three properties are not in early releases
14643        $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
14644        $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
14645        $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
14646
14647        # For versions 4.1 - 5.0, there is no MidNumLet property, and
14648        # correspondingly the case-ignorable definition lacks that one.  For
14649        # 4.0, it appears that it was meant to be the same definition, but was
14650        # inadvertently omitted from the standard's text, so add it if the
14651        # property actually is there
14652        my $wb = property_ref('Word_Break');
14653        if (defined $wb) {
14654            my $midlet = $wb->table('MidLetter');
14655            $perl_case_ignorable += $midlet if defined $midlet;
14656            my $midnumlet = $wb->table('MidNumLet');
14657            $perl_case_ignorable += $midnumlet if defined $midnumlet;
14658        }
14659        else {
14660
14661            # In earlier versions of the standard, instead of the above two
14662            # properties , just the following characters were used:
14663            $perl_case_ignorable +=
14664                            ord("'")
14665                        +   utf8::unicode_to_native(0xAD)  # SOFT HYPHEN (SHY)
14666                        +   0x2019; # RIGHT SINGLE QUOTATION MARK
14667        }
14668    }
14669
14670    # The remaining perl defined tables are mostly based on Unicode TR 18,
14671    # "Annex C: Compatibility Properties".  All of these have two versions,
14672    # one whose name generally begins with Posix that is posix-compliant, and
14673    # one that matches Unicode characters beyond the Posix, ASCII range
14674
14675    my $Alpha = $perl->add_match_table('XPosixAlpha');
14676
14677    # Alphabetic was not present in early releases
14678    my $Alphabetic = property_ref('Alphabetic');
14679    if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14680        $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14681    }
14682    else {
14683
14684        # The Alphabetic property doesn't exist for early releases, so
14685        # generate it.  The actual definition, in 5.2 terms is:
14686        #
14687        # gc=L + gc=Nl + Other_Alphabetic
14688        #
14689        # Other_Alphabetic is also not defined in these early releases, but it
14690        # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14691        # those last two as well, then subtract the relatively few of them that
14692        # shouldn't have been added.  (The gc=So range is the circled capital
14693        # Latin characters.  Early releases mistakenly didn't also include the
14694        # lower-case versions of these characters, and so we don't either, to
14695        # maintain consistency with those releases that first had this
14696        # property.
14697        $Alpha->initialize($gc->table('Letter')
14698                           + pre_3_dot_1_Nl()
14699                           + $gc->table('Mn')
14700                           + $gc->table('Mc')
14701                        );
14702        $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
14703        foreach my $range (     [ 0x0300, 0x0344 ],
14704                                [ 0x0346, 0x034E ],
14705                                [ 0x0360, 0x0362 ],
14706                                [ 0x0483, 0x0486 ],
14707                                [ 0x0591, 0x05AF ],
14708                                [ 0x06DF, 0x06E0 ],
14709                                [ 0x06EA, 0x06EC ],
14710                                [ 0x0740, 0x074A ],
14711                                0x093C,
14712                                0x094D,
14713                                [ 0x0951, 0x0954 ],
14714                                0x09BC,
14715                                0x09CD,
14716                                0x0A3C,
14717                                0x0A4D,
14718                                0x0ABC,
14719                                0x0ACD,
14720                                0x0B3C,
14721                                0x0B4D,
14722                                0x0BCD,
14723                                0x0C4D,
14724                                0x0CCD,
14725                                0x0D4D,
14726                                0x0DCA,
14727                                [ 0x0E47, 0x0E4C ],
14728                                0x0E4E,
14729                                [ 0x0EC8, 0x0ECC ],
14730                                [ 0x0F18, 0x0F19 ],
14731                                0x0F35,
14732                                0x0F37,
14733                                0x0F39,
14734                                [ 0x0F3E, 0x0F3F ],
14735                                [ 0x0F82, 0x0F84 ],
14736                                [ 0x0F86, 0x0F87 ],
14737                                0x0FC6,
14738                                0x1037,
14739                                0x1039,
14740                                [ 0x17C9, 0x17D3 ],
14741                                [ 0x20D0, 0x20DC ],
14742                                0x20E1,
14743                                [ 0x302A, 0x302F ],
14744                                [ 0x3099, 0x309A ],
14745                                [ 0xFE20, 0xFE23 ],
14746                                [ 0x1D165, 0x1D169 ],
14747                                [ 0x1D16D, 0x1D172 ],
14748                                [ 0x1D17B, 0x1D182 ],
14749                                [ 0x1D185, 0x1D18B ],
14750                                [ 0x1D1AA, 0x1D1AD ],
14751        ) {
14752            if (ref $range) {
14753                $Alpha->delete_range($range->[0], $range->[1]);
14754            }
14755            else {
14756                $Alpha->delete_range($range, $range);
14757            }
14758        }
14759        $Alpha->add_description('Alphabetic');
14760        $Alpha->add_alias('Alphabetic');
14761    }
14762    my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14763                            Initialize => $Alpha & $ASCII,
14764                            );
14765    $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14766    $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14767
14768    my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14769                        Description => 'Alphabetic and (decimal) Numeric',
14770                        Initialize => $Alpha + $gc->table('Decimal_Number'),
14771                        );
14772    $perl->add_match_table("PosixAlnum",
14773                            Initialize => $Alnum & $ASCII,
14774                            );
14775
14776    my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14777                                Description => '\w, including beyond ASCII;'
14778                                            . ' = \p{Alnum} + \pM + \p{Pc}'
14779                                            . ' + \p{Join_Control}',
14780                                Initialize => $Alnum + $gc->table('Mark'),
14781                                );
14782    my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14783    if (defined $Pc) {
14784        $Word += $Pc;
14785    }
14786    else {
14787        $Word += ord('_');  # Make sure this is a $Word
14788    }
14789    my $JC = property_ref('Join_Control');  # Wasn't in release 1
14790    if (defined $JC) {
14791        $Word += $JC->table('Y');
14792    }
14793    else {
14794        $Word += 0x200C + 0x200D;
14795    }
14796
14797    # This is a Perl extension, so the name doesn't begin with Posix.
14798    my $PerlWord = $perl->add_match_table('PosixWord',
14799                    Description => '\w, restricted to ASCII',
14800                    Initialize => $Word & $ASCII,
14801                    );
14802    $PerlWord->add_alias('PerlWord');
14803
14804    my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14805                                Description => '\h, Horizontal white space',
14806
14807                                # 200B is Zero Width Space which is for line
14808                                # break control, and was listed as
14809                                # Space_Separator in early releases
14810                                Initialize => $gc->table('Space_Separator')
14811                                            +   ord("\t")
14812                                            -   0x200B, # ZWSP
14813                                );
14814    $Blank->add_alias('HorizSpace');        # Another name for it.
14815    $perl->add_match_table("PosixBlank",
14816                            Initialize => $Blank & $ASCII,
14817                            );
14818
14819    my $VertSpace = $perl->add_match_table('VertSpace',
14820                            Description => '\v',
14821                            Initialize =>
14822                               $gc->table('Line_Separator')
14823                             + $gc->table('Paragraph_Separator')
14824                             + utf8::unicode_to_native(0x0A)  # LINE FEED
14825                             + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14826                             + ord("\f")
14827                             + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14828                             + utf8::unicode_to_native(0x85)  # NEL
14829                    );
14830    # No Posix equivalent for vertical space
14831
14832    my $Space = $perl->add_match_table('XPosixSpace',
14833                Description => '\s including beyond ASCII and vertical tab',
14834                Initialize => $Blank + $VertSpace,
14835    );
14836    $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
14837    $Space->add_alias('SpacePerl');
14838    $Space->add_alias('Space') if $v_version lt v4.1.0;
14839
14840    my $Posix_space = $perl->add_match_table("PosixSpace",
14841                            Initialize => $Space & $ASCII,
14842                            );
14843    $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14844
14845    my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14846                                        Description => 'Control characters');
14847    $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14848    $perl->add_match_table("PosixCntrl",
14849                            Description => "ASCII control characters",
14850                            Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14851                                         . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14852                                         . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14853                                         . " HT, LF, NAK, NUL, RS, SI, SO,"
14854                                         . " SOH, STX, SUB, SYN, US, VT",
14855                            Initialize => $Cntrl & $ASCII,
14856                            );
14857
14858    my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14859    my $Cs = $gc->table('Cs');
14860    if (defined $Cs && ! $Cs->is_empty) {
14861        $perl_surrogate += $Cs;
14862    }
14863    else {
14864        push @tables_that_may_be_empty, '_Perl_Surrogate';
14865    }
14866
14867    # $controls is a temporary used to construct Graph.
14868    my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14869                                                + $gc->table('Control')
14870                                                + $perl_surrogate);
14871
14872    # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
14873    my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14874                        Description => 'Characters that are graphical',
14875                        Initialize => ~ ($Space + $controls),
14876                        );
14877    $perl->add_match_table("PosixGraph",
14878                            Initialize => $Graph & $ASCII,
14879                            );
14880
14881    $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14882                        Description => 'Characters that are graphical plus space characters (but no controls)',
14883                        Initialize => $Blank + $Graph - $gc->table('Control'),
14884                        );
14885    $perl->add_match_table("PosixPrint",
14886                            Initialize => $print & $ASCII,
14887                            );
14888
14889    my $Punct = $perl->add_match_table('Punct');
14890    $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14891
14892    # \p{punct} doesn't include the symbols, which posix does
14893    my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14894                    Description => '\p{Punct} + ASCII-range \p{Symbol}',
14895                    Initialize => $gc->table('Punctuation')
14896                                + ($ASCII & $gc->table('Symbol')),
14897                                Perl_Extension => 1
14898        );
14899    $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14900        Initialize => $ASCII & $XPosixPunct,
14901        );
14902
14903    my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14904                            Description => '[0-9] + all other decimal digits');
14905    $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14906    my $PosixDigit = $perl->add_match_table("PosixDigit",
14907                                            Initialize => $Digit & $ASCII,
14908                                            );
14909
14910    # Hex_Digit was not present in first release
14911    my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14912    my $Hex = property_ref('Hex_Digit');
14913    if (defined $Hex && ! $Hex->is_empty) {
14914        $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14915    }
14916    else {
14917        $Xdigit->initialize([ ord('0') .. ord('9'),
14918                              ord('A') .. ord('F'),
14919                              ord('a') .. ord('f'),
14920                              0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14921    }
14922
14923    # AHex was not present in early releases
14924    my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14925    my $AHex = property_ref('ASCII_Hex_Digit');
14926    if (defined $AHex && ! $AHex->is_empty) {
14927        $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14928    }
14929    else {
14930        $PosixXDigit->initialize($Xdigit & $ASCII);
14931        $PosixXDigit->add_alias('AHex');
14932        $PosixXDigit->add_alias('Ascii_Hex_Digit');
14933    }
14934
14935    my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14936                    Description => "Code points that particpate in some fold",
14937                    );
14938    my $loc_problem_folds = $perl->add_match_table(
14939               "_Perl_Problematic_Locale_Folds",
14940               Description =>
14941                   "Code points that are in some way problematic under locale",
14942    );
14943
14944    # This allows regexec.c to skip some work when appropriate.  Some of the
14945    # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14946    my $loc_problem_folds_start = $perl->add_match_table(
14947               "_Perl_Problematic_Locale_Foldeds_Start",
14948               Description =>
14949                   "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14950    );
14951
14952    my $cf = property_ref('Case_Folding');
14953
14954    # Every character 0-255 is problematic because what each folds to depends
14955    # on the current locale
14956    $loc_problem_folds->add_range(0, 255);
14957    $loc_problem_folds_start += $loc_problem_folds;
14958
14959    # Also problematic are anything these fold to outside the range.  Likely
14960    # forever the only thing folded to by these outside the 0-255 range is the
14961    # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14962    # completely general, which should catch any unexpected changes or errors.
14963    # We look at each code point 0-255, and add its fold (including each part
14964    # of a multi-char fold) to the list.  See commit message
14965    # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14966    # of the MU issue.
14967    foreach my $range ($loc_problem_folds->ranges) {
14968        foreach my $code_point ($range->start .. $range->end) {
14969            my $fold_range = $cf->containing_range($code_point);
14970            next unless defined $fold_range;
14971
14972            # Skip if folds to itself
14973            next if $fold_range->value eq $CODE_POINT;
14974
14975            my @hex_folds = split " ", $fold_range->value;
14976            my $start_cp = $hex_folds[0];
14977            next if $start_cp eq $CODE_POINT;
14978            $start_cp = hex $start_cp;
14979            foreach my $i (0 .. @hex_folds - 1) {
14980                my $cp = $hex_folds[$i];
14981                next if $cp eq $CODE_POINT;
14982                $cp = hex $cp;
14983                next unless $cp > 255;    # Already have the < 256 ones
14984
14985                $loc_problem_folds->add_range($cp, $cp);
14986                $loc_problem_folds_start->add_range($start_cp, $start_cp);
14987            }
14988        }
14989    }
14990
14991    my $folds_to_multi_char = $perl->add_match_table(
14992         "_Perl_Folds_To_Multi_Char",
14993         Description =>
14994              "Code points whose fold is a string of more than one character",
14995    );
14996    if ($v_version lt v3.0.1) {
14997        push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
14998    }
14999
15000    # Look through all the known folds to populate these tables.
15001    foreach my $range ($cf->ranges) {
15002        next if $range->value eq $CODE_POINT;
15003        my $start = $range->start;
15004        my $end = $range->end;
15005        $any_folds->add_range($start, $end);
15006
15007        my @hex_folds = split " ", $range->value;
15008        if (@hex_folds > 1) {   # Is multi-char fold
15009            $folds_to_multi_char->add_range($start, $end);
15010        }
15011
15012        my $found_locale_problematic = 0;
15013
15014        # Look at each of the folded-to characters...
15015        foreach my $i (0 .. @hex_folds - 1) {
15016            my $cp = hex $hex_folds[$i];
15017            $any_folds->add_range($cp, $cp);
15018
15019            # The fold is problematic if any of the folded-to characters is
15020            # already considered problematic.
15021            if ($loc_problem_folds->contains($cp)) {
15022                $loc_problem_folds->add_range($start, $end);
15023                $found_locale_problematic = 1;
15024            }
15025        }
15026
15027        # If this is a problematic fold, add to the start chars the
15028        # folding-from characters and first folded-to character.
15029        if ($found_locale_problematic) {
15030            $loc_problem_folds_start->add_range($start, $end);
15031            my $cp = hex $hex_folds[0];
15032            $loc_problem_folds_start->add_range($cp, $cp);
15033        }
15034    }
15035
15036    my $dt = property_ref('Decomposition_Type');
15037    $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
15038        Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
15039        Perl_Extension => 1,
15040        Note => 'Union of all non-canonical decompositions',
15041        );
15042
15043    # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
15044    # than SD appeared, construct it ourselves, based on the first release SD
15045    # was in.  A pod entry is grandfathered in for it
15046    my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
15047                                           Perl_Extension => 1,
15048                                           Fate => $INTERNAL_ONLY,
15049                                           Status => $DISCOURAGED);
15050    my $soft_dotted = property_ref('Soft_Dotted');
15051    if (defined $soft_dotted && ! $soft_dotted->is_empty) {
15052        $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
15053    }
15054    else {
15055
15056        # This list came from 3.2 Soft_Dotted; all of these code points are in
15057        # all releases
15058        $CanonDCIJ->initialize([ ord('i'),
15059                                 ord('j'),
15060                                 0x012F,
15061                                 0x0268,
15062                                 0x0456,
15063                                 0x0458,
15064                                 0x1E2D,
15065                                 0x1ECB,
15066                               ]);
15067        $CanonDCIJ = $CanonDCIJ & $Assigned;
15068    }
15069
15070    # For backward compatibility, Perl has its own definition for IDStart.
15071    # It is regular XID_Start plus the underscore, but all characters must be
15072    # Word characters as well
15073    my $XID_Start = property_ref('XID_Start');
15074    my $perl_xids = $perl->add_match_table('_Perl_IDStart',
15075                                            Perl_Extension => 1,
15076                                            Fate => $INTERNAL_ONLY,
15077                                            Initialize => ord('_')
15078                                            );
15079    if (defined $XID_Start
15080        || defined ($XID_Start = property_ref('ID_Start')))
15081    {
15082        $perl_xids += $XID_Start->table('Y');
15083    }
15084    else {
15085        # For Unicode versions that don't have the property, construct our own
15086        # from first principles.  The actual definition is:
15087        #     Letters
15088        #   + letter numbers (Nl)
15089        #   - Pattern_Syntax
15090        #   - Pattern_White_Space
15091        #   + stability extensions
15092        #   - NKFC modifications
15093        #
15094        # What we do in the code below is to include the identical code points
15095        # that are in the first release that had Unicode's version of this
15096        # property, essentially extrapolating backwards.  There were no
15097        # stability extensions until v4.1, so none are included; likewise in
15098        # no Unicode version so far do subtracting PatSyn and PatWS make any
15099        # difference, so those also are ignored.
15100        $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
15101
15102        # We do subtract the NFKC modifications that are in the first version
15103        # that had this property.  We don't bother to test if they are in the
15104        # version in question, because if they aren't, the operation is a
15105        # no-op.  The NKFC modifications are discussed in
15106        # http://www.unicode.org/reports/tr31/#NFKC_Modifications
15107        foreach my $range ( 0x037A,
15108                            0x0E33,
15109                            0x0EB3,
15110                            [ 0xFC5E, 0xFC63 ],
15111                            [ 0xFDFA, 0xFE70 ],
15112                            [ 0xFE72, 0xFE76 ],
15113                            0xFE78,
15114                            0xFE7A,
15115                            0xFE7C,
15116                            0xFE7E,
15117                            [ 0xFF9E, 0xFF9F ],
15118        ) {
15119            if (ref $range) {
15120                $perl_xids->delete_range($range->[0], $range->[1]);
15121            }
15122            else {
15123                $perl_xids->delete_range($range, $range);
15124            }
15125        }
15126    }
15127
15128    $perl_xids &= $Word;
15129
15130    my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
15131                                        Perl_Extension => 1,
15132                                        Fate => $INTERNAL_ONLY);
15133    my $XIDC = property_ref('XID_Continue');
15134    if (defined $XIDC
15135        || defined ($XIDC = property_ref('ID_Continue')))
15136    {
15137        $perl_xidc += $XIDC->table('Y');
15138    }
15139    else {
15140        # Similarly, we construct our own XIDC if necessary for early Unicode
15141        # versions.  The definition is:
15142        #     everything in XIDS
15143        #   + Gc=Mn
15144        #   + Gc=Mc
15145        #   + Gc=Nd
15146        #   + Gc=Pc
15147        #   - Pattern_Syntax
15148        #   - Pattern_White_Space
15149        #   + stability extensions
15150        #   - NFKC modifications
15151        #
15152        # The same thing applies to this as with XIDS for the PatSyn, PatWS,
15153        # and stability extensions.  There is a somewhat different set of NFKC
15154        # mods to remove (and add in this case).  The ones below make this
15155        # have identical code points as in the first release that defined it.
15156        $perl_xidc += $perl_xids
15157                    + $gc->table('L')
15158                    + $gc->table('Mn')
15159                    + $gc->table('Mc')
15160                    + $gc->table('Nd')
15161                    + utf8::unicode_to_native(0xB7)
15162                    ;
15163        if (defined (my $pc = $gc->table('Pc'))) {
15164            $perl_xidc += $pc;
15165        }
15166        else {  # 1.1.5 didn't have Pc, but these should have been in it
15167            $perl_xidc += 0xFF3F;
15168            $perl_xidc->add_range(0x203F, 0x2040);
15169            $perl_xidc->add_range(0xFE33, 0xFE34);
15170            $perl_xidc->add_range(0xFE4D, 0xFE4F);
15171        }
15172
15173        # Subtract the NFKC mods
15174        foreach my $range ( 0x037A,
15175                            [ 0xFC5E, 0xFC63 ],
15176                            [ 0xFDFA, 0xFE1F ],
15177                            0xFE70,
15178                            [ 0xFE72, 0xFE76 ],
15179                            0xFE78,
15180                            0xFE7A,
15181                            0xFE7C,
15182                            0xFE7E,
15183        ) {
15184            if (ref $range) {
15185                $perl_xidc->delete_range($range->[0], $range->[1]);
15186            }
15187            else {
15188                $perl_xidc->delete_range($range, $range);
15189            }
15190        }
15191    }
15192
15193    $perl_xidc &= $Word;
15194
15195    my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
15196                    Perl_Extension => 1,
15197                    Fate => $INTERNAL_ONLY,
15198                    Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
15199                    );
15200
15201    my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
15202                        Perl_Extension => 1,
15203                        Fate => $INTERNAL_ONLY,
15204                        Initialize => $perl_xidc
15205                                    + ord(" ")
15206                                    + ord("(")
15207                                    + ord(")")
15208                                    + ord("-")
15209                        );
15210
15211    my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
15212
15213    if (@named_sequences) {
15214        push @composition, 'Named_Sequence';
15215        foreach my $sequence (@named_sequences) {
15216            $perl_charname->add_anomalous_entry($sequence);
15217        }
15218    }
15219
15220    my $alias_sentence = "";
15221    my %abbreviations;
15222    my $alias = property_ref('_Perl_Name_Alias');
15223    $perl_charname->set_proxy_for('_Perl_Name_Alias');
15224
15225    # Add each entry in _Perl_Name_Alias to Perl_Charnames.  Where these go
15226    # with respect to any existing entry depends on the entry type.
15227    # Corrections go before said entry, as they should be returned in
15228    # preference over the existing entry.  (A correction to a correction
15229    # should be later in the _Perl_Name_Alias table, so it will correctly
15230    # precede the erroneous correction in Perl_Charnames.)
15231    #
15232    # Abbreviations go after everything else, so they are saved temporarily in
15233    # a hash for later.
15234    #
15235    # Everything else is added added afterwards, which preserves the input
15236    # ordering
15237
15238    foreach my $range ($alias->ranges) {
15239        next if $range->value eq "";
15240        my $code_point = $range->start;
15241        if ($code_point != $range->end) {
15242            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
15243        }
15244        my ($value, $type) = split ': ', $range->value;
15245        my $replace_type;
15246        if ($type eq 'correction') {
15247            $replace_type = $MULTIPLE_BEFORE;
15248        }
15249        elsif ($type eq 'abbreviation') {
15250
15251            # Save for later
15252            $abbreviations{$value} = $code_point;
15253            next;
15254        }
15255        else {
15256            $replace_type = $MULTIPLE_AFTER;
15257        }
15258
15259        # Actually add; before or after current entry(ies) as determined
15260        # above.
15261
15262        $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
15263    }
15264    $alias_sentence = <<END;
15265The _Perl_Name_Alias property adds duplicate code point entries that are
15266alternatives to the original name.  If an addition is a corrected
15267name, it will be physically first in the table.  The original (less correct,
15268but still valid) name will be next; then any alternatives, in no particular
15269order; and finally any abbreviations, again in no particular order.
15270END
15271
15272    # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
15273    # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
15274    # so should be first in the file; the other names have precedence starting
15275    # in 6.1,
15276    my $before_or_after = ($v_version lt v6.1.0)
15277                          ? $MULTIPLE_BEFORE
15278                          : $MULTIPLE_AFTER;
15279
15280    foreach my $range (property_ref('Unicode_1_Name')->ranges) {
15281        my $code_point = $range->start;
15282        my $unicode_1_value = $range->value;
15283        next if $unicode_1_value eq "";     # Skip if name doesn't exist.
15284
15285        if ($code_point != $range->end) {
15286            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
15287        }
15288
15289        # To handle EBCDIC, we don't hard code in the code points of the
15290        # controls; instead realizing that all of them are below 256.
15291        last if $code_point > 255;
15292
15293        # We only add in the controls.
15294        next if $gc->value_of($code_point) ne 'Cc';
15295
15296        # We reject this Unicode1 name for later Perls, as it is used for
15297        # another code point
15298        next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
15299
15300        # This won't add an exact duplicate.
15301        $perl_charname->add_duplicate($code_point, $unicode_1_value,
15302                                        Replace => $before_or_after);
15303    }
15304
15305    # Now that have everything added, add in abbreviations after
15306    # everything else.  Sort so results don't change between runs of this
15307    # program
15308    foreach my $value (sort keys %abbreviations) {
15309        $perl_charname->add_duplicate($abbreviations{$value}, $value,
15310                                        Replace => $MULTIPLE_AFTER);
15311    }
15312
15313    my $comment;
15314    if (@composition <= 2) { # Always at least 2
15315        $comment = join " and ", @composition;
15316    }
15317    else {
15318        $comment = join ", ", @composition[0 .. scalar @composition - 2];
15319        $comment .= ", and $composition[-1]";
15320    }
15321
15322    $perl_charname->add_comment(join_lines( <<END
15323This file is for charnames.pm.  It is the union of the $comment properties.
15324Unicode_1_Name entries are used only for nameless code points in the Name
15325property.
15326$alias_sentence
15327This file doesn't include the algorithmically determinable names.  For those,
15328use 'unicore/Name.pm'
15329END
15330    ));
15331    property_ref('Name')->add_comment(join_lines( <<END
15332This file doesn't include the algorithmically determinable names.  For those,
15333use 'unicore/Name.pm'
15334END
15335    ));
15336
15337    # Construct the Present_In property from the Age property.
15338    if (-e 'DAge.txt' && defined $age) {
15339        my $default_map = $age->default_map;
15340        my $in = Property->new('In',
15341                                Default_Map => $default_map,
15342                                Full_Name => "Present_In",
15343                                Perl_Extension => 1,
15344                                Type => $ENUM,
15345                                Initialize => $age,
15346                                );
15347        $in->add_comment(join_lines(<<END
15348THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
15349same as for $age, and not for what $in really means.  This is because anything
15350defined in a given release should have multiple values: that release and all
15351higher ones.  But only one value per code point can be represented in a table
15352like this.
15353END
15354        ));
15355
15356        # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
15357        # lowest numbered (earliest) come first, with the non-numeric one
15358        # last.
15359        my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15360                                            ? 1
15361                                            : ($b->name !~ /^[\d.]*$/)
15362                                                ? -1
15363                                                : $a->name <=> $b->name
15364                                            } $age->tables;
15365
15366        # The Present_In property is the cumulative age properties.  The first
15367        # one hence is identical to the first age one.
15368        my $previous_in = $in->add_match_table($first_age->name);
15369        $previous_in->set_equivalent_to($first_age, Related => 1);
15370
15371        my $description_start = "Code point's usage introduced in version ";
15372        $first_age->add_description($description_start . $first_age->name);
15373
15374        # To construct the accumulated values, for each of the age tables
15375        # starting with the 2nd earliest, merge the earliest with it, to get
15376        # all those code points existing in the 2nd earliest.  Repeat merging
15377        # the new 2nd earliest with the 3rd earliest to get all those existing
15378        # in the 3rd earliest, and so on.
15379        foreach my $current_age (@rest_ages) {
15380            next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
15381
15382            my $current_in = $in->add_match_table(
15383                                    $current_age->name,
15384                                    Initialize => $current_age + $previous_in,
15385                                    Description => $description_start
15386                                                    . $current_age->name
15387                                                    . ' or earlier',
15388                                    );
15389            foreach my $alias ($current_age->aliases) {
15390                $current_in->add_alias($alias->name);
15391            }
15392            $previous_in = $current_in;
15393
15394            # Add clarifying material for the corresponding age file.  This is
15395            # in part because of the confusing and contradictory information
15396            # given in the Standard's documentation itself, as of 5.2.
15397            $current_age->add_description(
15398                            "Code point's usage was introduced in version "
15399                            . $current_age->name);
15400            $current_age->add_note("See also $in");
15401
15402        }
15403
15404        # And finally the code points whose usages have yet to be decided are
15405        # the same in both properties.  Note that permanently unassigned code
15406        # points actually have their usage assigned (as being permanently
15407        # unassigned), so that these tables are not the same as gc=cn.
15408        my $unassigned = $in->add_match_table($default_map);
15409        my $age_default = $age->table($default_map);
15410        $age_default->add_description(<<END
15411Code point's usage has not been assigned in any Unicode release thus far.
15412END
15413        );
15414        $unassigned->set_equivalent_to($age_default, Related => 1);
15415    }
15416
15417    my $patws = $perl->add_match_table('_Perl_PatWS',
15418                                       Perl_Extension => 1,
15419                                       Fate => $INTERNAL_ONLY);
15420    if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15421        $patws->initialize($off_patws->table('Y'));
15422    }
15423    else {
15424        $patws->initialize([ ord("\t"),
15425                             ord("\n"),
15426                             utf8::unicode_to_native(0x0B), # VT
15427                             ord("\f"),
15428                             ord("\r"),
15429                             ord(" "),
15430                             utf8::unicode_to_native(0x85), # NEL
15431                             0x200E..0x200F,             # Left, Right marks
15432                             0x2028..0x2029              # Line, Paragraph seps
15433                           ] );
15434    }
15435
15436    # See L<perlfunc/quotemeta>
15437    my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15438                                           Perl_Extension => 1,
15439                                           Fate => $INTERNAL_ONLY,
15440
15441                                           # Initialize to what's common in
15442                                           # all Unicode releases.
15443                                           Initialize =>
15444                                                  $gc->table('Control')
15445                                                + $Space
15446                                                + $patws
15447                                                + ((~ $Word) & $ASCII)
15448                           );
15449
15450    if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15451        $quotemeta += $patsyn->table('Y');
15452    }
15453    else {
15454        $quotemeta += ((~ $Word) & Range->new(0, 255))
15455                    - utf8::unicode_to_native(0xA8)
15456                    - utf8::unicode_to_native(0xAF)
15457                    - utf8::unicode_to_native(0xB2)
15458                    - utf8::unicode_to_native(0xB3)
15459                    - utf8::unicode_to_native(0xB4)
15460                    - utf8::unicode_to_native(0xB7)
15461                    - utf8::unicode_to_native(0xB8)
15462                    - utf8::unicode_to_native(0xB9)
15463                    - utf8::unicode_to_native(0xBC)
15464                    - utf8::unicode_to_native(0xBD)
15465                    - utf8::unicode_to_native(0xBE);
15466        $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15467                        # same in all releases
15468                        0x2010 .. 0x2027,
15469                        0x2030 .. 0x203E,
15470                        0x2041 .. 0x2053,
15471                        0x2055 .. 0x205E,
15472                        0x2190 .. 0x245F,
15473                        0x2500 .. 0x2775,
15474                        0x2794 .. 0x2BFF,
15475                        0x2E00 .. 0x2E7F,
15476                        0x3001 .. 0x3003,
15477                        0x3008 .. 0x3020,
15478                        0x3030 .. 0x3030,
15479                        0xFD3E .. 0xFD3F,
15480                        0xFE45 .. 0xFE46
15481                      ];
15482    }
15483
15484    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15485        $quotemeta += $di->table('Y')
15486    }
15487    else {
15488        if ($v_version ge v2.0) {
15489            $quotemeta += $gc->table('Cf')
15490                       +  $gc->table('Cs');
15491
15492            # These are above the Unicode version 1 max
15493            $quotemeta->add_range(0xE0000, 0xE0FFF);
15494        }
15495        $quotemeta += $gc->table('Cc')
15496                    - $Space;
15497        my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15498                                                   0x2060 .. 0x206F,
15499                                                   0xFE00 .. 0xFE0F,
15500                                                   0xFFF0 .. 0xFFFB,
15501                                                  ]);
15502        $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15503        $quotemeta += $temp;
15504    }
15505    calculate_DI();
15506    $quotemeta += $DI;
15507
15508    calculate_NChar();
15509
15510    # Finished creating all the perl properties.  All non-internal non-string
15511    # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
15512    # an underscore.)  These do not get a separate entry in the pod file
15513    foreach my $table ($perl->tables) {
15514        foreach my $alias ($table->aliases) {
15515            next if $alias->name =~ /^_/;
15516            $table->add_alias('Is_' . $alias->name,
15517                               Re_Pod_Entry => 0,
15518                               UCD => 0,
15519                               Status => $alias->status,
15520                               OK_as_Filename => 0);
15521        }
15522    }
15523
15524    # Perl tailors the WordBreak property so that \b{wb} doesn't split
15525    # adjacent spaces into separate words.  First create a copy of the regular
15526    # WB property as '_Perl_WB'.  (On Unicode releases earlier than when WB
15527    # was defined for, this will already have been done by the substitute file
15528    # portion for 'Input_file' code for WB.)
15529    my $perl_wb = property_ref('_Perl_WB');
15530    if (! defined $perl_wb) {
15531        $perl_wb = Property->new('_Perl_WB',
15532                                 Fate => $INTERNAL_ONLY,
15533                                 Perl_Extension => 1,
15534                                 Directory => $map_directory,
15535                                 Type => $STRING);
15536        my $wb = property_ref('Word_Break');
15537        $perl_wb->initialize($wb);
15538        $perl_wb->set_default_map($wb->default_map);
15539    }
15540
15541    # And simply replace the mappings of horizontal space characters that
15542    # otherwise would map to the default to instead map to our tailoring.
15543    my $default = $perl_wb->default_map;
15544    for my $range ($Blank->ranges) {
15545        for my $i ($range->start .. $range->end) {
15546            next unless $perl_wb->value_of($i) eq $default;
15547            $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15548                              Replace => $UNCONDITIONALLY);
15549        }
15550    }
15551
15552    # Create a version of the LineBreak property with the mappings that are
15553    # omitted in the default algorithm remapped to what
15554    # http://www.unicode.org/reports/tr14 says they should be.
15555    #
15556    # Original 	   Resolved  General_Category
15557    # AI, SG, XX      AL      Any
15558    # SA              CM      Only Mn or Mc
15559    # SA              AL      Any except Mn and Mc
15560    # CJ              NS      Any
15561    #
15562    # All property values are also written out in their long form, as
15563    # regen/mk_invlist.pl expects that.  This also fixes occurrences of the
15564    # typo in early Unicode versions: 'inseperable'.
15565    my $perl_lb = property_ref('_Perl_LB');
15566    if (! defined $perl_lb) {
15567        $perl_lb = Property->new('_Perl_LB',
15568                                 Fate => $INTERNAL_ONLY,
15569                                 Perl_Extension => 1,
15570                                 Directory => $map_directory,
15571                                 Type => $STRING);
15572        my $lb = property_ref('Line_Break');
15573
15574        # Populate from $lb, but use full name and fix typo.
15575        foreach my $range ($lb->ranges) {
15576            my $full_name = $lb->table($range->value)->full_name;
15577            $full_name = 'Inseparable'
15578                                if standardize($full_name) eq 'inseperable';
15579            $perl_lb->add_map($range->start, $range->end, $full_name);
15580        }
15581    }
15582
15583    $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
15584
15585    for my $range ($perl_lb->ranges) {
15586        my $value = standardize($range->value);
15587        if (   $value eq standardize('Unknown')
15588            || $value eq standardize('Ambiguous')
15589            || $value eq standardize('Surrogate'))
15590        {
15591            $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15592                              Replace => $UNCONDITIONALLY);
15593        }
15594        elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15595            $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15596                              Replace => $UNCONDITIONALLY);
15597        }
15598        elsif ($value eq standardize('Complex_Context')) {
15599            for my $i ($range->start .. $range->end) {
15600                my $gc_val = $gc->value_of($i);
15601                if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15602                    $perl_lb->add_map($i, $i, 'Combining_Mark',
15603                                      Replace => $UNCONDITIONALLY);
15604                }
15605                else {
15606                    $perl_lb->add_map($i, $i, 'Alphabetic',
15607                                      Replace => $UNCONDITIONALLY);
15608                }
15609            }
15610        }
15611    }
15612
15613    # This property is a modification of the scx property
15614    my $perl_scx = Property->new('_Perl_SCX',
15615                                 Fate => $INTERNAL_ONLY,
15616                                 Perl_Extension => 1,
15617                                 Directory => $map_directory,
15618                                 Type => $ENUM);
15619    my $source;
15620
15621    # Use scx if available; otherwise sc;  if neither is there (a very old
15622    # Unicode version, just say that everything is 'Common'
15623    if (defined $scx) {
15624        $source = $scx;
15625        $perl_scx->set_default_map('Unknown');
15626    }
15627    elsif (defined $script) {
15628        $source = $script;
15629
15630        # Early versions of 'sc', had everything be 'Common'
15631        if (defined $script->table('Unknown')) {
15632            $perl_scx->set_default_map('Unknown');
15633        }
15634        else {
15635            $perl_scx->set_default_map('Common');
15636        }
15637    } else {
15638        $perl_scx->add_match_table('Common');
15639        $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15640
15641        $perl_scx->add_match_table('Unknown');
15642        $perl_scx->set_default_map('Unknown');
15643    }
15644
15645    $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15646    $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15647
15648    if (defined $source) {
15649        $perl_scx->initialize($source);
15650
15651        # UTS 39 says that the scx property should be modified for these
15652        # countries where certain mixed scripts are commonly used.
15653        for my $range ($perl_scx->ranges) {
15654            my $value = $range->value;
15655            my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15656             $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15657             $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15658             $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15659                                     {$1 Katakana Hiragana Jpan}xi;
15660             $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15661             $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15662
15663            if ($changed) {
15664                $value = join " ", uniques split " ", $value;
15665                $range->set_value($value)
15666            }
15667        }
15668
15669        foreach my $table ($source->tables) {
15670            my $scx_table = $perl_scx->add_match_table($table->name,
15671                                    Full_Name => $table->full_name);
15672            foreach my $alias ($table->aliases) {
15673                $scx_table->add_alias($alias->name);
15674            }
15675        }
15676    }
15677
15678    # Here done with all the basic stuff.  Ready to populate the information
15679    # about each character if annotating them.
15680    if ($annotate) {
15681
15682        # See comments at its declaration
15683        $annotate_ranges = Range_Map->new;
15684
15685        # This separates out the non-characters from the other unassigneds, so
15686        # can give different annotations for each.
15687        $unassigned_sans_noncharacters = Range_List->new(
15688                                    Initialize => $gc->table('Unassigned'));
15689        $unassigned_sans_noncharacters &= (~ $NChar);
15690
15691        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15692            $i = populate_char_info($i);    # Note sets $i so may cause skips
15693
15694        }
15695    }
15696
15697    return;
15698}
15699
15700sub add_perl_synonyms() {
15701    # A number of Unicode tables have Perl synonyms that are expressed in
15702    # the single-form, \p{name}.  These are:
15703    #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15704    #       \p{Is_Name} as synonyms
15705    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15706    #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15707    #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15708    #       conflict, \p{Value} and \p{Is_Value} as well
15709    #
15710    # This routine generates these synonyms, warning of any unexpected
15711    # conflicts.
15712
15713    # Construct the list of tables to get synonyms for.  Start with all the
15714    # binary and the General_Category ones.
15715    my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15716                                                            property_ref('*');
15717    push @tables, $gc->tables;
15718
15719    # If the version of Unicode includes the Script Extensions (preferably),
15720    # or Script property, add its tables
15721    if (defined $scx) {
15722        push @tables, $scx->tables;
15723    }
15724    else {
15725        push @tables, $script->tables if defined $script;
15726    }
15727
15728    # The Block tables are kept separate because they are treated differently.
15729    # And the earliest versions of Unicode didn't include them, so add only if
15730    # there are some.
15731    my @blocks;
15732    push @blocks, $block->tables if defined $block;
15733
15734    # Here, have the lists of tables constructed.  Process blocks last so that
15735    # if there are name collisions with them, blocks have lowest priority.
15736    # Should there ever be other collisions, manual intervention would be
15737    # required.  See the comments at the beginning of the program for a
15738    # possible way to handle those semi-automatically.
15739    foreach my $table (@tables,  @blocks) {
15740
15741        # For non-binary properties, the synonym is just the name of the
15742        # table, like Greek, but for binary properties the synonym is the name
15743        # of the property, and means the code points in its 'Y' table.
15744        my $nominal = $table;
15745        my $nominal_property = $nominal->property;
15746        my $actual;
15747        if (! $nominal->isa('Property')) {
15748            $actual = $table;
15749        }
15750        else {
15751
15752            # Here is a binary property.  Use the 'Y' table.  Verify that is
15753            # there
15754            my $yes = $nominal->table('Y');
15755            unless (defined $yes) {  # Must be defined, but is permissible to
15756                                     # be empty.
15757                Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
15758                next;
15759            }
15760            $actual = $yes;
15761        }
15762
15763        foreach my $alias ($nominal->aliases) {
15764
15765            # Attempt to create a table in the perl directory for the
15766            # candidate table, using whatever aliases in it that don't
15767            # conflict.  Also add non-conflicting aliases for all these
15768            # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15769            PREFIX:
15770            foreach my $prefix ("", 'Is_', 'In_') {
15771
15772                # Only Block properties can have added 'In_' aliases.
15773                next if $prefix eq 'In_' and $nominal_property != $block;
15774
15775                my $proposed_name = $prefix . $alias->name;
15776
15777                # No Is_Is, In_In, nor combinations thereof
15778                trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15779                next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15780
15781                trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15782
15783                # Get a reference to any existing table in the perl
15784                # directory with the desired name.
15785                my $pre_existing = $perl->table($proposed_name);
15786
15787                if (! defined $pre_existing) {
15788
15789                    # No name collision, so OK to add the perl synonym.
15790
15791                    my $make_re_pod_entry;
15792                    my $ok_as_filename;
15793                    my $status = $alias->status;
15794                    if ($nominal_property == $block) {
15795
15796                        # For block properties, only the compound form is
15797                        # preferred for external use; the others are
15798                        # discouraged.  The pod file contains wild cards for
15799                        # the 'In' and 'Is' forms so no entries for those; and
15800                        # we don't want people using the name without any
15801                        # prefix, so discourage that.
15802                        if ($prefix eq "") {
15803                            $make_re_pod_entry = 1;
15804                            $status = $status || $DISCOURAGED;
15805                            $ok_as_filename = 0;
15806                        }
15807                        elsif ($prefix eq 'In_') {
15808                            $make_re_pod_entry = 0;
15809                            $status = $status || $DISCOURAGED;
15810                            $ok_as_filename = 1;
15811                        }
15812                        else {
15813                            $make_re_pod_entry = 0;
15814                            $status = $status || $DISCOURAGED;
15815                            $ok_as_filename = 0;
15816                        }
15817                    }
15818                    elsif ($prefix ne "") {
15819
15820                        # The 'Is' prefix is handled in the pod by a wild
15821                        # card, and we won't use it for an external name
15822                        $make_re_pod_entry = 0;
15823                        $status = $status || $NORMAL;
15824                        $ok_as_filename = 0;
15825                    }
15826                    else {
15827
15828                        # Here, is an empty prefix, non block.  This gets its
15829                        # own pod entry and can be used for an external name.
15830                        $make_re_pod_entry = 1;
15831                        $status = $status || $NORMAL;
15832                        $ok_as_filename = 1;
15833                    }
15834
15835                    # Here, there isn't a perl pre-existing table with the
15836                    # name.  Look through the list of equivalents of this
15837                    # table to see if one is a perl table.
15838                    foreach my $equivalent ($actual->leader->equivalents) {
15839                        next if $equivalent->property != $perl;
15840
15841                        # Here, have found a table for $perl.  Add this alias
15842                        # to it, and are done with this prefix.
15843                        $equivalent->add_alias($proposed_name,
15844                                        Re_Pod_Entry => $make_re_pod_entry,
15845
15846                                        # Currently don't output these in the
15847                                        # ucd pod, as are strongly discouraged
15848                                        # from being used
15849                                        UCD => 0,
15850
15851                                        Status => $status,
15852                                        OK_as_Filename => $ok_as_filename);
15853                        trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15854                        next PREFIX;
15855                    }
15856
15857                    # Here, $perl doesn't already have a table that is a
15858                    # synonym for this property, add one.
15859                    my $added_table = $perl->add_match_table($proposed_name,
15860                                            Re_Pod_Entry => $make_re_pod_entry,
15861
15862                                            # See UCD comment just above
15863                                            UCD => 0,
15864
15865                                            Status => $status,
15866                                            OK_as_Filename => $ok_as_filename);
15867                    # And it will be related to the actual table, since it is
15868                    # based on it.
15869                    $added_table->set_equivalent_to($actual, Related => 1);
15870                    trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15871                    next;
15872                } # End of no pre-existing.
15873
15874                # Here, there is a pre-existing table that has the proposed
15875                # name.  We could be in trouble, but not if this is just a
15876                # synonym for another table that we have already made a child
15877                # of the pre-existing one.
15878                if ($pre_existing->is_set_equivalent_to($actual)) {
15879                    trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15880                    $pre_existing->add_alias($proposed_name);
15881                    next;
15882                }
15883
15884                # Here, there is a name collision, but it still could be OK if
15885                # the tables match the identical set of code points, in which
15886                # case, we can combine the names.  Compare each table's code
15887                # point list to see if they are identical.
15888                trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15889                if ($pre_existing->matches_identically_to($actual)) {
15890
15891                    # Here, they do match identically.  Not a real conflict.
15892                    # Make the perl version a child of the Unicode one, except
15893                    # in the non-obvious case of where the perl name is
15894                    # already a synonym of another Unicode property.  (This is
15895                    # excluded by the test for it being its own parent.)  The
15896                    # reason for this exclusion is that then the two Unicode
15897                    # properties become related; and we don't really know if
15898                    # they are or not.  We generate documentation based on
15899                    # relatedness, and this would be misleading.  Code
15900                    # later executed in the process will cause the tables to
15901                    # be represented by a single file anyway, without making
15902                    # it look in the pod like they are necessarily related.
15903                    if ($pre_existing->parent == $pre_existing
15904                        && ($pre_existing->property == $perl
15905                            || $actual->property == $perl))
15906                    {
15907                        trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15908                        $pre_existing->set_equivalent_to($actual, Related => 1);
15909                    }
15910                    elsif (main::DEBUG && $to_trace) {
15911                        trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15912                        trace $pre_existing->parent;
15913                    }
15914                    next PREFIX;
15915                }
15916
15917                # Here they didn't match identically, there is a real conflict
15918                # between our new name and a pre-existing property.
15919                $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15920                $pre_existing->add_conflicting($nominal->full_name,
15921                                               'p',
15922                                               $actual);
15923
15924                # Don't output a warning for aliases for the block
15925                # properties (unless they start with 'In_') as it is
15926                # expected that there will be conflicts and the block
15927                # form loses.
15928                if ($verbosity >= $NORMAL_VERBOSITY
15929                    && ($actual->property != $block || $prefix eq 'In_'))
15930                {
15931                    print simple_fold(join_lines(<<END
15932There is already an alias named $proposed_name (from $pre_existing),
15933so not creating this alias for $actual
15934END
15935                    ), "", 4);
15936                }
15937
15938                # Keep track for documentation purposes.
15939                $has_In_conflicts++ if $prefix eq 'In_';
15940                $has_Is_conflicts++ if $prefix eq 'Is_';
15941            }
15942        }
15943    }
15944
15945    # There are some properties which have No and Yes (and N and Y) as
15946    # property values, but aren't binary, and could possibly be confused with
15947    # binary ones.  So create caveats for them.  There are tables that are
15948    # named 'No', and tables that are named 'N', but confusion is not likely
15949    # unless they are the same table.  For example, N meaning Number or
15950    # Neutral is not likely to cause confusion, so don't add caveats to things
15951    # like them.
15952    foreach my $property (grep { $_->type != $BINARY
15953                                 && $_->type != $FORCED_BINARY }
15954                                                            property_ref('*'))
15955    {
15956        my $yes = $property->table('Yes');
15957        if (defined $yes) {
15958            my $y = $property->table('Y');
15959            if (defined $y && $yes == $y) {
15960                foreach my $alias ($property->aliases) {
15961                    $yes->add_conflicting($alias->name);
15962                }
15963            }
15964        }
15965        my $no = $property->table('No');
15966        if (defined $no) {
15967            my $n = $property->table('N');
15968            if (defined $n && $no == $n) {
15969                foreach my $alias ($property->aliases) {
15970                    $no->add_conflicting($alias->name, 'P');
15971                }
15972            }
15973        }
15974    }
15975
15976    return;
15977}
15978
15979sub register_file_for_name($$$) {
15980    # Given info about a table and a datafile that it should be associated
15981    # with, register that association
15982
15983    my $table = shift;
15984    my $directory_ref = shift;   # Array of the directory path for the file
15985    my $file = shift;            # The file name in the final directory.
15986    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15987
15988    trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15989
15990    if ($table->isa('Property')) {
15991        $table->set_file_path(@$directory_ref, $file);
15992        push @map_properties, $table;
15993
15994        # No swash means don't do the rest of this.
15995        return if $table->fate != $ORDINARY
15996                  && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15997
15998        # Get the path to the file
15999        my @path = $table->file_path;
16000
16001        # Use just the file name if no subdirectory.
16002        shift @path if $path[0] eq File::Spec->curdir();
16003
16004        my $file = join '/', @path;
16005
16006        # Create a hash entry for utf8_heavy to get the file that stores this
16007        # property's map table
16008        foreach my $alias ($table->aliases) {
16009            my $name = $alias->name;
16010            if ($name =~ /^_/) {
16011                $strict_property_to_file_of{lc $name} = $file;
16012            }
16013            else {
16014                $loose_property_to_file_of{standardize($name)} = $file;
16015            }
16016        }
16017
16018        # And a way for utf8_heavy to find the proper key in the SwashInfo
16019        # hash for this property.
16020        $file_to_swash_name{$file} = "To" . $table->swash_name;
16021        return;
16022    }
16023
16024    # Do all of the work for all equivalent tables when called with the leader
16025    # table, so skip if isn't the leader.
16026    return if $table->leader != $table;
16027
16028    # If this is a complement of another file, use that other file instead,
16029    # with a ! prepended to it.
16030    my $complement;
16031    if (($complement = $table->complement) != 0) {
16032        my @directories = $complement->file_path;
16033
16034        # This assumes that the 0th element is something like 'lib',
16035        # the 1th element the property name (in its own directory), like
16036        # 'AHex', and the 2th element the file like 'Y' which will have a .pl
16037        # appended to it later.
16038        $directories[1] =~ s/^/!/;
16039        $file = pop @directories;
16040        $directory_ref =\@directories;
16041    }
16042
16043    # Join all the file path components together, using slashes.
16044    my $full_filename = join('/', @$directory_ref, $file);
16045
16046    # All go in the same subdirectory of unicore, or the special
16047    # pseudo-directory '#'
16048    if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
16049        Carp::my_carp("Unexpected directory in "
16050                .  join('/', @{$directory_ref}, $file));
16051    }
16052
16053    # For this table and all its equivalents ...
16054    foreach my $table ($table, $table->equivalents) {
16055
16056        # Associate it with its file internally.  Don't include the
16057        # $matches_directory first component
16058        $table->set_file_path(@$directory_ref, $file);
16059
16060        # No swash means don't do the rest of this.
16061        next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
16062
16063        my $sub_filename = join('/', $directory_ref->[1, -1], $file);
16064
16065        my $property = $table->property;
16066        my $property_name = ($property == $perl)
16067                             ? ""  # 'perl' is never explicitly stated
16068                             : standardize($property->name) . '=';
16069
16070        my $is_default = 0; # Is this table the default one for the property?
16071
16072        # To calculate $is_default, we find if this table is the same as the
16073        # default one for the property.  But this is complicated by the
16074        # possibility that there is a master table for this one, and the
16075        # information is stored there instead of here.
16076        my $parent = $table->parent;
16077        my $leader_prop = $parent->property;
16078        my $default_map = $leader_prop->default_map;
16079        if (defined $default_map) {
16080            my $default_table = $leader_prop->table($default_map);
16081            $is_default = 1 if defined $default_table && $parent == $default_table;
16082        }
16083
16084        # Calculate the loose name for this table.  Mostly it's just its name,
16085        # standardized.  But in the case of Perl tables that are single-form
16086        # equivalents to Unicode properties, it is the latter's name.
16087        my $loose_table_name =
16088                        ($property != $perl || $leader_prop == $perl)
16089                        ? standardize($table->name)
16090                        : standardize($parent->name);
16091
16092        my $deprecated = ($table->status eq $DEPRECATED)
16093                         ? $table->status_info
16094                         : "";
16095        my $caseless_equivalent = $table->caseless_equivalent;
16096
16097        # And for each of the table's aliases...  This inner loop eventually
16098        # goes through all aliases in the UCD that we generate regex match
16099        # files for
16100        foreach my $alias ($table->aliases) {
16101            my $standard = utf8_heavy_name($table, $alias);
16102
16103            # Generate an entry in either the loose or strict hashes, which
16104            # will translate the property and alias names combination into the
16105            # file where the table for them is stored.
16106            if ($alias->loose_match) {
16107                if (exists $loose_to_file_of{$standard}) {
16108                    Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
16109                }
16110                else {
16111                    $loose_to_file_of{$standard} = $sub_filename;
16112                }
16113            }
16114            else {
16115                if (exists $stricter_to_file_of{$standard}) {
16116                    Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
16117                }
16118                else {
16119                    $stricter_to_file_of{$standard} = $sub_filename;
16120
16121                    # Tightly coupled with how utf8_heavy.pl works, for a
16122                    # floating point number that is a whole number, get rid of
16123                    # the trailing decimal point and 0's, so that utf8_heavy
16124                    # will work.  Also note that this assumes that such a
16125                    # number is matched strictly; so if that were to change,
16126                    # this would be wrong.
16127                    if ((my $integer_name = $alias->name)
16128                            =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
16129                    {
16130                        $stricter_to_file_of{$property_name . $integer_name}
16131                                                            = $sub_filename;
16132                    }
16133                }
16134            }
16135
16136            # For Unicode::UCD, create a mapping of the prop=value to the
16137            # canonical =value for that property.
16138            if ($standard =~ /=/) {
16139
16140                # This could happen if a strict name mapped into an existing
16141                # loose name.  In that event, the strict names would have to
16142                # be moved to a new hash.
16143                if (exists($loose_to_standard_value{$standard})) {
16144                    Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
16145                }
16146                $loose_to_standard_value{$standard} = $loose_table_name;
16147            }
16148
16149            # Keep a list of the deprecated properties and their filenames
16150            if ($deprecated && $complement == 0) {
16151                $utf8::why_deprecated{$sub_filename} = $deprecated;
16152            }
16153
16154            # And a substitute table, if any, for case-insensitive matching
16155            if ($caseless_equivalent != 0) {
16156                $caseless_equivalent_to{$standard} = $caseless_equivalent;
16157            }
16158
16159            # Add to defaults list if the table this alias belongs to is the
16160            # default one
16161            $loose_defaults{$standard} = 1 if $is_default;
16162        }
16163    }
16164
16165    return;
16166}
16167
16168{   # Closure
16169    my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
16170                     # conflicts
16171    my %full_dir_name_of;   # Full length names of directories used.
16172
16173    sub construct_filename($$$) {
16174        # Return a file name for a table, based on the table name, but perhaps
16175        # changed to get rid of non-portable characters in it, and to make
16176        # sure that it is unique on a file system that allows the names before
16177        # any period to be at most 8 characters (DOS).  While we're at it
16178        # check and complain if there are any directory conflicts.
16179
16180        my $name = shift;       # The name to start with
16181        my $mutable = shift;    # Boolean: can it be changed?  If no, but
16182                                # yet it must be to work properly, a warning
16183                                # is given
16184        my $directories_ref = shift;  # A reference to an array containing the
16185                                # path to the file, with each element one path
16186                                # component.  This is used because the same
16187                                # name can be used in different directories.
16188        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16189
16190        my $warn = ! defined wantarray;  # If true, then if the name is
16191                                # changed, a warning is issued as well.
16192
16193        if (! defined $name) {
16194            Carp::my_carp("Undefined name in directory "
16195                          . File::Spec->join(@$directories_ref)
16196                          . ". '_' used");
16197            return '_';
16198        }
16199
16200        # Make sure that no directory names conflict with each other.  Look at
16201        # each directory in the input file's path.  If it is already in use,
16202        # assume it is correct, and is merely being re-used, but if we
16203        # truncate it to 8 characters, and find that there are two directories
16204        # that are the same for the first 8 characters, but differ after that,
16205        # then that is a problem.
16206        foreach my $directory (@$directories_ref) {
16207            my $short_dir = substr($directory, 0, 8);
16208            if (defined $full_dir_name_of{$short_dir}) {
16209                next if $full_dir_name_of{$short_dir} eq $directory;
16210                Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
16211            }
16212            else {
16213                $full_dir_name_of{$short_dir} = $directory;
16214            }
16215        }
16216
16217        my $path = join '/', @$directories_ref;
16218        $path .= '/' if $path;
16219
16220        # Remove interior underscores.
16221        (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
16222
16223        # Convert the dot in floating point numbers to an underscore
16224        $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
16225
16226        my $suffix = "";
16227
16228        # Extract any suffix, delete any non-word character, and truncate to 3
16229        # after the dot
16230        if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
16231            $filename = $1;
16232            $suffix = $2;
16233            $suffix =~ s/\W+//g;
16234            substr($suffix, 4) = "" if length($suffix) > 4;
16235        }
16236
16237        # Change any non-word character outside the suffix into an underscore,
16238        # and truncate to 8.
16239        $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
16240        substr($filename, 8) = "" if length($filename) > 8;
16241
16242        # Make sure the basename doesn't conflict with something we
16243        # might have already written. If we have, say,
16244        #     InGreekExtended1
16245        #     InGreekExtended2
16246        # they become
16247        #     InGreekE
16248        #     InGreek2
16249        my $warned = 0;
16250        while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
16251            $num++; # so basenames with numbers start with '2', which
16252                    # just looks more natural.
16253
16254            # Want to append $num, but if it'll make the basename longer
16255            # than 8 characters, pre-truncate $filename so that the result
16256            # is acceptable.
16257            my $delta = length($filename) + length($num) - 8;
16258            if ($delta > 0) {
16259                substr($filename, -$delta) = $num;
16260            }
16261            else {
16262                $filename .= $num;
16263            }
16264            if ($warn && ! $warned) {
16265                $warned = 1;
16266                Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
16267            }
16268        }
16269
16270        return $filename if $mutable;
16271
16272        # If not changeable, must return the input name, but warn if needed to
16273        # change it beyond shortening it.
16274        if ($name ne $filename
16275            && substr($name, 0, length($filename)) ne $filename) {
16276            Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
16277        }
16278        return $name;
16279    }
16280}
16281
16282# The pod file contains a very large table.  Many of the lines in that table
16283# would exceed a typical output window's size, and so need to be wrapped with
16284# a hanging indent to make them look good.  The pod language is really
16285# insufficient here.  There is no general construct to do that in pod, so it
16286# is done here by beginning each such line with a space to cause the result to
16287# be output without formatting, and doing all the formatting here.  This leads
16288# to the result that if the eventual display window is too narrow it won't
16289# look good, and if the window is too wide, no advantage is taken of that
16290# extra width.  A further complication is that the output may be indented by
16291# the formatter so that there is less space than expected.  What I (khw) have
16292# done is to assume that that indent is a particular number of spaces based on
16293# what it is in my Linux system;  people can always resize their windows if
16294# necessary, but this is obviously less than desirable, but the best that can
16295# be expected.
16296my $automatic_pod_indent = 8;
16297
16298# Try to format so that uses fewest lines, but few long left column entries
16299# slide into the right column.  An experiment on 5.1 data yielded the
16300# following percentages that didn't cut into the other side along with the
16301# associated first-column widths
16302# 69% = 24
16303# 80% not too bad except for a few blocks
16304# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
16305# 95% = 37;
16306my $indent_info_column = 27;    # 75% of lines didn't have overlap
16307
16308my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
16309                    # The 3 is because of:
16310                    #   1   for the leading space to tell the pod formatter to
16311                    #       output as-is
16312                    #   1   for the flag
16313                    #   1   for the space between the flag and the main data
16314
16315sub format_pod_line ($$$;$$) {
16316    # Take a pod line and return it, formatted properly
16317
16318    my $first_column_width = shift;
16319    my $entry = shift;  # Contents of left column
16320    my $info = shift;   # Contents of right column
16321
16322    my $status = shift || "";   # Any flag
16323
16324    my $loose_match = shift;    # Boolean.
16325    $loose_match = 1 unless defined $loose_match;
16326
16327    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16328
16329    my $flags = "";
16330    $flags .= $STRICTER if ! $loose_match;
16331
16332    $flags .= $status if $status;
16333
16334    # There is a blank in the left column to cause the pod formatter to
16335    # output the line as-is.
16336    return sprintf " %-*s%-*s %s\n",
16337                    # The first * in the format is replaced by this, the -1 is
16338                    # to account for the leading blank.  There isn't a
16339                    # hard-coded blank after this to separate the flags from
16340                    # the rest of the line, so that in the unlikely event that
16341                    # multiple flags are shown on the same line, they both
16342                    # will get displayed at the expense of that separation,
16343                    # but since they are left justified, a blank will be
16344                    # inserted in the normal case.
16345                    $FILLER - 1,
16346                    $flags,
16347
16348                    # The other * in the format is replaced by this number to
16349                    # cause the first main column to right fill with blanks.
16350                    # The -1 is for the guaranteed blank following it.
16351                    $first_column_width - $FILLER - 1,
16352                    $entry,
16353                    $info;
16354}
16355
16356my @zero_match_tables;  # List of tables that have no matches in this release
16357
16358sub make_re_pod_entries($) {
16359    # This generates the entries for the pod file for a given table.
16360    # Also done at this time are any children tables.  The output looks like:
16361    # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
16362
16363    my $input_table = shift;        # Table the entry is for
16364    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16365
16366    # Generate parent and all its children at the same time.
16367    return if $input_table->parent != $input_table;
16368
16369    my $property = $input_table->property;
16370    my $type = $property->type;
16371    my $full_name = $property->full_name;
16372
16373    my $count = $input_table->count;
16374    my $unicode_count;
16375    my $non_unicode_string;
16376    if ($count > $MAX_UNICODE_CODEPOINTS) {
16377        $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16378                                    - $MAX_UNICODE_CODEPOINT);
16379        $non_unicode_string = " plus all above-Unicode code points";
16380    }
16381    else {
16382        $unicode_count = $count;
16383        $non_unicode_string = "";
16384    }
16385
16386    my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16387
16388    my $definition = $input_table->calculate_table_definition;
16389    if ($definition) {
16390
16391        # Save the definition for later use.
16392        $input_table->set_definition($definition);
16393
16394        $definition = ": $definition";
16395    }
16396
16397    my $status = $input_table->status;
16398    my $status_info = $input_table->status_info;
16399    my $caseless_equivalent = $input_table->caseless_equivalent;
16400
16401    # Don't mention a placeholder equivalent as it isn't to be listed in the
16402    # pod
16403    $caseless_equivalent = 0 if $caseless_equivalent != 0
16404                                && $caseless_equivalent->fate > $ORDINARY;
16405
16406    my $entry_for_first_table; # The entry for the first table output.
16407                           # Almost certainly, it is the parent.
16408
16409    # For each related table (including itself), we will generate a pod entry
16410    # for each name each table goes by
16411    foreach my $table ($input_table, $input_table->children) {
16412
16413        # utf8_heavy.pl cannot deal with null string property values, so skip
16414        # any tables that have no non-null names.
16415        next if ! grep { $_->name ne "" } $table->aliases;
16416
16417        # First, gather all the info that applies to this table as a whole.
16418
16419        push @zero_match_tables, $table if $count == 0
16420                                            # Don't mention special tables
16421                                            # as being zero length
16422                                           && $table->fate == $ORDINARY;
16423
16424        my $table_property = $table->property;
16425
16426        # The short name has all the underscores removed, while the full name
16427        # retains them.  Later, we decide whether to output a short synonym
16428        # for the full one, we need to compare apples to apples, so we use the
16429        # short name's length including underscores.
16430        my $table_property_short_name_length;
16431        my $table_property_short_name
16432            = $table_property->short_name(\$table_property_short_name_length);
16433        my $table_property_full_name = $table_property->full_name;
16434
16435        # Get how much savings there is in the short name over the full one
16436        # (delta will always be <= 0)
16437        my $table_property_short_delta = $table_property_short_name_length
16438                                         - length($table_property_full_name);
16439        my @table_description = $table->description;
16440        my @table_note = $table->note;
16441
16442        # Generate an entry for each alias in this table.
16443        my $entry_for_first_alias;  # saves the first one encountered.
16444        foreach my $alias ($table->aliases) {
16445
16446            # Skip if not to go in pod.
16447            next unless $alias->make_re_pod_entry;
16448
16449            # Start gathering all the components for the entry
16450            my $name = $alias->name;
16451
16452            # Skip if name is empty, as can't be accessed by regexes.
16453            next if $name eq "";
16454
16455            my $entry;      # Holds the left column, may include extras
16456            my $entry_ref;  # To refer to the left column's contents from
16457                            # another entry; has no extras
16458
16459            # First the left column of the pod entry.  Tables for the $perl
16460            # property always use the single form.
16461            if ($table_property == $perl) {
16462                $entry = "\\p{$name}";
16463                $entry .= " \\p$name" if length $name == 1; # Show non-braced
16464                                                            # form too
16465                $entry_ref = "\\p{$name}";
16466            }
16467            else {    # Compound form.
16468
16469                # Only generate one entry for all the aliases that mean true
16470                # or false in binary properties.  Append a '*' to indicate
16471                # some are missing.  (The heading comment notes this.)
16472                my $rhs;
16473                if ($type == $BINARY) {
16474                    next if $name ne 'N' && $name ne 'Y';
16475                    $rhs = "$name*";
16476                }
16477                elsif ($type != $FORCED_BINARY) {
16478                    $rhs = $name;
16479                }
16480                else {
16481
16482                    # Forced binary properties require special handling.  It
16483                    # has two sets of tables, one set is true/false; and the
16484                    # other set is everything else.  Entries are generated for
16485                    # each set.  Use the Bidi_Mirrored property (which appears
16486                    # in all Unicode versions) to get a list of the aliases
16487                    # for the true/false tables.  Of these, only output the N
16488                    # and Y ones, the same as, a regular binary property.  And
16489                    # output all the rest, same as a non-binary property.
16490                    my $bm = property_ref("Bidi_Mirrored");
16491                    if ($name eq 'N' || $name eq 'Y') {
16492                        $rhs = "$name*";
16493                    } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16494                                                        $bm->table("N")->aliases)
16495                    {
16496                        next;
16497                    }
16498                    else {
16499                        $rhs = $name;
16500                    }
16501                }
16502
16503                # Colon-space is used to give a little more space to be easier
16504                # to read;
16505                $entry = "\\p{"
16506                        . $table_property_full_name
16507                        . ": $rhs}";
16508
16509                # But for the reference to this entry, which will go in the
16510                # right column, where space is at a premium, use equals
16511                # without a space
16512                $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16513            }
16514
16515            # Then the right (info) column.  This is stored as components of
16516            # an array for the moment, then joined into a string later.  For
16517            # non-internal only properties, begin the info with the entry for
16518            # the first table we encountered (if any), as things are ordered
16519            # so that that one is the most descriptive.  This leads to the
16520            # info column of an entry being a more descriptive version of the
16521            # name column
16522            my @info;
16523            if ($name =~ /^_/) {
16524                push @info,
16525                        '(For internal use by Perl, not necessarily stable)';
16526            }
16527            elsif ($entry_for_first_alias) {
16528                push @info, $entry_for_first_alias;
16529            }
16530
16531            # If this entry is equivalent to another, add that to the info,
16532            # using the first such table we encountered
16533            if ($entry_for_first_table) {
16534                if (@info) {
16535                    push @info, "(= $entry_for_first_table)";
16536                }
16537                else {
16538                    push @info, $entry_for_first_table;
16539                }
16540            }
16541
16542            # If the name is a large integer, add an equivalent with an
16543            # exponent for better readability
16544            if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16545                push @info, sprintf "(= %.1e)", $name
16546            }
16547
16548            my $parenthesized = "";
16549            if (! $entry_for_first_alias) {
16550
16551                # This is the first alias for the current table.  The alias
16552                # array is ordered so that this is the fullest, most
16553                # descriptive alias, so it gets the fullest info.  The other
16554                # aliases are mostly merely pointers to this one, using the
16555                # information already added above.
16556
16557                # Display any status message, but only on the parent table
16558                if ($status && ! $entry_for_first_table) {
16559                    push @info, $status_info;
16560                }
16561
16562                # Put out any descriptive info
16563                if (@table_description || @table_note) {
16564                    push @info, join "; ", @table_description, @table_note;
16565                }
16566
16567                # Look to see if there is a shorter name we can point people
16568                # at
16569                my $standard_name = standardize($name);
16570                my $short_name;
16571                my $proposed_short = $table->short_name;
16572                if (defined $proposed_short) {
16573                    my $standard_short = standardize($proposed_short);
16574
16575                    # If the short name is shorter than the standard one, or
16576                    # even it it's not, but the combination of it and its
16577                    # short property name (as in \p{prop=short} ($perl doesn't
16578                    # have this form)) saves at least two characters, then,
16579                    # cause it to be listed as a shorter synonym.
16580                    if (length $standard_short < length $standard_name
16581                        || ($table_property != $perl
16582                            && (length($standard_short)
16583                                - length($standard_name)
16584                                + $table_property_short_delta)  # (<= 0)
16585                                < -2))
16586                    {
16587                        $short_name = $proposed_short;
16588                        if ($table_property != $perl) {
16589                            $short_name = $table_property_short_name
16590                                          . "=$short_name";
16591                        }
16592                        $short_name = "\\p{$short_name}";
16593                    }
16594                }
16595
16596                # And if this is a compound form name, see if there is a
16597                # single form equivalent
16598                my $single_form;
16599                if ($table_property != $perl && $table_property != $block) {
16600
16601                    # Special case the binary N tables, so that will print
16602                    # \P{single}, but use the Y table values to populate
16603                    # 'single', as we haven't likewise populated the N table.
16604                    # For forced binary tables, we can't just look at the N
16605                    # table, but must see if this table is equivalent to the N
16606                    # one, as there are two equivalent beasts in these
16607                    # properties.
16608                    my $test_table;
16609                    my $p;
16610                    if (   ($type == $BINARY
16611                            && $input_table == $property->table('No'))
16612                        || ($type == $FORCED_BINARY
16613                            && $property->table('No')->
16614                                        is_set_equivalent_to($input_table)))
16615                    {
16616                        $test_table = $property->table('Yes');
16617                        $p = 'P';
16618                    }
16619                    else {
16620                        $test_table = $input_table;
16621                        $p = 'p';
16622                    }
16623
16624                    # Look for a single form amongst all the children.
16625                    foreach my $table ($test_table->children) {
16626                        next if $table->property != $perl;
16627                        my $proposed_name = $table->short_name;
16628                        next if ! defined $proposed_name;
16629
16630                        # Don't mention internal-only properties as a possible
16631                        # single form synonym
16632                        next if substr($proposed_name, 0, 1) eq '_';
16633
16634                        $proposed_name = "\\$p\{$proposed_name}";
16635                        if (! defined $single_form
16636                            || length($proposed_name) < length $single_form)
16637                        {
16638                            $single_form = $proposed_name;
16639
16640                            # The goal here is to find a single form; not the
16641                            # shortest possible one.  We've already found a
16642                            # short name.  So, stop at the first single form
16643                            # found, which is likely to be closer to the
16644                            # original.
16645                            last;
16646                        }
16647                    }
16648                }
16649
16650                # Output both short and single in the same parenthesized
16651                # expression, but with only one of 'Single', 'Short' if there
16652                # are both items.
16653                if ($short_name || $single_form || $table->conflicting) {
16654                    $parenthesized .= "Short: $short_name" if $short_name;
16655                    if ($short_name && $single_form) {
16656                        $parenthesized .= ', ';
16657                    }
16658                    elsif ($single_form) {
16659                        $parenthesized .= 'Single: ';
16660                    }
16661                    $parenthesized .= $single_form if $single_form;
16662                }
16663            }
16664
16665            if ($caseless_equivalent != 0) {
16666                $parenthesized .=  '; ' if $parenthesized ne "";
16667                $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16668            }
16669
16670
16671            # Warn if this property isn't the same as one that a
16672            # semi-casual user might expect.  The other components of this
16673            # parenthesized structure are calculated only for the first entry
16674            # for this table, but the conflicting is deemed important enough
16675            # to go on every entry.
16676            my $conflicting = join " NOR ", $table->conflicting;
16677            if ($conflicting) {
16678                $parenthesized .=  '; ' if $parenthesized ne "";
16679                $parenthesized .= "NOT $conflicting";
16680            }
16681
16682            push @info, "($parenthesized)" if $parenthesized;
16683
16684            if ($name =~ /_$/ && $alias->loose_match) {
16685                push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16686            }
16687
16688            if ($table_property != $perl && $table->perl_extension) {
16689                push @info, '(Perl extension)';
16690            }
16691            my $definition = $table->definition // "";
16692            $definition = "" if $entry_for_first_alias;
16693            $definition = ": $definition" if $definition;
16694            push @info, "($string_count$definition)";
16695
16696            # Now, we have both the entry and info so add them to the
16697            # list of all the properties.
16698            push @match_properties,
16699                format_pod_line($indent_info_column,
16700                                $entry,
16701                                join( " ", @info),
16702                                $alias->status,
16703                                $alias->loose_match);
16704
16705            $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16706        } # End of looping through the aliases for this table.
16707
16708        if (! $entry_for_first_table) {
16709            $entry_for_first_table = $entry_for_first_alias;
16710        }
16711    } # End of looping through all the related tables
16712    return;
16713}
16714
16715sub make_ucd_table_pod_entries {
16716    my $table = shift;
16717
16718    # Generate the entries for the UCD section of the pod for $table.  This
16719    # also calculates if names are ambiguous, so has to be called even if the
16720    # pod is not being output
16721
16722    my $short_name = $table->name;
16723    my $standard_short_name = standardize($short_name);
16724    my $full_name = $table->full_name;
16725    my $standard_full_name = standardize($full_name);
16726
16727    my $full_info = "";     # Text of info column for full-name entries
16728    my $other_info = "";    # Text of info column for short-name entries
16729    my $short_info = "";    # Text of info column for other entries
16730    my $meaning = "";       # Synonym of this table
16731
16732    my $property = ($table->isa('Property'))
16733                   ? $table
16734                   : $table->parent->property;
16735
16736    my $perl_extension = $table->perl_extension;
16737    my $is_perl_extension_match_table_but_not_dollar_perl
16738                                                        = $property != $perl
16739                                                       && $perl_extension
16740                                                       && $property != $table;
16741
16742    # Get the more official name for for perl extensions that aren't
16743    # stand-alone properties
16744    if ($is_perl_extension_match_table_but_not_dollar_perl) {
16745        if ($property->type == $BINARY) {
16746            $meaning = $property->full_name;
16747        }
16748        else {
16749            $meaning = $table->parent->complete_name;
16750        }
16751    }
16752
16753    # There are three types of info column.  One for the short name, one for
16754    # the full name, and one for everything else.  They mostly are the same,
16755    # so initialize in the same loop.
16756
16757    foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16758        if ($info_ref != \$full_info) {
16759
16760            # The non-full name columns include the full name
16761            $$info_ref .= $full_name;
16762        }
16763
16764
16765        if ($is_perl_extension_match_table_but_not_dollar_perl) {
16766
16767            # Add the synonymous name for the non-full name entries; and to
16768            # the full-name entry if it adds extra information
16769            if (   standardize($meaning) ne $standard_full_name
16770                || $info_ref == \$other_info
16771                || $info_ref == \$short_info)
16772            {
16773                my $parenthesized =  $info_ref != \$full_info;
16774                $$info_ref .= " " if $$info_ref && $parenthesized;
16775                $$info_ref .= "(=" if $parenthesized;
16776                $$info_ref .= "$meaning";
16777                $$info_ref .= ")" if $parenthesized;
16778                $$info_ref .= ".";
16779            }
16780        }
16781
16782        # And the full-name entry includes the short name, if shorter
16783        if ($info_ref == \$full_info
16784            && length $standard_short_name < length $standard_full_name)
16785        {
16786            $full_info =~ s/\.\Z//;
16787            $full_info .= "  " if $full_info;
16788            $full_info .= "(Short: $short_name)";
16789        }
16790
16791        if ($table->perl_extension) {
16792            $$info_ref =~ s/\.\Z//;
16793            $$info_ref .= ".  " if $$info_ref;
16794            $$info_ref .= "(Perl extension)";
16795        }
16796    }
16797
16798    my $definition;
16799    my $definition_table;
16800    my $type = $table->property->type;
16801    if ($type == $BINARY || $type == $FORCED_BINARY) {
16802        $definition_table = $table->property->table('Y');
16803    }
16804    elsif ($table->isa('Match_Table')) {
16805        $definition_table = $table;
16806    }
16807
16808    $definition = $definition_table->calculate_table_definition
16809                                            if defined $definition_table
16810                                                    && $definition_table != 0;
16811
16812    # Add any extra annotations to the full name entry
16813    foreach my $more_info ($table->description,
16814                            $definition,
16815                            $table->note,
16816                            $table->status_info)
16817    {
16818        next unless $more_info;
16819        $full_info =~ s/\.\Z//;
16820        $full_info .= ".  " if $full_info;
16821        $full_info .= $more_info;
16822    }
16823    if ($table->property->type == $FORCED_BINARY) {
16824        if ($full_info) {
16825            $full_info =~ s/\.\Z//;
16826            $full_info .= ".  ";
16827        }
16828        $full_info .= "This is a combination property which has both:"
16829                    . " 1) a map to various string values; and"
16830                    . " 2) a map to boolean Y/N, where 'Y' means the"
16831                    . " string value is non-empty.  Add the prefix 'is'"
16832                    . " to the prop_invmap() call to get the latter";
16833    }
16834
16835    # These keep track if have created full and short name pod entries for the
16836    # property
16837    my $done_full = 0;
16838    my $done_short = 0;
16839
16840    # Every possible name is kept track of, even those that aren't going to be
16841    # output.  This way we can be sure to find the ambiguities.
16842    foreach my $alias ($table->aliases) {
16843        my $name = $alias->name;
16844        my $standard = standardize($name);
16845        my $info;
16846        my $output_this = $alias->ucd;
16847
16848        # If the full and short names are the same, we want to output the full
16849        # one's entry, so it has priority.
16850        if ($standard eq $standard_full_name) {
16851            next if $done_full;
16852            $done_full = 1;
16853            $info = $full_info;
16854        }
16855        elsif ($standard eq $standard_short_name) {
16856            next if $done_short;
16857            $done_short = 1;
16858            next if $standard_short_name eq $standard_full_name;
16859            $info = $short_info;
16860        }
16861        else {
16862            $info = $other_info;
16863        }
16864
16865        $combination_property{$standard} = 1
16866                                  if $table->property->type == $FORCED_BINARY;
16867
16868        # Here, we have set up the two columns for this entry.  But if an
16869        # entry already exists for this name, we have to decide which one
16870        # we're going to later output.
16871        if (exists $ucd_pod{$standard}) {
16872
16873            # If the two entries refer to the same property, it's not going to
16874            # be ambiguous.  (Likely it's because the names when standardized
16875            # are the same.)  But that means if they are different properties,
16876            # there is ambiguity.
16877            if ($ucd_pod{$standard}->{'property'} != $property) {
16878
16879                # Here, we have an ambiguity.  This code assumes that one is
16880                # scheduled to be output and one not and that one is a perl
16881                # extension (which is not to be output) and the other isn't.
16882                # If those assumptions are wrong, things have to be rethought.
16883                if ($ucd_pod{$standard}{'output_this'} == $output_this
16884                    || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16885                    || $output_this == $perl_extension)
16886                {
16887                    Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
16888                }
16889
16890                # We modify the info column of the one being output to
16891                # indicate the ambiguity.  Set $which to point to that one's
16892                # info.
16893                my $which;
16894                if ($ucd_pod{$standard}{'output_this'}) {
16895                    $which = \$ucd_pod{$standard}->{'info'};
16896                }
16897                else {
16898                    $which = \$info;
16899                    $meaning = $ucd_pod{$standard}{'meaning'};
16900                }
16901
16902                chomp $$which;
16903                $$which =~ s/\.\Z//;
16904                $$which .= "; NOT '$standard' meaning '$meaning'";
16905
16906                $ambiguous_names{$standard} = 1;
16907            }
16908
16909            # Use the non-perl-extension variant
16910            next unless $ucd_pod{$standard}{'perl_extension'};
16911        }
16912
16913        # Store enough information about this entry that we can later look for
16914        # ambiguities, and output it properly.
16915        $ucd_pod{$standard} = { 'name' => $name,
16916                                'info' => $info,
16917                                'meaning' => $meaning,
16918                                'output_this' => $output_this,
16919                                'perl_extension' => $perl_extension,
16920                                'property' => $property,
16921                                'status' => $alias->status,
16922        };
16923    } # End of looping through all this table's aliases
16924
16925    return;
16926}
16927
16928sub pod_alphanumeric_sort {
16929    # Sort pod entries alphanumerically.
16930
16931    # The first few character columns are filler, plus the '\p{'; and get rid
16932    # of all the trailing stuff, starting with the trailing '}', so as to sort
16933    # on just 'Name=Value'
16934    (my $a = lc $a) =~ s/^ .*? \{ //x;
16935    $a =~ s/}.*//;
16936    (my $b = lc $b) =~ s/^ .*? \{ //x;
16937    $b =~ s/}.*//;
16938
16939    # Determine if the two operands are both internal only or both not.
16940    # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16941    # should be the underscore that begins internal only
16942    my $a_is_internal = (substr($a, 0, 1) eq '_');
16943    my $b_is_internal = (substr($b, 0, 1) eq '_');
16944
16945    # Sort so the internals come last in the table instead of first (which the
16946    # leading underscore would otherwise indicate).
16947    if ($a_is_internal != $b_is_internal) {
16948        return 1 if $a_is_internal;
16949        return -1
16950    }
16951
16952    # Determine if the two operands are compound or not, and if so if are
16953    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
16954    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16955    # all of which this considers numeric, and for sorting, looks just at the
16956    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
16957    my $split_re = qr/
16958        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16959                     # property name
16960        [:=] \s*     # The syntax for the compound form
16961        (?:          # followed by ...
16962            (        # $2 gets defined if what follows is a "numeric"
16963                     # expression, which is ...
16964              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
16965                                        # number, optionally signed
16966               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
16967                                         # of these go into $3
16968             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
16969                                         # number, into $4
16970            )
16971            | .* $    # If not "numeric", accept anything so that $1 gets
16972                      # defined if it is any compound form
16973        ) /ix;
16974    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16975    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16976
16977    # Sort alphabeticlly on the whole property name if either operand isn't
16978    # compound, or they differ.
16979    return $a cmp $b if   ! defined $a_initial
16980                       || ! defined $b_initial
16981                       || $a_initial ne $b_initial;
16982
16983    if (! defined $a_numeric) {
16984
16985        # If neither is numeric, use alpha sort
16986        return $a cmp $b if ! defined $b_numeric;
16987        return 1;  # Sort numeric ahead of alpha
16988    }
16989
16990    # Here $a is numeric
16991    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
16992
16993    # Here they are both numeric in the same property.
16994    # Convert version numbers into regular numbers
16995    if (defined $a_version) {
16996        ($a_number = $a_version) =~ s/^V//i;
16997        $a_number =~ s/_/./;
16998    }
16999    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
17000        $a_number =~ s/ ^ [[:alpha:]]+ //x;
17001    }
17002    if (defined $b_version) {
17003        ($b_number = $b_version) =~ s/^V//i;
17004        $b_number =~ s/_/./;
17005    }
17006    else {
17007        $b_number =~ s/ ^ [[:alpha:]]+ //x;
17008    }
17009
17010    # Convert rationals to floating for the comparison.
17011    $a_number = eval $a_number if $a_number =~ qr{/};
17012    $b_number = eval $b_number if $b_number =~ qr{/};
17013
17014    return $a_number <=> $b_number || $a cmp $b;
17015}
17016
17017sub make_pod () {
17018    # Create the .pod file.  This generates the various subsections and then
17019    # combines them in one big HERE document.
17020
17021    my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
17022
17023    return unless defined $pod_directory;
17024    print "Making pod file\n" if $verbosity >= $PROGRESS;
17025
17026    my $exception_message =
17027    '(Any exceptions are individually noted beginning with the word NOT.)';
17028    my @block_warning;
17029    if (-e 'Blocks.txt') {
17030
17031        # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
17032        # if the global $has_In_conflicts indicates we have them.
17033        push @match_properties, format_pod_line($indent_info_column,
17034                                                '\p{In_*}',
17035                                                '\p{Block: *}'
17036                                                    . (($has_In_conflicts)
17037                                                      ? " $exception_message"
17038                                                      : ""),
17039                                                 $DISCOURAGED);
17040        @block_warning = << "END";
17041
17042In particular, matches in the Block property have single forms
17043defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
17044all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
17045C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
17046C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
17047come along that would force Perl to change the meaning of one or more of
17048these, and your program would no longer be correct.  Currently there are no
17049such conflicts with the form that begins C<"In_">, but there are many with the
17050other two shortcuts, and Unicode continues to define new properties that begin
17051with C<"In">, so it's quite possible that a conflict will occur in the future.
17052The compound form is guaranteed to not become obsolete, and its meaning is
17053clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
17054END
17055    }
17056    my $text = $Is_flags_text;
17057    $text = "$exception_message $text" if $has_Is_conflicts;
17058
17059    # And the 'Is_ line';
17060    push @match_properties, format_pod_line($indent_info_column,
17061                                            '\p{Is_*}',
17062                                            "\\p{*} $text");
17063
17064    # Sort the properties array for output.  It is sorted alphabetically
17065    # except numerically for numeric properties, and only output unique lines.
17066    @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
17067
17068    my $formatted_properties = simple_fold(\@match_properties,
17069                                        "",
17070                                        # indent succeeding lines by two extra
17071                                        # which looks better
17072                                        $indent_info_column + 2,
17073
17074                                        # shorten the line length by how much
17075                                        # the formatter indents, so the folded
17076                                        # line will fit in the space
17077                                        # presumably available
17078                                        $automatic_pod_indent);
17079    # Add column headings, indented to be a little more centered, but not
17080    # exactly
17081    $formatted_properties =  format_pod_line($indent_info_column,
17082                                                    '    NAME',
17083                                                    '           INFO')
17084                                    . "\n"
17085                                    . $formatted_properties;
17086
17087    # Generate pod documentation lines for the tables that match nothing
17088    my $zero_matches = "";
17089    if (@zero_match_tables) {
17090        @zero_match_tables = uniques(@zero_match_tables);
17091        $zero_matches = join "\n\n",
17092                        map { $_ = '=item \p{' . $_->complete_name . "}" }
17093                            sort { $a->complete_name cmp $b->complete_name }
17094                            @zero_match_tables;
17095
17096        $zero_matches = <<END;
17097
17098=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
17099
17100Unicode has some property-value pairs that currently don't match anything.
17101This happens generally either because they are obsolete, or they exist for
17102symmetry with other forms, but no language has yet been encoded that uses
17103them.  In this version of Unicode, the following match zero code points:
17104
17105=over 4
17106
17107$zero_matches
17108
17109=back
17110
17111END
17112    }
17113
17114    # Generate list of properties that we don't accept, grouped by the reasons
17115    # why.  This is so only put out the 'why' once, and then list all the
17116    # properties that have that reason under it.
17117
17118    my %why_list;   # The keys are the reasons; the values are lists of
17119                    # properties that have the key as their reason
17120
17121    # For each property, add it to the list that are suppressed for its reason
17122    # The sort will cause the alphabetically first properties to be added to
17123    # each list first, so each list will be sorted.
17124    foreach my $property (sort keys %why_suppressed) {
17125        next unless $why_suppressed{$property};
17126        push @{$why_list{$why_suppressed{$property}}}, $property;
17127    }
17128
17129    # For each reason (sorted by the first property that has that reason)...
17130    my @bad_re_properties;
17131    foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
17132                     keys %why_list)
17133    {
17134        # Add to the output, all the properties that have that reason.
17135        my $has_item = 0;   # Flag if actually output anything.
17136        foreach my $name (@{$why_list{$why}}) {
17137
17138            # Split compound names into $property and $table components
17139            my $property = $name;
17140            my $table;
17141            if ($property =~ / (.*) = (.*) /x) {
17142                $property = $1;
17143                $table = $2;
17144            }
17145
17146            # This release of Unicode may not have a property that is
17147            # suppressed, so don't reference a non-existent one.
17148            $property = property_ref($property);
17149            next if ! defined $property;
17150
17151            # And since this list is only for match tables, don't list the
17152            # ones that don't have match tables.
17153            next if ! $property->to_create_match_tables;
17154
17155            # Find any abbreviation, and turn it into a compound name if this
17156            # is a property=value pair.
17157            my $short_name = $property->name;
17158            $short_name .= '=' . $property->table($table)->name if $table;
17159
17160            # Start with an empty line.
17161            push @bad_re_properties, "\n\n" unless $has_item;
17162
17163            # And add the property as an item for the reason.
17164            push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
17165            $has_item = 1;
17166        }
17167
17168        # And add the reason under the list of properties, if such a list
17169        # actually got generated.  Note that the header got added
17170        # unconditionally before.  But pod ignores extra blank lines, so no
17171        # harm.
17172        push @bad_re_properties, "\n$why\n" if $has_item;
17173
17174    } # End of looping through each reason.
17175
17176    if (! @bad_re_properties) {
17177        push @bad_re_properties,
17178                "*** This installation accepts ALL non-Unihan properties ***";
17179    }
17180    else {
17181        # Add =over only if non-empty to avoid an empty =over/=back section,
17182        # which is considered bad form.
17183        unshift @bad_re_properties, "\n=over 4\n";
17184        push @bad_re_properties, "\n=back\n";
17185    }
17186
17187    # Similarly, generate a list of files that we don't use, grouped by the
17188    # reasons why (Don't output if the reason is empty).  First, create a hash
17189    # whose keys are the reasons, and whose values are anonymous arrays of all
17190    # the files that share that reason.
17191    my %grouped_by_reason;
17192    foreach my $file (keys %skipped_files) {
17193        next unless $skipped_files{$file};
17194        push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
17195    }
17196
17197    # Then, sort each group.
17198    foreach my $group (keys %grouped_by_reason) {
17199        @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
17200                                        @{$grouped_by_reason{$group}} ;
17201    }
17202
17203    # Finally, create the output text.  For each reason (sorted by the
17204    # alphabetically first file that has that reason)...
17205    my @unused_files;
17206    foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
17207                               cmp lc $grouped_by_reason{$b}->[0]
17208                              }
17209                         keys %grouped_by_reason)
17210    {
17211        # Add all the files that have that reason to the output.  Start
17212        # with an empty line.
17213        push @unused_files, "\n\n";
17214        push @unused_files, map { "\n=item F<$_> \n" }
17215                            @{$grouped_by_reason{$reason}};
17216        # And add the reason under the list of files
17217        push @unused_files, "\n$reason\n";
17218    }
17219
17220    # Similarly, create the output text for the UCD section of the pod
17221    my @ucd_pod;
17222    foreach my $key (keys %ucd_pod) {
17223        next unless $ucd_pod{$key}->{'output_this'};
17224        push @ucd_pod, format_pod_line($indent_info_column,
17225                                       $ucd_pod{$key}->{'name'},
17226                                       $ucd_pod{$key}->{'info'},
17227                                       $ucd_pod{$key}->{'status'},
17228                                      );
17229    }
17230
17231    # Sort alphabetically, and fold for output
17232    @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
17233    my $ucd_pod = simple_fold(\@ucd_pod,
17234                           ' ',
17235                           $indent_info_column,
17236                           $automatic_pod_indent);
17237    $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
17238                . "\n"
17239                . $ucd_pod;
17240    my $space_hex = sprintf("%02x", ord " ");
17241    local $" = "";
17242
17243    # Everything is ready to assemble.
17244    my @OUT = << "END";
17245=begin comment
17246
17247$HEADER
17248
17249To change this file, edit $0 instead.
17250
17251=end comment
17252
17253=head1 NAME
17254
17255$pod_file - Index of Unicode Version $unicode_version character properties in Perl
17256
17257=head1 DESCRIPTION
17258
17259This document provides information about the portion of the Unicode database
17260that deals with character properties, that is the portion that is defined on
17261single code points.  (L</Other information in the Unicode data base>
17262below briefly mentions other data that Unicode provides.)
17263
17264Perl can provide access to all non-provisional Unicode character properties,
17265though not all are enabled by default.  The omitted ones are the Unihan
17266properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
17267deprecated or Unicode-internal properties.  (An installation may choose to
17268recompile Perl's tables to change this.  See L<Unicode character
17269properties that are NOT accepted by Perl>.)
17270
17271For most purposes, access to Unicode properties from the Perl core is through
17272regular expression matches, as described in the next section.
17273For some special purposes, and to access the properties that are not suitable
17274for regular expression matching, all the Unicode character properties that
17275Perl handles are accessible via the standard L<Unicode::UCD> module, as
17276described in the section L</Properties accessible through Unicode::UCD>.
17277
17278Perl also provides some additional extensions and short-cut synonyms
17279for Unicode properties.
17280
17281This document merely lists all available properties and does not attempt to
17282explain what each property really means.  There is a brief description of each
17283Perl extension; see L<perlunicode/Other Properties> for more information on
17284these.  There is some detail about Blocks, Scripts, General_Category,
17285and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
17286official Unicode properties, refer to the Unicode standard.  A good starting
17287place is L<$unicode_reference_url>.
17288
17289Note that you can define your own properties; see
17290L<perlunicode/"User-Defined Character Properties">.
17291
17292=head1 Properties accessible through C<\\p{}> and C<\\P{}>
17293
17294The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
17295most of the Unicode character properties.  The table below shows all these
17296constructs, both single and compound forms.
17297
17298B<Compound forms> consist of two components, separated by an equals sign or a
17299colon.  The first component is the property name, and the second component is
17300the particular value of the property to match against, for example,
17301C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
17302to match characters whose Script_Extensions property value is Greek.
17303(C<Script_Extensions> is an improved version of the C<Script> property.)
17304
17305B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
17306their equivalent compound forms.  The table shows these equivalences.  (In our
17307example, C<\\p{Greek}> is a just a shortcut for
17308C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
17309forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
17310These are also listed in the table.
17311
17312In parsing these constructs, Perl always ignores Upper/lower case differences
17313everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
17314C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
17315the left brace completely changes the meaning of the construct, from "match"
17316(for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
17317for improved legibility.
17318
17319Also, white space, hyphens, and underscores are normally ignored
17320everywhere between the {braces}, and hence can be freely added or removed
17321even if the C</x> modifier hasn't been specified on the regular expression.
17322But in the table below $a_bold_stricter at the beginning of an entry
17323means that tighter (stricter) rules are used for that entry:
17324
17325=over 4
17326
17327=over 4
17328
17329=item Single form (C<\\p{name}>) tighter rules:
17330
17331White space, hyphens, and underscores ARE significant
17332except for:
17333
17334=over 4
17335
17336=item * white space adjacent to a non-word character
17337
17338=item * underscores separating digits in numbers
17339
17340=back
17341
17342That means, for example, that you can freely add or remove white space
17343adjacent to (but within) the braces without affecting the meaning.
17344
17345=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17346
17347The tighter rules given above for the single form apply to everything to the
17348right of the colon or equals; the looser rules still apply to everything to
17349the left.
17350
17351That means, for example, that you can freely add or remove white space
17352adjacent to (but within) the braces and the colon or equal sign.
17353
17354=back
17355
17356=back
17357
17358Some properties are considered obsolete by Unicode, but still available.
17359There are several varieties of obsolescence:
17360
17361=over 4
17362
17363=over 4
17364
17365=item Stabilized
17366
17367A property may be stabilized.  Such a determination does not indicate
17368that the property should or should not be used; instead it is a declaration
17369that the property will not be maintained nor extended for newly encoded
17370characters.  Such properties are marked with $a_bold_stabilized in the
17371table.
17372
17373=item Deprecated
17374
17375A property may be deprecated, perhaps because its original intent
17376has been replaced by another property, or because its specification was
17377somehow defective.  This means that its use is strongly
17378discouraged, so much so that a warning will be issued if used, unless the
17379regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17380statement.  $A_bold_deprecated flags each such entry in the table, and
17381the entry there for the longest, most descriptive version of the property will
17382give the reason it is deprecated, and perhaps advice.  Perl may issue such a
17383warning, even for properties that aren't officially deprecated by Unicode,
17384when there used to be characters or code points that were matched by them, but
17385no longer.  This is to warn you that your program may not work like it did on
17386earlier Unicode releases.
17387
17388A deprecated property may be made unavailable in a future Perl version, so it
17389is best to move away from them.
17390
17391A deprecated property may also be stabilized, but this fact is not shown.
17392
17393=item Obsolete
17394
17395Properties marked with $a_bold_obsolete in the table are considered (plain)
17396obsolete.  Generally this designation is given to properties that Unicode once
17397used for internal purposes (but not any longer).
17398
17399=item Discouraged
17400
17401This is not actually a Unicode-specified obsolescence, but applies to certain
17402Perl extensions that are present for backwards compatibility, but are
17403discouraged from being used.  These are not obsolete, but their meanings are
17404not stable.  Future Unicode versions could force any of these extensions to be
17405removed without warning, replaced by another property with the same name that
17406means something different.  $A_bold_discouraged flags each such entry in the
17407table.  Use the equivalent shown instead.
17408
17409@block_warning
17410
17411=back
17412
17413=back
17414
17415The table below has two columns.  The left column contains the C<\\p{}>
17416constructs to look up, possibly preceded by the flags mentioned above; and
17417the right column contains information about them, like a description, or
17418synonyms.  The table shows both the single and compound forms for each
17419property that has them.  If the left column is a short name for a property,
17420the right column will give its longer, more descriptive name; and if the left
17421column is the longest name, the right column will show any equivalent shortest
17422name, in both single and compound forms if applicable.
17423
17424If braces are not needed to specify a property (e.g., C<\\pL>), the left
17425column contains both forms, with and without braces.
17426
17427The right column will also caution you if a property means something different
17428than what might normally be expected.
17429
17430All single forms are Perl extensions; a few compound forms are as well, and
17431are noted as such.
17432
17433Numbers in (parentheses) indicate the total number of Unicode code points
17434matched by the property.  For the entries that give the longest, most
17435descriptive version of the property, the count is followed by a list of some
17436of the code points matched by it.  The list includes all the matched
17437characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17438a regular expression bracketed character class.  Following that, the next few
17439higher matching ranges are also given.  To avoid visual ambiguity, the SPACE
17440character is represented as C<\\x$space_hex>.
17441
17442For emphasis, those properties that match no code points at all are listed as
17443well in a separate section following the table.
17444
17445Most properties match the same code points regardless of whether C<"/i">
17446case-insensitive matching is specified or not.  But a few properties are
17447affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
17448in the second column.  Under case-insensitive matching they match the
17449same code pode points as the property I<other_property>.
17450
17451There is no description given for most non-Perl defined properties (See
17452L<$unicode_reference_url> for that).
17453
17454For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17455combinations.  For example, entries like:
17456
17457 \\p{Gc: *}                                  \\p{General_Category: *}
17458
17459mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17460for the latter is also valid for the former.  Similarly,
17461
17462 \\p{Is_*}                                   \\p{*}
17463
17464means that if and only if, for example, C<\\p{Foo}> exists, then
17465C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17466And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17467C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
17468underscore.
17469
17470Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17471And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
17472'N*' to indicate this, and doesn't have separate entries for the other
17473possibilities.  Note that not all properties which have values 'Yes' and 'No'
17474are binary, and they have all their values spelled out without using this wild
17475card, and a C<NOT> clause in their description that highlights their not being
17476binary.  These also require the compound form to match them, whereas true
17477binary properties have both single and compound forms available.
17478
17479Note that all non-essential underscores are removed in the display of the
17480short names below.
17481
17482B<Legend summary:>
17483
17484=over 4
17485
17486=item Z<>B<*> is a wild-card
17487
17488=item B<(\\d+)> in the info column gives the number of Unicode code points matched
17489by this property.
17490
17491=item B<$DEPRECATED> means this is deprecated.
17492
17493=item B<$OBSOLETE> means this is obsolete.
17494
17495=item B<$STABILIZED> means this is stabilized.
17496
17497=item B<$STRICTER> means tighter (stricter) name matching applies.
17498
17499=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17500stable.
17501
17502=back
17503
17504$formatted_properties
17505
17506$zero_matches
17507
17508=head1 Properties accessible through Unicode::UCD
17509
17510The value of any Unicode (not including Perl extensions) character
17511property mentioned above for any single code point is available through
17512L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
17513values of all the Unicode properties for a given code point.
17514
17515Besides these, all the Unicode character properties mentioned above
17516(except for those marked as for internal use by Perl) are also
17517accessible by L<Unicode::UCD/prop_invlist()>.
17518
17519Due to their nature, not all Unicode character properties are suitable for
17520regular expression matches, nor C<prop_invlist()>.  The remaining
17521non-provisional, non-internal ones are accessible via
17522L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17523hasn't included; see L<below for which those are|/Unicode character properties
17524that are NOT accepted by Perl>).
17525
17526For compatibility with other parts of Perl, all the single forms given in the
17527table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17528are recognized.  BUT, there are some ambiguities between some Perl extensions
17529and the Unicode properties, all of which are silently resolved in favor of the
17530official Unicode property.  To avoid surprises, you should only use
17531C<prop_invmap()> for forms listed in the table below, which omits the
17532non-recommended ones.  The affected forms are the Perl single form equivalents
17533of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17534C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17535whose short name is C<sc>.  The table indicates the current ambiguities in the
17536INFO column, beginning with the word C<"NOT">.
17537
17538The standard Unicode properties listed below are documented in
17539L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17540L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
17541L<perlunicode/Other Properties>;
17542
17543The first column in the table is a name for the property; the second column is
17544an alternative name, if any, plus possibly some annotations.  The alternative
17545name is the property's full name, unless that would simply repeat the first
17546column, in which case the second column indicates the property's short name
17547(if different).  The annotations are given only in the entry for the full
17548name.  The annotations for binary properties include a list of the first few
17549ranges that the property matches.  To avoid any ambiguity, the SPACE character
17550is represented as C<\\x$space_hex>.
17551
17552If a property is obsolete, etc, the entry will be flagged with the same
17553characters used in the table in the L<section above|/Properties accessible
17554through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17555
17556$ucd_pod
17557
17558=head1 Properties accessible through other means
17559
17560Certain properties are accessible also via core function calls.  These are:
17561
17562 Lowercase_Mapping          lc() and lcfirst()
17563 Titlecase_Mapping          ucfirst()
17564 Uppercase_Mapping          uc()
17565
17566Also, Case_Folding is accessible through the C</i> modifier in regular
17567expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17568operator.
17569
17570And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
17571interpolation in double-quoted strings and regular expressions; and functions
17572C<charnames::viacode()>, C<charnames::vianame()>, and
17573C<charnames::string_vianame()> (which require a C<use charnames ();> to be
17574specified.
17575
17576Finally, most properties related to decomposition are accessible via
17577L<Unicode::Normalize>.
17578
17579=head1 Unicode character properties that are NOT accepted by Perl
17580
17581Perl will generate an error for a few character properties in Unicode when
17582used in a regular expression.  The non-Unihan ones are listed below, with the
17583reasons they are not accepted, perhaps with work-arounds.  The short names for
17584the properties are listed enclosed in (parentheses).
17585As described after the list, an installation can change the defaults and choose
17586to accept any of these.  The list is machine generated based on the
17587choices made for the installation that generated this document.
17588
17589@bad_re_properties
17590
17591An installation can choose to allow any of these to be matched by downloading
17592the Unicode database from L<http://www.unicode.org/Public/> to
17593C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17594controlling lists contained in the program
17595C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17596(C<\%Config> is available from the Config module).
17597
17598Also, perl can be recompiled to operate on an earlier version of the Unicode
17599standard.  Further information is at
17600C<\$Config{privlib}>/F<unicore/README.perl>.
17601
17602=head1 Other information in the Unicode data base
17603
17604The Unicode data base is delivered in two different formats.  The XML version
17605is valid for more modern Unicode releases.  The other version is a collection
17606of files.  The two are intended to give equivalent information.  Perl uses the
17607older form; this allows you to recompile Perl to use early Unicode releases.
17608
17609The only non-character property that Perl currently supports is Named
17610Sequences, in which a sequence of code points
17611is given a name and generally treated as a single entity.  (Perl supports
17612these via the C<\\N{...}> double-quotish construct,
17613L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17614
17615Below is a list of the files in the Unicode data base that Perl doesn't
17616currently use, along with very brief descriptions of their purposes.
17617Some of the names of the files have been shortened from those that Unicode
17618uses, in order to allow them to be distinguishable from similarly named files
17619on file systems for which only the first 8 characters of a name are
17620significant.
17621
17622=over 4
17623
17624@unused_files
17625
17626=back
17627
17628=head1 SEE ALSO
17629
17630L<$unicode_reference_url>
17631
17632L<perlrecharclass>
17633
17634L<perlunicode>
17635
17636END
17637
17638    # And write it.  The 0 means no utf8.
17639    main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17640    return;
17641}
17642
17643sub make_Heavy () {
17644    # Create and write Heavy.pl, which passes info about the tables to
17645    # utf8_heavy.pl
17646
17647    # Stringify structures for output
17648    my $loose_property_name_of
17649                           = simple_dumper(\%loose_property_name_of, ' ' x 4);
17650    chomp $loose_property_name_of;
17651
17652    my $strict_property_name_of
17653                           = simple_dumper(\%strict_property_name_of, ' ' x 4);
17654    chomp $strict_property_name_of;
17655
17656    my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17657    chomp $stricter_to_file_of;
17658
17659    my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17660    chomp $inline_definitions;
17661
17662    my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17663    chomp $loose_to_file_of;
17664
17665    my $nv_floating_to_rational
17666                           = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17667    chomp $nv_floating_to_rational;
17668
17669    my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
17670    chomp $why_deprecated;
17671
17672    # We set the key to the file when we associated files with tables, but we
17673    # couldn't do the same for the value then, as we might not have the file
17674    # for the alternate table figured out at that time.
17675    foreach my $cased (keys %caseless_equivalent_to) {
17676        my @path = $caseless_equivalent_to{$cased}->file_path;
17677        my $path;
17678        if ($path[0] eq "#") {  # Pseudo-directory '#'
17679            $path = join '/', @path;
17680        }
17681        else {  # Gets rid of lib/
17682            $path = join '/', @path[1, -1];
17683        }
17684        $caseless_equivalent_to{$cased} = $path;
17685    }
17686    my $caseless_equivalent_to
17687                           = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17688    chomp $caseless_equivalent_to;
17689
17690    my $loose_property_to_file_of
17691                        = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17692    chomp $loose_property_to_file_of;
17693
17694    my $strict_property_to_file_of
17695                        = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17696    chomp $strict_property_to_file_of;
17697
17698    my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17699    chomp $file_to_swash_name;
17700
17701    my @heavy = <<END;
17702$HEADER
17703$INTERNAL_ONLY_HEADER
17704
17705# This file is for the use of utf8_heavy.pl and Unicode::UCD
17706
17707# Maps Unicode (not Perl single-form extensions) property names in loose
17708# standard form to their corresponding standard names
17709\%utf8::loose_property_name_of = (
17710$loose_property_name_of
17711);
17712
17713# Same, but strict names
17714\%utf8::strict_property_name_of = (
17715$strict_property_name_of
17716);
17717
17718# Gives the definitions (in the form of inversion lists) for those properties
17719# whose definitions aren't kept in files
17720\@utf8::inline_definitions = (
17721$inline_definitions
17722);
17723
17724# Maps property, table to file for those using stricter matching.  For paths
17725# whose directory is '#', the file is in the form of a numeric index into
17726# \@inline_definitions
17727\%utf8::stricter_to_file_of = (
17728$stricter_to_file_of
17729);
17730
17731# Maps property, table to file for those using loose matching.  For paths
17732# whose directory is '#', the file is in the form of a numeric index into
17733# \@inline_definitions
17734\%utf8::loose_to_file_of = (
17735$loose_to_file_of
17736);
17737
17738# Maps floating point to fractional form
17739\%utf8::nv_floating_to_rational = (
17740$nv_floating_to_rational
17741);
17742
17743# If a floating point number doesn't have enough digits in it to get this
17744# close to a fraction, it isn't considered to be that fraction even if all the
17745# digits it does have match.
17746\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
17747
17748# Deprecated tables to generate a warning for.  The key is the file containing
17749# the table, so as to avoid duplication, as many property names can map to the
17750# file, but we only need one entry for all of them.
17751\%utf8::why_deprecated = (
17752$why_deprecated
17753);
17754
17755# A few properties have different behavior under /i matching.  This maps
17756# those to substitute files to use under /i.
17757\%utf8::caseless_equivalent = (
17758$caseless_equivalent_to
17759);
17760
17761# Property names to mapping files
17762\%utf8::loose_property_to_file_of = (
17763$loose_property_to_file_of
17764);
17765
17766# Property names to mapping files
17767\%utf8::strict_property_to_file_of = (
17768$strict_property_to_file_of
17769);
17770
17771# Files to the swash names within them.
17772\%utf8::file_to_swash_name = (
17773$file_to_swash_name
17774);
17775
177761;
17777END
17778
17779    main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
17780    return;
17781}
17782
17783sub make_Name_pm () {
17784    # Create and write Name.pm, which contains subroutines and data to use in
17785    # conjunction with Name.pl
17786
17787    # Maybe there's nothing to do.
17788    return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17789
17790    my @name = <<END;
17791$HEADER
17792$INTERNAL_ONLY_HEADER
17793END
17794
17795    # Convert these structures to output format.
17796    my $code_points_ending_in_code_point =
17797        main::simple_dumper(\@code_points_ending_in_code_point,
17798                            ' ' x 8);
17799    my $names = main::simple_dumper(\%names_ending_in_code_point,
17800                                    ' ' x 8);
17801    my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17802                                    ' ' x 8);
17803
17804    # Do the same with the Hangul names,
17805    my $jamo;
17806    my $jamo_l;
17807    my $jamo_v;
17808    my $jamo_t;
17809    my $jamo_re;
17810    if ($has_hangul_syllables) {
17811
17812        # Construct a regular expression of all the possible
17813        # combinations of the Hangul syllables.
17814        my @L_re;   # Leading consonants
17815        for my $i ($LBase .. $LBase + $LCount - 1) {
17816            push @L_re, $Jamo{$i}
17817        }
17818        my @V_re;   # Middle vowels
17819        for my $i ($VBase .. $VBase + $VCount - 1) {
17820            push @V_re, $Jamo{$i}
17821        }
17822        my @T_re;   # Trailing consonants
17823        for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17824            push @T_re, $Jamo{$i}
17825        }
17826
17827        # The whole re is made up of the L V T combination.
17828        $jamo_re = '('
17829                    . join ('|', sort @L_re)
17830                    . ')('
17831                    . join ('|', sort @V_re)
17832                    . ')('
17833                    . join ('|', sort @T_re)
17834                    . ')?';
17835
17836        # These hashes needed by the algorithm were generated
17837        # during reading of the Jamo.txt file
17838        $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17839        $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17840        $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17841        $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17842    }
17843
17844    push @name, <<END;
17845
17846package charnames;
17847
17848# This module contains machine-generated tables and code for the
17849# algorithmically-determinable Unicode character names.  The following
17850# routines can be used to translate between name and code point and vice versa
17851
17852{ # Closure
17853
17854    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
17855    # two must be 10; if there are 5, the first must not be a 0.  Written this
17856    # way to decrease backtracking.  The first regex allows the code point to
17857    # be at the end of a word, but to work properly, the word shouldn't end
17858    # with a valid hex character.  The second one won't match a code point at
17859    # the end of a word, and doesn't have the run-on issue
17860    my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17861    my \$code_point_re = qr/$code_point_re/;
17862
17863    # In the following hash, the keys are the bases of names which include
17864    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
17865    # of each key is another hash which is used to get the low and high ends
17866    # for each range of code points that apply to the name.
17867    my %names_ending_in_code_point = (
17868$names
17869    );
17870
17871    # The following hash is a copy of the previous one, except is for loose
17872    # matching, so each name has blanks and dashes squeezed out
17873    my %loose_names_ending_in_code_point = (
17874$loose_names
17875    );
17876
17877    # And the following array gives the inverse mapping from code points to
17878    # names.  Lowest code points are first
17879    my \@code_points_ending_in_code_point = (
17880$code_points_ending_in_code_point
17881    );
17882END
17883    # Earlier releases didn't have Jamos.  No sense outputting
17884    # them unless will be used.
17885    if ($has_hangul_syllables) {
17886        push @name, <<END;
17887
17888    # Convert from code point to Jamo short name for use in composing Hangul
17889    # syllable names
17890    my %Jamo = (
17891$jamo
17892    );
17893
17894    # Leading consonant (can be null)
17895    my %Jamo_L = (
17896$jamo_l
17897    );
17898
17899    # Vowel
17900    my %Jamo_V = (
17901$jamo_v
17902    );
17903
17904    # Optional trailing consonant
17905    my %Jamo_T = (
17906$jamo_t
17907    );
17908
17909    # Computed re that splits up a Hangul name into LVT or LV syllables
17910    my \$syllable_re = qr/$jamo_re/;
17911
17912    my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17913    my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17914
17915    # These constants names and values were taken from the Unicode standard,
17916    # version 5.1, section 3.12.  They are used in conjunction with Hangul
17917    # syllables
17918    my \$SBase = $SBase_string;
17919    my \$LBase = $LBase_string;
17920    my \$VBase = $VBase_string;
17921    my \$TBase = $TBase_string;
17922    my \$SCount = $SCount;
17923    my \$LCount = $LCount;
17924    my \$VCount = $VCount;
17925    my \$TCount = $TCount;
17926    my \$NCount = \$VCount * \$TCount;
17927END
17928    } # End of has Jamos
17929
17930    push @name, << 'END';
17931
17932    sub name_to_code_point_special {
17933        my ($name, $loose) = @_;
17934
17935        # Returns undef if not one of the specially handled names; otherwise
17936        # returns the code point equivalent to the input name
17937        # $loose is non-zero if to use loose matching, 'name' in that case
17938        # must be input as upper case with all blanks and dashes squeezed out.
17939END
17940    if ($has_hangul_syllables) {
17941        push @name, << 'END';
17942
17943        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17944            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17945        {
17946            return if $name !~ qr/^$syllable_re$/;
17947            my $L = $Jamo_L{$1};
17948            my $V = $Jamo_V{$2};
17949            my $T = (defined $3) ? $Jamo_T{$3} : 0;
17950            return ($L * $VCount + $V) * $TCount + $T + $SBase;
17951        }
17952END
17953    }
17954    push @name, << 'END';
17955
17956        # Name must end in 'code_point' for this to handle.
17957        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17958                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17959
17960        my $base = $1;
17961        my $code_point = CORE::hex $2;
17962        my $names_ref;
17963
17964        if ($loose) {
17965            $names_ref = \%loose_names_ending_in_code_point;
17966        }
17967        else {
17968            return if $base !~ s/-$//;
17969            $names_ref = \%names_ending_in_code_point;
17970        }
17971
17972        # Name must be one of the ones which has the code point in it.
17973        return if ! $names_ref->{$base};
17974
17975        # Look through the list of ranges that apply to this name to see if
17976        # the code point is in one of them.
17977        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17978            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17979            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17980
17981            # Here, the code point is in the range.
17982            return $code_point;
17983        }
17984
17985        # Here, looked like the name had a code point number in it, but
17986        # did not match one of the valid ones.
17987        return;
17988    }
17989
17990    sub code_point_to_name_special {
17991        my $code_point = shift;
17992
17993        # Returns the name of a code point if algorithmically determinable;
17994        # undef if not
17995END
17996    if ($has_hangul_syllables) {
17997        push @name, << 'END';
17998
17999        # If in the Hangul range, calculate the name based on Unicode's
18000        # algorithm
18001        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
18002            use integer;
18003            my $SIndex = $code_point - $SBase;
18004            my $L = $LBase + $SIndex / $NCount;
18005            my $V = $VBase + ($SIndex % $NCount) / $TCount;
18006            my $T = $TBase + $SIndex % $TCount;
18007            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
18008            $name .= $Jamo{$T} if $T != $TBase;
18009            return $name;
18010        }
18011END
18012    }
18013    push @name, << 'END';
18014
18015        # Look through list of these code points for one in range.
18016        foreach my $hash (@code_points_ending_in_code_point) {
18017            return if $code_point < $hash->{'low'};
18018            if ($code_point <= $hash->{'high'}) {
18019                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
18020            }
18021        }
18022        return;            # None found
18023    }
18024} # End closure
18025
180261;
18027END
18028
18029    main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
18030    return;
18031}
18032
18033sub make_UCD () {
18034    # Create and write UCD.pl, which passes info about the tables to
18035    # Unicode::UCD
18036
18037    # Create a mapping from each alias of Perl single-form extensions to all
18038    # its equivalent aliases, for quick look-up.
18039    my %perlprop_to_aliases;
18040    foreach my $table ($perl->tables) {
18041
18042        # First create the list of the aliases of each extension
18043        my @aliases_list;    # List of legal aliases for this extension
18044
18045        my $table_name = $table->name;
18046        my $standard_table_name = standardize($table_name);
18047        my $table_full_name = $table->full_name;
18048        my $standard_table_full_name = standardize($table_full_name);
18049
18050        # Make sure that the list has both the short and full names
18051        push @aliases_list, $table_name, $table_full_name;
18052
18053        my $found_ucd = 0;  # ? Did we actually get an alias that should be
18054                            # output for this table
18055
18056        # Go through all the aliases (including the two just added), and add
18057        # any new unique ones to the list
18058        foreach my $alias ($table->aliases) {
18059
18060            # Skip non-legal names
18061            next unless $alias->ok_as_filename;
18062            next unless $alias->ucd;
18063
18064            $found_ucd = 1;     # have at least one legal name
18065
18066            my $name = $alias->name;
18067            my $standard = standardize($name);
18068
18069            # Don't repeat a name that is equivalent to one already on the
18070            # list
18071            next if $standard eq $standard_table_name;
18072            next if $standard eq $standard_table_full_name;
18073
18074            push @aliases_list, $name;
18075        }
18076
18077        # If there were no legal names, don't output anything.
18078        next unless $found_ucd;
18079
18080        # To conserve memory in the program reading these in, omit full names
18081        # that are identical to the short name, when those are the only two
18082        # aliases for the property.
18083        if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
18084            pop @aliases_list;
18085        }
18086
18087        # Here, @aliases_list is the list of all the aliases that this
18088        # extension legally has.  Now can create a map to it from each legal
18089        # standardized alias
18090        foreach my $alias ($table->aliases) {
18091            next unless $alias->ucd;
18092            next unless $alias->ok_as_filename;
18093            push @{$perlprop_to_aliases{standardize($alias->name)}},
18094                 @aliases_list;
18095        }
18096    }
18097
18098    # Make a list of all combinations of properties/values that are suppressed.
18099    my @suppressed;
18100    if (! $debug_skip) {    # This tends to fail in this debug mode
18101        foreach my $property_name (keys %why_suppressed) {
18102
18103            # Just the value
18104            my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
18105
18106            # The hash may contain properties not in this release of Unicode
18107            next unless defined (my $property = property_ref($property_name));
18108
18109            # Find all combinations
18110            foreach my $prop_alias ($property->aliases) {
18111                my $prop_alias_name = standardize($prop_alias->name);
18112
18113                # If no =value, there's just one combination possible for this
18114                if (! $value_name) {
18115
18116                    # The property may be suppressed, but there may be a proxy
18117                    # for it, so it shouldn't be listed as suppressed
18118                    next if $prop_alias->ucd;
18119                    push @suppressed, $prop_alias_name;
18120                }
18121                else {  # Otherwise
18122                    foreach my $value_alias
18123                                    ($property->table($value_name)->aliases)
18124                    {
18125                        next if $value_alias->ucd;
18126
18127                        push @suppressed, "$prop_alias_name="
18128                                        .  standardize($value_alias->name);
18129                    }
18130                }
18131            }
18132        }
18133    }
18134    @suppressed = sort @suppressed; # So doesn't change between runs of this
18135                                    # program
18136
18137    # Convert the structure below (designed for Name.pm) to a form that UCD
18138    # wants, so it doesn't have to modify it at all; i.e. so that it includes
18139    # an element for the Hangul syllables in the appropriate place, and
18140    # otherwise changes the name to include the "-<code point>" suffix.
18141    my @algorithm_names;
18142    my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
18143                                             # along in this version
18144    # Copy it linearly.
18145    for my $i (0 .. @code_points_ending_in_code_point - 1) {
18146
18147        # Insert the hanguls in the correct place.
18148        if (! $done_hangul
18149            && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
18150        {
18151            $done_hangul = 1;
18152            push @algorithm_names, { low => $SBase,
18153                                     high => $SBase + $SCount - 1,
18154                                     name => '<hangul syllable>',
18155                                    };
18156        }
18157
18158        # Copy the current entry, modified.
18159        push @algorithm_names, {
18160            low => $code_points_ending_in_code_point[$i]->{'low'},
18161            high => $code_points_ending_in_code_point[$i]->{'high'},
18162            name =>
18163               "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
18164        };
18165    }
18166
18167    # Serialize these structures for output.
18168    my $loose_to_standard_value
18169                          = simple_dumper(\%loose_to_standard_value, ' ' x 4);
18170    chomp $loose_to_standard_value;
18171
18172    my $string_property_loose_to_name
18173                    = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
18174    chomp $string_property_loose_to_name;
18175
18176    my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
18177    chomp $perlprop_to_aliases;
18178
18179    my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
18180    chomp $prop_aliases;
18181
18182    my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
18183    chomp $prop_value_aliases;
18184
18185    my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
18186    chomp $suppressed;
18187
18188    my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
18189    chomp $algorithm_names;
18190
18191    my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
18192    chomp $ambiguous_names;
18193
18194    my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
18195    chomp $combination_property;
18196
18197    my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
18198    chomp $loose_defaults;
18199
18200    my @ucd = <<END;
18201$HEADER
18202$INTERNAL_ONLY_HEADER
18203
18204# This file is for the use of Unicode::UCD
18205
18206# Highest legal Unicode code point
18207\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
18208
18209# Hangul syllables
18210\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
18211\$Unicode::UCD::HANGUL_COUNT = $SCount;
18212
18213# Keys are all the possible "prop=value" combinations, in loose form; values
18214# are the standard loose name for the 'value' part of the key
18215\%Unicode::UCD::loose_to_standard_value = (
18216$loose_to_standard_value
18217);
18218
18219# String property loose names to standard loose name
18220\%Unicode::UCD::string_property_loose_to_name = (
18221$string_property_loose_to_name
18222);
18223
18224# Keys are Perl extensions in loose form; values are each one's list of
18225# aliases
18226\%Unicode::UCD::loose_perlprop_to_name = (
18227$perlprop_to_aliases
18228);
18229
18230# Keys are standard property name; values are each one's aliases
18231\%Unicode::UCD::prop_aliases = (
18232$prop_aliases
18233);
18234
18235# Keys of top level are standard property name; values are keys to another
18236# hash,  Each one is one of the property's values, in standard form.  The
18237# values are that prop-val's aliases.  If only one specified, the short and
18238# long alias are identical.
18239\%Unicode::UCD::prop_value_aliases = (
18240$prop_value_aliases
18241);
18242
18243# Ordered (by code point ordinal) list of the ranges of code points whose
18244# names are algorithmically determined.  Each range entry is an anonymous hash
18245# of the start and end points and a template for the names within it.
18246\@Unicode::UCD::algorithmic_named_code_points = (
18247$algorithm_names
18248);
18249
18250# The properties that as-is have two meanings, and which must be disambiguated
18251\%Unicode::UCD::ambiguous_names = (
18252$ambiguous_names
18253);
18254
18255# Keys are the prop-val combinations which are the default values for the
18256# given property, expressed in standard loose form
18257\%Unicode::UCD::loose_defaults = (
18258$loose_defaults
18259);
18260
18261# The properties that are combinations, in that they have both a map table and
18262# a match table.  This is actually for UCD.t, so it knows how to test for
18263# these.
18264\%Unicode::UCD::combination_property = (
18265$combination_property
18266);
18267
18268# All combinations of names that are suppressed.
18269# This is actually for UCD.t, so it knows which properties shouldn't have
18270# entries.  If it got any bigger, would probably want to put it in its own
18271# file to use memory only when it was needed, in testing.
18272\@Unicode::UCD::suppressed_properties = (
18273$suppressed
18274);
18275
182761;
18277END
18278
18279    main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
18280    return;
18281}
18282
18283sub write_all_tables() {
18284    # Write out all the tables generated by this program to files, as well as
18285    # the supporting data structures, pod file, and .t file.
18286
18287    my @writables;              # List of tables that actually get written
18288    my %match_tables_to_write;  # Used to collapse identical match tables
18289                                # into one file.  Each key is a hash function
18290                                # result to partition tables into buckets.
18291                                # Each value is an array of the tables that
18292                                # fit in the bucket.
18293
18294    # For each property ...
18295    # (sort so that if there is an immutable file name, it has precedence, so
18296    # some other property can't come in and take over its file name.  (We
18297    # don't care if both defined, as they had better be different anyway.)
18298    # The property named 'Perl' needs to be first (it doesn't have any
18299    # immutable file name) because empty properties are defined in terms of
18300    # its table named 'All' under the -annotate option.)   We also sort by
18301    # the property's name.  This is just for repeatability of the outputs
18302    # between runs of this program, but does not affect correctness.
18303    PROPERTY:
18304    foreach my $property ($perl,
18305                          sort { return -1 if defined $a->file;
18306                                 return 1 if defined $b->file;
18307                                 return $a->name cmp $b->name;
18308                                } grep { $_ != $perl } property_ref('*'))
18309    {
18310        my $type = $property->type;
18311
18312        # And for each table for that property, starting with the mapping
18313        # table for it ...
18314        TABLE:
18315        foreach my $table($property,
18316
18317                        # and all the match tables for it (if any), sorted so
18318                        # the ones with the shortest associated file name come
18319                        # first.  The length sorting prevents problems of a
18320                        # longer file taking a name that might have to be used
18321                        # by a shorter one.  The alphabetic sorting prevents
18322                        # differences between releases
18323                        sort {  my $ext_a = $a->external_name;
18324                                return 1 if ! defined $ext_a;
18325                                my $ext_b = $b->external_name;
18326                                return -1 if ! defined $ext_b;
18327
18328                                # But return the non-complement table before
18329                                # the complement one, as the latter is defined
18330                                # in terms of the former, and needs to have
18331                                # the information for the former available.
18332                                return 1 if $a->complement != 0;
18333                                return -1 if $b->complement != 0;
18334
18335                                # Similarly, return a subservient table after
18336                                # a leader
18337                                return 1 if $a->leader != $a;
18338                                return -1 if $b->leader != $b;
18339
18340                                my $cmp = length $ext_a <=> length $ext_b;
18341
18342                                # Return result if lengths not equal
18343                                return $cmp if $cmp;
18344
18345                                # Alphabetic if lengths equal
18346                                return $ext_a cmp $ext_b
18347                        } $property->tables
18348                    )
18349        {
18350
18351            # Here we have a table associated with a property.  It could be
18352            # the map table (done first for each property), or one of the
18353            # other tables.  Determine which type.
18354            my $is_property = $table->isa('Property');
18355
18356            my $name = $table->name;
18357            my $complete_name = $table->complete_name;
18358
18359            # See if should suppress the table if is empty, but warn if it
18360            # contains something.
18361            my $suppress_if_empty_warn_if_not
18362                    = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18363
18364            # Calculate if this table should have any code points associated
18365            # with it or not.
18366            my $expected_empty =
18367
18368                # $perl should be empty
18369                ($is_property && ($table == $perl))
18370
18371                # Match tables in properties we skipped populating should be
18372                # empty
18373                || (! $is_property && ! $property->to_create_match_tables)
18374
18375                # Tables and properties that are expected to have no code
18376                # points should be empty
18377                || $suppress_if_empty_warn_if_not
18378            ;
18379
18380            # Set a boolean if this table is the complement of an empty binary
18381            # table
18382            my $is_complement_of_empty_binary =
18383                $type == $BINARY &&
18384                (($table == $property->table('Y')
18385                    && $property->table('N')->is_empty)
18386                || ($table == $property->table('N')
18387                    && $property->table('Y')->is_empty));
18388
18389            if ($table->is_empty) {
18390
18391                if ($suppress_if_empty_warn_if_not) {
18392                    $table->set_fate($SUPPRESSED,
18393                                     $suppress_if_empty_warn_if_not);
18394                }
18395
18396                # Suppress (by skipping them) expected empty tables.
18397                next TABLE if $expected_empty;
18398
18399                # And setup to later output a warning for those that aren't
18400                # known to be allowed to be empty.  Don't do the warning if
18401                # this table is a child of another one to avoid duplicating
18402                # the warning that should come from the parent one.
18403                if (($table == $property || $table->parent == $table)
18404                    && $table->fate != $SUPPRESSED
18405                    && $table->fate != $MAP_PROXIED
18406                    && ! grep { $complete_name =~ /^$_$/ }
18407                                                    @tables_that_may_be_empty)
18408                {
18409                    push @unhandled_properties, "$table";
18410                }
18411
18412                # The old way of expressing an empty match list was to
18413                # complement the list that matches everything.  The new way is
18414                # to create an empty inversion list, but this doesn't work for
18415                # annotating, so use the old way then.
18416                $table->set_complement($All) if $annotate
18417                                                && $table != $property;
18418            }
18419            elsif ($expected_empty) {
18420                my $because = "";
18421                if ($suppress_if_empty_warn_if_not) {
18422                    $because = " because $suppress_if_empty_warn_if_not";
18423                }
18424
18425                Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
18426            }
18427
18428            # Some tables should match everything
18429            my $expected_full =
18430                ($table->fate == $SUPPRESSED)
18431                ? 0
18432                : ($is_property)
18433                  ? # All these types of map tables will be full because
18434                    # they will have been populated with defaults
18435                    ($type == $ENUM)
18436
18437                  : # A match table should match everything if its method
18438                    # shows it should
18439                    ($table->matches_all
18440
18441                    # The complement of an empty binary table will match
18442                    # everything
18443                    || $is_complement_of_empty_binary
18444                    )
18445            ;
18446
18447            my $count = $table->count;
18448            if ($expected_full) {
18449                if ($count != $MAX_WORKING_CODEPOINTS) {
18450                    Carp::my_carp("$table matches only "
18451                    . clarify_number($count)
18452                    . " Unicode code points but should match "
18453                    . clarify_number($MAX_WORKING_CODEPOINTS)
18454                    . " (off by "
18455                    .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18456                    . ").  Proceeding anyway.");
18457                }
18458
18459                # Here is expected to be full.  If it is because it is the
18460                # complement of an (empty) binary table that is to be
18461                # suppressed, then suppress this one as well.
18462                if ($is_complement_of_empty_binary) {
18463                    my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18464                    my $opposing = $property->table($opposing_name);
18465                    my $opposing_status = $opposing->status;
18466                    if ($opposing_status) {
18467                        $table->set_status($opposing_status,
18468                                           $opposing->status_info);
18469                    }
18470                }
18471            }
18472            elsif ($count == $MAX_UNICODE_CODEPOINTS
18473                   && $name ne "Any"
18474                   && ($table == $property || $table->leader == $table)
18475                   && $table->property->status ne $NORMAL)
18476            {
18477                    Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
18478            }
18479
18480            if ($table->fate >= $SUPPRESSED) {
18481                if (! $is_property) {
18482                    my @children = $table->children;
18483                    foreach my $child (@children) {
18484                        if ($child->fate < $SUPPRESSED) {
18485                            Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18486                        }
18487                    }
18488                }
18489                next TABLE;
18490
18491            }
18492
18493            if (! $is_property) {
18494
18495                make_ucd_table_pod_entries($table) if $table->property == $perl;
18496
18497                # Several things need to be done just once for each related
18498                # group of match tables.  Do them on the parent.
18499                if ($table->parent == $table) {
18500
18501                    # Add an entry in the pod file for the table; it also does
18502                    # the children.
18503                    make_re_pod_entries($table) if defined $pod_directory;
18504
18505                    # See if the table matches identical code points with
18506                    # something that has already been processed and is ready
18507                    # for output.  In that case, no need to have two files
18508                    # with the same code points in them.  We use the table's
18509                    # hash() method to store these in buckets, so that it is
18510                    # quite likely that if two tables are in the same bucket
18511                    # they will be identical, so don't have to compare tables
18512                    # frequently.  The tables have to have the same status to
18513                    # share a file, so add this to the bucket hash.  (The
18514                    # reason for this latter is that Heavy.pl associates a
18515                    # status with a file.) We don't check tables that are
18516                    # inverses of others, as it would lead to some coding
18517                    # complications, and checking all the regular ones should
18518                    # find everything.
18519                    if ($table->complement == 0) {
18520                        my $hash = $table->hash . ';' . $table->status;
18521
18522                        # Look at each table that is in the same bucket as
18523                        # this one would be.
18524                        foreach my $comparison
18525                                            (@{$match_tables_to_write{$hash}})
18526                        {
18527                            # If the table doesn't point back to this one, we
18528                            # see if it matches identically
18529                            if (   $comparison->leader != $table
18530                                && $table->matches_identically_to($comparison))
18531                            {
18532                                $table->set_equivalent_to($comparison,
18533                                                                Related => 0);
18534                                next TABLE;
18535                            }
18536                        }
18537
18538                        # Here, not equivalent, add this table to the bucket.
18539                        push @{$match_tables_to_write{$hash}}, $table;
18540                    }
18541                }
18542            }
18543            else {
18544
18545                # Here is the property itself.
18546                # Don't write out or make references to the $perl property
18547                next if $table == $perl;
18548
18549                make_ucd_table_pod_entries($table);
18550
18551                # There is a mapping stored of the various synonyms to the
18552                # standardized name of the property for utf8_heavy.pl.
18553                # Also, the pod file contains entries of the form:
18554                # \p{alias: *}         \p{full: *}
18555                # rather than show every possible combination of things.
18556
18557                my @property_aliases = $property->aliases;
18558
18559                my $full_property_name = $property->full_name;
18560                my $property_name = $property->name;
18561                my $standard_property_name = standardize($property_name);
18562                my $standard_property_full_name
18563                                        = standardize($full_property_name);
18564
18565                # We also create for Unicode::UCD a list of aliases for
18566                # the property.  The list starts with the property name;
18567                # then its full name.  Legacy properties are not listed in
18568                # Unicode::UCD.
18569                my @property_list;
18570                my @standard_list;
18571                if ( $property->fate <= $MAP_PROXIED) {
18572                    @property_list = ($property_name, $full_property_name);
18573                    @standard_list = ($standard_property_name,
18574                                        $standard_property_full_name);
18575                }
18576
18577                # For each synonym ...
18578                for my $i (0 .. @property_aliases - 1)  {
18579                    my $alias = $property_aliases[$i];
18580                    my $alias_name = $alias->name;
18581                    my $alias_standard = standardize($alias_name);
18582
18583
18584                    # Add other aliases to the list of property aliases
18585                    if ($property->fate <= $MAP_PROXIED
18586                        && ! grep { $alias_standard eq $_ } @standard_list)
18587                    {
18588                        push @property_list, $alias_name;
18589                        push @standard_list, $alias_standard;
18590                    }
18591
18592                    # For utf8_heavy, set the mapping of the alias to the
18593                    # property
18594                    if ($type == $STRING) {
18595                        if ($property->fate <= $MAP_PROXIED) {
18596                            $string_property_loose_to_name{$alias_standard}
18597                                            = $standard_property_name;
18598                        }
18599                    }
18600                    else {
18601                        my $hash_ref = ($alias_standard =~ /^_/)
18602                                       ? \%strict_property_name_of
18603                                       : \%loose_property_name_of;
18604                        if (exists $hash_ref->{$alias_standard}) {
18605                            Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
18606                        }
18607                        else {
18608                            $hash_ref->{$alias_standard}
18609                                                = $standard_property_name;
18610                        }
18611
18612                        # Now for the re pod entry for this alias.  Skip if not
18613                        # outputting a pod; skip the first one, which is the
18614                        # full name so won't have an entry like: '\p{full: *}
18615                        # \p{full: *}', and skip if don't want an entry for
18616                        # this one.
18617                        next if $i == 0
18618                                || ! defined $pod_directory
18619                                || ! $alias->make_re_pod_entry;
18620
18621                        my $rhs = "\\p{$full_property_name: *}";
18622                        if ($property != $perl && $table->perl_extension) {
18623                            $rhs .= ' (Perl extension)';
18624                        }
18625                        push @match_properties,
18626                            format_pod_line($indent_info_column,
18627                                        '\p{' . $alias->name . ': *}',
18628                                        $rhs,
18629                                        $alias->status);
18630                    }
18631                }
18632
18633                # The list of all possible names is attached to each alias, so
18634                # lookup is easy
18635                if (@property_list) {
18636                    push @{$prop_aliases{$standard_list[0]}}, @property_list;
18637                }
18638
18639                if ($property->fate <= $MAP_PROXIED) {
18640
18641                    # Similarly, we create for Unicode::UCD a list of
18642                    # property-value aliases.
18643
18644                    # Look at each table in the property...
18645                    foreach my $table ($property->tables) {
18646                        my @values_list;
18647                        my $table_full_name = $table->full_name;
18648                        my $standard_table_full_name
18649                                              = standardize($table_full_name);
18650                        my $table_name = $table->name;
18651                        my $standard_table_name = standardize($table_name);
18652
18653                        # The list starts with the table name and its full
18654                        # name.
18655                        push @values_list, $table_name, $table_full_name;
18656
18657                        # We add to the table each unique alias that isn't
18658                        # discouraged from use.
18659                        foreach my $alias ($table->aliases) {
18660                            next if $alias->status
18661                                 && $alias->status eq $DISCOURAGED;
18662                            my $name = $alias->name;
18663                            my $standard = standardize($name);
18664                            next if $standard eq $standard_table_name;
18665                            next if $standard eq $standard_table_full_name;
18666                            push @values_list, $name;
18667                        }
18668
18669                        # Here @values_list is a list of all the aliases for
18670                        # the table.  That is, all the property-values given
18671                        # by this table.  By agreement with Unicode::UCD,
18672                        # if the name and full name are identical, and there
18673                        # are no other names, drop the duplicate entry to save
18674                        # memory.
18675                        if (@values_list == 2
18676                            && $values_list[0] eq $values_list[1])
18677                        {
18678                            pop @values_list
18679                        }
18680
18681                        # To save memory, unlike the similar list for property
18682                        # aliases above, only the standard forms have the list.
18683                        # This forces an extra step of converting from input
18684                        # name to standard name, but the savings are
18685                        # considerable.  (There is only marginal savings if we
18686                        # did this with the property aliases.)
18687                        push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18688                    }
18689                }
18690
18691                # Don't write out a mapping file if not desired.
18692                next if ! $property->to_output_map;
18693            }
18694
18695            # Here, we know we want to write out the table, but don't do it
18696            # yet because there may be other tables that come along and will
18697            # want to share the file, and the file's comments will change to
18698            # mention them.  So save for later.
18699            push @writables, $table;
18700
18701        } # End of looping through the property and all its tables.
18702    } # End of looping through all properties.
18703
18704    # Now have all the tables that will have files written for them.  Do it.
18705    foreach my $table (@writables) {
18706        my @directory;
18707        my $filename;
18708        my $property = $table->property;
18709        my $is_property = ($table == $property);
18710
18711        # For very short tables, instead of writing them out to actual files,
18712        # we in-line their inversion list definitions into Heavy.pl.  The
18713        # definition replaces the file name, and the special pseudo-directory
18714        # '#' is used to signal this.  This significantly cuts down the number
18715        # of files written at little extra cost to the hashes in Heavy.pl.
18716        # And it means, no run-time files to read to get the definitions.
18717        if (! $is_property
18718            && ! $annotate  # For annotation, we want to explicitly show
18719                            # everything, so keep in files
18720            && $table->ranges <= 3)
18721        {
18722            my @ranges = $table->ranges;
18723            my $count = @ranges;
18724            if ($count == 0) {  # 0th index reserved for 0-length lists
18725                $filename = 0;
18726            }
18727            elsif ($table->leader != $table) {
18728
18729                # Here, is a table that is equivalent to another; code
18730                # in register_file_for_name() causes its leader's definition
18731                # to be used
18732
18733                next;
18734            }
18735            else {  # No equivalent table so far.
18736
18737                # Build up its definition range-by-range.
18738                my $definition = "";
18739                while (defined (my $range = shift @ranges)) {
18740                    my $end = $range->end;
18741                    if ($end < $MAX_WORKING_CODEPOINT) {
18742                        $count++;
18743                        $end = "\n" . ($end + 1);
18744                    }
18745                    else {  # Extends to infinity, hence no 'end'
18746                        $end = "";
18747                    }
18748                    $definition .= "\n" . $range->start . $end;
18749                }
18750                $definition = "V$count" . $definition;
18751                $filename = @inline_definitions;
18752                push @inline_definitions, $definition;
18753            }
18754            @directory = "#";
18755            register_file_for_name($table, \@directory, $filename);
18756            next;
18757        }
18758
18759        if (! $is_property) {
18760            # Match tables for the property go in lib/$subdirectory, which is
18761            # the property's name.  Don't use the standard file name for this,
18762            # as may get an unfamiliar alias
18763            @directory = ($matches_directory, $property->external_name);
18764        }
18765        else {
18766
18767            @directory = $table->directory;
18768            $filename = $table->file;
18769        }
18770
18771        # Use specified filename if available, or default to property's
18772        # shortest name.  We need an 8.3 safe filename (which means "an 8
18773        # safe" filename, since after the dot is only 'pl', which is < 3)
18774        # The 2nd parameter is if the filename shouldn't be changed, and
18775        # it shouldn't iff there is a hard-coded name for this table.
18776        $filename = construct_filename(
18777                                $filename || $table->external_name,
18778                                ! $filename,    # mutable if no filename
18779                                \@directory);
18780
18781        register_file_for_name($table, \@directory, $filename);
18782
18783        # Only need to write one file when shared by more than one
18784        # property
18785        next if ! $is_property
18786                && ($table->leader != $table || $table->complement != 0);
18787
18788        # Construct a nice comment to add to the file
18789        $table->set_final_comment;
18790
18791        $table->write;
18792    }
18793
18794
18795    # Write out the pod file
18796    make_pod;
18797
18798    # And Heavy.pl, Name.pm, UCD.pl
18799    make_Heavy;
18800    make_Name_pm;
18801    make_UCD;
18802
18803    make_property_test_script() if $make_test_script;
18804    make_normalization_test_script() if $make_norm_test_script;
18805    return;
18806}
18807
18808my @white_space_separators = ( # This used only for making the test script.
18809                            "",
18810                            ' ',
18811                            "\t",
18812                            '   '
18813                        );
18814
18815sub generate_separator($) {
18816    # This used only for making the test script.  It generates the colon or
18817    # equal separator between the property and property value, with random
18818    # white space surrounding the separator
18819
18820    my $lhs = shift;
18821
18822    return "" if $lhs eq "";  # No separator if there's only one (the r) side
18823
18824    # Choose space before and after randomly
18825    my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18826    my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18827
18828    # And return the whole complex, half the time using a colon, half the
18829    # equals
18830    return $spaces_before
18831            . (rand() < 0.5) ? '=' : ':'
18832            . $spaces_after;
18833}
18834
18835sub generate_tests($$$$$) {
18836    # This used only for making the test script.  It generates test cases that
18837    # are expected to compile successfully in perl.  Note that the LHS and
18838    # RHS are assumed to already be as randomized as the caller wants.
18839
18840    my $lhs = shift;           # The property: what's to the left of the colon
18841                               #  or equals separator
18842    my $rhs = shift;           # The property value; what's to the right
18843    my $valid_code = shift;    # A code point that's known to be in the
18844                               # table given by LHS=RHS; undef if table is
18845                               # empty
18846    my $invalid_code = shift;  # A code point known to not be in the table;
18847                               # undef if the table is all code points
18848    my $warning = shift;
18849
18850    # Get the colon or equal
18851    my $separator = generate_separator($lhs);
18852
18853    # The whole 'property=value'
18854    my $name = "$lhs$separator$rhs";
18855
18856    my @output;
18857    # Create a complete set of tests, with complements.
18858    if (defined $valid_code) {
18859        push @output, <<"EOC"
18860Expect(1, $valid_code, '\\p{$name}', $warning);
18861Expect(0, $valid_code, '\\p{^$name}', $warning);
18862Expect(0, $valid_code, '\\P{$name}', $warning);
18863Expect(1, $valid_code, '\\P{^$name}', $warning);
18864EOC
18865    }
18866    if (defined $invalid_code) {
18867        push @output, <<"EOC"
18868Expect(0, $invalid_code, '\\p{$name}', $warning);
18869Expect(1, $invalid_code, '\\p{^$name}', $warning);
18870Expect(1, $invalid_code, '\\P{$name}', $warning);
18871Expect(0, $invalid_code, '\\P{^$name}', $warning);
18872EOC
18873    }
18874    return @output;
18875}
18876
18877sub generate_error($$$) {
18878    # This used only for making the test script.  It generates test cases that
18879    # are expected to not only not match, but to be syntax or similar errors
18880
18881    my $lhs = shift;                # The property: what's to the left of the
18882                                    # colon or equals separator
18883    my $rhs = shift;                # The property value; what's to the right
18884    my $already_in_error = shift;   # Boolean; if true it's known that the
18885                                # unmodified LHS and RHS will cause an error.
18886                                # This routine should not force another one
18887    # Get the colon or equal
18888    my $separator = generate_separator($lhs);
18889
18890    # Since this is an error only, don't bother to randomly decide whether to
18891    # put the error on the left or right side; and assume that the RHS is
18892    # loosely matched, again for convenience rather than rigor.
18893    $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18894
18895    my $property = $lhs . $separator . $rhs;
18896
18897    return <<"EOC";
18898Error('\\p{$property}');
18899Error('\\P{$property}');
18900EOC
18901}
18902
18903# These are used only for making the test script
18904# XXX Maybe should also have a bad strict seps, which includes underscore.
18905
18906my @good_loose_seps = (
18907            " ",
18908            "-",
18909            "\t",
18910            "",
18911            "_",
18912           );
18913my @bad_loose_seps = (
18914           "/a/",
18915           ':=',
18916          );
18917
18918sub randomize_stricter_name {
18919    # This used only for making the test script.  Take the input name and
18920    # return a randomized, but valid version of it under the stricter matching
18921    # rules.
18922
18923    my $name = shift;
18924    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18925
18926    # If the name looks like a number (integer, floating, or rational), do
18927    # some extra work
18928    if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18929        my $sign = $1;
18930        my $number = $2;
18931        my $separator = $3;
18932
18933        # If there isn't a sign, part of the time add a plus
18934        # Note: Not testing having any denominator having a minus sign
18935        if (! $sign) {
18936            $sign = '+' if rand() <= .3;
18937        }
18938
18939        # And add 0 or more leading zeros.
18940        $name = $sign . ('0' x int rand(10)) . $number;
18941
18942        if (defined $separator) {
18943            my $extra_zeros = '0' x int rand(10);
18944
18945            if ($separator eq '.') {
18946
18947                # Similarly, add 0 or more trailing zeros after a decimal
18948                # point
18949                $name .= $extra_zeros;
18950            }
18951            else {
18952
18953                # Or, leading zeros before the denominator
18954                $name =~ s,/,/$extra_zeros,;
18955            }
18956        }
18957    }
18958
18959    # For legibility of the test, only change the case of whole sections at a
18960    # time.  To do this, first split into sections.  The split returns the
18961    # delimiters
18962    my @sections;
18963    for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18964        trace $section if main::DEBUG && $to_trace;
18965
18966        if (length $section > 1 && $section !~ /\D/) {
18967
18968            # If the section is a sequence of digits, about half the time
18969            # randomly add underscores between some of them.
18970            if (rand() > .5) {
18971
18972                # Figure out how many underscores to add.  max is 1 less than
18973                # the number of digits.  (But add 1 at the end to make sure
18974                # result isn't 0, and compensate earlier by subtracting 2
18975                # instead of 1)
18976                my $num_underscores = int rand(length($section) - 2) + 1;
18977
18978                # And add them evenly throughout, for convenience, not rigor
18979                use integer;
18980                my $spacing = (length($section) - 1)/ $num_underscores;
18981                my $temp = $section;
18982                $section = "";
18983                for my $i (1 .. $num_underscores) {
18984                    $section .= substr($temp, 0, $spacing, "") . '_';
18985                }
18986                $section .= $temp;
18987            }
18988            push @sections, $section;
18989        }
18990        else {
18991
18992            # Here not a sequence of digits.  Change the case of the section
18993            # randomly
18994            my $switch = int rand(4);
18995            if ($switch == 0) {
18996                push @sections, uc $section;
18997            }
18998            elsif ($switch == 1) {
18999                push @sections, lc $section;
19000            }
19001            elsif ($switch == 2) {
19002                push @sections, ucfirst $section;
19003            }
19004            else {
19005                push @sections, $section;
19006            }
19007        }
19008    }
19009    trace "returning", join "", @sections if main::DEBUG && $to_trace;
19010    return join "", @sections;
19011}
19012
19013sub randomize_loose_name($;$) {
19014    # This used only for making the test script
19015
19016    my $name = shift;
19017    my $want_error = shift;  # if true, make an error
19018    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
19019
19020    $name = randomize_stricter_name($name);
19021
19022    my @parts;
19023    push @parts, $good_loose_seps[rand(@good_loose_seps)];
19024
19025    # Preserve trailing ones for the sake of not stripping the underscore from
19026    # 'L_'
19027    for my $part (split /[-\s_]+ (?= . )/, $name) {
19028        if (@parts) {
19029            if ($want_error and rand() < 0.3) {
19030                push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
19031                $want_error = 0;
19032            }
19033            else {
19034                push @parts, $good_loose_seps[rand(@good_loose_seps)];
19035            }
19036        }
19037        push @parts, $part;
19038    }
19039    my $new = join("", @parts);
19040    trace "$name => $new" if main::DEBUG && $to_trace;
19041
19042    if ($want_error) {
19043        if (rand() >= 0.5) {
19044            $new .= $bad_loose_seps[rand(@bad_loose_seps)];
19045        }
19046        else {
19047            $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
19048        }
19049    }
19050    return $new;
19051}
19052
19053# Used to make sure don't generate duplicate test cases.
19054my %test_generated;
19055
19056sub make_property_test_script() {
19057    # This used only for making the test script
19058    # this written directly -- it's huge.
19059
19060    print "Making test script\n" if $verbosity >= $PROGRESS;
19061
19062    # This uses randomness to test different possibilities without testing all
19063    # possibilities.  To ensure repeatability, set the seed to 0.  But if
19064    # tests are added, it will perturb all later ones in the .t file
19065    srand 0;
19066
19067    $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
19068
19069    # Keep going down an order of magnitude
19070    # until find that adding this quantity to
19071    # 1 remains 1; but put an upper limit on
19072    # this so in case this algorithm doesn't
19073    # work properly on some platform, that we
19074    # won't loop forever.
19075    my $digits = 0;
19076    my $min_floating_slop = 1;
19077    while (1+ $min_floating_slop != 1
19078            && $digits++ < 50)
19079    {
19080        my $next = $min_floating_slop / 10;
19081        last if $next == 0; # If underflows,
19082                            # use previous one
19083        $min_floating_slop = $next;
19084    }
19085
19086    # It doesn't matter whether the elements of this array contain single lines
19087    # or multiple lines. main::write doesn't count the lines.
19088    my @output;
19089
19090    push @output, <<'EOF_CODE';
19091Error('\p{Script=InGreek}');    # Bug #69018
19092Test_GCB("1100 $nobreak 1161");  # Bug #70940
19093Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
19094Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
19095Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
19096
19097# Make sure this gets tested; it was not part of the official test suite at
19098# the time this was added.  Note that this is as it would appear in the
19099# official suite, and gets modified to check for the perl tailoring by
19100# Test_WB()
19101Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
19102Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
19103EOF_CODE
19104
19105    # Sort these so get results in same order on different runs of this
19106    # program
19107    foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
19108                                    or
19109                                 lc $a->name cmp lc $b->name
19110                               } property_ref('*'))
19111    {
19112        # Non-binary properties should not match \p{};  Test all for that.
19113        if ($property->type != $BINARY) {
19114            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
19115                                                            $property->aliases;
19116            foreach my $property_alias ($property->aliases) {
19117                my $name = standardize($property_alias->name);
19118
19119                # But some names are ambiguous, meaning a binary property with
19120                # the same name when used in \p{}, and a different
19121                # (non-binary) property in other contexts.
19122                next if grep { $name eq $_ } keys %ambiguous_names;
19123
19124                push @output, <<"EOF_CODE";
19125Error('\\p{$name}');
19126Error('\\P{$name}');
19127EOF_CODE
19128            }
19129        }
19130        foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
19131                                    or
19132                                  lc $a->name cmp lc $b->name
19133                                } $property->tables)
19134        {
19135
19136            # Find code points that match, and don't match this table.
19137            my $valid = $table->get_valid_code_point;
19138            my $invalid = $table->get_invalid_code_point;
19139            my $warning = ($table->status eq $DEPRECATED)
19140                            ? "'deprecated'"
19141                            : '""';
19142
19143            # Test each possible combination of the property's aliases with
19144            # the table's.  If this gets to be too many, could do what is done
19145            # in the set_final_comment() for Tables
19146            my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
19147            next unless @table_aliases;
19148            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
19149            next unless @property_aliases;
19150
19151            # Every property can be optionally be prefixed by 'Is_', so test
19152            # that those work, by creating such a new alias for each
19153            # pre-existing one.
19154            push @property_aliases, map { Alias->new("Is_" . $_->name,
19155                                                    $_->loose_match,
19156                                                    $_->make_re_pod_entry,
19157                                                    $_->ok_as_filename,
19158                                                    $_->status,
19159                                                    $_->ucd,
19160                                                    )
19161                                         } @property_aliases;
19162            my $max = max(scalar @table_aliases, scalar @property_aliases);
19163            for my $j (0 .. $max - 1) {
19164
19165                # The current alias for property is the next one on the list,
19166                # or if beyond the end, start over.  Similarly for table
19167                my $property_name
19168                            = $property_aliases[$j % @property_aliases]->name;
19169
19170                $property_name = "" if $table->property == $perl;
19171                my $table_alias = $table_aliases[$j % @table_aliases];
19172                my $table_name = $table_alias->name;
19173                my $loose_match = $table_alias->loose_match;
19174
19175                # If the table doesn't have a file, any test for it is
19176                # already guaranteed to be in error
19177                my $already_error = ! $table->file_path;
19178
19179                # Generate error cases for this alias.
19180                push @output, generate_error($property_name,
19181                                             $table_name,
19182                                             $already_error);
19183
19184                # If the table is guaranteed to always generate an error,
19185                # quit now without generating success cases.
19186                next if $already_error;
19187
19188                # Now for the success cases.
19189                my $random;
19190                if ($loose_match) {
19191
19192                    # For loose matching, create an extra test case for the
19193                    # standard name.
19194                    my $standard = standardize($table_name);
19195
19196                    # $test_name should be a unique combination for each test
19197                    # case; used just to avoid duplicate tests
19198                    my $test_name = "$property_name=$standard";
19199
19200                    # Don't output duplicate test cases.
19201                    if (! exists $test_generated{$test_name}) {
19202                        $test_generated{$test_name} = 1;
19203                        push @output, generate_tests($property_name,
19204                                                     $standard,
19205                                                     $valid,
19206                                                     $invalid,
19207                                                     $warning,
19208                                                 );
19209                    }
19210                    $random = randomize_loose_name($table_name)
19211                }
19212                else { # Stricter match
19213                    $random = randomize_stricter_name($table_name);
19214                }
19215
19216                # Now for the main test case for this alias.
19217                my $test_name = "$property_name=$random";
19218                if (! exists $test_generated{$test_name}) {
19219                    $test_generated{$test_name} = 1;
19220                    push @output, generate_tests($property_name,
19221                                                 $random,
19222                                                 $valid,
19223                                                 $invalid,
19224                                                 $warning,
19225                                             );
19226
19227                    # If the name is a rational number, add tests for the
19228                    # floating point equivalent.
19229                    if ($table_name =~ qr{/}) {
19230
19231                        # Calculate the float, and find just the fraction.
19232                        my $float = eval $table_name;
19233                        my ($whole, $fraction)
19234                                            = $float =~ / (.*) \. (.*) /x;
19235
19236                        # Starting with one digit after the decimal point,
19237                        # create a test for each possible precision (number of
19238                        # digits past the decimal point) until well beyond the
19239                        # native number found on this machine.  (If we started
19240                        # with 0 digits, it would be an integer, which could
19241                        # well match an unrelated table)
19242                        PLACE:
19243                        for my $i (1 .. $min_floating_slop + 3) {
19244                            my $table_name = sprintf("%.*f", $i, $float);
19245                            if ($i < $MIN_FRACTION_LENGTH) {
19246
19247                                # If the test case has fewer digits than the
19248                                # minimum acceptable precision, it shouldn't
19249                                # succeed, so we expect an error for it.
19250                                # E.g., 2/3 = .7 at one decimal point, and we
19251                                # shouldn't say it matches .7.  We should make
19252                                # it be .667 at least before agreeing that the
19253                                # intent was to match 2/3.  But at the
19254                                # less-than- acceptable level of precision, it
19255                                # might actually match an unrelated number.
19256                                # So don't generate a test case if this
19257                                # conflating is possible.  In our example, we
19258                                # don't want 2/3 matching 7/10, if there is
19259                                # a 7/10 code point.
19260
19261                                # First, integers are not in the rationals
19262                                # table.  Don't generate an error if this
19263                                # rounds to an integer using the given
19264                                # precision.
19265                                my $round = sprintf "%.0f", $table_name;
19266                                next PLACE if abs($table_name - $round)
19267                                                        < $MAX_FLOATING_SLOP;
19268
19269                                # Here, isn't close enough to an integer to be
19270                                # confusable with one.  Now, see it it's
19271                                # "close" to a known rational
19272                                for my $existing
19273                                        (keys %nv_floating_to_rational)
19274                                {
19275                                    next PLACE
19276                                        if abs($table_name - $existing)
19277                                                < $MAX_FLOATING_SLOP;
19278                                }
19279                                push @output, generate_error($property_name,
19280                                                             $table_name,
19281                                                             1   # 1 => already an error
19282                                              );
19283                            }
19284                            else {
19285
19286                                # Here the number of digits exceeds the
19287                                # minimum we think is needed.  So generate a
19288                                # success test case for it.
19289                                push @output, generate_tests($property_name,
19290                                                             $table_name,
19291                                                             $valid,
19292                                                             $invalid,
19293                                                             $warning,
19294                                             );
19295                            }
19296                        }
19297                    }
19298                }
19299            }
19300            $table->DESTROY();
19301        }
19302        $property->DESTROY();
19303    }
19304
19305    # Make any test of the boundary (break) properties TODO if the code
19306    # doesn't match the version being compiled
19307    my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19308                             ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19309                             : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19310
19311    @output= map {
19312        map s/^/    /mgr,
19313        map "$_;\n",
19314        split /;\n/, $_
19315    } @output;
19316
19317    # Cause there to be 'if' statements to only execute a portion of this
19318    # long-running test each time, so that we can have a bunch of .t's running
19319    # in parallel
19320    my $chunks = 10     # Number of test files
19321               - 1      # For GCB & SB
19322               - 1      # For WB
19323               - 4;     # LB split into this many files
19324    my @output_chunked;
19325    my $chunk_count=0;
19326    my $chunk_size= int(@output / $chunks) + 1;
19327    while (@output) {
19328        $chunk_count++;
19329        my @chunk= splice @output, 0, $chunk_size;
19330        push @output_chunked,
19331            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19332                @chunk,
19333            "}\n";
19334    }
19335
19336    $chunk_count++;
19337    push @output_chunked,
19338        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19339            (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
19340            (map {"    Test_SB('$_');\n"} @SB_tests),
19341        "}\n";
19342
19343
19344    $chunk_size= int(@LB_tests / 4) + 1;
19345    @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
19346    while (@LB_tests) {
19347        $chunk_count++;
19348        my @chunk= splice @LB_tests, 0, $chunk_size;
19349        push @output_chunked,
19350            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19351                @chunk,
19352            "}\n";
19353    }
19354
19355    $chunk_count++;
19356    push @output_chunked,
19357        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19358            (map {"    Test_WB('$_');\n"} @WB_tests),
19359        "}\n";
19360
19361    &write($t_path,
19362           0,           # Not utf8;
19363           [$HEADER,
19364            $TODO_FAILING_BREAKS,
19365            <DATA>,
19366            @output_chunked,
19367            "Finished();\n",
19368           ]);
19369
19370    return;
19371}
19372
19373sub make_normalization_test_script() {
19374    print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19375
19376    my $n_path = 'TestNorm.pl';
19377
19378    unshift @normalization_tests, <<'END';
19379use utf8;
19380use Test::More;
19381
19382sub ord_string {    # Convert packed ords to printable string
19383    use charnames ();
19384    return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19385                                                unpack "U*", shift) .  "'";
19386    #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
19387}
19388
19389sub Test_N {
19390    my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19391    my $display_source = ord_string($source);
19392    my $display_nfc = ord_string($nfc);
19393    my $display_nfd = ord_string($nfd);
19394    my $display_nfkc = ord_string($nfkc);
19395    my $display_nfkd = ord_string($nfkd);
19396
19397    use Unicode::Normalize;
19398    #    NFC
19399    #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
19400    #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
19401    #
19402    #    NFD
19403    #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
19404    #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
19405    #
19406    #    NFKC
19407    #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19408    #      toNFKC(nfkc) == toNFKC(nfkd)
19409    #
19410    #    NFKD
19411    #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19412    #      toNFKD(nfkc) == toNFKD(nfkd)
19413
19414    is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19415    is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19416    is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19417    is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19418    is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19419
19420    is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19421    is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19422    is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19423    is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19424    is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19425
19426    is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19427    is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19428    is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19429    is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19430    is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19431
19432    is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19433    is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19434    is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19435    is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19436    is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19437}
19438END
19439
19440    &write($n_path,
19441           1,           # Is utf8;
19442           [
19443            @normalization_tests,
19444            'done_testing();'
19445            ]);
19446    return;
19447}
19448
19449# Skip reasons, so will be exact same text and hence the files with each
19450# reason will get grouped together in perluniprops.
19451my $Documentation = "Documentation";
19452my $Indic_Skip
19453            = "Provisional; for the analysis and processing of Indic scripts";
19454my $Validation = "Validation Tests";
19455my $Validation_Documentation = "Documentation of validation Tests";
19456
19457# This is a list of the input files and how to handle them.  The files are
19458# processed in their order in this list.  Some reordering is possible if
19459# desired, but the PropertyAliases and PropValueAliases files should be first,
19460# and the extracted before the others (as data in an extracted file can be
19461# over-ridden by the non-extracted.  Some other files depend on data derived
19462# from an earlier file, like UnicodeData requires data from Jamo, and the case
19463# changing and folding requires data from Unicode.  Mostly, it is safest to
19464# order by first version releases in (except the Jamo).
19465#
19466# The version strings allow the program to know whether to expect a file or
19467# not, but if a file exists in the directory, it will be processed, even if it
19468# is in a version earlier than expected, so you can copy files from a later
19469# release into an earlier release's directory.
19470my @input_file_objects = (
19471    Input_file->new('PropertyAliases.txt', v3.2,
19472                    Handler => \&process_PropertyAliases,
19473                    Early => [ \&substitute_PropertyAliases ],
19474                    Required_Even_in_Debug_Skip => 1,
19475                   ),
19476    Input_file->new(undef, v0,  # No file associated with this
19477                    Progress_Message => 'Finishing property setup',
19478                    Handler => \&finish_property_setup,
19479                   ),
19480    Input_file->new('PropValueAliases.txt', v3.2,
19481                     Handler => \&process_PropValueAliases,
19482                     Early => [ \&substitute_PropValueAliases ],
19483                     Has_Missings_Defaults => $NOT_IGNORED,
19484                     Required_Even_in_Debug_Skip => 1,
19485                    ),
19486    Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19487                    Property => 'General_Category',
19488                   ),
19489    Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19490                    Property => 'Canonical_Combining_Class',
19491                    Has_Missings_Defaults => $NOT_IGNORED,
19492                   ),
19493    Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19494                    Property => 'Numeric_Type',
19495                    Has_Missings_Defaults => $NOT_IGNORED,
19496                   ),
19497    Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19498                    Property => 'East_Asian_Width',
19499                    Has_Missings_Defaults => $NOT_IGNORED,
19500                   ),
19501    Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19502                    Property => 'Line_Break',
19503                    Has_Missings_Defaults => $NOT_IGNORED,
19504                   ),
19505    Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19506                    Property => 'Bidi_Class',
19507                    Has_Missings_Defaults => $NOT_IGNORED,
19508                   ),
19509    Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19510                    Property => 'Decomposition_Type',
19511                    Has_Missings_Defaults => $NOT_IGNORED,
19512                   ),
19513    Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19514    Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19515                    Property => 'Numeric_Value',
19516                    Each_Line_Handler => \&filter_numeric_value_line,
19517                    Has_Missings_Defaults => $NOT_IGNORED,
19518                   ),
19519    Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19520                    Property => 'Joining_Group',
19521                    Has_Missings_Defaults => $NOT_IGNORED,
19522                   ),
19523
19524    Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19525                    Property => 'Joining_Type',
19526                    Has_Missings_Defaults => $NOT_IGNORED,
19527                   ),
19528    Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19529                    Skip => 'This file adds no new information not already'
19530                          . ' present in other files',
19531                    # And it's unnecessary programmer work to handle this new
19532                    # format.  Previous Derived files actually had bug fixes
19533                    # in them that were useful, but that should not be the
19534                    # case here.
19535                   ),
19536    Input_file->new('Jamo.txt', v2.0.0,
19537                    Property => 'Jamo_Short_Name',
19538                    Each_Line_Handler => \&filter_jamo_line,
19539                   ),
19540    Input_file->new('UnicodeData.txt', v1.1.5,
19541                    Pre_Handler => \&setup_UnicodeData,
19542
19543                    # We clean up this file for some early versions.
19544                    Each_Line_Handler => [ (($v_version lt v2.0.0 )
19545                                            ? \&filter_v1_ucd
19546                                            : ($v_version eq v2.1.5)
19547                                                ? \&filter_v2_1_5_ucd
19548
19549                                                # And for 5.14 Perls with 6.0,
19550                                                # have to also make changes
19551                                                : ($v_version ge v6.0.0
19552                                                   && $^V lt v5.17.0)
19553                                                    ? \&filter_v6_ucd
19554                                                    : undef),
19555
19556                                            # Early versions did not have the
19557                                            # proper Unicode_1 names for the
19558                                            # controls
19559                                            (($v_version lt v3.0.0)
19560                                            ? \&filter_early_U1_names
19561                                            : undef),
19562
19563                                            # Early versions did not correctly
19564                                            # use the later method for giving
19565                                            # decimal digit values
19566                                            (($v_version le v3.2.0)
19567                                            ? \&filter_bad_Nd_ucd
19568                                            : undef),
19569
19570                                            # And the main filter
19571                                            \&filter_UnicodeData_line,
19572                                         ],
19573                    EOF_Handler => \&EOF_UnicodeData,
19574                   ),
19575    Input_file->new('CJKXREF.TXT', v1.1.5,
19576                    Withdrawn => v2.0.0,
19577                    Skip => 'Gives the mapping of CJK code points '
19578                          . 'between Unicode and various other standards',
19579                   ),
19580    Input_file->new('ArabicShaping.txt', v2.0.0,
19581                    Each_Line_Handler =>
19582                        ($v_version lt 4.1.0)
19583                                    ? \&filter_old_style_arabic_shaping
19584                                    : undef,
19585                    # The first field after the range is a "schematic name"
19586                    # not used by Perl
19587                    Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19588                    Has_Missings_Defaults => $NOT_IGNORED,
19589                   ),
19590    Input_file->new('Blocks.txt', v2.0.0,
19591                    Property => 'Block',
19592                    Has_Missings_Defaults => $NOT_IGNORED,
19593                    Each_Line_Handler => \&filter_blocks_lines
19594                   ),
19595    Input_file->new('Index.txt', v2.0.0,
19596                    Skip => 'Alphabetical index of Unicode characters',
19597                   ),
19598    Input_file->new('NamesList.txt', v2.0.0,
19599                    Skip => 'Annotated list of characters',
19600                   ),
19601    Input_file->new('PropList.txt', v2.0.0,
19602                    Each_Line_Handler => (($v_version lt v3.1.0)
19603                                            ? \&filter_old_style_proplist
19604                                            : undef),
19605                   ),
19606    Input_file->new('Props.txt', v2.0.0,
19607                    Withdrawn => v3.0.0,
19608                    Skip => 'A subset of F<PropList.txt> (which is used instead)',
19609                   ),
19610    Input_file->new('ReadMe.txt', v2.0.0,
19611                    Skip => $Documentation,
19612                   ),
19613    Input_file->new('Unihan.txt', v2.0.0,
19614                    Withdrawn => v5.2.0,
19615                    Construction_Time_Handler => \&construct_unihan,
19616                    Pre_Handler => \&setup_unihan,
19617                    Optional => [ "",
19618                                  'Unicode_Radical_Stroke'
19619                                ],
19620                    Each_Line_Handler => \&filter_unihan_line,
19621                   ),
19622    Input_file->new('SpecialCasing.txt', v2.1.8,
19623                    Each_Line_Handler => ($v_version eq 2.1.8)
19624                                         ? \&filter_2_1_8_special_casing_line
19625                                         : \&filter_special_casing_line,
19626                    Pre_Handler => \&setup_special_casing,
19627                    Has_Missings_Defaults => $IGNORED,
19628                   ),
19629    Input_file->new(
19630                    'LineBreak.txt', v3.0.0,
19631                    Has_Missings_Defaults => $NOT_IGNORED,
19632                    Property => 'Line_Break',
19633                    # Early versions had problematic syntax
19634                    Each_Line_Handler => ($v_version ge v3.1.0)
19635                                          ? undef
19636                                          : ($v_version lt v3.0.0)
19637                                            ? \&filter_substitute_lb
19638                                            : \&filter_early_ea_lb,
19639                    # Must use long names for property values see comments at
19640                    # sub filter_substitute_lb
19641                    Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19642                               'Alphabetic', # default to this because XX ->
19643                                             # AL
19644
19645                               # Don't use _Perl_LB as a synonym for
19646                               # Line_Break in later perls, as it is tailored
19647                               # and isn't the same as Line_Break
19648                               'ONLY_EARLY' ],
19649                   ),
19650    Input_file->new('EastAsianWidth.txt', v3.0.0,
19651                    Property => 'East_Asian_Width',
19652                    Has_Missings_Defaults => $NOT_IGNORED,
19653                    # Early versions had problematic syntax
19654                    Each_Line_Handler => (($v_version lt v3.1.0)
19655                                        ? \&filter_early_ea_lb
19656                                        : undef),
19657                   ),
19658    Input_file->new('CompositionExclusions.txt', v3.0.0,
19659                    Property => 'Composition_Exclusion',
19660                   ),
19661    Input_file->new('UnicodeData.html', v3.0.0,
19662                    Withdrawn => v4.0.1,
19663                    Skip => $Documentation,
19664                   ),
19665    Input_file->new('BidiMirroring.txt', v3.0.1,
19666                    Property => 'Bidi_Mirroring_Glyph',
19667                    Has_Missings_Defaults => ($v_version lt v6.2.0)
19668                                              ? $NO_DEFAULTS
19669                                              # Is <none> which doesn't mean
19670                                              # anything to us, we will use the
19671                                              # null string
19672                                              : $IGNORED,
19673                   ),
19674    Input_file->new('NamesList.html', v3.0.0,
19675                    Skip => 'Describes the format and contents of '
19676                          . 'F<NamesList.txt>',
19677                   ),
19678    Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19679                    Withdrawn => v5.1,
19680                    Skip => $Documentation,
19681                   ),
19682    Input_file->new('CaseFolding.txt', v3.0.1,
19683                    Pre_Handler => \&setup_case_folding,
19684                    Each_Line_Handler =>
19685                        [ ($v_version lt v3.1.0)
19686                                 ? \&filter_old_style_case_folding
19687                                 : undef,
19688                           \&filter_case_folding_line
19689                        ],
19690                    Has_Missings_Defaults => $IGNORED,
19691                   ),
19692    Input_file->new("NormTest.txt", v3.0.1,
19693                     Handler => \&process_NormalizationsTest,
19694                     Skip => ($make_norm_test_script) ? 0 : $Validation,
19695                   ),
19696    Input_file->new('DCoreProperties.txt', v3.1.0,
19697                    # 5.2 changed this file
19698                    Has_Missings_Defaults => (($v_version ge v5.2.0)
19699                                            ? $NOT_IGNORED
19700                                            : $NO_DEFAULTS),
19701                   ),
19702    Input_file->new('DProperties.html', v3.1.0,
19703                    Withdrawn => v3.2.0,
19704                    Skip => $Documentation,
19705                   ),
19706    Input_file->new('PropList.html', v3.1.0,
19707                    Withdrawn => v5.1,
19708                    Skip => $Documentation,
19709                   ),
19710    Input_file->new('Scripts.txt', v3.1.0,
19711                    Property => 'Script',
19712                    Each_Line_Handler => (($v_version le v4.0.0)
19713                                          ? \&filter_all_caps_script_names
19714                                          : undef),
19715                    Has_Missings_Defaults => $NOT_IGNORED,
19716                   ),
19717    Input_file->new('DNormalizationProps.txt', v3.1.0,
19718                    Has_Missings_Defaults => $NOT_IGNORED,
19719                    Each_Line_Handler => (($v_version lt v4.0.1)
19720                                      ? \&filter_old_style_normalization_lines
19721                                      : undef),
19722                   ),
19723    Input_file->new('DerivedProperties.html', v3.1.1,
19724                    Withdrawn => v5.1,
19725                    Skip => $Documentation,
19726                   ),
19727    Input_file->new('DAge.txt', v3.2.0,
19728                    Has_Missings_Defaults => $NOT_IGNORED,
19729                    Property => 'Age'
19730                   ),
19731    Input_file->new('HangulSyllableType.txt', v4.0,
19732                    Has_Missings_Defaults => $NOT_IGNORED,
19733                    Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19734                    Property => 'Hangul_Syllable_Type'
19735                   ),
19736    Input_file->new('NormalizationCorrections.txt', v3.2.0,
19737                     # This documents the cumulative fixes to erroneous
19738                     # normalizations in earlier Unicode versions.  Its main
19739                     # purpose is so that someone running on an earlier
19740                     # version can use this file to override what got
19741                     # published in that earlier release.  It would be easy
19742                     # for mktables to handle this file.  But all the
19743                     # corrections in it should already be in the other files
19744                     # for the release it is.  To get it to actually mean
19745                     # something useful, someone would have to be using an
19746                     # earlier Unicode release, and copy it into the directory
19747                     # for that release and recompile.  So far there has been
19748                     # no demand to do that, so this hasn't been implemented.
19749                    Skip => 'Documentation of corrections already '
19750                          . 'incorporated into the Unicode data base',
19751                   ),
19752    Input_file->new('StandardizedVariants.html', v3.2.0,
19753                    Skip => 'Obsoleted as of Unicode 9.0, but previously '
19754                          . 'provided a visual display of the standard '
19755                          . 'variant sequences derived from '
19756                          . 'F<StandardizedVariants.txt>.',
19757                        # I don't know why the html came earlier than the
19758                        # .txt, but both are skipped anyway, so it doesn't
19759                        # matter.
19760                   ),
19761    Input_file->new('StandardizedVariants.txt', v4.0.0,
19762                    Skip => 'Certain glyph variations for character display '
19763                          . 'are standardized.  This lists the non-Unihan '
19764                          . 'ones; the Unihan ones are also not used by '
19765                          . 'Perl, and are in a separate Unicode data base '
19766                          . 'L<http://www.unicode.org/ivd>',
19767                   ),
19768    Input_file->new('UCD.html', v4.0.0,
19769                    Withdrawn => v5.2,
19770                    Skip => $Documentation,
19771                   ),
19772    Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19773                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
19774
19775                               # Don't use _Perl_WB as a synonym for
19776                               # Word_Break in later perls, as it is tailored
19777                               # and isn't the same as Word_Break
19778                               'ONLY_EARLY' ],
19779                    Property => 'Word_Break',
19780                    Has_Missings_Defaults => $NOT_IGNORED,
19781                   ),
19782    Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19783                    Early => [ \&generate_GCB, '_Perl_GCB' ],
19784                    Property => 'Grapheme_Cluster_Break',
19785                    Has_Missings_Defaults => $NOT_IGNORED,
19786                   ),
19787    Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19788                    Handler => \&process_GCB_test,
19789                    retain_trailing_comments => 1,
19790                   ),
19791    Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19792                    Skip => $Validation_Documentation,
19793                   ),
19794    Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19795                    Handler => \&process_SB_test,
19796                    retain_trailing_comments => 1,
19797                   ),
19798    Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19799                    Skip => $Validation_Documentation,
19800                   ),
19801    Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19802                    Handler => \&process_WB_test,
19803                    retain_trailing_comments => 1,
19804                   ),
19805    Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19806                    Skip => $Validation_Documentation,
19807                   ),
19808    Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19809                    Property => 'Sentence_Break',
19810                    Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19811                    Has_Missings_Defaults => $NOT_IGNORED,
19812                   ),
19813    Input_file->new('NamedSequences.txt', v4.1.0,
19814                    Handler => \&process_NamedSequences
19815                   ),
19816    Input_file->new('Unihan.html', v4.1.0,
19817                    Withdrawn => v5.2,
19818                    Skip => $Documentation,
19819                   ),
19820    Input_file->new('NameAliases.txt', v5.0,
19821                    Property => 'Name_Alias',
19822                    Each_Line_Handler => ($v_version le v6.0.0)
19823                                   ? \&filter_early_version_name_alias_line
19824                                   : \&filter_later_version_name_alias_line,
19825                   ),
19826        # NameAliases.txt came along in v5.0.  The above constructor handles
19827        # this.  But until 6.1, it was lacking some information needed by core
19828        # perl.  The constructor below handles that.  It is either a kludge or
19829        # clever, depending on your point of view.  The 'Withdrawn' parameter
19830        # indicates not to use it at all starting in 6.1 (so the above
19831        # constructor applies), and the 'v6.1' parameter indicates to use the
19832        # Early parameter before 6.1.  Therefore 'Early" is always used,
19833        # yielding the internal-only property '_Perl_Name_Alias', which it
19834        # gets from a NameAliases.txt from 6.1 or later stored in
19835        # N_Asubst.txt.  In combination with the above constructor,
19836        # 'Name_Alias' is publicly accessible starting with v5.0, and the
19837        # better 6.1 version is accessible to perl core in all releases.
19838    Input_file->new("NameAliases.txt", v6.1,
19839                    Withdrawn => v6.1,
19840                    Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19841                    Property => 'Name_Alias',
19842                    EOF_Handler => \&fixup_early_perl_name_alias,
19843                    Each_Line_Handler =>
19844                                       \&filter_later_version_name_alias_line,
19845                   ),
19846    Input_file->new('NamedSqProv.txt', v5.0.0,
19847                    Skip => 'Named sequences proposed for inclusion in a '
19848                          . 'later version of the Unicode Standard; if you '
19849                          . 'need them now, you can append this file to '
19850                          . 'F<NamedSequences.txt> and recompile perl',
19851                   ),
19852    Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19853                    Handler => \&process_LB_test,
19854                    retain_trailing_comments => 1,
19855                   ),
19856    Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19857                    Skip => $Validation_Documentation,
19858                   ),
19859    Input_file->new("BidiTest.txt", v5.2.0,
19860                    Skip => $Validation,
19861                   ),
19862    Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19863                    Optional => "",
19864                    Each_Line_Handler => \&filter_unihan_line,
19865                   ),
19866    Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19867                    Optional => "",
19868                    Each_Line_Handler => \&filter_unihan_line,
19869                   ),
19870    Input_file->new('UnihanIRGSources.txt', v5.2.0,
19871                    Optional => [ "",
19872                                  'kCompatibilityVariant',
19873                                  'kIICore',
19874                                  'kIRG_GSource',
19875                                  'kIRG_HSource',
19876                                  'kIRG_JSource',
19877                                  'kIRG_KPSource',
19878                                  'kIRG_MSource',
19879                                  'kIRG_KSource',
19880                                  'kIRG_TSource',
19881                                  'kIRG_USource',
19882                                  'kIRG_VSource',
19883                               ],
19884                    Pre_Handler => \&setup_unihan,
19885                    Each_Line_Handler => \&filter_unihan_line,
19886                   ),
19887    Input_file->new('UnihanNumericValues.txt', v5.2.0,
19888                    Optional => [ "",
19889                                  'kAccountingNumeric',
19890                                  'kOtherNumeric',
19891                                  'kPrimaryNumeric',
19892                                ],
19893                    Each_Line_Handler => \&filter_unihan_line,
19894                   ),
19895    Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19896                    Optional => "",
19897                    Each_Line_Handler => \&filter_unihan_line,
19898                   ),
19899    Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19900                    Optional => [ "",
19901                                  'Unicode_Radical_Stroke'
19902                                ],
19903                    Each_Line_Handler => \&filter_unihan_line,
19904                   ),
19905    Input_file->new('UnihanReadings.txt', v5.2.0,
19906                    Optional => "",
19907                    Each_Line_Handler => \&filter_unihan_line,
19908                   ),
19909    Input_file->new('UnihanVariants.txt', v5.2.0,
19910                    Optional => "",
19911                    Each_Line_Handler => \&filter_unihan_line,
19912                   ),
19913    Input_file->new('CJKRadicals.txt', v5.2.0,
19914                    Skip => 'Maps the kRSUnicode property values to '
19915                          . 'corresponding code points',
19916                   ),
19917    Input_file->new('EmojiSources.txt', v6.0.0,
19918                    Skip => 'Maps certain Unicode code points to their '
19919                          . 'legacy Japanese cell-phone values',
19920                   ),
19921    Input_file->new('ScriptExtensions.txt', v6.0.0,
19922                    Property => 'Script_Extensions',
19923                    Early => [ sub {} ], # Doesn't do anything but ensures
19924                                         # that this isn't skipped for early
19925                                         # versions
19926                    Pre_Handler => \&setup_script_extensions,
19927                    Each_Line_Handler => \&filter_script_extensions_line,
19928                    Has_Missings_Defaults => (($v_version le v6.0.0)
19929                                            ? $NO_DEFAULTS
19930                                            : $IGNORED),
19931                   ),
19932    # These two Indic files are actually not usable as-is until 6.1.0,
19933    # because their property values are missing from PropValueAliases.txt
19934    # until that release, so that further work would have to be done to get
19935    # them to work properly, which isn't worth it because of them being
19936    # provisional.
19937    Input_file->new('IndicMatraCategory.txt', v6.0.0,
19938                    Withdrawn => v8.0.0,
19939                    Property => 'Indic_Matra_Category',
19940                    Has_Missings_Defaults => $NOT_IGNORED,
19941                    Skip => $Indic_Skip,
19942                   ),
19943    Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19944                    Property => 'Indic_Syllabic_Category',
19945                    Has_Missings_Defaults => $NOT_IGNORED,
19946                    Skip => (($v_version lt v8.0.0)
19947                              ? $Indic_Skip
19948                              : 0),
19949                   ),
19950    Input_file->new('USourceData.txt', v6.2.0,
19951                    Skip => 'Documentation of status and cross reference of '
19952                          . 'proposals for encoding by Unicode of Unihan '
19953                          . 'characters',
19954                   ),
19955    Input_file->new('USourceGlyphs.pdf', v6.2.0,
19956                    Skip => 'Pictures of the characters in F<USourceData.txt>',
19957                   ),
19958    Input_file->new('BidiBrackets.txt', v6.3.0,
19959                    Properties => [ 'Bidi_Paired_Bracket',
19960                                    'Bidi_Paired_Bracket_Type'
19961                                  ],
19962                    Has_Missings_Defaults => $NO_DEFAULTS,
19963                   ),
19964    Input_file->new("BidiCharacterTest.txt", v6.3.0,
19965                    Skip => $Validation,
19966                   ),
19967    Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19968                    Property => 'Indic_Positional_Category',
19969                    Has_Missings_Defaults => $NOT_IGNORED,
19970                   ),
19971    Input_file->new('TangutSources.txt', v9.0.0,
19972                    Skip => 'Specifies source mappings for Tangut ideographs'
19973                          . ' and components. This data file also includes'
19974                          . ' informative radical-stroke values that are used'
19975                          . ' internally by Unicode',
19976                   ),
19977    Input_file->new('VerticalOrientation.txt', v10.0.0,
19978                    Property => 'Vertical_Orientation',
19979                    Has_Missings_Defaults => $NOT_IGNORED,
19980                   ),
19981    Input_file->new('NushuSources.txt', v10.0.0,
19982                    Skip => 'Specifies source material for Nushu characters',
19983                   ),
19984);
19985
19986# End of all the preliminaries.
19987# Do it...
19988
19989if (@missing_early_files) {
19990    print simple_fold(join_lines(<<END
19991
19992The compilation cannot be completed because one or more required input files,
19993listed below, are missing.  This is because you are compiling Unicode version
19994$unicode_version, which predates the existence of these file(s).  To fully
19995function, perl needs the data that these files would have contained if they
19996had been in this release.  To work around this, create copies of later
19997versions of the missing files in the directory containing '$0'.  (Perl will
19998make the necessary adjustments to the data to compensate for it not being the
19999same version as is being compiled.)  The files are available from unicode.org,
20000via either ftp or http.  If using http, they will be under
20001www.unicode.org/versions/.  Below are listed the source file name of each
20002missing file, the Unicode version to copy it from, and the name to store it
20003as.  (Note that the listed source file name may not be exactly the one that
20004Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
20005to get the correct name.)
20006END
20007    ));
20008    print simple_fold(join_lines("\n$_")) for @missing_early_files;
20009    exit 2;
20010}
20011
20012if ($compare_versions) {
20013    Carp::my_carp(<<END
20014Warning.  \$compare_versions is set.  Output is not suitable for production
20015END
20016    );
20017}
20018
20019# Put into %potential_files a list of all the files in the directory structure
20020# that could be inputs to this program
20021File::Find::find({
20022    wanted=>sub {
20023        return unless / \. ( txt | htm l? ) $ /xi;  # Some platforms change the
20024                                                    # name's case
20025        my $full = lc(File::Spec->rel2abs($_));
20026        $potential_files{$full} = 1;
20027        return;
20028    }
20029}, File::Spec->curdir());
20030
20031my @mktables_list_output_files;
20032my $old_start_time = 0;
20033my $old_options = "";
20034
20035if (! -e $file_list) {
20036    print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
20037    $write_unchanged_files = 1;
20038} elsif ($write_unchanged_files) {
20039    print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
20040}
20041else {
20042    print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
20043    my $file_handle;
20044    if (! open $file_handle, "<", $file_list) {
20045        Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
20046        $glob_list = 1;
20047    }
20048    else {
20049        my @input;
20050
20051        # Read and parse mktables.lst, placing the results from the first part
20052        # into @input, and the second part into @mktables_list_output_files
20053        for my $list ( \@input, \@mktables_list_output_files ) {
20054            while (<$file_handle>) {
20055                s/^ \s+ | \s+ $//xg;
20056                if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
20057                    $old_start_time = $1;
20058                    next;
20059                }
20060                if (/^ \s* \# \s* From\ options\ (.+) /x) {
20061                    $old_options = $1;
20062                    next;
20063                }
20064                next if /^ \s* (?: \# .* )? $/x;
20065                last if /^ =+ $/x;
20066                my ( $file ) = split /\t/;
20067                push @$list, $file;
20068            }
20069            @$list = uniques(@$list);
20070            next;
20071        }
20072
20073        # Look through all the input files
20074        foreach my $input (@input) {
20075            next if $input eq 'version'; # Already have checked this.
20076
20077            # Ignore if doesn't exist.  The checking about whether we care or
20078            # not is done via the Input_file object.
20079            next if ! file_exists($input);
20080
20081            # The paths are stored with relative names, and with '/' as the
20082            # delimiter; convert to absolute on this machine
20083            my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
20084            $potential_files{lc $full} = 1;
20085        }
20086    }
20087
20088    close $file_handle;
20089}
20090
20091if ($glob_list) {
20092
20093    # Here wants to process all .txt files in the directory structure.
20094    # Convert them to full path names.  They are stored in the platform's
20095    # relative style
20096    my @known_files;
20097    foreach my $object (@input_file_objects) {
20098        my $file = $object->file;
20099        next unless defined $file;
20100        push @known_files, File::Spec->rel2abs($file);
20101    }
20102
20103    my @unknown_input_files;
20104    foreach my $file (keys %potential_files) {  # The keys are stored in lc
20105        next if grep { $file eq lc($_) } @known_files;
20106
20107        # Here, the file is unknown to us.  Get relative path name
20108        $file = File::Spec->abs2rel($file);
20109        push @unknown_input_files, $file;
20110
20111        # What will happen is we create a data structure for it, and add it to
20112        # the list of input files to process.  First get the subdirectories
20113        # into an array
20114        my (undef, $directories, undef) = File::Spec->splitpath($file);
20115        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
20116        my @directories = File::Spec->splitdir($directories);
20117
20118        # If the file isn't extracted (meaning none of the directories is the
20119        # extracted one), just add it to the end of the list of inputs.
20120        if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
20121            push @input_file_objects, Input_file->new($file, v0);
20122        }
20123        else {
20124
20125            # Here, the file is extracted.  It needs to go ahead of most other
20126            # processing.  Search for the first input file that isn't a
20127            # special required property (that is, find one whose first_release
20128            # is non-0), and isn't extracted.  Also, the Age property file is
20129            # processed before the extracted ones, just in case
20130            # $compare_versions is set.
20131            for (my $i = 0; $i < @input_file_objects; $i++) {
20132                if ($input_file_objects[$i]->first_released ne v0
20133                    && lc($input_file_objects[$i]->file) ne 'dage.txt'
20134                    && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
20135                {
20136                    splice @input_file_objects, $i, 0,
20137                                                Input_file->new($file, v0);
20138                    last;
20139                }
20140            }
20141
20142        }
20143    }
20144    if (@unknown_input_files) {
20145        print STDERR simple_fold(join_lines(<<END
20146
20147The following files are unknown as to how to handle.  Assuming they are
20148typical property files.  You'll know by later error messages if it worked or
20149not:
20150END
20151        ) . " " . join(", ", @unknown_input_files) . "\n\n");
20152    }
20153} # End of looking through directory structure for more .txt files.
20154
20155# Create the list of input files from the objects we have defined, plus
20156# version
20157my @input_files = qw(version Makefile);
20158foreach my $object (@input_file_objects) {
20159    my $file = $object->file;
20160    next if ! defined $file;    # Not all objects have files
20161    next if defined $object->skip;;
20162    push @input_files,  $file;
20163}
20164
20165if ( $verbosity >= $VERBOSE ) {
20166    print "Expecting ".scalar( @input_files )." input files. ",
20167         "Checking ".scalar( @mktables_list_output_files )." output files.\n";
20168}
20169
20170# We set $most_recent to be the most recently changed input file, including
20171# this program itself (done much earlier in this file)
20172foreach my $in (@input_files) {
20173    next unless -e $in;        # Keep going even if missing a file
20174    my $mod_time = (stat $in)[9];
20175    $most_recent = $mod_time if $mod_time > $most_recent;
20176
20177    # See that the input files have distinct names, to warn someone if they
20178    # are adding a new one
20179    if ($make_list) {
20180        my ($volume, $directories, $file ) = File::Spec->splitpath($in);
20181        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
20182        my @directories = File::Spec->splitdir($directories);
20183        construct_filename($file, 'mutable', \@directories);
20184    }
20185}
20186
20187# We use 'Makefile' just to see if it has changed since the last time we
20188# rebuilt.  Now discard it.
20189@input_files = grep { $_ ne 'Makefile' } @input_files;
20190
20191my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
20192              || ! scalar @mktables_list_output_files  # or if no outputs known
20193              || $old_start_time < $most_recent        # or out-of-date
20194              || $old_options ne $command_line_arguments; # or with different
20195                                                          # options
20196
20197# Now we check to see if any output files are older than youngest, if
20198# they are, we need to continue on, otherwise we can presumably bail.
20199if (! $rebuild) {
20200    foreach my $out (@mktables_list_output_files) {
20201        if ( ! file_exists($out)) {
20202            print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
20203            $rebuild = 1;
20204            last;
20205         }
20206        #local $to_trace = 1 if main::DEBUG;
20207        trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
20208        if ( (stat $out)[9] <= $most_recent ) {
20209            #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
20210            print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
20211            $rebuild = 1;
20212            last;
20213        }
20214    }
20215}
20216if (! $rebuild) {
20217    print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
20218    exit(0);
20219}
20220print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20221
20222# Ready to do the major processing.  First create the perl pseudo-property.
20223$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20224
20225# Process each input file
20226foreach my $file (@input_file_objects) {
20227    $file->run;
20228}
20229
20230# Finish the table generation.
20231
20232print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20233finish_Unicode();
20234
20235# For the very specialized case of comparing two Unicode versions...
20236if (DEBUG && $compare_versions) {
20237    handle_compare_versions();
20238}
20239
20240print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20241compile_perl();
20242
20243print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20244add_perl_synonyms();
20245
20246print "Writing tables\n" if $verbosity >= $PROGRESS;
20247write_all_tables();
20248
20249# Write mktables.lst
20250if ( $file_list and $make_list ) {
20251
20252    print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20253    foreach my $file (@input_files, @files_actually_output) {
20254        my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20255        my @directories = grep length, File::Spec->splitdir($directories);
20256        $file = join '/', @directories, $basefile;
20257    }
20258
20259    my $ofh;
20260    if (! open $ofh,">",$file_list) {
20261        Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
20262        return
20263    }
20264    else {
20265        my $localtime = localtime $start_time;
20266        print $ofh <<"END";
20267#
20268# $file_list -- File list for $0.
20269#
20270#   Autogenerated starting on $start_time ($localtime)
20271#   From options $command_line_arguments
20272#
20273# - First section is input files
20274#   ($0 itself is not listed but is automatically considered an input)
20275# - Section separator is /^=+\$/
20276# - Second section is a list of output files.
20277# - Lines matching /^\\s*#/ are treated as comments
20278#   which along with blank lines are ignored.
20279#
20280
20281# Input files:
20282
20283END
20284        print $ofh "$_\n" for sort(@input_files);
20285        print $ofh "\n=================================\n# Output files:\n\n";
20286        print $ofh "$_\n" for sort @files_actually_output;
20287        print $ofh "\n# ",scalar(@input_files)," input files\n",
20288                "# ",scalar(@files_actually_output)+1," output files\n\n",
20289                "# End list\n";
20290        close $ofh
20291            or Carp::my_carp("Failed to close $ofh: $!");
20292
20293        print "Filelist has ",scalar(@input_files)," input files and ",
20294            scalar(@files_actually_output)+1," output files\n"
20295            if $verbosity >= $VERBOSE;
20296    }
20297}
20298
20299# Output these warnings unless -q explicitly specified.
20300if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20301    if (@unhandled_properties) {
20302        print "\nProperties and tables that unexpectedly have no code points\n";
20303        foreach my $property (sort @unhandled_properties) {
20304            print $property, "\n";
20305        }
20306    }
20307
20308    if (%potential_files) {
20309        print "\nInput files that are not considered:\n";
20310        foreach my $file (sort keys %potential_files) {
20311            print File::Spec->abs2rel($file), "\n";
20312        }
20313    }
20314    print "\nAll done\n" if $verbosity >= $VERBOSE;
20315}
20316
20317if ($version_of_mk_invlist_bounds lt $v_version) {
20318    Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20319                . " to be checked and possibly updated to Unicode"
20320                . " $string_version");
20321}
20322
20323exit(0);
20324
20325# TRAILING CODE IS USED BY make_property_test_script()
20326__DATA__
20327
20328use strict;
20329use warnings;
20330
20331# Test qr/\X/ and the \p{} regular expression constructs.  This file is
20332# constructed by mktables from the tables it generates, so if mktables is
20333# buggy, this won't necessarily catch those bugs.  Tests are generated for all
20334# feasible properties; a few aren't currently feasible; see
20335# is_code_point_usable() in mktables for details.
20336
20337# Standard test packages are not used because this manipulates SIG_WARN.  It
20338# exits 0 if every non-skipped test succeeded; -1 if any failed.
20339
20340my $Tests = 0;
20341my $Fails = 0;
20342
20343# loc_tools.pl requires this function to be defined
20344sub ok($@) {
20345    my ($pass, @msg) = @_;
20346    print "not " unless $pass;
20347    print "ok ";
20348    print ++$Tests;
20349    print " - ", join "", @msg if @msg;
20350    print "\n";
20351}
20352
20353sub Expect($$$$) {
20354    my $expected = shift;
20355    my $ord = shift;
20356    my $regex  = shift;
20357    my $warning_type = shift;   # Type of warning message, like 'deprecated'
20358                                # or empty if none
20359    my $line   = (caller)[2];
20360
20361    # Convert the code point to hex form
20362    my $string = sprintf "\"\\x{%04X}\"", $ord;
20363
20364    my @tests = "";
20365
20366    # The first time through, use all warnings.  If the input should generate
20367    # a warning, add another time through with them turned off
20368    push @tests, "no warnings '$warning_type';" if $warning_type;
20369
20370    foreach my $no_warnings (@tests) {
20371
20372        # Store any warning messages instead of outputting them
20373        local $SIG{__WARN__} = $SIG{__WARN__};
20374        my $warning_message;
20375        $SIG{__WARN__} = sub { $warning_message = $_[0] };
20376
20377        $Tests++;
20378
20379        # A string eval is needed because of the 'no warnings'.
20380        # Assumes no parentheses in the regular expression
20381        my $result = eval "$no_warnings
20382                            my \$RegObj = qr($regex);
20383                            $string =~ \$RegObj ? 1 : 0";
20384        if (not defined $result) {
20385            print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20386            $Fails++;
20387        }
20388        elsif ($result ^ $expected) {
20389            print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20390            $Fails++;
20391        }
20392        elsif ($warning_message) {
20393            if (! $warning_type || ($warning_type && $no_warnings)) {
20394                print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20395                $Fails++;
20396            }
20397            else {
20398                print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20399            }
20400        }
20401        elsif ($warning_type && ! $no_warnings) {
20402            print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20403            $Fails++;
20404        }
20405        else {
20406            print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20407        }
20408    }
20409    return;
20410}
20411
20412sub Error($) {
20413    my $regex  = shift;
20414    $Tests++;
20415    if (eval { 'x' =~ qr/$regex/; 1 }) {
20416        $Fails++;
20417        my $line = (caller)[2];
20418        print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20419    }
20420    else {
20421        my $line = (caller)[2];
20422        print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20423    }
20424    return;
20425}
20426
20427# Break test files (e.g. GCBTest.txt) character that break allowed here
20428my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20429utf8::upgrade($breakable_utf8);
20430
20431# Break test files (e.g. GCBTest.txt) character that indicates can't break
20432# here
20433my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20434utf8::upgrade($nobreak_utf8);
20435
20436my $are_ctype_locales_available;
20437my $utf8_locale;
20438chdir 't' if -d 't';
20439eval { require "./loc_tools.pl" };
20440if (defined &locales_enabled) {
20441    $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20442    if ($are_ctype_locales_available) {
20443        $utf8_locale = &find_utf8_ctype_locale;
20444    }
20445}
20446
20447# Eval'd so can run on versions earlier than the property is available in
20448my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20449if (! defined $WB_Extend_or_Format_re) {
20450    $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20451}
20452
20453sub _test_break($$) {
20454    # Test various break property matches.  The 2nd parameter gives the
20455    # property name.  The input is a line from auxiliary/*Test.txt for the
20456    # given property.  Each such line is a sequence of Unicode (not native)
20457    # code points given by their hex numbers, separated by the two characters
20458    # defined just before this subroutine that indicate that either there can
20459    # or cannot be a break between the adjacent code points.  All these are
20460    # tested.
20461    #
20462    # For the gcb property extra tests are made.  if there isn't a break, that
20463    # means the sequence forms an extended grapheme cluster, which means that
20464    # \X should match the whole thing.  If there is a break, \X should stop
20465    # there.  This is all converted by this routine into a match: $string =~
20466    # /(\X)/, Each \X should match the next cluster; and that is what is
20467    # checked.
20468
20469    my $template = shift;
20470    my $break_type = shift;
20471
20472    my $line   = (caller 1)[2];   # Line number
20473    my $comment = "";
20474
20475    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20476        $template = $1;
20477        $comment = $2;
20478
20479        # Replace leading spaces with a single one.
20480        $comment =~ s/ ^ \s* / # /x;
20481    }
20482
20483    # The line contains characters above the ASCII range, but in Latin1.  It
20484    # may or may not be in utf8, and if it is, it may or may not know it.  So,
20485    # convert these characters to 8 bits.  If knows is in utf8, simply
20486    # downgrade.
20487    if (utf8::is_utf8($template)) {
20488        utf8::downgrade($template);
20489    } else {
20490
20491        # Otherwise, if it is in utf8, but doesn't know it, the next lines
20492        # convert the two problematic characters to their 8-bit equivalents.
20493        # If it isn't in utf8, they don't harm anything.
20494        use bytes;
20495        $template =~ s/$nobreak_utf8/$nobreak/g;
20496        $template =~ s/$breakable_utf8/$breakable/g;
20497    }
20498
20499    # Perl customizes wb.  So change the official tests accordingly
20500    if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20501
20502        # Split into elements that alternate between code point and
20503        # break/no-break
20504        my @line = split / +/, $template;
20505
20506        # Look at each code point and its following one
20507        for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
20508
20509            # The customization only involves changing some breaks to
20510            # non-breaks.
20511            next if $line[$i+1] =~ /$nobreak/;
20512
20513            my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20514            my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20515
20516            # And it only affects adjacent space characters.
20517            next if $lhs !~ /\s/u;
20518
20519            # But, we want to make sure to test spaces followed by a Extend
20520            # or Format.
20521            next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20522
20523            # To test the customization, add some white-space before this to
20524            # create a span.  The $lhs white space may or may not be bound to
20525            # that span, and also with the $rhs.  If the $rhs is a binding
20526            # character, the $lhs is bound to it and not to the span, unless
20527            # $lhs is vertical space.  In all other cases, the $lhs is bound
20528            # to the span.  If the $rhs is white space, it is bound to the
20529            # $lhs
20530            my $bound;
20531            my $span;
20532            if ($rhs =~ /$WB_Extend_or_Format_re/) {
20533                if ($lhs =~ /\v/) {
20534                    $bound = $breakable;
20535                    $span = $nobreak;
20536                }
20537                else {
20538                    $bound = $nobreak;
20539                    $span = $breakable;
20540                }
20541            }
20542            else {
20543                $span = $nobreak;
20544                $bound = $nobreak;
20545            }
20546
20547            splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20548            $i += 4;
20549            $line[$i+1] = $bound;
20550        }
20551        $template = join " ", @line;
20552    }
20553
20554    # The input is just the break/no-break symbols and sequences of Unicode
20555    # code points as hex digits separated by spaces for legibility. e.g.:
20556    # ÷ 0020 × 0308 ÷ 0020 ÷
20557    # Convert to native \x format
20558    $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20559    $template =~ s/ \s* //gx;   # Probably the line above removed all spaces;
20560                                # but be sure
20561
20562    # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20563    # appropriate
20564    my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20565    $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20566
20567    my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20568    my $string = eval "\"$display_string\"";
20569
20570    # The remaining massaging of the input is for the \X tests.  Get rid of
20571    # the leading and trailing breakables
20572    $template =~ s/^ \s* $breakable \s* //x;
20573    $template =~ s/ \s* $breakable \s* $ //x;
20574
20575    # Delete no-breaks
20576    $template =~ s/ \s* $nobreak \s* //xg;
20577
20578    # Split the input into segments that are breakable between them.
20579    my @should_display = split /\s*$breakable\s*/, $template;
20580    my @should_match = map { eval "\"$_\"" } @should_display;
20581
20582    # If a string can be represented in both non-ut8 and utf8, test both cases
20583    my $display_upgrade = "";
20584    UPGRADE:
20585    for my $to_upgrade (0 .. 1) {
20586
20587        if ($to_upgrade) {
20588
20589            # If already in utf8, would just be a repeat
20590            next UPGRADE if utf8::is_utf8($string);
20591
20592            utf8::upgrade($string);
20593            $display_upgrade = " (utf8-upgraded)";
20594        }
20595
20596        my @modifiers = qw(a aa d u i);
20597        if ($are_ctype_locales_available) {
20598            push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20599
20600            # The /l modifier has C after it to indicate the locale to try
20601            push @modifiers, "lC";
20602        }
20603
20604        # Test for each of the regex modifiers.
20605        for my $modifier (@modifiers) {
20606            my $display_locale = "";
20607
20608            # For /l, set the locale to what it says to.
20609            if ($modifier =~ / ^ l (.*) /x) {
20610                my $locale = $1;
20611                $display_locale = "(locale = $locale)";
20612                POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
20613                $modifier = 'l';
20614            }
20615
20616            no warnings qw(locale regexp surrogate);
20617            my $pattern = "(?$modifier:$break_pattern)";
20618
20619            # Actually do the test
20620            my $matched_text;
20621            my $matched = $string =~ qr/$pattern/;
20622            if ($matched) {
20623                $matched_text = "matched";
20624            }
20625            else {
20626                $matched_text = "failed to match";
20627                print "not ";
20628
20629                if (TODO_FAILING_BREAKS) {
20630                    $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20631                    $comment =~ s/#/# TODO/;
20632                }
20633            }
20634            print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20635
20636            # Only print the comment on the first use of this line
20637            $comment = "";
20638
20639            # Repeat with the first \B{} in the pattern.  This makes sure the
20640            # code in regexec.c:find_byclass() for \B gets executed
20641            if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20642                my $B_pattern = "$1$2";
20643                $matched = $string =~ qr/$B_pattern/;
20644                print "not " unless $matched;
20645                $matched_text = ($matched) ? "matched" : "failed to match";
20646                print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20647                print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20648                print "\n";
20649            }
20650        }
20651
20652        next if $break_type ne 'gcb';
20653
20654        # Finally, do the \X match.
20655        my @matches = $string =~ /(\X)/g;
20656
20657        # Look through each matched cluster to verify that it matches what we
20658        # expect.
20659        my $min = (@matches < @should_match) ? @matches : @should_match;
20660        for my $i (0 .. $min - 1) {
20661            $Tests++;
20662            if ($matches[$i] eq $should_match[$i]) {
20663                print "ok $Tests - ";
20664                if ($i == 0) {
20665                    print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20666                } else {
20667                    print "And \\X #", $i + 1,
20668                }
20669                print " correctly matched $should_display[$i]; line $line\n";
20670            } else {
20671                $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20672                                                    split "", $matches[$i]);
20673                print "not ok $Tests -";
20674                print " # TODO" if TODO_FAILING_BREAKS;
20675                print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20676                    $i + 1,
20677                    " should have matched $should_display[$i]",
20678                    " but instead matched $matches[$i]",
20679                    ".  Abandoning rest of line $line\n";
20680                next UPGRADE;
20681            }
20682        }
20683
20684        # And the number of matches should equal the number of expected matches.
20685        $Tests++;
20686        if (@matches == @should_match) {
20687            print "ok $Tests - Nothing was left over; line $line\n";
20688        } else {
20689            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20690            print " # TODO" if TODO_FAILING_BREAKS;
20691            print "\n";
20692        }
20693    }
20694
20695    return;
20696}
20697
20698sub Test_GCB($) {
20699    _test_break(shift, 'gcb');
20700}
20701
20702sub Test_LB($) {
20703    _test_break(shift, 'lb');
20704}
20705
20706sub Test_SB($) {
20707    _test_break(shift, 'sb');
20708}
20709
20710sub Test_WB($) {
20711    _test_break(shift, 'wb');
20712}
20713
20714sub Finished() {
20715    print "1..$Tests\n";
20716    exit($Fails ? -1 : 0);
20717}
20718
20719