1#!/usr/bin/perl -w 2 3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! 4# Any files created or read by this program should be listed in 'mktables.lst' 5# Use -makelist to regenerate it. 6 7# Needs 'no overloading' to run faster on miniperl. Code commented out at the 8# subroutine objaddr can be used instead to work as far back (untested) as 9# 5.8: needs pack "U". But almost all occurrences of objaddr have been 10# removed in favor of using 'no overloading'. You also would have to go 11# through and replace occurrences like: 12# my $addr = do { no overloading; pack 'J', $self; } 13# with 14# my $addr = main::objaddr $self; 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b 16# that instituted the change to main::objaddr, and subsequent commits that 17# changed 0+$self to pack 'J', $self.) 18 19my $start_time; 20BEGIN { # Get the time the script started running; do it at compilation to 21 # get it as close as possible 22 $start_time= time; 23} 24 25require 5.010_001; 26use strict; 27use warnings; 28use Carp; 29use Config; 30use File::Find; 31use File::Path; 32use File::Spec; 33use Text::Tabs; 34use re "/aa"; 35 36sub DEBUG () { 0 } # Set to 0 for production; 1 for development 37my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; 38 39########################################################################## 40# 41# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), 42# from the Unicode database files (lib/unicore/.../*.txt), It also generates 43# a pod file and .t files, depending on option parameters. 44# 45# The structure of this file is: 46# First these introductory comments; then 47# code needed for everywhere, such as debugging stuff; then 48# code to handle input parameters; then 49# data structures likely to be of external interest (some of which depend on 50# the input parameters, so follows them; then 51# more data structures and subroutine and package (class) definitions; then 52# the small actual loop to process the input files and finish up; then 53# a __DATA__ section, for the .t tests 54# 55# This program works on all releases of Unicode so far. The outputs have been 56# scrutinized most intently for release 5.1. The others have been checked for 57# somewhat more than just sanity. It can handle all non-provisional Unicode 58# character properties in those releases. 59# 60# This program is mostly about Unicode character (or code point) properties. 61# A property describes some attribute or quality of a code point, like if it 62# is lowercase or not, its name, what version of Unicode it was first defined 63# in, or what its uppercase equivalent is. Unicode deals with these disparate 64# possibilities by making all properties into mappings from each code point 65# into some corresponding value. In the case of it being lowercase or not, 66# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each 67# property maps each Unicode code point to a single value, called a "property 68# value". (Some more recently defined properties, map a code point to a set 69# of values.) 70# 71# When using a property in a regular expression, what is desired isn't the 72# mapping of the code point to its property's value, but the reverse (or the 73# mathematical "inverse relation"): starting with the property value, "Does a 74# code point map to it?" These are written in a "compound" form: 75# \p{property=value}, e.g., \p{category=punctuation}. This program generates 76# files containing the lists of code points that map to each such regular 77# expression property value, one file per list 78# 79# There is also a single form shortcut that Perl adds for many of the commonly 80# used properties. This happens for all binary properties, plus script, 81# general_category, and block properties. 82# 83# Thus the outputs of this program are files. There are map files, mostly in 84# the 'To' directory; and there are list files for use in regular expression 85# matching, all in subdirectories of the 'lib' directory, with each 86# subdirectory being named for the property that the lists in it are for. 87# Bookkeeping, test, and documentation files are also generated. 88 89my $matches_directory = 'lib'; # Where match (\p{}) files go. 90my $map_directory = 'To'; # Where map files go. 91 92# DATA STRUCTURES 93# 94# The major data structures of this program are Property, of course, but also 95# Table. There are two kinds of tables, very similar to each other. 96# "Match_Table" is the data structure giving the list of code points that have 97# a particular property value, mentioned above. There is also a "Map_Table" 98# data structure which gives the property's mapping from code point to value. 99# There are two structures because the match tables need to be combined in 100# various ways, such as constructing unions, intersections, complements, etc., 101# and the map ones don't. And there would be problems, perhaps subtle, if 102# a map table were inadvertently operated on in some of those ways. 103# The use of separate classes with operations defined on one but not the other 104# prevents accidentally confusing the two. 105# 106# At the heart of each table's data structure is a "Range_List", which is just 107# an ordered list of "Ranges", plus ancillary information, and methods to 108# operate on them. A Range is a compact way to store property information. 109# Each range has a starting code point, an ending code point, and a value that 110# is meant to apply to all the code points between the two end points, 111# inclusive. For a map table, this value is the property value for those 112# code points. Two such ranges could be written like this: 113# 0x41 .. 0x5A, 'Upper', 114# 0x61 .. 0x7A, 'Lower' 115# 116# Each range also has a type used as a convenience to classify the values. 117# Most ranges in this program will be Type 0, or normal, but there are some 118# ranges that have a non-zero type. These are used only in map tables, and 119# are for mappings that don't fit into the normal scheme of things. Mappings 120# that require a hash entry to communicate with utf8.c are one example; 121# another example is mappings for charnames.pm to use which indicate a name 122# that is algorithmically determinable from its code point (and the reverse). 123# These are used to significantly compact these tables, instead of listing 124# each one of the tens of thousands individually. 125# 126# In a match table, the value of a range is irrelevant (and hence the type as 127# well, which will always be 0), and arbitrarily set to the null string. 128# Using the example above, there would be two match tables for those two 129# entries, one named Upper would contain the 0x41..0x5A range, and the other 130# named Lower would contain 0x61..0x7A. 131# 132# Actually, there are two types of range lists, "Range_Map" is the one 133# associated with map tables, and "Range_List" with match tables. 134# Again, this is so that methods can be defined on one and not the others so 135# as to prevent operating on them in incorrect ways. 136# 137# Eventually, most tables are written out to files to be read by utf8_heavy.pl 138# in the perl core. All tables could in theory be written, but some are 139# suppressed because there is no current practical use for them. It is easy 140# to change which get written by changing various lists that are near the top 141# of the actual code in this file. The table data structures contain enough 142# ancillary information to allow them to be treated as separate entities for 143# writing, such as the path to each one's file. There is a heading in each 144# map table that gives the format of its entries, and what the map is for all 145# the code points missing from it. (This allows tables to be more compact.) 146# 147# The Property data structure contains one or more tables. All properties 148# contain a map table (except the $perl property which is a 149# pseudo-property containing only match tables), and any properties that 150# are usable in regular expression matches also contain various matching 151# tables, one for each value the property can have. A binary property can 152# have two values, True and False (or Y and N, which are preferred by Unicode 153# terminology). Thus each of these properties will have a map table that 154# takes every code point and maps it to Y or N (but having ranges cuts the 155# number of entries in that table way down), and two match tables, one 156# which has a list of all the code points that map to Y, and one for all the 157# code points that map to N. (For each binary property, a third table is also 158# generated for the pseudo Perl property. It contains the identical code 159# points as the Y table, but can be written in regular expressions, not in the 160# compound form, but in a "single" form like \p{IsUppercase}.) Many 161# properties are binary, but some properties have several possible values, 162# some have many, and properties like Name have a different value for every 163# named code point. Those will not, unless the controlling lists are changed, 164# have their match tables written out. But all the ones which can be used in 165# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally 166# a property would have either its map table or its match tables written but 167# not both. Again, what gets written is controlled by lists which can easily 168# be changed. Starting in 5.14, advantage was taken of this, and all the map 169# tables needed to reconstruct the Unicode db are now written out, while 170# suppressing the Unicode .txt files that contain the data. Our tables are 171# much more compact than the .txt files, so a significant space savings was 172# achieved. Also, tables are not written out that are trivially derivable 173# from tables that do get written. So, there typically is no file containing 174# the code points not matched by a binary property (the table for \P{} versus 175# lowercase \p{}), since you just need to invert the True table to get the 176# False table. 177 178# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on 179# how many match tables there are and the content of the maps. This 'Type' is 180# different than a range 'Type', so don't get confused by the two concepts 181# having the same name. 182# 183# For information about the Unicode properties, see Unicode's UAX44 document: 184 185my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; 186 187# As stated earlier, this program will work on any release of Unicode so far. 188# Most obvious problems in earlier data have NOT been corrected except when 189# necessary to make Perl or this program work reasonably, and to keep out 190# potential security issues. For example, no folding information was given in 191# early releases, so this program substitutes lower case instead, just so that 192# a regular expression with the /i option will do something that actually 193# gives the right results in many cases. There are also a couple other 194# corrections for version 1.1.5, commented at the point they are made. As an 195# example of corrections that weren't made (but could be) is this statement 196# from DerivedAge.txt: "The supplementary private use code points and the 197# non-character code points were assigned in version 2.0, but not specifically 198# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise 199# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is 200# further down in these introductory comments. 201# 202# This program works on all non-provisional properties as of the current 203# Unicode release, though the files for some are suppressed for various 204# reasons. You can change which are output by changing lists in this program. 205# 206# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's 207# loose matchings rules (from Unicode TR18): 208# 209# The recommended names for UCD properties and property values are in 210# PropertyAliases.txt [Prop] and PropertyValueAliases.txt 211# [PropValue]. There are both abbreviated names and longer, more 212# descriptive names. It is strongly recommended that both names be 213# recognized, and that loose matching of property names be used, 214# whereby the case distinctions, whitespace, hyphens, and underbar 215# are ignored. 216# 217# The program still allows Fuzzy to override its determination of if loose 218# matching should be used, but it isn't currently used, as it is no longer 219# needed; the calculations it makes are good enough. 220# 221# SUMMARY OF HOW IT WORKS: 222# 223# Process arguments 224# 225# A list is constructed containing each input file that is to be processed 226# 227# Each file on the list is processed in a loop, using the associated handler 228# code for each: 229# The PropertyAliases.txt and PropValueAliases.txt files are processed 230# first. These files name the properties and property values. 231# Objects are created of all the property and property value names 232# that the rest of the input should expect, including all synonyms. 233# The other input files give mappings from properties to property 234# values. That is, they list code points and say what the mapping 235# is under the given property. Some files give the mappings for 236# just one property; and some for many. This program goes through 237# each file and populates the properties and their map tables from 238# them. Some properties are listed in more than one file, and 239# Unicode has set up a precedence as to which has priority if there 240# is a conflict. Thus the order of processing matters, and this 241# program handles the conflict possibility by processing the 242# overriding input files last, so that if necessary they replace 243# earlier values. 244# After this is all done, the program creates the property mappings not 245# furnished by Unicode, but derivable from what it does give. 246# The tables of code points that match each property value in each 247# property that is accessible by regular expressions are created. 248# The Perl-defined properties are created and populated. Many of these 249# require data determined from the earlier steps 250# Any Perl-defined synonyms are created, and name clashes between Perl 251# and Unicode are reconciled and warned about. 252# All the properties are written to files 253# Any other files are written, and final warnings issued. 254# 255# For clarity, a number of operators have been overloaded to work on tables: 256# ~ means invert (take all characters not in the set). The more 257# conventional '!' is not used because of the possibility of confusing 258# it with the actual boolean operation. 259# + means union 260# - means subtraction 261# & means intersection 262# The precedence of these is the order listed. Parentheses should be 263# copiously used. These are not a general scheme. The operations aren't 264# defined for a number of things, deliberately, to avoid getting into trouble. 265# Operations are done on references and affect the underlying structures, so 266# that the copy constructors for them have been overloaded to not return a new 267# clone, but the input object itself. 268# 269# The bool operator is deliberately not overloaded to avoid confusion with 270# "should it mean if the object merely exists, or also is non-empty?". 271# 272# WHY CERTAIN DESIGN DECISIONS WERE MADE 273# 274# This program needs to be able to run under miniperl. Therefore, it uses a 275# minimum of other modules, and hence implements some things itself that could 276# be gotten from CPAN 277# 278# This program uses inputs published by the Unicode Consortium. These can 279# change incompatibly between releases without the Perl maintainers realizing 280# it. Therefore this program is now designed to try to flag these. It looks 281# at the directories where the inputs are, and flags any unrecognized files. 282# It keeps track of all the properties in the files it handles, and flags any 283# that it doesn't know how to handle. It also flags any input lines that 284# don't match the expected syntax, among other checks. 285# 286# It is also designed so if a new input file matches one of the known 287# templates, one hopefully just needs to add it to a list to have it 288# processed. 289# 290# As mentioned earlier, some properties are given in more than one file. In 291# particular, the files in the extracted directory are supposedly just 292# reformattings of the others. But they contain information not easily 293# derivable from the other files, including results for Unihan, which this 294# program doesn't ordinarily look at, and for unassigned code points. They 295# also have historically had errors or been incomplete. In an attempt to 296# create the best possible data, this program thus processes them first to 297# glean information missing from the other files; then processes those other 298# files to override any errors in the extracted ones. Much of the design was 299# driven by this need to store things and then possibly override them. 300# 301# It tries to keep fatal errors to a minimum, to generate something usable for 302# testing purposes. It always looks for files that could be inputs, and will 303# warn about any that it doesn't know how to handle (the -q option suppresses 304# the warning). 305# 306# Why is there more than one type of range? 307# This simplified things. There are some very specialized code points that 308# have to be handled specially for output, such as Hangul syllable names. 309# By creating a range type (done late in the development process), it 310# allowed this to be stored with the range, and overridden by other input. 311# Originally these were stored in another data structure, and it became a 312# mess trying to decide if a second file that was for the same property was 313# overriding the earlier one or not. 314# 315# Why are there two kinds of tables, match and map? 316# (And there is a base class shared by the two as well.) As stated above, 317# they actually are for different things. Development proceeded much more 318# smoothly when I (khw) realized the distinction. Map tables are used to 319# give the property value for every code point (actually every code point 320# that doesn't map to a default value). Match tables are used for regular 321# expression matches, and are essentially the inverse mapping. Separating 322# the two allows more specialized methods, and error checks so that one 323# can't just take the intersection of two map tables, for example, as that 324# is nonsensical. 325# 326# What about 'fate' and 'status'. The concept of a table's fate was created 327# late when it became clear that something more was needed. The difference 328# between this and 'status' is unclean, and could be improved if someone 329# wanted to spend the effort. 330# 331# DEBUGGING 332# 333# This program is written so it will run under miniperl. Occasionally changes 334# will cause an error where the backtrace doesn't work well under miniperl. 335# To diagnose the problem, you can instead run it under regular perl, if you 336# have one compiled. 337# 338# There is a good trace facility. To enable it, first sub DEBUG must be set 339# to return true. Then a line like 340# 341# local $to_trace = 1 if main::DEBUG; 342# 343# can be added to enable tracing in its lexical scope (plus dynamic) or until 344# you insert another line: 345# 346# local $to_trace = 0 if main::DEBUG; 347# 348# To actually trace, use a line like "trace $a, @b, %c, ...; 349# 350# Some of the more complex subroutines already have trace statements in them. 351# Permanent trace statements should be like: 352# 353# trace ... if main::DEBUG && $to_trace; 354# 355# If there is just one or a few files that you're debugging, you can easily 356# cause most everything else to be skipped. Change the line 357# 358# my $debug_skip = 0; 359# 360# to 1, and every file whose object is in @input_file_objects and doesn't have 361# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping 362# Jamo.txt or UnicodeData.txt will likely cause fatal errors. 363# 364# To compare the output tables, it may be useful to specify the -annotate 365# flag. This causes the tables to expand so there is one entry for each 366# non-algorithmically named code point giving, currently its name, and its 367# graphic representation if printable (and you have a font that knows about 368# it). This makes it easier to see what the particular code points are in 369# each output table. The tables are usable, but because they don't have 370# ranges (for the most part), a Perl using them will run slower. Non-named 371# code points are annotated with a description of their status, and contiguous 372# ones with the same description will be output as a range rather than 373# individually. Algorithmically named characters are also output as ranges, 374# except when there are just a few contiguous ones. 375# 376# FUTURE ISSUES 377# 378# The program would break if Unicode were to change its names so that 379# interior white space, underscores, or dashes differences were significant 380# within property and property value names. 381# 382# It might be easier to use the xml versions of the UCD if this program ever 383# would need heavy revision, and the ability to handle old versions was not 384# required. 385# 386# There is the potential for name collisions, in that Perl has chosen names 387# that Unicode could decide it also likes. There have been such collisions in 388# the past, with mostly Perl deciding to adopt the Unicode definition of the 389# name. However in the 5.2 Unicode beta testing, there were a number of such 390# collisions, which were withdrawn before the final release, because of Perl's 391# and other's protests. These all involved new properties which began with 392# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, 393# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a 394# Unicode document, so they are unlikely to be used by Unicode for another 395# purpose. However, they might try something beginning with 'In', or use any 396# of the other Perl-defined properties. This program will warn you of name 397# collisions, and refuse to generate tables with them, but manual intervention 398# will be required in this event. One scheme that could be implemented, if 399# necessary, would be to have this program generate another file, or add a 400# field to mktables.lst that gives the date of first definition of a property. 401# Each new release of Unicode would use that file as a basis for the next 402# iteration. And the Perl synonym addition code could sort based on the age 403# of the property, so older properties get priority, and newer ones that clash 404# would be refused; hence existing code would not be impacted, and some other 405# synonym would have to be used for the new property. This is ugly, and 406# manual intervention would certainly be easier to do in the short run; lets 407# hope it never comes to this. 408# 409# A NOTE ON UNIHAN 410# 411# This program can generate tables from the Unihan database. But it doesn't 412# by default, letting the CPAN module Unicode::Unihan handle them. Prior to 413# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the 414# database was split into 8 different files, all beginning with the letters 415# 'Unihan'. This program will read those file(s) if present, but it needs to 416# know which of the many properties in the file(s) should have tables created 417# for them. It will create tables for any properties listed in 418# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the 419# @cjk_properties array and the @cjk_property_values array. Thus, if a 420# property you want is not in those files of the release you are building 421# against, you must add it to those two arrays. Starting in 4.0, the 422# Unicode_Radical_Stroke was listed in those files, so if the Unihan database 423# is present in the directory, a table will be generated for that property. 424# In 5.2, several more properties were added. For your convenience, the two 425# arrays are initialized with all the 6.0 listed properties that are also in 426# earlier releases. But these are commented out. You can just uncomment the 427# ones you want, or use them as a template for adding entries for other 428# properties. 429# 430# You may need to adjust the entries to suit your purposes. setup_unihan(), 431# and filter_unihan_line() are the functions where this is done. This program 432# already does some adjusting to make the lines look more like the rest of the 433# Unicode DB; You can see what that is in filter_unihan_line() 434# 435# There is a bug in the 3.2 data file in which some values for the 436# kPrimaryNumeric property have commas and an unexpected comment. A filter 437# could be added for these; or for a particular installation, the Unihan.txt 438# file could be edited to fix them. 439# 440# HOW TO ADD A FILE TO BE PROCESSED 441# 442# A new file from Unicode needs to have an object constructed for it in 443# @input_file_objects, probably at the end or at the end of the extracted 444# ones. The program should warn you if its name will clash with others on 445# restrictive file systems, like DOS. If so, figure out a better name, and 446# add lines to the README.perl file giving that. If the file is a character 447# property, it should be in the format that Unicode has implicitly 448# standardized for such files for the more recently introduced ones. 449# If so, the Input_file constructor for @input_file_objects can just be the 450# file name and release it first appeared in. If not, then it should be 451# possible to construct an each_line_handler() to massage the line into the 452# standardized form. 453# 454# For non-character properties, more code will be needed. You can look at 455# the existing entries for clues. 456# 457# UNICODE VERSIONS NOTES 458# 459# The Unicode UCD has had a number of errors in it over the versions. And 460# these remain, by policy, in the standard for that version. Therefore it is 461# risky to correct them, because code may be expecting the error. So this 462# program doesn't generally make changes, unless the error breaks the Perl 463# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value 464# for U+1105, which causes real problems for the algorithms for Jamo 465# calculations, so it is changed here. 466# 467# But it isn't so clear cut as to what to do about concepts that are 468# introduced in a later release; should they extend back to earlier releases 469# where the concept just didn't exist? It was easier to do this than to not, 470# so that's what was done. For example, the default value for code points not 471# in the files for various properties was probably undefined until changed by 472# some version. No_Block for blocks is such an example. This program will 473# assign No_Block even in Unicode versions that didn't have it. This has the 474# benefit that code being written doesn't have to special case earlier 475# versions; and the detriment that it doesn't match the Standard precisely for 476# the affected versions. 477# 478# Here are some observations about some of the issues in early versions: 479# 480# Prior to version 3.0, there were 3 character decompositions. These are not 481# handled by Unicode::Normalize, nor will it compile when presented a version 482# that has them. However, you can trivially get it to compile by simply 483# ignoring those decompositions, by changing the croak to a carp. At the time 484# of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads 485# 486# croak("Weird Canonical Decomposition of U+$h"); 487# 488# Simply change to a carp. It will compile, but will not know about any three 489# character decomposition. 490 491# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out 492# that the reason is that the CJK block starting at 4E00 was removed from 493# PropList, and was not put back in until 3.1.0. The Perl extension (the 494# single property name \p{alpha}) has the correct values. But the compound 495# form is simply not generated until 3.1, as it can be argued that prior to 496# this release, this was not an official property. The comments for 497# filter_old_style_proplist() give more details. 498# 499# Unicode introduced the synonym Space for White_Space in 4.1. Perl has 500# always had a \p{Space}. In release 3.2 only, they are not synonymous. The 501# reason is that 3.2 introduced U+205F=medium math space, which was not 502# classed as white space, but Perl figured out that it should have been. 4.0 503# reclassified it correctly. 504# 505# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 506# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB 507# became 202, and ATBL was left with no code points, as all the ones that 508# mapped to 202 stayed mapped to 202. Thus if your program used the numeric 509# name for the class, it would not have been affected, but if it used the 510# mnemonic, it would have been. 511# 512# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code 513# points which eventually came to have this script property value, instead 514# mapped to "Unknown". But in the next release all these code points were 515# moved to \p{sc=common} instead. 516# 517# The default for missing code points for BidiClass is complicated. Starting 518# in 3.1.1, the derived file DBidiClass.txt handles this, but this program 519# tries to do the best it can for earlier releases. It is done in 520# process_PropertyAliases() 521# 522# In version 2.1.2, the entry in UnicodeData.txt: 523# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; 524# should instead be 525# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F 526# Without this change, there are casing problems for this character. 527# 528############################################################################## 529 530my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing 531 # and errors 532my $MAX_LINE_WIDTH = 78; 533 534# Debugging aid to skip most files so as to not be distracted by them when 535# concentrating on the ones being debugged. Add 536# non_skip => 1, 537# to the constructor for those files you want processed when you set this. 538# Files with a first version number of 0 are special: they are always 539# processed regardless of the state of this flag. Generally, Jamo.txt and 540# UnicodeData.txt must not be skipped if you want this program to not die 541# before normal completion. 542my $debug_skip = 0; 543 544 545# Normally these are suppressed. 546my $write_Unicode_deprecated_tables = 0; 547 548# Set to 1 to enable tracing. 549our $to_trace = 0; 550 551{ # Closure for trace: debugging aid 552 my $print_caller = 1; # ? Include calling subroutine name 553 my $main_with_colon = 'main::'; 554 my $main_colon_length = length($main_with_colon); 555 556 sub trace { 557 return unless $to_trace; # Do nothing if global flag not set 558 559 my @input = @_; 560 561 local $DB::trace = 0; 562 $DB::trace = 0; # Quiet 'used only once' message 563 564 my $line_number; 565 566 # Loop looking up the stack to get the first non-trace caller 567 my $caller_line; 568 my $caller_name; 569 my $i = 0; 570 do { 571 $line_number = $caller_line; 572 (my $pkg, my $file, $caller_line, my $caller) = caller $i++; 573 $caller = $main_with_colon unless defined $caller; 574 575 $caller_name = $caller; 576 577 # get rid of pkg 578 $caller_name =~ s/.*:://; 579 if (substr($caller_name, 0, $main_colon_length) 580 eq $main_with_colon) 581 { 582 $caller_name = substr($caller_name, $main_colon_length); 583 } 584 585 } until ($caller_name ne 'trace'); 586 587 # If the stack was empty, we were called from the top level 588 $caller_name = 'main' if ($caller_name eq "" 589 || $caller_name eq 'trace'); 590 591 my $output = ""; 592 foreach my $string (@input) { 593 #print STDERR __LINE__, ": ", join ", ", @input, "\n"; 594 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { 595 $output .= simple_dumper($string); 596 } 597 else { 598 $string = "$string" if ref $string; 599 $string = $UNDEF unless defined $string; 600 chomp $string; 601 $string = '""' if $string eq ""; 602 $output .= " " if $output ne "" 603 && $string ne "" 604 && substr($output, -1, 1) ne " " 605 && substr($string, 0, 1) ne " "; 606 $output .= $string; 607 } 608 } 609 610 print STDERR sprintf "%4d: ", $line_number if defined $line_number; 611 print STDERR "$caller_name: " if $print_caller; 612 print STDERR $output, "\n"; 613 return; 614 } 615} 616 617# This is for a rarely used development feature that allows you to compare two 618# versions of the Unicode standard without having to deal with changes caused 619# by the code points introduced in the later version. Change the 0 to a 620# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only 621# code points introduced in that release and earlier will be used; later ones 622# are thrown away. You use the version number of the earliest one you want to 623# compare; then run this program on directory structures containing each 624# release, and compare the outputs. These outputs will therefore include only 625# the code points common to both releases, and you can see the changes caused 626# just by the underlying release semantic changes. For versions earlier than 627# 3.2, you must copy a version of DAge.txt into the directory. 628my $string_compare_versions = DEBUG && 0; # e.g., "2.1"; 629my $compare_versions = DEBUG 630 && $string_compare_versions 631 && pack "C*", split /\./, $string_compare_versions; 632 633sub uniques { 634 # Returns non-duplicated input values. From "Perl Best Practices: 635 # Encapsulated Cleverness". p. 455 in first edition. 636 637 my %seen; 638 # Arguably this breaks encapsulation, if the goal is to permit multiple 639 # distinct objects to stringify to the same value, and be interchangeable. 640 # However, for this program, no two objects stringify identically, and all 641 # lists passed to this function are either objects or strings. So this 642 # doesn't affect correctness, but it does give a couple of percent speedup. 643 no overloading; 644 return grep { ! $seen{$_}++ } @_; 645} 646 647$0 = File::Spec->canonpath($0); 648 649my $make_test_script = 0; # ? Should we output a test script 650my $make_norm_test_script = 0; # ? Should we output a normalization test script 651my $write_unchanged_files = 0; # ? Should we update the output files even if 652 # we don't think they have changed 653my $use_directory = ""; # ? Should we chdir somewhere. 654my $pod_directory; # input directory to store the pod file. 655my $pod_file = 'perluniprops'; 656my $t_path; # Path to the .t test file 657my $file_list = 'mktables.lst'; # File to store input and output file names. 658 # This is used to speed up the build, by not 659 # executing the main body of the program if 660 # nothing on the list has changed since the 661 # previous build 662my $make_list = 1; # ? Should we write $file_list. Set to always 663 # make a list so that when the pumpking is 664 # preparing a release, s/he won't have to do 665 # special things 666my $glob_list = 0; # ? Should we try to include unknown .txt files 667 # in the input. 668my $output_range_counts = $debugging_build; # ? Should we include the number 669 # of code points in ranges in 670 # the output 671my $annotate = 0; # ? Should character names be in the output 672 673# Verbosity levels; 0 is quiet 674my $NORMAL_VERBOSITY = 1; 675my $PROGRESS = 2; 676my $VERBOSE = 3; 677 678my $verbosity = $NORMAL_VERBOSITY; 679 680# Process arguments 681while (@ARGV) { 682 my $arg = shift @ARGV; 683 if ($arg eq '-v') { 684 $verbosity = $VERBOSE; 685 } 686 elsif ($arg eq '-p') { 687 $verbosity = $PROGRESS; 688 $| = 1; # Flush buffers as we go. 689 } 690 elsif ($arg eq '-q') { 691 $verbosity = 0; 692 } 693 elsif ($arg eq '-w') { 694 $write_unchanged_files = 1; # update the files even if havent changed 695 } 696 elsif ($arg eq '-check') { 697 my $this = shift @ARGV; 698 my $ok = shift @ARGV; 699 if ($this ne $ok) { 700 print "Skipping as check params are not the same.\n"; 701 exit(0); 702 } 703 } 704 elsif ($arg eq '-P' && defined ($pod_directory = shift)) { 705 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; 706 } 707 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) 708 { 709 $make_test_script = 1; 710 } 711 elsif ($arg eq '-makenormtest') 712 { 713 $make_norm_test_script = 1; 714 } 715 elsif ($arg eq '-makelist') { 716 $make_list = 1; 717 } 718 elsif ($arg eq '-C' && defined ($use_directory = shift)) { 719 -d $use_directory or croak "Unknown directory '$use_directory'"; 720 } 721 elsif ($arg eq '-L') { 722 723 # Existence not tested until have chdir'd 724 $file_list = shift; 725 } 726 elsif ($arg eq '-globlist') { 727 $glob_list = 1; 728 } 729 elsif ($arg eq '-c') { 730 $output_range_counts = ! $output_range_counts 731 } 732 elsif ($arg eq '-annotate') { 733 $annotate = 1; 734 $debugging_build = 1; 735 $output_range_counts = 1; 736 } 737 else { 738 my $with_c = 'with'; 739 $with_c .= 'out' if $output_range_counts; # Complements the state 740 croak <<END; 741usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] 742 [ -T test_file_path ] [-globlist] [-makelist] [-maketest] 743 [-check A B ] 744 -c : Output comments $with_c number of code points in ranges 745 -q : Quiet Mode: Only output serious warnings. 746 -p : Set verbosity level to normal plus show progress. 747 -v : Set Verbosity level high: Show progress and non-serious 748 warnings 749 -w : Write files regardless 750 -C dir : Change to this directory before proceeding. All relative paths 751 except those specified by the -P and -T options will be done 752 with respect to this directory. 753 -P dir : Output $pod_file file to directory 'dir'. 754 -T path : Create a test script as 'path'; overrides -maketest 755 -L filelist : Use alternate 'filelist' instead of standard one 756 -globlist : Take as input all non-Test *.txt files in current and sub 757 directories 758 -maketest : Make test script 'TestProp.pl' in current (or -C directory), 759 overrides -T 760 -makelist : Rewrite the file list $file_list based on current setup 761 -annotate : Output an annotation for each character in the table files; 762 useful for debugging mktables, looking at diffs; but is slow, 763 memory intensive; resulting tables are usable but are slow and 764 very large (and currently fail the Unicode::UCD.t tests). 765 -check A B : Executes $0 only if A and B are the same 766END 767 } 768} 769 770# Stores the most-recently changed file. If none have changed, can skip the 771# build 772my $most_recent = (stat $0)[9]; # Do this before the chdir! 773 774# Change directories now, because need to read 'version' early. 775if ($use_directory) { 776 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { 777 $pod_directory = File::Spec->rel2abs($pod_directory); 778 } 779 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { 780 $t_path = File::Spec->rel2abs($t_path); 781 } 782 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; 783 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { 784 $pod_directory = File::Spec->abs2rel($pod_directory); 785 } 786 if ($t_path && File::Spec->file_name_is_absolute($t_path)) { 787 $t_path = File::Spec->abs2rel($t_path); 788 } 789} 790 791# Get Unicode version into regular and v-string. This is done now because 792# various tables below get populated based on it. These tables are populated 793# here to be near the top of the file, and so easily seeable by those needing 794# to modify things. 795open my $VERSION, "<", "version" 796 or croak "$0: can't open required file 'version': $!\n"; 797my $string_version = <$VERSION>; 798close $VERSION; 799chomp $string_version; 800my $v_version = pack "C*", split /\./, $string_version; # v string 801 802# The following are the complete names of properties with property values that 803# are known to not match any code points in some versions of Unicode, but that 804# may change in the future so they should be matchable, hence an empty file is 805# generated for them. 806my @tables_that_may_be_empty = ( 807 'Joining_Type=Left_Joining', 808 ); 809push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; 810push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; 811push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' 812 if $v_version ge v4.1.0; 813push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' 814 if $v_version ge v6.0.0; 815push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' 816 if $v_version ge v6.1.0; 817push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' 818 if $v_version ge v6.2.0; 819 820# The lists below are hashes, so the key is the item in the list, and the 821# value is the reason why it is in the list. This makes generation of 822# documentation easier. 823 824my %why_suppressed; # No file generated for these. 825 826# Files aren't generated for empty extraneous properties. This is arguable. 827# Extraneous properties generally come about because a property is no longer 828# used in a newer version of Unicode. If we generated a file without code 829# points, programs that used to work on that property will still execute 830# without errors. It just won't ever match (or will always match, with \P{}). 831# This means that the logic is now likely wrong. I (khw) think its better to 832# find this out by getting an error message. Just move them to the table 833# above to change this behavior 834my %why_suppress_if_empty_warn_if_not = ( 835 836 # It is the only property that has ever officially been removed from the 837 # Standard. The database never contained any code points for it. 838 'Special_Case_Condition' => 'Obsolete', 839 840 # Apparently never official, but there were code points in some versions of 841 # old-style PropList.txt 842 'Non_Break' => 'Obsolete', 843); 844 845# These would normally go in the warn table just above, but they were changed 846# a long time before this program was written, so warnings about them are 847# moot. 848if ($v_version gt v3.2.0) { 849 push @tables_that_may_be_empty, 850 'Canonical_Combining_Class=Attached_Below_Left' 851} 852 853# These are listed in the Property aliases file in 6.0, but Unihan is ignored 854# unless explicitly added. 855if ($v_version ge v5.2.0) { 856 my $unihan = 'Unihan; remove from list if using Unihan'; 857 foreach my $table (qw ( 858 kAccountingNumeric 859 kOtherNumeric 860 kPrimaryNumeric 861 kCompatibilityVariant 862 kIICore 863 kIRG_GSource 864 kIRG_HSource 865 kIRG_JSource 866 kIRG_KPSource 867 kIRG_MSource 868 kIRG_KSource 869 kIRG_TSource 870 kIRG_USource 871 kIRG_VSource 872 kRSUnicode 873 )) 874 { 875 $why_suppress_if_empty_warn_if_not{$table} = $unihan; 876 } 877} 878 879# Enum values for to_output_map() method in the Map_Table package. 880my $EXTERNAL_MAP = 1; 881my $INTERNAL_MAP = 2; 882my $OUTPUT_ADJUSTED = 3; 883 884# To override computed values for writing the map tables for these properties. 885# The default for enum map tables is to write them out, so that the Unicode 886# .txt files can be removed, but all the data to compute any property value 887# for any code point is available in a more compact form. 888my %global_to_output_map = ( 889 # Needed by UCD.pm, but don't want to publicize that it exists, so won't 890 # get stuck supporting it if things change. Since it is a STRING 891 # property, it normally would be listed in the pod, but INTERNAL_MAP 892 # suppresses that. 893 Unicode_1_Name => $INTERNAL_MAP, 894 895 Present_In => 0, # Suppress, as easily computed from Age 896 Block => 0, # Suppress, as Blocks.txt is retained. 897 898 # Suppress, as mapping can be found instead from the 899 # Perl_Decomposition_Mapping file 900 Decomposition_Type => 0, 901); 902 903# Properties that this program ignores. 904my @unimplemented_properties; 905 906# With this release, it is automatically handled if the Unihan db is 907# downloaded 908push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0; 909 910# There are several types of obsolete properties defined by Unicode. These 911# must be hand-edited for every new Unicode release. 912my %why_deprecated; # Generates a deprecated warning message if used. 913my %why_stabilized; # Documentation only 914my %why_obsolete; # Documentation only 915 916{ # Closure 917 my $simple = 'Perl uses the more complete version of this property'; 918 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan'; 919 920 my $other_properties = 'other properties'; 921 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; 922 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; 923 924 %why_deprecated = ( 925 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 926 'Jamo_Short_Name' => $contributory, 927 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 928 'Other_Alphabetic' => $contributory, 929 'Other_Default_Ignorable_Code_Point' => $contributory, 930 'Other_Grapheme_Extend' => $contributory, 931 'Other_ID_Continue' => $contributory, 932 'Other_ID_Start' => $contributory, 933 'Other_Lowercase' => $contributory, 934 'Other_Math' => $contributory, 935 'Other_Uppercase' => $contributory, 936 'Expands_On_NFC' => $why_no_expand, 937 'Expands_On_NFD' => $why_no_expand, 938 'Expands_On_NFKC' => $why_no_expand, 939 'Expands_On_NFKD' => $why_no_expand, 940 ); 941 942 %why_suppressed = ( 943 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which 944 # contains the same information, but without the algorithmically 945 # determinable Hangul syllables'. This file is not published, so it's 946 # existence is not noted in the comment. 947 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()', 948 949 'Indic_Matra_Category' => "Provisional", 950 'Indic_Syllabic_Category' => "Provisional", 951 952 # Don't suppress ISO_Comment, as otherwise special handling is needed 953 # to differentiate between it and gc=c, which can be written as 'isc', 954 # which is the same characters as ISO_Comment's short name. 955 956 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()", 957 958 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()", 959 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 960 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 961 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 962 963 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful', 964 ); 965 966 foreach my $property ( 967 968 # The following are suppressed because they were made contributory 969 # or deprecated by Unicode before Perl ever thought about 970 # supporting them. 971 'Jamo_Short_Name', 972 'Grapheme_Link', 973 'Expands_On_NFC', 974 'Expands_On_NFD', 975 'Expands_On_NFKC', 976 'Expands_On_NFKD', 977 978 # The following are suppressed because they have been marked 979 # as deprecated for a sufficient amount of time 980 'Other_Alphabetic', 981 'Other_Default_Ignorable_Code_Point', 982 'Other_Grapheme_Extend', 983 'Other_ID_Continue', 984 'Other_ID_Start', 985 'Other_Lowercase', 986 'Other_Math', 987 'Other_Uppercase', 988 ) { 989 $why_suppressed{$property} = $why_deprecated{$property}; 990 } 991 992 # Customize the message for all the 'Other_' properties 993 foreach my $property (keys %why_deprecated) { 994 next if (my $main_property = $property) !~ s/^Other_//; 995 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; 996 } 997} 998 999if ($write_Unicode_deprecated_tables) { 1000 foreach my $property (keys %why_suppressed) { 1001 delete $why_suppressed{$property} if $property =~ 1002 / ^ Other | Grapheme /x; 1003 } 1004} 1005 1006if ($v_version ge 4.0.0) { 1007 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; 1008 if ($v_version ge 6.0.0) { 1009 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; 1010 } 1011} 1012if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { 1013 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; 1014 if ($v_version ge 6.0.0) { 1015 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; 1016 } 1017} 1018 1019# Probably obsolete forever 1020if ($v_version ge v4.1.0) { 1021 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; 1022} 1023if ($v_version ge v6.0.0) { 1024 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; 1025 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; 1026} 1027 1028# This program can create files for enumerated-like properties, such as 1029# 'Numeric_Type'. This file would be the same format as for a string 1030# property, with a mapping from code point to its value, so you could look up, 1031# for example, the script a code point is in. But no one so far wants this 1032# mapping, or they have found another way to get it since this is a new 1033# feature. So no file is generated except if it is in this list. 1034my @output_mapped_properties = split "\n", <<END; 1035END 1036 1037# If you are using the Unihan database in a Unicode version before 5.2, you 1038# need to add the properties that you want to extract from it to this table. 1039# For your convenience, the properties in the 6.0 PropertyAliases.txt file are 1040# listed, commented out 1041my @cjk_properties = split "\n", <<'END'; 1042#cjkAccountingNumeric; kAccountingNumeric 1043#cjkOtherNumeric; kOtherNumeric 1044#cjkPrimaryNumeric; kPrimaryNumeric 1045#cjkCompatibilityVariant; kCompatibilityVariant 1046#cjkIICore ; kIICore 1047#cjkIRG_GSource; kIRG_GSource 1048#cjkIRG_HSource; kIRG_HSource 1049#cjkIRG_JSource; kIRG_JSource 1050#cjkIRG_KPSource; kIRG_KPSource 1051#cjkIRG_KSource; kIRG_KSource 1052#cjkIRG_TSource; kIRG_TSource 1053#cjkIRG_USource; kIRG_USource 1054#cjkIRG_VSource; kIRG_VSource 1055#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS 1056END 1057 1058# Similarly for the property values. For your convenience, the lines in the 1059# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both 1060# '#' marks (for Unicode versions before 5.2) 1061my @cjk_property_values = split "\n", <<'END'; 1062## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 1063## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> 1064## @missing: 0000..10FFFF; cjkIICore; <none> 1065## @missing: 0000..10FFFF; cjkIRG_GSource; <none> 1066## @missing: 0000..10FFFF; cjkIRG_HSource; <none> 1067## @missing: 0000..10FFFF; cjkIRG_JSource; <none> 1068## @missing: 0000..10FFFF; cjkIRG_KPSource; <none> 1069## @missing: 0000..10FFFF; cjkIRG_KSource; <none> 1070## @missing: 0000..10FFFF; cjkIRG_TSource; <none> 1071## @missing: 0000..10FFFF; cjkIRG_USource; <none> 1072## @missing: 0000..10FFFF; cjkIRG_VSource; <none> 1073## @missing: 0000..10FFFF; cjkOtherNumeric; NaN 1074## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN 1075## @missing: 0000..10FFFF; cjkRSUnicode; <none> 1076END 1077 1078# The input files don't list every code point. Those not listed are to be 1079# defaulted to some value. Below are hard-coded what those values are for 1080# non-binary properties as of 5.1. Starting in 5.0, there are 1081# machine-parsable comment lines in the files that give the defaults; so this 1082# list shouldn't have to be extended. The claim is that all missing entries 1083# for binary properties will default to 'N'. Unicode tried to change that in 1084# 5.2, but the beta period produced enough protest that they backed off. 1085# 1086# The defaults for the fields that appear in UnicodeData.txt in this hash must 1087# be in the form that it expects. The others may be synonyms. 1088my $CODE_POINT = '<code point>'; 1089my %default_mapping = ( 1090 Age => "Unassigned", 1091 # Bidi_Class => Complicated; set in code 1092 Bidi_Mirroring_Glyph => "", 1093 Block => 'No_Block', 1094 Canonical_Combining_Class => 0, 1095 Case_Folding => $CODE_POINT, 1096 Decomposition_Mapping => $CODE_POINT, 1097 Decomposition_Type => 'None', 1098 East_Asian_Width => "Neutral", 1099 FC_NFKC_Closure => $CODE_POINT, 1100 General_Category => 'Cn', 1101 Grapheme_Cluster_Break => 'Other', 1102 Hangul_Syllable_Type => 'NA', 1103 ISO_Comment => "", 1104 Jamo_Short_Name => "", 1105 Joining_Group => "No_Joining_Group", 1106 # Joining_Type => Complicated; set in code 1107 kIICore => 'N', # Is converted to binary 1108 #Line_Break => Complicated; set in code 1109 Lowercase_Mapping => $CODE_POINT, 1110 Name => "", 1111 Name_Alias => "", 1112 NFC_QC => 'Yes', 1113 NFD_QC => 'Yes', 1114 NFKC_QC => 'Yes', 1115 NFKD_QC => 'Yes', 1116 Numeric_Type => 'None', 1117 Numeric_Value => 'NaN', 1118 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', 1119 Sentence_Break => 'Other', 1120 Simple_Case_Folding => $CODE_POINT, 1121 Simple_Lowercase_Mapping => $CODE_POINT, 1122 Simple_Titlecase_Mapping => $CODE_POINT, 1123 Simple_Uppercase_Mapping => $CODE_POINT, 1124 Titlecase_Mapping => $CODE_POINT, 1125 Unicode_1_Name => "", 1126 Unicode_Radical_Stroke => "", 1127 Uppercase_Mapping => $CODE_POINT, 1128 Word_Break => 'Other', 1129); 1130 1131# Below are files that Unicode furnishes, but this program ignores, and why. 1132# NormalizationCorrections.txt requires some more explanation. It documents 1133# the cumulative fixes to erroneous normalizations in earlier Unicode 1134# versions. Its main purpose is so that someone running on an earlier version 1135# can use this file to override what got published in that earlier release. 1136# It would be easy for mktables to read and handle this file. But all the 1137# corrections in it should already be in the other files for the release it 1138# is. To get it to actually mean something useful, someone would have to be 1139# using an earlier Unicode release, and copy it to the files for that release 1140# and recomplile. So far there has been no demand to do that, so this hasn't 1141# been implemented. 1142my %ignored_files = ( 1143 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points', 1144 'Index.txt' => 'Alphabetical index of Unicode characters', 1145 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl', 1146 'NamesList.txt' => 'Annotated list of characters', 1147 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base', 1148 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)', 1149 'ReadMe.txt' => 'Documentation', 1150 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>', 1151 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', 1152 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters', 1153 'USourceData.pdf' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters', 1154 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', 1155 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', 1156 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', 1157 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests', 1158); 1159 1160my %skipped_files; # List of files that we skip 1161 1162### End of externally interesting definitions, except for @input_file_objects 1163 1164my $HEADER=<<"EOF"; 1165# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1166# This file is machine-generated by $0 from the Unicode 1167# database, Version $string_version. Any changes made here will be lost! 1168EOF 1169 1170my $INTERNAL_ONLY_HEADER = <<"EOF"; 1171 1172# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 1173# This file is for internal use by core Perl only. The format and even the 1174# name or existence of this file are subject to change without notice. Don't 1175# use it directly. 1176EOF 1177 1178my $DEVELOPMENT_ONLY=<<"EOF"; 1179# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! 1180# This file contains information artificially constrained to code points 1181# present in Unicode release $string_compare_versions. 1182# IT CANNOT BE RELIED ON. It is for use during development only and should 1183# not be used for production. 1184 1185EOF 1186 1187my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF"; 1188my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; 1189my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; 1190 1191# Matches legal code point. 4-6 hex numbers, If there are 6, the first 1192# two must be 10; if there are 5, the first must not be a 0. Written this way 1193# to decrease backtracking. The first regex allows the code point to be at 1194# the end of a word, but to work properly, the word shouldn't end with a valid 1195# hex character. The second one won't match a code point at the end of a 1196# word, and doesn't have the run-on issue 1197my $run_on_code_point_re = 1198 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; 1199my $code_point_re = qr/\b$run_on_code_point_re/; 1200 1201# This matches the beginning of the line in the Unicode db files that give the 1202# defaults for code points not listed (i.e., missing) in the file. The code 1203# depends on this ending with a semi-colon, so it can assume it is a valid 1204# field when the line is split() by semi-colons 1205my $missing_defaults_prefix = 1206 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/; 1207 1208# Property types. Unicode has more types, but these are sufficient for our 1209# purposes. 1210my $UNKNOWN = -1; # initialized to illegal value 1211my $NON_STRING = 1; # Either binary or enum 1212my $BINARY = 2; 1213my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal 1214 # tables, additional true and false tables are 1215 # generated so that false is anything matching the 1216 # default value, and true is everything else. 1217my $ENUM = 4; # Include catalog 1218my $STRING = 5; # Anything else: string or misc 1219 1220# Some input files have lines that give default values for code points not 1221# contained in the file. Sometimes these should be ignored. 1222my $NO_DEFAULTS = 0; # Must evaluate to false 1223my $NOT_IGNORED = 1; 1224my $IGNORED = 2; 1225 1226# Range types. Each range has a type. Most ranges are type 0, for normal, 1227# and will appear in the main body of the tables in the output files, but 1228# there are other types of ranges as well, listed below, that are specially 1229# handled. There are pseudo-types as well that will never be stored as a 1230# type, but will affect the calculation of the type. 1231 1232# 0 is for normal, non-specials 1233my $MULTI_CP = 1; # Sequence of more than code point 1234my $HANGUL_SYLLABLE = 2; 1235my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. 1236my $NULL = 4; # The map is to the null string; utf8.c can't 1237 # handle these, nor is there an accepted syntax 1238 # for them in \p{} constructs 1239my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would 1240 # otherwise be $MULTI_CP type are instead type 0 1241 1242# process_generic_property_file() can accept certain overrides in its input. 1243# Each of these must begin AND end with $CMD_DELIM. 1244my $CMD_DELIM = "\a"; 1245my $REPLACE_CMD = 'replace'; # Override the Replace 1246my $MAP_TYPE_CMD = 'map_type'; # Override the Type 1247 1248my $NO = 0; 1249my $YES = 1; 1250 1251# Values for the Replace argument to add_range. 1252# $NO # Don't replace; add only the code points not 1253 # already present. 1254my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in 1255 # the comments at the subroutine definition. 1256my $UNCONDITIONALLY = 2; # Replace without conditions. 1257my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if 1258 # already there 1259my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if 1260 # already there 1261my $CROAK = 6; # Die with an error if is already there 1262 1263# Flags to give property statuses. The phrases are to remind maintainers that 1264# if the flag is changed, the indefinite article referring to it in the 1265# documentation may need to be as well. 1266my $NORMAL = ""; 1267my $DEPRECATED = 'D'; 1268my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; 1269my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; 1270my $DISCOURAGED = 'X'; 1271my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; 1272my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; 1273my $STRICTER = 'T'; 1274my $a_bold_stricter = "a 'B<$STRICTER>'"; 1275my $A_bold_stricter = "A 'B<$STRICTER>'"; 1276my $STABILIZED = 'S'; 1277my $a_bold_stabilized = "an 'B<$STABILIZED>'"; 1278my $A_bold_stabilized = "An 'B<$STABILIZED>'"; 1279my $OBSOLETE = 'O'; 1280my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; 1281my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; 1282 1283my %status_past_participles = ( 1284 $DISCOURAGED => 'discouraged', 1285 $STABILIZED => 'stabilized', 1286 $OBSOLETE => 'obsolete', 1287 $DEPRECATED => 'deprecated', 1288); 1289 1290# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be 1291# externally documented. 1292my $ORDINARY = 0; # The normal fate. 1293my $MAP_PROXIED = 1; # The map table for the property isn't written out, 1294 # but there is a file written that can be used to 1295 # reconstruct this table 1296my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is 1297 # for Perl's internal use only 1298my $SUPPRESSED = 3; # The file for this table is not written out, and as a 1299 # result, we don't bother to do many computations on 1300 # it. 1301my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the 1302 # computations anyway, as the values are needed for 1303 # things to work. This happens when we have Perl 1304 # extensions that depend on Unicode tables that 1305 # wouldn't normally be in a given Unicode version. 1306 1307# The format of the values of the tables: 1308my $EMPTY_FORMAT = ""; 1309my $BINARY_FORMAT = 'b'; 1310my $DECIMAL_FORMAT = 'd'; 1311my $FLOAT_FORMAT = 'f'; 1312my $INTEGER_FORMAT = 'i'; 1313my $HEX_FORMAT = 'x'; 1314my $RATIONAL_FORMAT = 'r'; 1315my $STRING_FORMAT = 's'; 1316my $ADJUST_FORMAT = 'a'; 1317my $DECOMP_STRING_FORMAT = 'c'; 1318my $STRING_WHITE_SPACE_LIST = 'sw'; 1319 1320my %map_table_formats = ( 1321 $BINARY_FORMAT => 'binary', 1322 $DECIMAL_FORMAT => 'single decimal digit', 1323 $FLOAT_FORMAT => 'floating point number', 1324 $INTEGER_FORMAT => 'integer', 1325 $HEX_FORMAT => 'non-negative hex whole number; a code point', 1326 $RATIONAL_FORMAT => 'rational: an integer or a fraction', 1327 $STRING_FORMAT => 'string', 1328 $ADJUST_FORMAT => 'some entries need adjustment', 1329 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', 1330 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' 1331); 1332 1333# Unicode didn't put such derived files in a separate directory at first. 1334my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; 1335my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; 1336my $AUXILIARY = 'auxiliary'; 1337 1338# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl 1339# and into UCD.pl for the use of UCD.pm 1340my %loose_to_file_of; # loosely maps table names to their respective 1341 # files 1342my %stricter_to_file_of; # same; but for stricter mapping. 1343my %loose_property_to_file_of; # Maps a loose property name to its map file 1344my %file_to_swash_name; # Maps the file name to its corresponding key name 1345 # in the hash %utf8::SwashInfo 1346my %nv_floating_to_rational; # maps numeric values floating point numbers to 1347 # their rational equivalent 1348my %loose_property_name_of; # Loosely maps (non_string) property names to 1349 # standard form 1350my %string_property_loose_to_name; # Same, for string properties. 1351my %loose_defaults; # keys are of form "prop=value", where 'prop' is 1352 # the property name in standard loose form, and 1353 # 'value' is the default value for that property, 1354 # also in standard loose form. 1355my %loose_to_standard_value; # loosely maps table names to the canonical 1356 # alias for them 1357my %ambiguous_names; # keys are alias names (in standard form) that 1358 # have more than one possible meaning. 1359my %prop_aliases; # Keys are standard property name; values are each 1360 # one's aliases 1361my %prop_value_aliases; # Keys of top level are standard property name; 1362 # values are keys to another hash, Each one is 1363 # one of the property's values, in standard form. 1364 # The values are that prop-val's aliases. 1365my %ucd_pod; # Holds entries that will go into the UCD section of the pod 1366 1367# Most properties are immune to caseless matching, otherwise you would get 1368# nonsensical results, as properties are a function of a code point, not 1369# everything that is caselessly equivalent to that code point. For example, 1370# Changes_When_Case_Folded('s') should be false, whereas caselessly it would 1371# be true because 's' and 'S' are equivalent caselessly. However, 1372# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we 1373# extend that concept to those very few properties that are like this. Each 1374# such property will match the full range caselessly. They are hard-coded in 1375# the program; it's not worth trying to make it general as it's extremely 1376# unlikely that they will ever change. 1377my %caseless_equivalent_to; 1378 1379# These constants names and values were taken from the Unicode standard, 1380# version 5.1, section 3.12. They are used in conjunction with Hangul 1381# syllables. The '_string' versions are so generated tables can retain the 1382# hex format, which is the more familiar value 1383my $SBase_string = "0xAC00"; 1384my $SBase = CORE::hex $SBase_string; 1385my $LBase_string = "0x1100"; 1386my $LBase = CORE::hex $LBase_string; 1387my $VBase_string = "0x1161"; 1388my $VBase = CORE::hex $VBase_string; 1389my $TBase_string = "0x11A7"; 1390my $TBase = CORE::hex $TBase_string; 1391my $SCount = 11172; 1392my $LCount = 19; 1393my $VCount = 21; 1394my $TCount = 28; 1395my $NCount = $VCount * $TCount; 1396 1397# For Hangul syllables; These store the numbers from Jamo.txt in conjunction 1398# with the above published constants. 1399my %Jamo; 1400my %Jamo_L; # Leading consonants 1401my %Jamo_V; # Vowels 1402my %Jamo_T; # Trailing consonants 1403 1404# For code points whose name contains its ordinal as a '-ABCD' suffix. 1405# The key is the base name of the code point, and the value is an 1406# array giving all the ranges that use this base name. Each range 1407# is actually a hash giving the 'low' and 'high' values of it. 1408my %names_ending_in_code_point; 1409my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes 1410 # removed from the names 1411# Inverse mapping. The list of ranges that have these kinds of 1412# names. Each element contains the low, high, and base names in an 1413# anonymous hash. 1414my @code_points_ending_in_code_point; 1415 1416# To hold Unicode's normalization test suite 1417my @normalization_tests; 1418 1419# Boolean: does this Unicode version have the hangul syllables, and are we 1420# writing out a table for them? 1421my $has_hangul_syllables = 0; 1422 1423# Does this Unicode version have code points whose names end in their 1424# respective code points, and are we writing out a table for them? 0 for no; 1425# otherwise points to first property that a table is needed for them, so that 1426# if multiple tables are needed, we don't create duplicates 1427my $needing_code_points_ending_in_code_point = 0; 1428 1429my @backslash_X_tests; # List of tests read in for testing \X 1430my @unhandled_properties; # Will contain a list of properties found in 1431 # the input that we didn't process. 1432my @match_properties; # Properties that have match tables, to be 1433 # listed in the pod 1434my @map_properties; # Properties that get map files written 1435my @named_sequences; # NamedSequences.txt contents. 1436my %potential_files; # Generated list of all .txt files in the directory 1437 # structure so we can warn if something is being 1438 # ignored. 1439my @files_actually_output; # List of files we generated. 1440my @more_Names; # Some code point names are compound; this is used 1441 # to store the extra components of them. 1442my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at 1443 # the minimum before we consider it equivalent to a 1444 # candidate rational 1445my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms 1446 1447# These store references to certain commonly used property objects 1448my $gc; 1449my $perl; 1450my $block; 1451my $perl_charname; 1452my $print; 1453my $Any; 1454my $script; 1455 1456# Are there conflicting names because of beginning with 'In_', or 'Is_' 1457my $has_In_conflicts = 0; 1458my $has_Is_conflicts = 0; 1459 1460sub internal_file_to_platform ($) { 1461 # Convert our file paths which have '/' separators to those of the 1462 # platform. 1463 1464 my $file = shift; 1465 return undef unless defined $file; 1466 1467 return File::Spec->join(split '/', $file); 1468} 1469 1470sub file_exists ($) { # platform independent '-e'. This program internally 1471 # uses slash as a path separator. 1472 my $file = shift; 1473 return 0 if ! defined $file; 1474 return -e internal_file_to_platform($file); 1475} 1476 1477sub objaddr($) { 1478 # Returns the address of the blessed input object. 1479 # It doesn't check for blessedness because that would do a string eval 1480 # every call, and the program is structured so that this is never called 1481 # for a non-blessed object. 1482 1483 no overloading; # If overloaded, numifying below won't work. 1484 1485 # Numifying a ref gives its address. 1486 return pack 'J', $_[0]; 1487} 1488 1489# These are used only if $annotate is true. 1490# The entire range of Unicode characters is examined to populate these 1491# after all the input has been processed. But most can be skipped, as they 1492# have the same descriptive phrases, such as being unassigned 1493my @viacode; # Contains the 1 million character names 1494my @printable; # boolean: And are those characters printable? 1495my @annotate_char_type; # Contains a type of those characters, specifically 1496 # for the purposes of annotation. 1497my $annotate_ranges; # A map of ranges of code points that have the same 1498 # name for the purposes of annotation. They map to the 1499 # upper edge of the range, so that the end point can 1500 # be immediately found. This is used to skip ahead to 1501 # the end of a range, and avoid processing each 1502 # individual code point in it. 1503my $unassigned_sans_noncharacters; # A Range_List of the unassigned 1504 # characters, but excluding those which are 1505 # also noncharacter code points 1506 1507# The annotation types are an extension of the regular range types, though 1508# some of the latter are folded into one. Make the new types negative to 1509# avoid conflicting with the regular types 1510my $SURROGATE_TYPE = -1; 1511my $UNASSIGNED_TYPE = -2; 1512my $PRIVATE_USE_TYPE = -3; 1513my $NONCHARACTER_TYPE = -4; 1514my $CONTROL_TYPE = -5; 1515my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program 1516 1517sub populate_char_info ($) { 1518 # Used only with the $annotate option. Populates the arrays with the 1519 # input code point's info that are needed for outputting more detailed 1520 # comments. If calling context wants a return, it is the end point of 1521 # any contiguous range of characters that share essentially the same info 1522 1523 my $i = shift; 1524 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 1525 1526 $viacode[$i] = $perl_charname->value_of($i) || ""; 1527 1528 # A character is generally printable if Unicode says it is, 1529 # but below we make sure that most Unicode general category 'C' types 1530 # aren't. 1531 $printable[$i] = $print->contains($i); 1532 1533 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; 1534 1535 # Only these two regular types are treated specially for annotations 1536 # purposes 1537 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME 1538 && $annotate_char_type[$i] != $HANGUL_SYLLABLE; 1539 1540 # Give a generic name to all code points that don't have a real name. 1541 # We output ranges, if applicable, for these. Also calculate the end 1542 # point of the range. 1543 my $end; 1544 if (! $viacode[$i]) { 1545 my $nonchar; 1546 if ($gc-> table('Private_use')->contains($i)) { 1547 $viacode[$i] = 'Private Use'; 1548 $annotate_char_type[$i] = $PRIVATE_USE_TYPE; 1549 $printable[$i] = 0; 1550 $end = $gc->table('Private_Use')->containing_range($i)->end; 1551 } 1552 elsif ((defined ($nonchar = 1553 Property::property_ref('Noncharacter_Code_Point')) 1554 && $nonchar->table('Y')->contains($i))) 1555 { 1556 $viacode[$i] = 'Noncharacter'; 1557 $annotate_char_type[$i] = $NONCHARACTER_TYPE; 1558 $printable[$i] = 0; 1559 $end = property_ref('Noncharacter_Code_Point')->table('Y')-> 1560 containing_range($i)->end; 1561 } 1562 elsif ($gc-> table('Control')->contains($i)) { 1563 $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control'; 1564 $annotate_char_type[$i] = $CONTROL_TYPE; 1565 $printable[$i] = 0; 1566 } 1567 elsif ($gc-> table('Unassigned')->contains($i)) { 1568 $annotate_char_type[$i] = $UNASSIGNED_TYPE; 1569 $printable[$i] = 0; 1570 if ($v_version lt v2.0.0) { # No blocks in earliest releases 1571 $viacode[$i] = 'Unassigned'; 1572 $end = $gc-> table('Unassigned')->containing_range($i)->end; 1573 } 1574 else { 1575 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i); 1576 1577 # Because we name the unassigned by the blocks they are in, it 1578 # can't go past the end of that block, and it also can't go 1579 # past the unassigned range it is in. The special table makes 1580 # sure that the non-characters, which are unassigned, are 1581 # separated out. 1582 $end = min($block->containing_range($i)->end, 1583 $unassigned_sans_noncharacters-> 1584 containing_range($i)->end); 1585 } 1586 } 1587 elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases 1588 $viacode[$i] = $gc->value_of($i); 1589 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1590 $printable[$i] = 0; 1591 } 1592 elsif ($gc-> table('Surrogate')->contains($i)) { 1593 $viacode[$i] = 'Surrogate'; 1594 $annotate_char_type[$i] = $SURROGATE_TYPE; 1595 $printable[$i] = 0; 1596 $end = $gc->table('Surrogate')->containing_range($i)->end; 1597 } 1598 else { 1599 Carp::my_carp_bug("Can't figure out how to annotate " 1600 . sprintf("U+%04X", $i) 1601 . ". Proceeding anyway."); 1602 $viacode[$i] = 'UNKNOWN'; 1603 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1604 $printable[$i] = 0; 1605 } 1606 } 1607 1608 # Here, has a name, but if it's one in which the code point number is 1609 # appended to the name, do that. 1610 elsif ($annotate_char_type[$i] == $CP_IN_NAME) { 1611 $viacode[$i] .= sprintf("-%04X", $i); 1612 $end = $perl_charname->containing_range($i)->end; 1613 } 1614 1615 # And here, has a name, but if it's a hangul syllable one, replace it with 1616 # the correct name from the Unicode algorithm 1617 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { 1618 use integer; 1619 my $SIndex = $i - $SBase; 1620 my $L = $LBase + $SIndex / $NCount; 1621 my $V = $VBase + ($SIndex % $NCount) / $TCount; 1622 my $T = $TBase + $SIndex % $TCount; 1623 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; 1624 $viacode[$i] .= $Jamo{$T} if $T != $TBase; 1625 $end = $perl_charname->containing_range($i)->end; 1626 } 1627 1628 return if ! defined wantarray; 1629 return $i if ! defined $end; # If not a range, return the input 1630 1631 # Save this whole range so can find the end point quickly 1632 $annotate_ranges->add_map($i, $end, $end); 1633 1634 return $end; 1635} 1636 1637# Commented code below should work on Perl 5.8. 1638## This 'require' doesn't necessarily work in miniperl, and even if it does, 1639## the native perl version of it (which is what would operate under miniperl) 1640## is extremely slow, as it does a string eval every call. 1641#my $has_fast_scalar_util = $ !~ /miniperl/ 1642# && defined eval "require Scalar::Util"; 1643# 1644#sub objaddr($) { 1645# # Returns the address of the blessed input object. Uses the XS version if 1646# # available. It doesn't check for blessedness because that would do a 1647# # string eval every call, and the program is structured so that this is 1648# # never called for a non-blessed object. 1649# 1650# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; 1651# 1652# # Check at least that is a ref. 1653# my $pkg = ref($_[0]) or return undef; 1654# 1655# # Change to a fake package to defeat any overloaded stringify 1656# bless $_[0], 'main::Fake'; 1657# 1658# # Numifying a ref gives its address. 1659# my $addr = pack 'J', $_[0]; 1660# 1661# # Return to original class 1662# bless $_[0], $pkg; 1663# return $addr; 1664#} 1665 1666sub max ($$) { 1667 my $a = shift; 1668 my $b = shift; 1669 return $a if $a >= $b; 1670 return $b; 1671} 1672 1673sub min ($$) { 1674 my $a = shift; 1675 my $b = shift; 1676 return $a if $a <= $b; 1677 return $b; 1678} 1679 1680sub clarify_number ($) { 1681 # This returns the input number with underscores inserted every 3 digits 1682 # in large (5 digits or more) numbers. Input must be entirely digits, not 1683 # checked. 1684 1685 my $number = shift; 1686 my $pos = length($number) - 3; 1687 return $number if $pos <= 1; 1688 while ($pos > 0) { 1689 substr($number, $pos, 0) = '_'; 1690 $pos -= 3; 1691 } 1692 return $number; 1693} 1694 1695 1696package Carp; 1697 1698# These routines give a uniform treatment of messages in this program. They 1699# are placed in the Carp package to cause the stack trace to not include them, 1700# although an alternative would be to use another package and set @CARP_NOT 1701# for it. 1702 1703our $Verbose = 1 if main::DEBUG; # Useful info when debugging 1704 1705# This is a work-around suggested by Nicholas Clark to fix a problem with Carp 1706# and overload trying to load Scalar:Util under miniperl. See 1707# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html 1708undef $overload::VERSION; 1709 1710sub my_carp { 1711 my $message = shift || ""; 1712 my $nofold = shift || 0; 1713 1714 if ($message) { 1715 $message = main::join_lines($message); 1716 $message =~ s/^$0: *//; # Remove initial program name 1717 $message =~ s/[.;,]+$//; # Remove certain ending punctuation 1718 $message = "\n$0: $message;"; 1719 1720 # Fold the message with program name, semi-colon end punctuation 1721 # (which looks good with the message that carp appends to it), and a 1722 # hanging indent for continuation lines. 1723 $message = main::simple_fold($message, "", 4) unless $nofold; 1724 $message =~ s/\n$//; # Remove the trailing nl so what carp 1725 # appends is to the same line 1726 } 1727 1728 return $message if defined wantarray; # If a caller just wants the msg 1729 1730 carp $message; 1731 return; 1732} 1733 1734sub my_carp_bug { 1735 # This is called when it is clear that the problem is caused by a bug in 1736 # this program. 1737 1738 my $message = shift; 1739 $message =~ s/^$0: *//; 1740 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); 1741 carp $message; 1742 return; 1743} 1744 1745sub carp_too_few_args { 1746 if (@_ != 2) { 1747 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken."); 1748 return; 1749 } 1750 1751 my $args_ref = shift; 1752 my $count = shift; 1753 1754 my_carp_bug("Need at least $count arguments to " 1755 . (caller 1)[3] 1756 . ". Instead got: '" 1757 . join ', ', @$args_ref 1758 . "'. No action taken."); 1759 return; 1760} 1761 1762sub carp_extra_args { 1763 my $args_ref = shift; 1764 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_; 1765 1766 unless (ref $args_ref) { 1767 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); 1768 return; 1769 } 1770 my ($package, $file, $line) = caller; 1771 my $subroutine = (caller 1)[3]; 1772 1773 my $list; 1774 if (ref $args_ref eq 'HASH') { 1775 foreach my $key (keys %$args_ref) { 1776 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; 1777 } 1778 $list = join ', ', each %{$args_ref}; 1779 } 1780 elsif (ref $args_ref eq 'ARRAY') { 1781 foreach my $arg (@$args_ref) { 1782 $arg = $UNDEF unless defined $arg; 1783 } 1784 $list = join ', ', @$args_ref; 1785 } 1786 else { 1787 my_carp_bug("Can't cope with ref " 1788 . ref($args_ref) 1789 . " . argument to 'carp_extra_args'. Not checking arguments."); 1790 return; 1791 } 1792 1793 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); 1794 return; 1795} 1796 1797package main; 1798 1799{ # Closure 1800 1801 # This program uses the inside-out method for objects, as recommended in 1802 # "Perl Best Practices". This closure aids in generating those. There 1803 # are two routines. setup_package() is called once per package to set 1804 # things up, and then set_access() is called for each hash representing a 1805 # field in the object. These routines arrange for the object to be 1806 # properly destroyed when no longer used, and for standard accessor 1807 # functions to be generated. If you need more complex accessors, just 1808 # write your own and leave those accesses out of the call to set_access(). 1809 # More details below. 1810 1811 my %constructor_fields; # fields that are to be used in constructors; see 1812 # below 1813 1814 # The values of this hash will be the package names as keys to other 1815 # hashes containing the name of each field in the package as keys, and 1816 # references to their respective hashes as values. 1817 my %package_fields; 1818 1819 sub setup_package { 1820 # Sets up the package, creating standard DESTROY and dump methods 1821 # (unless already defined). The dump method is used in debugging by 1822 # simple_dumper(). 1823 # The optional parameters are: 1824 # a) a reference to a hash, that gets populated by later 1825 # set_access() calls with one of the accesses being 1826 # 'constructor'. The caller can then refer to this, but it is 1827 # not otherwise used by these two routines. 1828 # b) a reference to a callback routine to call during destruction 1829 # of the object, before any fields are actually destroyed 1830 1831 my %args = @_; 1832 my $constructor_ref = delete $args{'Constructor_Fields'}; 1833 my $destroy_callback = delete $args{'Destroy_Callback'}; 1834 Carp::carp_extra_args(\@_) if main::DEBUG && %args; 1835 1836 my %fields; 1837 my $package = (caller)[0]; 1838 1839 $package_fields{$package} = \%fields; 1840 $constructor_fields{$package} = $constructor_ref; 1841 1842 unless ($package->can('DESTROY')) { 1843 my $destroy_name = "${package}::DESTROY"; 1844 no strict "refs"; 1845 1846 # Use typeglob to give the anonymous subroutine the name we want 1847 *$destroy_name = sub { 1848 my $self = shift; 1849 my $addr = do { no overloading; pack 'J', $self; }; 1850 1851 $self->$destroy_callback if $destroy_callback; 1852 foreach my $field (keys %{$package_fields{$package}}) { 1853 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; 1854 delete $package_fields{$package}{$field}{$addr}; 1855 } 1856 return; 1857 } 1858 } 1859 1860 unless ($package->can('dump')) { 1861 my $dump_name = "${package}::dump"; 1862 no strict "refs"; 1863 *$dump_name = sub { 1864 my $self = shift; 1865 return dump_inside_out($self, $package_fields{$package}, @_); 1866 } 1867 } 1868 return; 1869 } 1870 1871 sub set_access { 1872 # Arrange for the input field to be garbage collected when no longer 1873 # needed. Also, creates standard accessor functions for the field 1874 # based on the optional parameters-- none if none of these parameters: 1875 # 'addable' creates an 'add_NAME()' accessor function. 1876 # 'readable' or 'readable_array' creates a 'NAME()' accessor 1877 # function. 1878 # 'settable' creates a 'set_NAME()' accessor function. 1879 # 'constructor' doesn't create an accessor function, but adds the 1880 # field to the hash that was previously passed to 1881 # setup_package(); 1882 # Any of the accesses can be abbreviated down, so that 'a', 'ad', 1883 # 'add' etc. all mean 'addable'. 1884 # The read accessor function will work on both array and scalar 1885 # values. If another accessor in the parameter list is 'a', the read 1886 # access assumes an array. You can also force it to be array access 1887 # by specifying 'readable_array' instead of 'readable' 1888 # 1889 # A sort-of 'protected' access can be set-up by preceding the addable, 1890 # readable or settable with some initial portion of 'protected_' (but, 1891 # the underscore is required), like 'p_a', 'pro_set', etc. The 1892 # "protection" is only by convention. All that happens is that the 1893 # accessor functions' names begin with an underscore. So instead of 1894 # calling set_foo, the call is _set_foo. (Real protection could be 1895 # accomplished by having a new subroutine, end_package, called at the 1896 # end of each package, and then storing the __LINE__ ranges and 1897 # checking them on every accessor. But that is way overkill.) 1898 1899 # We create anonymous subroutines as the accessors and then use 1900 # typeglobs to assign them to the proper package and name 1901 1902 my $name = shift; # Name of the field 1903 my $field = shift; # Reference to the inside-out hash containing the 1904 # field 1905 1906 my $package = (caller)[0]; 1907 1908 if (! exists $package_fields{$package}) { 1909 croak "$0: Must call 'setup_package' before 'set_access'"; 1910 } 1911 1912 # Stash the field so DESTROY can get it. 1913 $package_fields{$package}{$name} = $field; 1914 1915 # Remaining arguments are the accessors. For each... 1916 foreach my $access (@_) { 1917 my $access = lc $access; 1918 1919 my $protected = ""; 1920 1921 # Match the input as far as it goes. 1922 if ($access =~ /^(p[^_]*)_/) { 1923 $protected = $1; 1924 if (substr('protected_', 0, length $protected) 1925 eq $protected) 1926 { 1927 1928 # Add 1 for the underscore not included in $protected 1929 $access = substr($access, length($protected) + 1); 1930 $protected = '_'; 1931 } 1932 else { 1933 $protected = ""; 1934 } 1935 } 1936 1937 if (substr('addable', 0, length $access) eq $access) { 1938 my $subname = "${package}::${protected}add_$name"; 1939 no strict "refs"; 1940 1941 # add_ accessor. Don't add if already there, which we 1942 # determine using 'eq' for scalars and '==' otherwise. 1943 *$subname = sub { 1944 use strict "refs"; 1945 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 1946 my $self = shift; 1947 my $value = shift; 1948 my $addr = do { no overloading; pack 'J', $self; }; 1949 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 1950 if (ref $value) { 1951 return if grep { $value == $_ } @{$field->{$addr}}; 1952 } 1953 else { 1954 return if grep { $value eq $_ } @{$field->{$addr}}; 1955 } 1956 push @{$field->{$addr}}, $value; 1957 return; 1958 } 1959 } 1960 elsif (substr('constructor', 0, length $access) eq $access) { 1961 if ($protected) { 1962 Carp::my_carp_bug("Can't set-up 'protected' constructors") 1963 } 1964 else { 1965 $constructor_fields{$package}{$name} = $field; 1966 } 1967 } 1968 elsif (substr('readable_array', 0, length $access) eq $access) { 1969 1970 # Here has read access. If one of the other parameters for 1971 # access is array, or this one specifies array (by being more 1972 # than just 'readable_'), then create a subroutine that 1973 # assumes the data is an array. Otherwise just a scalar 1974 my $subname = "${package}::${protected}$name"; 1975 if (grep { /^a/i } @_ 1976 or length($access) > length('readable_')) 1977 { 1978 no strict "refs"; 1979 *$subname = sub { 1980 use strict "refs"; 1981 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; 1982 my $addr = do { no overloading; pack 'J', $_[0]; }; 1983 if (ref $field->{$addr} ne 'ARRAY') { 1984 my $type = ref $field->{$addr}; 1985 $type = 'scalar' unless $type; 1986 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); 1987 return; 1988 } 1989 return scalar @{$field->{$addr}} unless wantarray; 1990 1991 # Make a copy; had problems with caller modifying the 1992 # original otherwise 1993 my @return = @{$field->{$addr}}; 1994 return @return; 1995 } 1996 } 1997 else { 1998 1999 # Here not an array value, a simpler function. 2000 no strict "refs"; 2001 *$subname = sub { 2002 use strict "refs"; 2003 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; 2004 no overloading; 2005 return $field->{pack 'J', $_[0]}; 2006 } 2007 } 2008 } 2009 elsif (substr('settable', 0, length $access) eq $access) { 2010 my $subname = "${package}::${protected}set_$name"; 2011 no strict "refs"; 2012 *$subname = sub { 2013 use strict "refs"; 2014 if (main::DEBUG) { 2015 return Carp::carp_too_few_args(\@_, 2) if @_ < 2; 2016 Carp::carp_extra_args(\@_) if @_ > 2; 2017 } 2018 # $self is $_[0]; $value is $_[1] 2019 no overloading; 2020 $field->{pack 'J', $_[0]} = $_[1]; 2021 return; 2022 } 2023 } 2024 else { 2025 Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); 2026 } 2027 } 2028 return; 2029 } 2030} 2031 2032package Input_file; 2033 2034# All input files use this object, which stores various attributes about them, 2035# and provides for convenient, uniform handling. The run method wraps the 2036# processing. It handles all the bookkeeping of opening, reading, and closing 2037# the file, returning only significant input lines. 2038# 2039# Each object gets a handler which processes the body of the file, and is 2040# called by run(). Most should use the generic, default handler, which has 2041# code scrubbed to handle things you might not expect. A handler should 2042# basically be a while(next_line()) {...} loop. 2043# 2044# You can also set up handlers to 2045# 1) call before the first line is read, for pre processing 2046# 2) call to adjust each line of the input before the main handler gets them 2047# 3) call upon EOF before the main handler exits its loop 2048# 4) call at the end, for post processing 2049# 2050# $_ is used to store the input line, and is to be filtered by the 2051# each_line_handler()s. So, if the format of the line is not in the desired 2052# format for the main handler, these are used to do that adjusting. They can 2053# be stacked (by enclosing them in an [ anonymous array ] in the constructor, 2054# so the $_ output of one is used as the input to the next. None of the other 2055# handlers are stackable, but could easily be changed to be so. 2056# 2057# Most of the handlers can call insert_lines() or insert_adjusted_lines() 2058# which insert the parameters as lines to be processed before the next input 2059# file line is read. This allows the EOF handler to flush buffers, for 2060# example. The difference between the two routines is that the lines inserted 2061# by insert_lines() are subjected to the each_line_handler()s. (So if you 2062# called it from such a handler, you would get infinite recursion.) Lines 2063# inserted by insert_adjusted_lines() go directly to the main handler without 2064# any adjustments. If the post-processing handler calls any of these, there 2065# will be no effect. Some error checking for these conditions could be added, 2066# but it hasn't been done. 2067# 2068# carp_bad_line() should be called to warn of bad input lines, which clears $_ 2069# to prevent further processing of the line. This routine will output the 2070# message as a warning once, and then keep a count of the lines that have the 2071# same message, and output that count at the end of the file's processing. 2072# This keeps the number of messages down to a manageable amount. 2073# 2074# get_missings() should be called to retrieve any @missing input lines. 2075# Messages will be raised if this isn't done if the options aren't to ignore 2076# missings. 2077 2078sub trace { return main::trace(@_); } 2079 2080{ # Closure 2081 # Keep track of fields that are to be put into the constructor. 2082 my %constructor_fields; 2083 2084 main::setup_package(Constructor_Fields => \%constructor_fields); 2085 2086 my %file; # Input file name, required 2087 main::set_access('file', \%file, qw{ c r }); 2088 2089 my %first_released; # Unicode version file was first released in, required 2090 main::set_access('first_released', \%first_released, qw{ c r }); 2091 2092 my %handler; # Subroutine to process the input file, defaults to 2093 # 'process_generic_property_file' 2094 main::set_access('handler', \%handler, qw{ c }); 2095 2096 my %property; 2097 # name of property this file is for. defaults to none, meaning not 2098 # applicable, or is otherwise determinable, for example, from each line. 2099 main::set_access('property', \%property, qw{ c r }); 2100 2101 my %optional; 2102 # If this is true, the file is optional. If not present, no warning is 2103 # output. If it is present, the string given by this parameter is 2104 # evaluated, and if false the file is not processed. 2105 main::set_access('optional', \%optional, 'c', 'r'); 2106 2107 my %non_skip; 2108 # This is used for debugging, to skip processing of all but a few input 2109 # files. Add 'non_skip => 1' to the constructor for those files you want 2110 # processed when you set the $debug_skip global. 2111 main::set_access('non_skip', \%non_skip, 'c'); 2112 2113 my %skip; 2114 # This is used to skip processing of this input file semi-permanently, 2115 # when it evaluates to true. The value should be the reason the file is 2116 # being skipped. It is used for files that we aren't planning to process 2117 # anytime soon, but want to allow to be in the directory and not raise a 2118 # message that we are not handling. Mostly for test files. This is in 2119 # contrast to the non_skip element, which is supposed to be used very 2120 # temporarily for debugging. Sets 'optional' to 1. Also, files that we 2121 # pretty much will never look at can be placed in the global 2122 # %ignored_files instead. Ones used here will be added to %skipped files 2123 main::set_access('skip', \%skip, 'c'); 2124 2125 my %each_line_handler; 2126 # list of subroutines to look at and filter each non-comment line in the 2127 # file. defaults to none. The subroutines are called in order, each is 2128 # to adjust $_ for the next one, and the final one adjusts it for 2129 # 'handler' 2130 main::set_access('each_line_handler', \%each_line_handler, 'c'); 2131 2132 my %has_missings_defaults; 2133 # ? Are there lines in the file giving default values for code points 2134 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is 2135 # the norm, but IGNORED means it has such lines, but the handler doesn't 2136 # use them. Having these three states allows us to catch changes to the 2137 # UCD that this program should track 2138 main::set_access('has_missings_defaults', 2139 \%has_missings_defaults, qw{ c r }); 2140 2141 my %pre_handler; 2142 # Subroutine to call before doing anything else in the file. If undef, no 2143 # such handler is called. 2144 main::set_access('pre_handler', \%pre_handler, qw{ c }); 2145 2146 my %eof_handler; 2147 # Subroutine to call upon getting an EOF on the input file, but before 2148 # that is returned to the main handler. This is to allow buffers to be 2149 # flushed. The handler is expected to call insert_lines() or 2150 # insert_adjusted() with the buffered material 2151 main::set_access('eof_handler', \%eof_handler, qw{ c r }); 2152 2153 my %post_handler; 2154 # Subroutine to call after all the lines of the file are read in and 2155 # processed. If undef, no such handler is called. 2156 main::set_access('post_handler', \%post_handler, qw{ c }); 2157 2158 my %progress_message; 2159 # Message to print to display progress in lieu of the standard one 2160 main::set_access('progress_message', \%progress_message, qw{ c }); 2161 2162 my %handle; 2163 # cache open file handle, internal. Is undef if file hasn't been 2164 # processed at all, empty if has; 2165 main::set_access('handle', \%handle); 2166 2167 my %added_lines; 2168 # cache of lines added virtually to the file, internal 2169 main::set_access('added_lines', \%added_lines); 2170 2171 my %errors; 2172 # cache of errors found, internal 2173 main::set_access('errors', \%errors); 2174 2175 my %missings; 2176 # storage of '@missing' defaults lines 2177 main::set_access('missings', \%missings); 2178 2179 sub new { 2180 my $class = shift; 2181 2182 my $self = bless \do{ my $anonymous_scalar }, $class; 2183 my $addr = do { no overloading; pack 'J', $self; }; 2184 2185 # Set defaults 2186 $handler{$addr} = \&main::process_generic_property_file; 2187 $non_skip{$addr} = 0; 2188 $skip{$addr} = 0; 2189 $has_missings_defaults{$addr} = $NO_DEFAULTS; 2190 $handle{$addr} = undef; 2191 $added_lines{$addr} = [ ]; 2192 $each_line_handler{$addr} = [ ]; 2193 $errors{$addr} = { }; 2194 $missings{$addr} = [ ]; 2195 2196 # Two positional parameters. 2197 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2198 $file{$addr} = main::internal_file_to_platform(shift); 2199 $first_released{$addr} = shift; 2200 2201 # The rest of the arguments are key => value pairs 2202 # %constructor_fields has been set up earlier to list all possible 2203 # ones. Either set or push, depending on how the default has been set 2204 # up just above. 2205 my %args = @_; 2206 foreach my $key (keys %args) { 2207 my $argument = $args{$key}; 2208 2209 # Note that the fields are the lower case of the constructor keys 2210 my $hash = $constructor_fields{lc $key}; 2211 if (! defined $hash) { 2212 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); 2213 next; 2214 } 2215 if (ref $hash->{$addr} eq 'ARRAY') { 2216 if (ref $argument eq 'ARRAY') { 2217 foreach my $argument (@{$argument}) { 2218 next if ! defined $argument; 2219 push @{$hash->{$addr}}, $argument; 2220 } 2221 } 2222 else { 2223 push @{$hash->{$addr}}, $argument if defined $argument; 2224 } 2225 } 2226 else { 2227 $hash->{$addr} = $argument; 2228 } 2229 delete $args{$key}; 2230 }; 2231 2232 # If the file has a property for it, it means that the property is not 2233 # listed in the file's entries. So add a handler to the list of line 2234 # handlers to insert the property name into the lines, to provide a 2235 # uniform interface to the final processing subroutine. 2236 # the final code doesn't have to worry about that. 2237 if ($property{$addr}) { 2238 push @{$each_line_handler{$addr}}, \&_insert_property_into_line; 2239 } 2240 2241 if ($non_skip{$addr} && ! $debug_skip && $verbosity) { 2242 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; 2243 } 2244 2245 # If skipping, set to optional, and add to list of ignored files, 2246 # including its reason 2247 if ($skip{$addr}) { 2248 $optional{$addr} = 1; 2249 $skipped_files{$file{$addr}} = $skip{$addr} 2250 } 2251 2252 return $self; 2253 } 2254 2255 2256 use overload 2257 fallback => 0, 2258 qw("") => "_operator_stringify", 2259 "." => \&main::_operator_dot, 2260 ".=" => \&main::_operator_dot_equal, 2261 ; 2262 2263 sub _operator_stringify { 2264 my $self = shift; 2265 2266 return __PACKAGE__ . " object for " . $self->file; 2267 } 2268 2269 # flag to make sure extracted files are processed early 2270 my $seen_non_extracted_non_age = 0; 2271 2272 sub run { 2273 # Process the input object $self. This opens and closes the file and 2274 # calls all the handlers for it. Currently, this can only be called 2275 # once per file, as it destroy's the EOF handler 2276 2277 my $self = shift; 2278 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2279 2280 my $addr = do { no overloading; pack 'J', $self; }; 2281 2282 my $file = $file{$addr}; 2283 2284 # Don't process if not expecting this file (because released later 2285 # than this Unicode version), and isn't there. This means if someone 2286 # copies it into an earlier version's directory, we will go ahead and 2287 # process it. 2288 return if $first_released{$addr} gt $v_version && ! -e $file; 2289 2290 # If in debugging mode and this file doesn't have the non-skip 2291 # flag set, and isn't one of the critical files, skip it. 2292 if ($debug_skip 2293 && $first_released{$addr} ne v0 2294 && ! $non_skip{$addr}) 2295 { 2296 print "Skipping $file in debugging\n" if $verbosity; 2297 return; 2298 } 2299 2300 # File could be optional 2301 if ($optional{$addr}) { 2302 return unless -e $file; 2303 my $result = eval $optional{$addr}; 2304 if (! defined $result) { 2305 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped."); 2306 return; 2307 } 2308 if (! $result) { 2309 if ($verbosity) { 2310 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n"; 2311 } 2312 return; 2313 } 2314 } 2315 2316 if (! defined $file || ! -e $file) { 2317 2318 # If the file doesn't exist, see if have internal data for it 2319 # (based on first_released being 0). 2320 if ($first_released{$addr} eq v0) { 2321 $handle{$addr} = 'pretend_is_open'; 2322 } 2323 else { 2324 if (! $optional{$addr} # File could be optional 2325 && $v_version ge $first_released{$addr}) 2326 { 2327 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr}; 2328 } 2329 return; 2330 } 2331 } 2332 else { 2333 2334 # Here, the file exists. Some platforms may change the case of 2335 # its name 2336 if ($seen_non_extracted_non_age) { 2337 if ($file =~ /$EXTRACTED/i) { 2338 Carp::my_carp_bug(main::join_lines(<<END 2339$file should be processed just after the 'Prop...Alias' files, and before 2340anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may 2341have subtle problems 2342END 2343 )); 2344 } 2345 } 2346 elsif ($EXTRACTED_DIR 2347 && $first_released{$addr} ne v0 2348 && $file !~ /$EXTRACTED/i 2349 && lc($file) ne 'dage.txt') 2350 { 2351 # We don't set this (by the 'if' above) if we have no 2352 # extracted directory, so if running on an early version, 2353 # this test won't work. Not worth worrying about. 2354 $seen_non_extracted_non_age = 1; 2355 } 2356 2357 # And mark the file as having being processed, and warn if it 2358 # isn't a file we are expecting. As we process the files, 2359 # they are deleted from the hash, so any that remain at the 2360 # end of the program are files that we didn't process. 2361 my $fkey = File::Spec->rel2abs($file); 2362 my $expecting = delete $potential_files{lc($fkey)}; 2363 2364 Carp::my_carp("Was not expecting '$file'.") if 2365 ! $expecting 2366 && ! defined $handle{$addr}; 2367 2368 # Having deleted from expected files, we can quit if not to do 2369 # anything. Don't print progress unless really want verbosity 2370 if ($skip{$addr}) { 2371 print "Skipping $file.\n" if $verbosity >= $VERBOSE; 2372 return; 2373 } 2374 2375 # Open the file, converting the slashes used in this program 2376 # into the proper form for the OS 2377 my $file_handle; 2378 if (not open $file_handle, "<", $file) { 2379 Carp::my_carp("Can't open $file. Skipping: $!"); 2380 return 0; 2381 } 2382 $handle{$addr} = $file_handle; # Cache the open file handle 2383 } 2384 2385 if ($verbosity >= $PROGRESS) { 2386 if ($progress_message{$addr}) { 2387 print "$progress_message{$addr}\n"; 2388 } 2389 else { 2390 # If using a virtual file, say so. 2391 print "Processing ", (-e $file) 2392 ? $file 2393 : "substitute $file", 2394 "\n"; 2395 } 2396 } 2397 2398 2399 # Call any special handler for before the file. 2400 &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; 2401 2402 # Then the main handler 2403 &{$handler{$addr}}($self); 2404 2405 # Then any special post-file handler. 2406 &{$post_handler{$addr}}($self) if $post_handler{$addr}; 2407 2408 # If any errors have been accumulated, output the counts (as the first 2409 # error message in each class was output when it was encountered). 2410 if ($errors{$addr}) { 2411 my $total = 0; 2412 my $types = 0; 2413 foreach my $error (keys %{$errors{$addr}}) { 2414 $total += $errors{$addr}->{$error}; 2415 delete $errors{$addr}->{$error}; 2416 $types++; 2417 } 2418 if ($total > 1) { 2419 my $message 2420 = "A total of $total lines had errors in $file. "; 2421 2422 $message .= ($types == 1) 2423 ? '(Only the first one was displayed.)' 2424 : '(Only the first of each type was displayed.)'; 2425 Carp::my_carp($message); 2426 } 2427 } 2428 2429 if (@{$missings{$addr}}) { 2430 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); 2431 } 2432 2433 # If a real file handle, close it. 2434 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if 2435 ref $handle{$addr}; 2436 $handle{$addr} = ""; # Uses empty to indicate that has already seen 2437 # the file, as opposed to undef 2438 return; 2439 } 2440 2441 sub next_line { 2442 # Sets $_ to be the next logical input line, if any. Returns non-zero 2443 # if such a line exists. 'logical' means that any lines that have 2444 # been added via insert_lines() will be returned in $_ before the file 2445 # is read again. 2446 2447 my $self = shift; 2448 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2449 2450 my $addr = do { no overloading; pack 'J', $self; }; 2451 2452 # Here the file is open (or if the handle is not a ref, is an open 2453 # 'virtual' file). Get the next line; any inserted lines get priority 2454 # over the file itself. 2455 my $adjusted; 2456 2457 LINE: 2458 while (1) { # Loop until find non-comment, non-empty line 2459 #local $to_trace = 1 if main::DEBUG; 2460 my $inserted_ref = shift @{$added_lines{$addr}}; 2461 if (defined $inserted_ref) { 2462 ($adjusted, $_) = @{$inserted_ref}; 2463 trace $adjusted, $_ if main::DEBUG && $to_trace; 2464 return 1 if $adjusted; 2465 } 2466 else { 2467 last if ! ref $handle{$addr}; # Don't read unless is real file 2468 last if ! defined ($_ = readline $handle{$addr}); 2469 } 2470 chomp; 2471 trace $_ if main::DEBUG && $to_trace; 2472 2473 # See if this line is the comment line that defines what property 2474 # value that code points that are not listed in the file should 2475 # have. The format or existence of these lines is not guaranteed 2476 # by Unicode since they are comments, but the documentation says 2477 # that this was added for machine-readability, so probably won't 2478 # change. This works starting in Unicode Version 5.0. They look 2479 # like: 2480 # 2481 # @missing: 0000..10FFFF; Not_Reordered 2482 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> 2483 # @missing: 0000..10FFFF; ; NaN 2484 # 2485 # Save the line for a later get_missings() call. 2486 if (/$missing_defaults_prefix/) { 2487 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { 2488 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); 2489 } 2490 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { 2491 my @defaults = split /\s* ; \s*/x, $_; 2492 2493 # The first field is the @missing, which ends in a 2494 # semi-colon, so can safely shift. 2495 shift @defaults; 2496 2497 # Some of these lines may have empty field placeholders 2498 # which get in the way. An example is: 2499 # @missing: 0000..10FFFF; ; NaN 2500 # Remove them. Process starting from the top so the 2501 # splice doesn't affect things still to be looked at. 2502 for (my $i = @defaults - 1; $i >= 0; $i--) { 2503 next if $defaults[$i] ne ""; 2504 splice @defaults, $i, 1; 2505 } 2506 2507 # What's left should be just the property (maybe) and the 2508 # default. Having only one element means it doesn't have 2509 # the property. 2510 my $default; 2511 my $property; 2512 if (@defaults >= 1) { 2513 if (@defaults == 1) { 2514 $default = $defaults[0]; 2515 } 2516 else { 2517 $property = $defaults[0]; 2518 $default = $defaults[1]; 2519 } 2520 } 2521 2522 if (@defaults < 1 2523 || @defaults > 2 2524 || ($default =~ /^</ 2525 && $default !~ /^<code *point>$/i 2526 && $default !~ /^<none>$/i 2527 && $default !~ /^<script>$/i)) 2528 { 2529 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); 2530 } 2531 else { 2532 2533 # If the property is missing from the line, it should 2534 # be the one for the whole file 2535 $property = $property{$addr} if ! defined $property; 2536 2537 # Change <none> to the null string, which is what it 2538 # really means. If the default is the code point 2539 # itself, set it to <code point>, which is what 2540 # Unicode uses (but sometimes they've forgotten the 2541 # space) 2542 if ($default =~ /^<none>$/i) { 2543 $default = ""; 2544 } 2545 elsif ($default =~ /^<code *point>$/i) { 2546 $default = $CODE_POINT; 2547 } 2548 elsif ($default =~ /^<script>$/i) { 2549 2550 # Special case this one. Currently is from 2551 # ScriptExtensions.txt, and means for all unlisted 2552 # code points, use their Script property values. 2553 # For the code points not listed in that file, the 2554 # default value is 'Unknown'. 2555 $default = "Unknown"; 2556 } 2557 2558 # Store them as a sub-arrays with both components. 2559 push @{$missings{$addr}}, [ $default, $property ]; 2560 } 2561 } 2562 2563 # There is nothing for the caller to process on this comment 2564 # line. 2565 next; 2566 } 2567 2568 # Remove comments and trailing space, and skip this line if the 2569 # result is empty 2570 s/#.*//; 2571 s/\s+$//; 2572 next if /^$/; 2573 2574 # Call any handlers for this line, and skip further processing of 2575 # the line if the handler sets the line to null. 2576 foreach my $sub_ref (@{$each_line_handler{$addr}}) { 2577 &{$sub_ref}($self); 2578 next LINE if /^$/; 2579 } 2580 2581 # Here the line is ok. return success. 2582 return 1; 2583 } # End of looping through lines. 2584 2585 # If there is an EOF handler, call it (only once) and if it generates 2586 # more lines to process go back in the loop to handle them. 2587 if ($eof_handler{$addr}) { 2588 &{$eof_handler{$addr}}($self); 2589 $eof_handler{$addr} = ""; # Currently only get one shot at it. 2590 goto LINE if $added_lines{$addr}; 2591 } 2592 2593 # Return failure -- no more lines. 2594 return 0; 2595 2596 } 2597 2598# Not currently used, not fully tested. 2599# sub peek { 2600# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank 2601# # record. Not callable from an each_line_handler(), nor does it call 2602# # an each_line_handler() on the line. 2603# 2604# my $self = shift; 2605# my $addr = do { no overloading; pack 'J', $self; }; 2606# 2607# foreach my $inserted_ref (@{$added_lines{$addr}}) { 2608# my ($adjusted, $line) = @{$inserted_ref}; 2609# next if $adjusted; 2610# 2611# # Remove comments and trailing space, and return a non-empty 2612# # resulting line 2613# $line =~ s/#.*//; 2614# $line =~ s/\s+$//; 2615# return $line if $line ne ""; 2616# } 2617# 2618# return if ! ref $handle{$addr}; # Don't read unless is real file 2619# while (1) { # Loop until find non-comment, non-empty line 2620# local $to_trace = 1 if main::DEBUG; 2621# trace $_ if main::DEBUG && $to_trace; 2622# return if ! defined (my $line = readline $handle{$addr}); 2623# chomp $line; 2624# push @{$added_lines{$addr}}, [ 0, $line ]; 2625# 2626# $line =~ s/#.*//; 2627# $line =~ s/\s+$//; 2628# return $line if $line ne ""; 2629# } 2630# 2631# return; 2632# } 2633 2634 2635 sub insert_lines { 2636 # Lines can be inserted so that it looks like they were in the input 2637 # file at the place it was when this routine is called. See also 2638 # insert_adjusted_lines(). Lines inserted via this routine go through 2639 # any each_line_handler() 2640 2641 my $self = shift; 2642 2643 # Each inserted line is an array, with the first element being 0 to 2644 # indicate that this line hasn't been adjusted, and needs to be 2645 # processed. 2646 no overloading; 2647 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_; 2648 return; 2649 } 2650 2651 sub insert_adjusted_lines { 2652 # Lines can be inserted so that it looks like they were in the input 2653 # file at the place it was when this routine is called. See also 2654 # insert_lines(). Lines inserted via this routine are already fully 2655 # adjusted, ready to be processed; each_line_handler()s handlers will 2656 # not be called. This means this is not a completely general 2657 # facility, as only the last each_line_handler on the stack should 2658 # call this. It could be made more general, by passing to each of the 2659 # line_handlers their position on the stack, which they would pass on 2660 # to this routine, and that would replace the boolean first element in 2661 # the anonymous array pushed here, so that the next_line routine could 2662 # use that to call only those handlers whose index is after it on the 2663 # stack. But this is overkill for what is needed now. 2664 2665 my $self = shift; 2666 trace $_[0] if main::DEBUG && $to_trace; 2667 2668 # Each inserted line is an array, with the first element being 1 to 2669 # indicate that this line has been adjusted 2670 no overloading; 2671 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_; 2672 return; 2673 } 2674 2675 sub get_missings { 2676 # Returns the stored up @missings lines' values, and clears the list. 2677 # The values are in an array, consisting of the default in the first 2678 # element, and the property in the 2nd. However, since these lines 2679 # can be stacked up, the return is an array of all these arrays. 2680 2681 my $self = shift; 2682 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2683 2684 my $addr = do { no overloading; pack 'J', $self; }; 2685 2686 # If not accepting a list return, just return the first one. 2687 return shift @{$missings{$addr}} unless wantarray; 2688 2689 my @return = @{$missings{$addr}}; 2690 undef @{$missings{$addr}}; 2691 return @return; 2692 } 2693 2694 sub _insert_property_into_line { 2695 # Add a property field to $_, if this file requires it. 2696 2697 my $self = shift; 2698 my $addr = do { no overloading; pack 'J', $self; }; 2699 my $property = $property{$addr}; 2700 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2701 2702 $_ =~ s/(;|$)/; $property$1/; 2703 return; 2704 } 2705 2706 sub carp_bad_line { 2707 # Output consistent error messages, using either a generic one, or the 2708 # one given by the optional parameter. To avoid gazillions of the 2709 # same message in case the syntax of a file is way off, this routine 2710 # only outputs the first instance of each message, incrementing a 2711 # count so the totals can be output at the end of the file. 2712 2713 my $self = shift; 2714 my $message = shift; 2715 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2716 2717 my $addr = do { no overloading; pack 'J', $self; }; 2718 2719 $message = 'Unexpected line' unless $message; 2720 2721 # No trailing punctuation so as to fit with our addenda. 2722 $message =~ s/[.:;,]$//; 2723 2724 # If haven't seen this exact message before, output it now. Otherwise 2725 # increment the count of how many times it has occurred 2726 unless ($errors{$addr}->{$message}) { 2727 Carp::my_carp("$message in '$_' in " 2728 . $file{$addr} 2729 . " at line $.. Skipping this line;"); 2730 $errors{$addr}->{$message} = 1; 2731 } 2732 else { 2733 $errors{$addr}->{$message}++; 2734 } 2735 2736 # Clear the line to prevent any further (meaningful) processing of it. 2737 $_ = ""; 2738 2739 return; 2740 } 2741} # End closure 2742 2743package Multi_Default; 2744 2745# Certain properties in early versions of Unicode had more than one possible 2746# default for code points missing from the files. In these cases, one 2747# default applies to everything left over after all the others are applied, 2748# and for each of the others, there is a description of which class of code 2749# points applies to it. This object helps implement this by storing the 2750# defaults, and for all but that final default, an eval string that generates 2751# the class that it applies to. 2752 2753 2754{ # Closure 2755 2756 main::setup_package(); 2757 2758 my %class_defaults; 2759 # The defaults structure for the classes 2760 main::set_access('class_defaults', \%class_defaults); 2761 2762 my %other_default; 2763 # The default that applies to everything left over. 2764 main::set_access('other_default', \%other_default, 'r'); 2765 2766 2767 sub new { 2768 # The constructor is called with default => eval pairs, terminated by 2769 # the left-over default. e.g. 2770 # Multi_Default->new( 2771 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C 2772 # - 0x200D', 2773 # 'R' => 'some other expression that evaluates to code points', 2774 # . 2775 # . 2776 # . 2777 # 'U')); 2778 2779 my $class = shift; 2780 2781 my $self = bless \do{my $anonymous_scalar}, $class; 2782 my $addr = do { no overloading; pack 'J', $self; }; 2783 2784 while (@_ > 1) { 2785 my $default = shift; 2786 my $eval = shift; 2787 $class_defaults{$addr}->{$default} = $eval; 2788 } 2789 2790 $other_default{$addr} = shift; 2791 2792 return $self; 2793 } 2794 2795 sub get_next_defaults { 2796 # Iterates and returns the next class of defaults. 2797 my $self = shift; 2798 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2799 2800 my $addr = do { no overloading; pack 'J', $self; }; 2801 2802 return each %{$class_defaults{$addr}}; 2803 } 2804} 2805 2806package Alias; 2807 2808# An alias is one of the names that a table goes by. This class defines them 2809# including some attributes. Everything is currently setup in the 2810# constructor. 2811 2812 2813{ # Closure 2814 2815 main::setup_package(); 2816 2817 my %name; 2818 main::set_access('name', \%name, 'r'); 2819 2820 my %loose_match; 2821 # Should this name match loosely or not. 2822 main::set_access('loose_match', \%loose_match, 'r'); 2823 2824 my %make_re_pod_entry; 2825 # Some aliases should not get their own entries in the re section of the 2826 # pod, because they are covered by a wild-card, and some we want to 2827 # discourage use of. Binary 2828 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); 2829 2830 my %ucd; 2831 # Is this documented to be accessible via Unicode::UCD 2832 main::set_access('ucd', \%ucd, 'r', 's'); 2833 2834 my %status; 2835 # Aliases have a status, like deprecated, or even suppressed (which means 2836 # they don't appear in documentation). Enum 2837 main::set_access('status', \%status, 'r'); 2838 2839 my %ok_as_filename; 2840 # Similarly, some aliases should not be considered as usable ones for 2841 # external use, such as file names, or we don't want documentation to 2842 # recommend them. Boolean 2843 main::set_access('ok_as_filename', \%ok_as_filename, 'r'); 2844 2845 sub new { 2846 my $class = shift; 2847 2848 my $self = bless \do { my $anonymous_scalar }, $class; 2849 my $addr = do { no overloading; pack 'J', $self; }; 2850 2851 $name{$addr} = shift; 2852 $loose_match{$addr} = shift; 2853 $make_re_pod_entry{$addr} = shift; 2854 $ok_as_filename{$addr} = shift; 2855 $status{$addr} = shift; 2856 $ucd{$addr} = shift; 2857 2858 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2859 2860 # Null names are never ok externally 2861 $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; 2862 2863 return $self; 2864 } 2865} 2866 2867package Range; 2868 2869# A range is the basic unit for storing code points, and is described in the 2870# comments at the beginning of the program. Each range has a starting code 2871# point; an ending code point (not less than the starting one); a value 2872# that applies to every code point in between the two end-points, inclusive; 2873# and an enum type that applies to the value. The type is for the user's 2874# convenience, and has no meaning here, except that a non-zero type is 2875# considered to not obey the normal Unicode rules for having standard forms. 2876# 2877# The same structure is used for both map and match tables, even though in the 2878# latter, the value (and hence type) is irrelevant and could be used as a 2879# comment. In map tables, the value is what all the code points in the range 2880# map to. Type 0 values have the standardized version of the value stored as 2881# well, so as to not have to recalculate it a lot. 2882 2883sub trace { return main::trace(@_); } 2884 2885{ # Closure 2886 2887 main::setup_package(); 2888 2889 my %start; 2890 main::set_access('start', \%start, 'r', 's'); 2891 2892 my %end; 2893 main::set_access('end', \%end, 'r', 's'); 2894 2895 my %value; 2896 main::set_access('value', \%value, 'r'); 2897 2898 my %type; 2899 main::set_access('type', \%type, 'r'); 2900 2901 my %standard_form; 2902 # The value in internal standard form. Defined only if the type is 0. 2903 main::set_access('standard_form', \%standard_form); 2904 2905 # Note that if these fields change, the dump() method should as well 2906 2907 sub new { 2908 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 2909 my $class = shift; 2910 2911 my $self = bless \do { my $anonymous_scalar }, $class; 2912 my $addr = do { no overloading; pack 'J', $self; }; 2913 2914 $start{$addr} = shift; 2915 $end{$addr} = shift; 2916 2917 my %args = @_; 2918 2919 my $value = delete $args{'Value'}; # Can be 0 2920 $value = "" unless defined $value; 2921 $value{$addr} = $value; 2922 2923 $type{$addr} = delete $args{'Type'} || 0; 2924 2925 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 2926 2927 return $self; 2928 } 2929 2930 use overload 2931 fallback => 0, 2932 qw("") => "_operator_stringify", 2933 "." => \&main::_operator_dot, 2934 ".=" => \&main::_operator_dot_equal, 2935 ; 2936 2937 sub _operator_stringify { 2938 my $self = shift; 2939 my $addr = do { no overloading; pack 'J', $self; }; 2940 2941 # Output it like '0041..0065 (value)' 2942 my $return = sprintf("%04X", $start{$addr}) 2943 . '..' 2944 . sprintf("%04X", $end{$addr}); 2945 my $value = $value{$addr}; 2946 my $type = $type{$addr}; 2947 $return .= ' ('; 2948 $return .= "$value"; 2949 $return .= ", Type=$type" if $type != 0; 2950 $return .= ')'; 2951 2952 return $return; 2953 } 2954 2955 sub standard_form { 2956 # Calculate the standard form only if needed, and cache the result. 2957 # The standard form is the value itself if the type is special. 2958 # This represents a considerable CPU and memory saving - at the time 2959 # of writing there are 368676 non-special objects, but the standard 2960 # form is only requested for 22047 of them - ie about 6%. 2961 2962 my $self = shift; 2963 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2964 2965 my $addr = do { no overloading; pack 'J', $self; }; 2966 2967 return $standard_form{$addr} if defined $standard_form{$addr}; 2968 2969 my $value = $value{$addr}; 2970 return $value if $type{$addr}; 2971 return $standard_form{$addr} = main::standardize($value); 2972 } 2973 2974 sub dump { 2975 # Human, not machine readable. For machine readable, comment out this 2976 # entire routine and let the standard one take effect. 2977 my $self = shift; 2978 my $indent = shift; 2979 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2980 2981 my $addr = do { no overloading; pack 'J', $self; }; 2982 2983 my $return = $indent 2984 . sprintf("%04X", $start{$addr}) 2985 . '..' 2986 . sprintf("%04X", $end{$addr}) 2987 . " '$value{$addr}';"; 2988 if (! defined $standard_form{$addr}) { 2989 $return .= "(type=$type{$addr})"; 2990 } 2991 elsif ($standard_form{$addr} ne $value{$addr}) { 2992 $return .= "(standard '$standard_form{$addr}')"; 2993 } 2994 return $return; 2995 } 2996} # End closure 2997 2998package _Range_List_Base; 2999 3000# Base class for range lists. A range list is simply an ordered list of 3001# ranges, so that the ranges with the lowest starting numbers are first in it. 3002# 3003# When a new range is added that is adjacent to an existing range that has the 3004# same value and type, it merges with it to form a larger range. 3005# 3006# Ranges generally do not overlap, except that there can be multiple entries 3007# of single code point ranges. This is because of NameAliases.txt. 3008# 3009# In this program, there is a standard value such that if two different 3010# values, have the same standard value, they are considered equivalent. This 3011# value was chosen so that it gives correct results on Unicode data 3012 3013# There are a number of methods to manipulate range lists, and some operators 3014# are overloaded to handle them. 3015 3016sub trace { return main::trace(@_); } 3017 3018{ # Closure 3019 3020 our $addr; 3021 3022 # Max is initialized to a negative value that isn't adjacent to 0, for 3023 # simpler tests 3024 my $max_init = -2; 3025 3026 main::setup_package(); 3027 3028 my %ranges; 3029 # The list of ranges 3030 main::set_access('ranges', \%ranges, 'readable_array'); 3031 3032 my %max; 3033 # The highest code point in the list. This was originally a method, but 3034 # actual measurements said it was used a lot. 3035 main::set_access('max', \%max, 'r'); 3036 3037 my %each_range_iterator; 3038 # Iterator position for each_range() 3039 main::set_access('each_range_iterator', \%each_range_iterator); 3040 3041 my %owner_name_of; 3042 # Name of parent this is attached to, if any. Solely for better error 3043 # messages. 3044 main::set_access('owner_name_of', \%owner_name_of, 'p_r'); 3045 3046 my %_search_ranges_cache; 3047 # A cache of the previous result from _search_ranges(), for better 3048 # performance 3049 main::set_access('_search_ranges_cache', \%_search_ranges_cache); 3050 3051 sub new { 3052 my $class = shift; 3053 my %args = @_; 3054 3055 # Optional initialization data for the range list. 3056 my $initialize = delete $args{'Initialize'}; 3057 3058 my $self; 3059 3060 # Use _union() to initialize. _union() returns an object of this 3061 # class, which means that it will call this constructor recursively. 3062 # But it won't have this $initialize parameter so that it won't 3063 # infinitely loop on this. 3064 return _union($class, $initialize, %args) if defined $initialize; 3065 3066 $self = bless \do { my $anonymous_scalar }, $class; 3067 my $addr = do { no overloading; pack 'J', $self; }; 3068 3069 # Optional parent object, only for debug info. 3070 $owner_name_of{$addr} = delete $args{'Owner'}; 3071 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; 3072 3073 # Stringify, in case it is an object. 3074 $owner_name_of{$addr} = "$owner_name_of{$addr}"; 3075 3076 # This is used only for error messages, and so a colon is added 3077 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; 3078 3079 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3080 3081 $max{$addr} = $max_init; 3082 3083 $_search_ranges_cache{$addr} = 0; 3084 $ranges{$addr} = []; 3085 3086 return $self; 3087 } 3088 3089 use overload 3090 fallback => 0, 3091 qw("") => "_operator_stringify", 3092 "." => \&main::_operator_dot, 3093 ".=" => \&main::_operator_dot_equal, 3094 ; 3095 3096 sub _operator_stringify { 3097 my $self = shift; 3098 my $addr = do { no overloading; pack 'J', $self; }; 3099 3100 return "Range_List attached to '$owner_name_of{$addr}'" 3101 if $owner_name_of{$addr}; 3102 return "anonymous Range_List " . \$self; 3103 } 3104 3105 sub _union { 3106 # Returns the union of the input code points. It can be called as 3107 # either a constructor or a method. If called as a method, the result 3108 # will be a new() instance of the calling object, containing the union 3109 # of that object with the other parameter's code points; if called as 3110 # a constructor, the first parameter gives the class that the new object 3111 # should be, and the second parameter gives the code points to go into 3112 # it. 3113 # In either case, there are two parameters looked at by this routine; 3114 # any additional parameters are passed to the new() constructor. 3115 # 3116 # The code points can come in the form of some object that contains 3117 # ranges, and has a conventionally named method to access them; or 3118 # they can be an array of individual code points (as integers); or 3119 # just a single code point. 3120 # 3121 # If they are ranges, this routine doesn't make any effort to preserve 3122 # the range values and types of one input over the other. Therefore 3123 # this base class should not allow _union to be called from other than 3124 # initialization code, so as to prevent two tables from being added 3125 # together where the range values matter. The general form of this 3126 # routine therefore belongs in a derived class, but it was moved here 3127 # to avoid duplication of code. The failure to overload this in this 3128 # class keeps it safe. 3129 # 3130 # It does make the effort during initialization to accept tables with 3131 # multiple values for the same code point, and to preserve the order 3132 # of these. If there is only one input range or range set, it doesn't 3133 # sort (as it should already be sorted to the desired order), and will 3134 # accept multiple values per code point. Otherwise it will merge 3135 # multiple values into a single one. 3136 3137 my $self; 3138 my @args; # Arguments to pass to the constructor 3139 3140 my $class = shift; 3141 3142 # If a method call, will start the union with the object itself, and 3143 # the class of the new object will be the same as self. 3144 if (ref $class) { 3145 $self = $class; 3146 $class = ref $self; 3147 push @args, $self; 3148 } 3149 3150 # Add the other required parameter. 3151 push @args, shift; 3152 # Rest of parameters are passed on to the constructor 3153 3154 # Accumulate all records from both lists. 3155 my @records; 3156 my $input_count = 0; 3157 for my $arg (@args) { 3158 #local $to_trace = 0 if main::DEBUG; 3159 trace "argument = $arg" if main::DEBUG && $to_trace; 3160 if (! defined $arg) { 3161 my $message = ""; 3162 if (defined $self) { 3163 no overloading; 3164 $message .= $owner_name_of{pack 'J', $self}; 3165 } 3166 Carp::my_carp_bug($message . "Undefined argument to _union. No union done."); 3167 return; 3168 } 3169 3170 $arg = [ $arg ] if ! ref $arg; 3171 my $type = ref $arg; 3172 if ($type eq 'ARRAY') { 3173 foreach my $element (@$arg) { 3174 push @records, Range->new($element, $element); 3175 $input_count++; 3176 } 3177 } 3178 elsif ($arg->isa('Range')) { 3179 push @records, $arg; 3180 $input_count++; 3181 } 3182 elsif ($arg->can('ranges')) { 3183 push @records, $arg->ranges; 3184 $input_count++; 3185 } 3186 else { 3187 my $message = ""; 3188 if (defined $self) { 3189 no overloading; 3190 $message .= $owner_name_of{pack 'J', $self}; 3191 } 3192 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); 3193 return; 3194 } 3195 } 3196 3197 # Sort with the range containing the lowest ordinal first, but if 3198 # two ranges start at the same code point, sort with the bigger range 3199 # of the two first, because it takes fewer cycles. 3200 if ($input_count > 1) { 3201 @records = sort { ($a->start <=> $b->start) 3202 or 3203 # if b is shorter than a, b->end will be 3204 # less than a->end, and we want to select 3205 # a, so want to return -1 3206 ($b->end <=> $a->end) 3207 } @records; 3208 } 3209 3210 my $new = $class->new(@_); 3211 3212 # Fold in records so long as they add new information. 3213 for my $set (@records) { 3214 my $start = $set->start; 3215 my $end = $set->end; 3216 my $value = $set->value; 3217 my $type = $set->type; 3218 if ($start > $new->max) { 3219 $new->_add_delete('+', $start, $end, $value, Type => $type); 3220 } 3221 elsif ($end > $new->max) { 3222 $new->_add_delete('+', $new->max +1, $end, $value, 3223 Type => $type); 3224 } 3225 elsif ($input_count == 1) { 3226 # Here, overlaps existing range, but is from a single input, 3227 # so preserve the multiple values from that input. 3228 $new->_add_delete('+', $start, $end, $value, Type => $type, 3229 Replace => $MULTIPLE_AFTER); 3230 } 3231 } 3232 3233 return $new; 3234 } 3235 3236 sub range_count { # Return the number of ranges in the range list 3237 my $self = shift; 3238 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3239 3240 no overloading; 3241 return scalar @{$ranges{pack 'J', $self}}; 3242 } 3243 3244 sub min { 3245 # Returns the minimum code point currently in the range list, or if 3246 # the range list is empty, 2 beyond the max possible. This is a 3247 # method because used so rarely, that not worth saving between calls, 3248 # and having to worry about changing it as ranges are added and 3249 # deleted. 3250 3251 my $self = shift; 3252 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3253 3254 my $addr = do { no overloading; pack 'J', $self; }; 3255 3256 # If the range list is empty, return a large value that isn't adjacent 3257 # to any that could be in the range list, for simpler tests 3258 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; 3259 return $ranges{$addr}->[0]->start; 3260 } 3261 3262 sub contains { 3263 # Boolean: Is argument in the range list? If so returns $i such that: 3264 # range[$i]->end < $codepoint <= range[$i+1]->end 3265 # which is one beyond what you want; this is so that the 0th range 3266 # doesn't return false 3267 my $self = shift; 3268 my $codepoint = shift; 3269 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3270 3271 my $i = $self->_search_ranges($codepoint); 3272 return 0 unless defined $i; 3273 3274 # The search returns $i, such that 3275 # range[$i-1]->end < $codepoint <= range[$i]->end 3276 # So is in the table if and only iff it is at least the start position 3277 # of range $i. 3278 no overloading; 3279 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; 3280 return $i + 1; 3281 } 3282 3283 sub containing_range { 3284 # Returns the range object that contains the code point, undef if none 3285 3286 my $self = shift; 3287 my $codepoint = shift; 3288 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3289 3290 my $i = $self->contains($codepoint); 3291 return unless $i; 3292 3293 # contains() returns 1 beyond where we should look 3294 no overloading; 3295 return $ranges{pack 'J', $self}->[$i-1]; 3296 } 3297 3298 sub value_of { 3299 # Returns the value associated with the code point, undef if none 3300 3301 my $self = shift; 3302 my $codepoint = shift; 3303 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3304 3305 my $range = $self->containing_range($codepoint); 3306 return unless defined $range; 3307 3308 return $range->value; 3309 } 3310 3311 sub type_of { 3312 # Returns the type of the range containing the code point, undef if 3313 # the code point is not in the table 3314 3315 my $self = shift; 3316 my $codepoint = shift; 3317 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3318 3319 my $range = $self->containing_range($codepoint); 3320 return unless defined $range; 3321 3322 return $range->type; 3323 } 3324 3325 sub _search_ranges { 3326 # Find the range in the list which contains a code point, or where it 3327 # should go if were to add it. That is, it returns $i, such that: 3328 # range[$i-1]->end < $codepoint <= range[$i]->end 3329 # Returns undef if no such $i is possible (e.g. at end of table), or 3330 # if there is an error. 3331 3332 my $self = shift; 3333 my $code_point = shift; 3334 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3335 3336 my $addr = do { no overloading; pack 'J', $self; }; 3337 3338 return if $code_point > $max{$addr}; 3339 my $r = $ranges{$addr}; # The current list of ranges 3340 my $range_list_size = scalar @$r; 3341 my $i; 3342 3343 use integer; # want integer division 3344 3345 # Use the cached result as the starting guess for this one, because, 3346 # an experiment on 5.1 showed that 90% of the time the cache was the 3347 # same as the result on the next call (and 7% it was one less). 3348 $i = $_search_ranges_cache{$addr}; 3349 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. 3350 # from an intervening deletion 3351 #local $to_trace = 1 if main::DEBUG; 3352 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); 3353 return $i if $code_point <= $r->[$i]->end 3354 && ($i == 0 || $r->[$i-1]->end < $code_point); 3355 3356 # Here the cache doesn't yield the correct $i. Try adding 1. 3357 if ($i < $range_list_size - 1 3358 && $r->[$i]->end < $code_point && 3359 $code_point <= $r->[$i+1]->end) 3360 { 3361 $i++; 3362 trace "next \$i is correct: $i" if main::DEBUG && $to_trace; 3363 $_search_ranges_cache{$addr} = $i; 3364 return $i; 3365 } 3366 3367 # Here, adding 1 also didn't work. We do a binary search to 3368 # find the correct position, starting with current $i 3369 my $lower = 0; 3370 my $upper = $range_list_size - 1; 3371 while (1) { 3372 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; 3373 3374 if ($code_point <= $r->[$i]->end) { 3375 3376 # Here we have met the upper constraint. We can quit if we 3377 # also meet the lower one. 3378 last if $i == 0 || $r->[$i-1]->end < $code_point; 3379 3380 $upper = $i; # Still too high. 3381 3382 } 3383 else { 3384 3385 # Here, $r[$i]->end < $code_point, so look higher up. 3386 $lower = $i; 3387 } 3388 3389 # Split search domain in half to try again. 3390 my $temp = ($upper + $lower) / 2; 3391 3392 # No point in continuing unless $i changes for next time 3393 # in the loop. 3394 if ($temp == $i) { 3395 3396 # We can't reach the highest element because of the averaging. 3397 # So if one below the upper edge, force it there and try one 3398 # more time. 3399 if ($i == $range_list_size - 2) { 3400 3401 trace "Forcing to upper edge" if main::DEBUG && $to_trace; 3402 $i = $range_list_size - 1; 3403 3404 # Change $lower as well so if fails next time through, 3405 # taking the average will yield the same $i, and we will 3406 # quit with the error message just below. 3407 $lower = $i; 3408 next; 3409 } 3410 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); 3411 return; 3412 } 3413 $i = $temp; 3414 } # End of while loop 3415 3416 if (main::DEBUG && $to_trace) { 3417 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; 3418 trace "i= [ $i ]", $r->[$i]; 3419 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; 3420 } 3421 3422 # Here we have found the offset. Cache it as a starting point for the 3423 # next call. 3424 $_search_ranges_cache{$addr} = $i; 3425 return $i; 3426 } 3427 3428 sub _add_delete { 3429 # Add, replace or delete ranges to or from a list. The $type 3430 # parameter gives which: 3431 # '+' => insert or replace a range, returning a list of any changed 3432 # ranges. 3433 # '-' => delete a range, returning a list of any deleted ranges. 3434 # 3435 # The next three parameters give respectively the start, end, and 3436 # value associated with the range. 'value' should be null unless the 3437 # operation is '+'; 3438 # 3439 # The range list is kept sorted so that the range with the lowest 3440 # starting position is first in the list, and generally, adjacent 3441 # ranges with the same values are merged into a single larger one (see 3442 # exceptions below). 3443 # 3444 # There are more parameters; all are key => value pairs: 3445 # Type gives the type of the value. It is only valid for '+'. 3446 # All ranges have types; if this parameter is omitted, 0 is 3447 # assumed. Ranges with type 0 are assumed to obey the 3448 # Unicode rules for casing, etc; ranges with other types are 3449 # not. Otherwise, the type is arbitrary, for the caller's 3450 # convenience, and looked at only by this routine to keep 3451 # adjacent ranges of different types from being merged into 3452 # a single larger range, and when Replace => 3453 # $IF_NOT_EQUIVALENT is specified (see just below). 3454 # Replace determines what to do if the range list already contains 3455 # ranges which coincide with all or portions of the input 3456 # range. It is only valid for '+': 3457 # => $NO means that the new value is not to replace 3458 # any existing ones, but any empty gaps of the 3459 # range list coinciding with the input range 3460 # will be filled in with the new value. 3461 # => $UNCONDITIONALLY means to replace the existing values with 3462 # this one unconditionally. However, if the 3463 # new and old values are identical, the 3464 # replacement is skipped to save cycles 3465 # => $IF_NOT_EQUIVALENT means to replace the existing values 3466 # (the default) with this one if they are not equivalent. 3467 # Ranges are equivalent if their types are the 3468 # same, and they are the same string; or if 3469 # both are type 0 ranges, if their Unicode 3470 # standard forms are identical. In this last 3471 # case, the routine chooses the more "modern" 3472 # one to use. This is because some of the 3473 # older files are formatted with values that 3474 # are, for example, ALL CAPs, whereas the 3475 # derived files have a more modern style, 3476 # which looks better. By looking for this 3477 # style when the pre-existing and replacement 3478 # standard forms are the same, we can move to 3479 # the modern style 3480 # => $MULTIPLE_BEFORE means that if this range duplicates an 3481 # existing one, but has a different value, 3482 # don't replace the existing one, but insert 3483 # this, one so that the same range can occur 3484 # multiple times. They are stored LIFO, so 3485 # that the final one inserted is the first one 3486 # returned in an ordered search of the table. 3487 # If this is an exact duplicate, including the 3488 # value, the original will be moved to be 3489 # first, before any other duplicate ranges 3490 # with different values. 3491 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored 3492 # FIFO, so that this one is inserted after all 3493 # others that currently exist. If this is an 3494 # exact duplicate, including value, of an 3495 # existing range, this one is discarded 3496 # (leaving the existing one in its original, 3497 # higher priority position 3498 # => anything else is the same as => $IF_NOT_EQUIVALENT 3499 # 3500 # "same value" means identical for non-type-0 ranges, and it means 3501 # having the same standard forms for type-0 ranges. 3502 3503 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; 3504 3505 my $self = shift; 3506 my $operation = shift; # '+' for add/replace; '-' for delete; 3507 my $start = shift; 3508 my $end = shift; 3509 my $value = shift; 3510 3511 my %args = @_; 3512 3513 $value = "" if not defined $value; # warning: $value can be "0" 3514 3515 my $replace = delete $args{'Replace'}; 3516 $replace = $IF_NOT_EQUIVALENT unless defined $replace; 3517 3518 my $type = delete $args{'Type'}; 3519 $type = 0 unless defined $type; 3520 3521 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3522 3523 my $addr = do { no overloading; pack 'J', $self; }; 3524 3525 if ($operation ne '+' && $operation ne '-') { 3526 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); 3527 return; 3528 } 3529 unless (defined $start && defined $end) { 3530 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); 3531 return; 3532 } 3533 unless ($end >= $start) { 3534 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); 3535 return; 3536 } 3537 if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') { 3538 Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway"); 3539 } 3540 #local $to_trace = 1 if main::DEBUG; 3541 3542 if ($operation eq '-') { 3543 if ($replace != $IF_NOT_EQUIVALENT) { 3544 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); 3545 $replace = $IF_NOT_EQUIVALENT; 3546 } 3547 if ($type) { 3548 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); 3549 $type = 0; 3550 } 3551 if ($value ne "") { 3552 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); 3553 $value = ""; 3554 } 3555 } 3556 3557 my $r = $ranges{$addr}; # The current list of ranges 3558 my $range_list_size = scalar @$r; # And its size 3559 my $max = $max{$addr}; # The current high code point in 3560 # the list of ranges 3561 3562 # Do a special case requiring fewer machine cycles when the new range 3563 # starts after the current highest point. The Unicode input data is 3564 # structured so this is common. 3565 if ($start > $max) { 3566 3567 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace; 3568 return if $operation eq '-'; # Deleting a non-existing range is a 3569 # no-op 3570 3571 # If the new range doesn't logically extend the current final one 3572 # in the range list, create a new range at the end of the range 3573 # list. (max cleverly is initialized to a negative number not 3574 # adjacent to 0 if the range list is empty, so even adding a range 3575 # to an empty range list starting at 0 will have this 'if' 3576 # succeed.) 3577 if ($start > $max + 1 # non-adjacent means can't extend. 3578 || @{$r}[-1]->value ne $value # values differ, can't extend. 3579 || @{$r}[-1]->type != $type # types differ, can't extend. 3580 ) { 3581 push @$r, Range->new($start, $end, 3582 Value => $value, 3583 Type => $type); 3584 } 3585 else { 3586 3587 # Here, the new range starts just after the current highest in 3588 # the range list, and they have the same type and value. 3589 # Extend the current range to incorporate the new one. 3590 @{$r}[-1]->set_end($end); 3591 } 3592 3593 # This becomes the new maximum. 3594 $max{$addr} = $end; 3595 3596 return; 3597 } 3598 #local $to_trace = 0 if main::DEBUG; 3599 3600 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; 3601 3602 # Here, the input range isn't after the whole rest of the range list. 3603 # Most likely 'splice' will be needed. The rest of the routine finds 3604 # the needed splice parameters, and if necessary, does the splice. 3605 # First, find the offset parameter needed by the splice function for 3606 # the input range. Note that the input range may span multiple 3607 # existing ones, but we'll worry about that later. For now, just find 3608 # the beginning. If the input range is to be inserted starting in a 3609 # position not currently in the range list, it must (obviously) come 3610 # just after the range below it, and just before the range above it. 3611 # Slightly less obviously, it will occupy the position currently 3612 # occupied by the range that is to come after it. More formally, we 3613 # are looking for the position, $i, in the array of ranges, such that: 3614 # 3615 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end 3616 # 3617 # (The ordered relationships within existing ranges are also shown in 3618 # the equation above). However, if the start of the input range is 3619 # within an existing range, the splice offset should point to that 3620 # existing range's position in the list; that is $i satisfies a 3621 # somewhat different equation, namely: 3622 # 3623 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end 3624 # 3625 # More briefly, $start can come before or after r[$i]->start, and at 3626 # this point, we don't know which it will be. However, these 3627 # two equations share these constraints: 3628 # 3629 # r[$i-1]->end < $start <= r[$i]->end 3630 # 3631 # And that is good enough to find $i. 3632 3633 my $i = $self->_search_ranges($start); 3634 if (! defined $i) { 3635 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); 3636 return; 3637 } 3638 3639 # The search function returns $i such that: 3640 # 3641 # r[$i-1]->end < $start <= r[$i]->end 3642 # 3643 # That means that $i points to the first range in the range list 3644 # that could possibly be affected by this operation. We still don't 3645 # know if the start of the input range is within r[$i], or if it 3646 # points to empty space between r[$i-1] and r[$i]. 3647 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; 3648 3649 # Special case the insertion of data that is not to replace any 3650 # existing data. 3651 if ($replace == $NO) { # If $NO, has to be operation '+' 3652 #local $to_trace = 1 if main::DEBUG; 3653 trace "Doesn't replace" if main::DEBUG && $to_trace; 3654 3655 # Here, the new range is to take effect only on those code points 3656 # that aren't already in an existing range. This can be done by 3657 # looking through the existing range list and finding the gaps in 3658 # the ranges that this new range affects, and then calling this 3659 # function recursively on each of those gaps, leaving untouched 3660 # anything already in the list. Gather up a list of the changed 3661 # gaps first so that changes to the internal state as new ranges 3662 # are added won't be a problem. 3663 my @gap_list; 3664 3665 # First, if the starting point of the input range is outside an 3666 # existing one, there is a gap from there to the beginning of the 3667 # existing range -- add a span to fill the part that this new 3668 # range occupies 3669 if ($start < $r->[$i]->start) { 3670 push @gap_list, Range->new($start, 3671 main::min($end, 3672 $r->[$i]->start - 1), 3673 Type => $type); 3674 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; 3675 } 3676 3677 # Then look through the range list for other gaps until we reach 3678 # the highest range affected by the input one. 3679 my $j; 3680 for ($j = $i+1; $j < $range_list_size; $j++) { 3681 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; 3682 last if $end < $r->[$j]->start; 3683 3684 # If there is a gap between when this range starts and the 3685 # previous one ends, add a span to fill it. Note that just 3686 # because there are two ranges doesn't mean there is a 3687 # non-zero gap between them. It could be that they have 3688 # different values or types 3689 if ($r->[$j-1]->end + 1 != $r->[$j]->start) { 3690 push @gap_list, 3691 Range->new($r->[$j-1]->end + 1, 3692 $r->[$j]->start - 1, 3693 Type => $type); 3694 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; 3695 } 3696 } 3697 3698 # Here, we have either found an existing range in the range list, 3699 # beyond the area affected by the input one, or we fell off the 3700 # end of the loop because the input range affects the whole rest 3701 # of the range list. In either case, $j is 1 higher than the 3702 # highest affected range. If $j == $i, it means that there are no 3703 # affected ranges, that the entire insertion is in the gap between 3704 # r[$i-1], and r[$i], which we already have taken care of before 3705 # the loop. 3706 # On the other hand, if there are affected ranges, it might be 3707 # that there is a gap that needs filling after the final such 3708 # range to the end of the input range 3709 if ($r->[$j-1]->end < $end) { 3710 push @gap_list, Range->new(main::max($start, 3711 $r->[$j-1]->end + 1), 3712 $end, 3713 Type => $type); 3714 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; 3715 } 3716 3717 # Call recursively to fill in all the gaps. 3718 foreach my $gap (@gap_list) { 3719 $self->_add_delete($operation, 3720 $gap->start, 3721 $gap->end, 3722 $value, 3723 Type => $type); 3724 } 3725 3726 return; 3727 } 3728 3729 # Here, we have taken care of the case where $replace is $NO. 3730 # Remember that here, r[$i-1]->end < $start <= r[$i]->end 3731 # If inserting a multiple record, this is where it goes, before the 3732 # first (if any) existing one if inserting LIFO. (If this is to go 3733 # afterwards, FIFO, we below move the pointer to there.) These imply 3734 # an insertion, and no change to any existing ranges. Note that $i 3735 # can be -1 if this new range doesn't actually duplicate any existing, 3736 # and comes at the beginning of the list. 3737 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) { 3738 3739 if ($start != $end) { 3740 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken."); 3741 return; 3742 } 3743 3744 # If the new code point is within a current range ... 3745 if ($end >= $r->[$i]->start) { 3746 3747 # Don't add an exact duplicate, as it isn't really a multiple 3748 my $existing_value = $r->[$i]->value; 3749 my $existing_type = $r->[$i]->type; 3750 return if $value eq $existing_value && $type eq $existing_type; 3751 3752 # If the multiple value is part of an existing range, we want 3753 # to split up that range, so that only the single code point 3754 # is affected. To do this, we first call ourselves 3755 # recursively to delete that code point from the table, having 3756 # preserved its current data above. Then we call ourselves 3757 # recursively again to add the new multiple, which we know by 3758 # the test just above is different than the current code 3759 # point's value, so it will become a range containing a single 3760 # code point: just itself. Finally, we add back in the 3761 # pre-existing code point, which will again be a single code 3762 # point range. Because 'i' likely will have changed as a 3763 # result of these operations, we can't just continue on, but 3764 # do this operation recursively as well. If we are inserting 3765 # LIFO, the pre-existing code point needs to go after the new 3766 # one, so use MULTIPLE_AFTER; and vice versa. 3767 if ($r->[$i]->start != $r->[$i]->end) { 3768 $self->_add_delete('-', $start, $end, ""); 3769 $self->_add_delete('+', $start, $end, $value, Type => $type); 3770 return $self->_add_delete('+', 3771 $start, $end, 3772 $existing_value, 3773 Type => $existing_type, 3774 Replace => ($replace == $MULTIPLE_BEFORE) 3775 ? $MULTIPLE_AFTER 3776 : $MULTIPLE_BEFORE); 3777 } 3778 } 3779 3780 # If to place this new record after, move to beyond all existing 3781 # ones; but don't add this one if identical to any of them, as it 3782 # isn't really a multiple. This leaves the original order, so 3783 # that the current request is ignored. The reasoning is that the 3784 # previous request that wanted this record to have high priority 3785 # should have precedence. 3786 if ($replace == $MULTIPLE_AFTER) { 3787 while ($i < @$r && $r->[$i]->start == $start) { 3788 return if $value eq $r->[$i]->value 3789 && $type eq $r->[$i]->type; 3790 $i++; 3791 } 3792 } 3793 else { 3794 # If instead we are to place this new record before any 3795 # existing ones, remove any identical ones that come after it. 3796 # This changes the existing order so that the new one is 3797 # first, as is being requested. 3798 for (my $j = $i + 1; 3799 $j < @$r && $r->[$j]->start == $start; 3800 $j++) 3801 { 3802 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { 3803 splice @$r, $j, 1; 3804 last; # There should only be one instance, so no 3805 # need to keep looking 3806 } 3807 } 3808 } 3809 3810 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; 3811 my @return = splice @$r, 3812 $i, 3813 0, 3814 Range->new($start, 3815 $end, 3816 Value => $value, 3817 Type => $type); 3818 if (main::DEBUG && $to_trace) { 3819 trace "After splice:"; 3820 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 3821 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 3822 trace "i =[", $i, "]", $r->[$i] if $i >= 0; 3823 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 3824 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 3825 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3; 3826 } 3827 return @return; 3828 } 3829 3830 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This 3831 # leaves delete, insert, and replace either unconditionally or if not 3832 # equivalent. $i still points to the first potential affected range. 3833 # Now find the highest range affected, which will determine the length 3834 # parameter to splice. (The input range can span multiple existing 3835 # ones.) If this isn't a deletion, while we are looking through the 3836 # range list, see also if this is a replacement rather than a clean 3837 # insertion; that is if it will change the values of at least one 3838 # existing range. Start off assuming it is an insert, until find it 3839 # isn't. 3840 my $clean_insert = $operation eq '+'; 3841 my $j; # This will point to the highest affected range 3842 3843 # For non-zero types, the standard form is the value itself; 3844 my $standard_form = ($type) ? $value : main::standardize($value); 3845 3846 for ($j = $i; $j < $range_list_size; $j++) { 3847 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; 3848 3849 # If find a range that it doesn't overlap into, we can stop 3850 # searching 3851 last if $end < $r->[$j]->start; 3852 3853 # Here, overlaps the range at $j. If the values don't match, 3854 # and so far we think this is a clean insertion, it becomes a 3855 # non-clean insertion, i.e., a 'change' or 'replace' instead. 3856 if ($clean_insert) { 3857 if ($r->[$j]->standard_form ne $standard_form) { 3858 $clean_insert = 0; 3859 if ($replace == $CROAK) { 3860 main::croak("The range to add " 3861 . sprintf("%04X", $start) 3862 . '-' 3863 . sprintf("%04X", $end) 3864 . " with value '$value' overlaps an existing range $r->[$j]"); 3865 } 3866 } 3867 else { 3868 3869 # Here, the two values are essentially the same. If the 3870 # two are actually identical, replacing wouldn't change 3871 # anything so skip it. 3872 my $pre_existing = $r->[$j]->value; 3873 if ($pre_existing ne $value) { 3874 3875 # Here the new and old standardized values are the 3876 # same, but the non-standardized values aren't. If 3877 # replacing unconditionally, then replace 3878 if( $replace == $UNCONDITIONALLY) { 3879 $clean_insert = 0; 3880 } 3881 else { 3882 3883 # Here, are replacing conditionally. Decide to 3884 # replace or not based on which appears to look 3885 # the "nicest". If one is mixed case and the 3886 # other isn't, choose the mixed case one. 3887 my $new_mixed = $value =~ /[A-Z]/ 3888 && $value =~ /[a-z]/; 3889 my $old_mixed = $pre_existing =~ /[A-Z]/ 3890 && $pre_existing =~ /[a-z]/; 3891 3892 if ($old_mixed != $new_mixed) { 3893 $clean_insert = 0 if $new_mixed; 3894 if (main::DEBUG && $to_trace) { 3895 if ($clean_insert) { 3896 trace "Retaining $pre_existing over $value"; 3897 } 3898 else { 3899 trace "Replacing $pre_existing with $value"; 3900 } 3901 } 3902 } 3903 else { 3904 3905 # Here casing wasn't different between the two. 3906 # If one has hyphens or underscores and the 3907 # other doesn't, choose the one with the 3908 # punctuation. 3909 my $new_punct = $value =~ /[-_]/; 3910 my $old_punct = $pre_existing =~ /[-_]/; 3911 3912 if ($old_punct != $new_punct) { 3913 $clean_insert = 0 if $new_punct; 3914 if (main::DEBUG && $to_trace) { 3915 if ($clean_insert) { 3916 trace "Retaining $pre_existing over $value"; 3917 } 3918 else { 3919 trace "Replacing $pre_existing with $value"; 3920 } 3921 } 3922 } # else existing one is just as "good"; 3923 # retain it to save cycles. 3924 } 3925 } 3926 } 3927 } 3928 } 3929 } # End of loop looking for highest affected range. 3930 3931 # Here, $j points to one beyond the highest range that this insertion 3932 # affects (hence to beyond the range list if that range is the final 3933 # one in the range list). 3934 3935 # The splice length is all the affected ranges. Get it before 3936 # subtracting, for efficiency, so we don't have to later add 1. 3937 my $length = $j - $i; 3938 3939 $j--; # $j now points to the highest affected range. 3940 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; 3941 3942 # Here, have taken care of $NO and $MULTIPLE_foo replaces. 3943 # $j points to the highest affected range. But it can be < $i or even 3944 # -1. These happen only if the insertion is entirely in the gap 3945 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop 3946 # above exited first time through with $end < $r->[$i]->start. (And 3947 # then we subtracted one from j) This implies also that $start < 3948 # $r->[$i]->start, but we know from above that $r->[$i-1]->end < 3949 # $start, so the entire input range is in the gap. 3950 if ($j < $i) { 3951 3952 # Here the entire input range is in the gap before $i. 3953 3954 if (main::DEBUG && $to_trace) { 3955 if ($i) { 3956 trace "Entire range is between $r->[$i-1] and $r->[$i]"; 3957 } 3958 else { 3959 trace "Entire range is before $r->[$i]"; 3960 } 3961 } 3962 return if $operation ne '+'; # Deletion of a non-existent range is 3963 # a no-op 3964 } 3965 else { 3966 3967 # Here part of the input range is not in the gap before $i. Thus, 3968 # there is at least one affected one, and $j points to the highest 3969 # such one. 3970 3971 # At this point, here is the situation: 3972 # This is not an insertion of a multiple, nor of tentative ($NO) 3973 # data. 3974 # $i points to the first element in the current range list that 3975 # may be affected by this operation. In fact, we know 3976 # that the range at $i is affected because we are in 3977 # the else branch of this 'if' 3978 # $j points to the highest affected range. 3979 # In other words, 3980 # r[$i-1]->end < $start <= r[$i]->end 3981 # And: 3982 # r[$i-1]->end < $start <= $end <= r[$j]->end 3983 # 3984 # Also: 3985 # $clean_insert is a boolean which is set true if and only if 3986 # this is a "clean insertion", i.e., not a change nor a 3987 # deletion (multiple was handled above). 3988 3989 # We now have enough information to decide if this call is a no-op 3990 # or not. It is a no-op if this is an insertion of already 3991 # existing data. 3992 3993 if (main::DEBUG && $to_trace && $clean_insert 3994 && $i == $j 3995 && $start >= $r->[$i]->start) 3996 { 3997 trace "no-op"; 3998 } 3999 return if $clean_insert 4000 && $i == $j # more than one affected range => not no-op 4001 4002 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end 4003 # Further, $start and/or $end is >= r[$i]->start 4004 # The test below hence guarantees that 4005 # r[$i]->start < $start <= $end <= r[$i]->end 4006 # This means the input range is contained entirely in 4007 # the one at $i, so is a no-op 4008 && $start >= $r->[$i]->start; 4009 } 4010 4011 # Here, we know that some action will have to be taken. We have 4012 # calculated the offset and length (though adjustments may be needed) 4013 # for the splice. Now start constructing the replacement list. 4014 my @replacement; 4015 my $splice_start = $i; 4016 4017 my $extends_below; 4018 my $extends_above; 4019 4020 # See if should extend any adjacent ranges. 4021 if ($operation eq '-') { # Don't extend deletions 4022 $extends_below = $extends_above = 0; 4023 } 4024 else { # Here, should extend any adjacent ranges. See if there are 4025 # any. 4026 $extends_below = ($i > 0 4027 # can't extend unless adjacent 4028 && $r->[$i-1]->end == $start -1 4029 # can't extend unless are same standard value 4030 && $r->[$i-1]->standard_form eq $standard_form 4031 # can't extend unless share type 4032 && $r->[$i-1]->type == $type); 4033 $extends_above = ($j+1 < $range_list_size 4034 && $r->[$j+1]->start == $end +1 4035 && $r->[$j+1]->standard_form eq $standard_form 4036 && $r->[$j+1]->type == $type); 4037 } 4038 if ($extends_below && $extends_above) { # Adds to both 4039 $splice_start--; # start replace at element below 4040 $length += 2; # will replace on both sides 4041 trace "Extends both below and above ranges" if main::DEBUG && $to_trace; 4042 4043 # The result will fill in any gap, replacing both sides, and 4044 # create one large range. 4045 @replacement = Range->new($r->[$i-1]->start, 4046 $r->[$j+1]->end, 4047 Value => $value, 4048 Type => $type); 4049 } 4050 else { 4051 4052 # Here we know that the result won't just be the conglomeration of 4053 # a new range with both its adjacent neighbors. But it could 4054 # extend one of them. 4055 4056 if ($extends_below) { 4057 4058 # Here the new element adds to the one below, but not to the 4059 # one above. If inserting, and only to that one range, can 4060 # just change its ending to include the new one. 4061 if ($length == 0 && $clean_insert) { 4062 $r->[$i-1]->set_end($end); 4063 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; 4064 return; 4065 } 4066 else { 4067 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; 4068 $splice_start--; # start replace at element below 4069 $length++; # will replace the element below 4070 $start = $r->[$i-1]->start; 4071 } 4072 } 4073 elsif ($extends_above) { 4074 4075 # Here the new element adds to the one above, but not below. 4076 # Mirror the code above 4077 if ($length == 0 && $clean_insert) { 4078 $r->[$j+1]->set_start($start); 4079 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; 4080 return; 4081 } 4082 else { 4083 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; 4084 $length++; # will replace the element above 4085 $end = $r->[$j+1]->end; 4086 } 4087 } 4088 4089 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; 4090 4091 # Finally, here we know there will have to be a splice. 4092 # If the change or delete affects only the highest portion of the 4093 # first affected range, the range will have to be split. The 4094 # splice will remove the whole range, but will replace it by a new 4095 # range containing just the unaffected part. So, in this case, 4096 # add to the replacement list just this unaffected portion. 4097 if (! $extends_below 4098 && $start > $r->[$i]->start && $start <= $r->[$i]->end) 4099 { 4100 push @replacement, 4101 Range->new($r->[$i]->start, 4102 $start - 1, 4103 Value => $r->[$i]->value, 4104 Type => $r->[$i]->type); 4105 } 4106 4107 # In the case of an insert or change, but not a delete, we have to 4108 # put in the new stuff; this comes next. 4109 if ($operation eq '+') { 4110 push @replacement, Range->new($start, 4111 $end, 4112 Value => $value, 4113 Type => $type); 4114 } 4115 4116 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; 4117 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; 4118 4119 # And finally, if we're changing or deleting only a portion of the 4120 # highest affected range, it must be split, as the lowest one was. 4121 if (! $extends_above 4122 && $j >= 0 # Remember that j can be -1 if before first 4123 # current element 4124 && $end >= $r->[$j]->start 4125 && $end < $r->[$j]->end) 4126 { 4127 push @replacement, 4128 Range->new($end + 1, 4129 $r->[$j]->end, 4130 Value => $r->[$j]->value, 4131 Type => $r->[$j]->type); 4132 } 4133 } 4134 4135 # And do the splice, as calculated above 4136 if (main::DEBUG && $to_trace) { 4137 trace "replacing $length element(s) at $i with "; 4138 foreach my $replacement (@replacement) { 4139 trace " $replacement"; 4140 } 4141 trace "Before splice:"; 4142 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4143 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4144 trace "i =[", $i, "]", $r->[$i]; 4145 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4146 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4147 } 4148 4149 my @return = splice @$r, $splice_start, $length, @replacement; 4150 4151 if (main::DEBUG && $to_trace) { 4152 trace "After splice:"; 4153 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4154 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4155 trace "i =[", $i, "]", $r->[$i]; 4156 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4157 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4158 trace "removed ", @return if @return; 4159 } 4160 4161 # An actual deletion could have changed the maximum in the list. 4162 # There was no deletion if the splice didn't return something, but 4163 # otherwise recalculate it. This is done too rarely to worry about 4164 # performance. 4165 if ($operation eq '-' && @return) { 4166 if (@$r) { 4167 $max{$addr} = $r->[-1]->end; 4168 } 4169 else { # Now empty 4170 $max{$addr} = $max_init; 4171 } 4172 } 4173 return @return; 4174 } 4175 4176 sub reset_each_range { # reset the iterator for each_range(); 4177 my $self = shift; 4178 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4179 4180 no overloading; 4181 undef $each_range_iterator{pack 'J', $self}; 4182 return; 4183 } 4184 4185 sub each_range { 4186 # Iterate over each range in a range list. Results are undefined if 4187 # the range list is changed during the iteration. 4188 4189 my $self = shift; 4190 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4191 4192 my $addr = do { no overloading; pack 'J', $self; }; 4193 4194 return if $self->is_empty; 4195 4196 $each_range_iterator{$addr} = -1 4197 if ! defined $each_range_iterator{$addr}; 4198 $each_range_iterator{$addr}++; 4199 return $ranges{$addr}->[$each_range_iterator{$addr}] 4200 if $each_range_iterator{$addr} < @{$ranges{$addr}}; 4201 undef $each_range_iterator{$addr}; 4202 return; 4203 } 4204 4205 sub count { # Returns count of code points in range list 4206 my $self = shift; 4207 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4208 4209 my $addr = do { no overloading; pack 'J', $self; }; 4210 4211 my $count = 0; 4212 foreach my $range (@{$ranges{$addr}}) { 4213 $count += $range->end - $range->start + 1; 4214 } 4215 return $count; 4216 } 4217 4218 sub delete_range { # Delete a range 4219 my $self = shift; 4220 my $start = shift; 4221 my $end = shift; 4222 4223 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4224 4225 return $self->_add_delete('-', $start, $end, ""); 4226 } 4227 4228 sub is_empty { # Returns boolean as to if a range list is empty 4229 my $self = shift; 4230 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4231 4232 no overloading; 4233 return scalar @{$ranges{pack 'J', $self}} == 0; 4234 } 4235 4236 sub hash { 4237 # Quickly returns a scalar suitable for separating tables into 4238 # buckets, i.e. it is a hash function of the contents of a table, so 4239 # there are relatively few conflicts. 4240 4241 my $self = shift; 4242 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4243 4244 my $addr = do { no overloading; pack 'J', $self; }; 4245 4246 # These are quickly computable. Return looks like 'min..max;count' 4247 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; 4248 } 4249} # End closure for _Range_List_Base 4250 4251package Range_List; 4252use base '_Range_List_Base'; 4253 4254# A Range_List is a range list for match tables; i.e. the range values are 4255# not significant. Thus a number of operations can be safely added to it, 4256# such as inversion, intersection. Note that union is also an unsafe 4257# operation when range values are cared about, and that method is in the base 4258# class, not here. But things are set up so that that method is callable only 4259# during initialization. Only in this derived class, is there an operation 4260# that combines two tables. A Range_Map can thus be used to initialize a 4261# Range_List, and its mappings will be in the list, but are not significant to 4262# this class. 4263 4264sub trace { return main::trace(@_); } 4265 4266{ # Closure 4267 4268 use overload 4269 fallback => 0, 4270 '+' => sub { my $self = shift; 4271 my $other = shift; 4272 4273 return $self->_union($other) 4274 }, 4275 '+=' => sub { my $self = shift; 4276 my $other = shift; 4277 my $reversed = shift; 4278 4279 if ($reversed) { 4280 Carp::my_carp_bug("Bad news. Can't cope with '" 4281 . ref($other) 4282 . ' += ' 4283 . ref($self) 4284 . "'. undef returned."); 4285 return; 4286 } 4287 4288 return $self->_union($other) 4289 }, 4290 '&' => sub { my $self = shift; 4291 my $other = shift; 4292 4293 return $self->_intersect($other, 0); 4294 }, 4295 '&=' => sub { my $self = shift; 4296 my $other = shift; 4297 my $reversed = shift; 4298 4299 if ($reversed) { 4300 Carp::my_carp_bug("Bad news. Can't cope with '" 4301 . ref($other) 4302 . ' &= ' 4303 . ref($self) 4304 . "'. undef returned."); 4305 return; 4306 } 4307 4308 return $self->_intersect($other, 0); 4309 }, 4310 '~' => "_invert", 4311 '-' => "_subtract", 4312 ; 4313 4314 sub _invert { 4315 # Returns a new Range_List that gives all code points not in $self. 4316 4317 my $self = shift; 4318 4319 my $new = Range_List->new; 4320 4321 # Go through each range in the table, finding the gaps between them 4322 my $max = -1; # Set so no gap before range beginning at 0 4323 for my $range ($self->ranges) { 4324 my $start = $range->start; 4325 my $end = $range->end; 4326 4327 # If there is a gap before this range, the inverse will contain 4328 # that gap. 4329 if ($start > $max + 1) { 4330 $new->add_range($max + 1, $start - 1); 4331 } 4332 $max = $end; 4333 } 4334 4335 # And finally, add the gap from the end of the table to the max 4336 # possible code point 4337 if ($max < $MAX_UNICODE_CODEPOINT) { 4338 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT); 4339 } 4340 return $new; 4341 } 4342 4343 sub _subtract { 4344 # Returns a new Range_List with the argument deleted from it. The 4345 # argument can be a single code point, a range, or something that has 4346 # a range, with the _range_list() method on it returning them 4347 4348 my $self = shift; 4349 my $other = shift; 4350 my $reversed = shift; 4351 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4352 4353 if ($reversed) { 4354 Carp::my_carp_bug("Bad news. Can't cope with '" 4355 . ref($other) 4356 . ' - ' 4357 . ref($self) 4358 . "'. undef returned."); 4359 return; 4360 } 4361 4362 my $new = Range_List->new(Initialize => $self); 4363 4364 if (! ref $other) { # Single code point 4365 $new->delete_range($other, $other); 4366 } 4367 elsif ($other->isa('Range')) { 4368 $new->delete_range($other->start, $other->end); 4369 } 4370 elsif ($other->can('_range_list')) { 4371 foreach my $range ($other->_range_list->ranges) { 4372 $new->delete_range($range->start, $range->end); 4373 } 4374 } 4375 else { 4376 Carp::my_carp_bug("Can't cope with a " 4377 . ref($other) 4378 . " argument to '-'. Subtraction ignored." 4379 ); 4380 return $self; 4381 } 4382 4383 return $new; 4384 } 4385 4386 sub _intersect { 4387 # Returns either a boolean giving whether the two inputs' range lists 4388 # intersect (overlap), or a new Range_List containing the intersection 4389 # of the two lists. The optional final parameter being true indicates 4390 # to do the check instead of the intersection. 4391 4392 my $a_object = shift; 4393 my $b_object = shift; 4394 my $check_if_overlapping = shift; 4395 $check_if_overlapping = 0 unless defined $check_if_overlapping; 4396 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4397 4398 if (! defined $b_object) { 4399 my $message = ""; 4400 $message .= $a_object->_owner_name_of if defined $a_object; 4401 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); 4402 return; 4403 } 4404 4405 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) 4406 # Thus the intersection could be much more simply be written: 4407 # return ~(~$a_object + ~$b_object); 4408 # But, this is slower, and when taking the inverse of a large 4409 # range_size_1 table, back when such tables were always stored that 4410 # way, it became prohibitively slow, hence the code was changed to the 4411 # below 4412 4413 if ($b_object->isa('Range')) { 4414 $b_object = Range_List->new(Initialize => $b_object, 4415 Owner => $a_object->_owner_name_of); 4416 } 4417 $b_object = $b_object->_range_list if $b_object->can('_range_list'); 4418 4419 my @a_ranges = $a_object->ranges; 4420 my @b_ranges = $b_object->ranges; 4421 4422 #local $to_trace = 1 if main::DEBUG; 4423 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; 4424 4425 # Start with the first range in each list 4426 my $a_i = 0; 4427 my $range_a = $a_ranges[$a_i]; 4428 my $b_i = 0; 4429 my $range_b = $b_ranges[$b_i]; 4430 4431 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) 4432 if ! $check_if_overlapping; 4433 4434 # If either list is empty, there is no intersection and no overlap 4435 if (! defined $range_a || ! defined $range_b) { 4436 return $check_if_overlapping ? 0 : $new; 4437 } 4438 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 4439 4440 # Otherwise, must calculate the intersection/overlap. Start with the 4441 # very first code point in each list 4442 my $a = $range_a->start; 4443 my $b = $range_b->start; 4444 4445 # Loop through all the ranges of each list; in each iteration, $a and 4446 # $b are the current code points in their respective lists 4447 while (1) { 4448 4449 # If $a and $b are the same code point, ... 4450 if ($a == $b) { 4451 4452 # it means the lists overlap. If just checking for overlap 4453 # know the answer now, 4454 return 1 if $check_if_overlapping; 4455 4456 # The intersection includes this code point plus anything else 4457 # common to both current ranges. 4458 my $start = $a; 4459 my $end = main::min($range_a->end, $range_b->end); 4460 if (! $check_if_overlapping) { 4461 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; 4462 $new->add_range($start, $end); 4463 } 4464 4465 # Skip ahead to the end of the current intersect 4466 $a = $b = $end; 4467 4468 # If the current intersect ends at the end of either range (as 4469 # it must for at least one of them), the next possible one 4470 # will be the beginning code point in it's list's next range. 4471 if ($a == $range_a->end) { 4472 $range_a = $a_ranges[++$a_i]; 4473 last unless defined $range_a; 4474 $a = $range_a->start; 4475 } 4476 if ($b == $range_b->end) { 4477 $range_b = $b_ranges[++$b_i]; 4478 last unless defined $range_b; 4479 $b = $range_b->start; 4480 } 4481 4482 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 4483 } 4484 elsif ($a < $b) { 4485 4486 # Not equal, but if the range containing $a encompasses $b, 4487 # change $a to be the middle of the range where it does equal 4488 # $b, so the next iteration will get the intersection 4489 if ($range_a->end >= $b) { 4490 $a = $b; 4491 } 4492 else { 4493 4494 # Here, the current range containing $a is entirely below 4495 # $b. Go try to find a range that could contain $b. 4496 $a_i = $a_object->_search_ranges($b); 4497 4498 # If no range found, quit. 4499 last unless defined $a_i; 4500 4501 # The search returns $a_i, such that 4502 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end 4503 # Set $a to the beginning of this new range, and repeat. 4504 $range_a = $a_ranges[$a_i]; 4505 $a = $range_a->start; 4506 } 4507 } 4508 else { # Here, $b < $a. 4509 4510 # Mirror image code to the leg just above 4511 if ($range_b->end >= $a) { 4512 $b = $a; 4513 } 4514 else { 4515 $b_i = $b_object->_search_ranges($a); 4516 last unless defined $b_i; 4517 $range_b = $b_ranges[$b_i]; 4518 $b = $range_b->start; 4519 } 4520 } 4521 } # End of looping through ranges. 4522 4523 # Intersection fully computed, or now know that there is no overlap 4524 return $check_if_overlapping ? 0 : $new; 4525 } 4526 4527 sub overlaps { 4528 # Returns boolean giving whether the two arguments overlap somewhere 4529 4530 my $self = shift; 4531 my $other = shift; 4532 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4533 4534 return $self->_intersect($other, 1); 4535 } 4536 4537 sub add_range { 4538 # Add a range to the list. 4539 4540 my $self = shift; 4541 my $start = shift; 4542 my $end = shift; 4543 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4544 4545 return $self->_add_delete('+', $start, $end, ""); 4546 } 4547 4548 sub matches_identically_to { 4549 # Return a boolean as to whether or not two Range_Lists match identical 4550 # sets of code points. 4551 4552 my $self = shift; 4553 my $other = shift; 4554 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4555 4556 # These are ordered in increasing real time to figure out (at least 4557 # until a patch changes that and doesn't change this) 4558 return 0 if $self->max != $other->max; 4559 return 0 if $self->min != $other->min; 4560 return 0 if $self->range_count != $other->range_count; 4561 return 0 if $self->count != $other->count; 4562 4563 # Here they could be identical because all the tests above passed. 4564 # The loop below is somewhat simpler since we know they have the same 4565 # number of elements. Compare range by range, until reach the end or 4566 # find something that differs. 4567 my @a_ranges = $self->ranges; 4568 my @b_ranges = $other->ranges; 4569 for my $i (0 .. @a_ranges - 1) { 4570 my $a = $a_ranges[$i]; 4571 my $b = $b_ranges[$i]; 4572 trace "self $a; other $b" if main::DEBUG && $to_trace; 4573 return 0 if ! defined $b 4574 || $a->start != $b->start 4575 || $a->end != $b->end; 4576 } 4577 return 1; 4578 } 4579 4580 sub is_code_point_usable { 4581 # This used only for making the test script. See if the input 4582 # proposed trial code point is one that Perl will handle. If second 4583 # parameter is 0, it won't select some code points for various 4584 # reasons, noted below. 4585 4586 my $code = shift; 4587 my $try_hard = shift; 4588 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4589 4590 return 0 if $code < 0; # Never use a negative 4591 4592 # shun null. I'm (khw) not sure why this was done, but NULL would be 4593 # the character very frequently used. 4594 return $try_hard if $code == 0x0000; 4595 4596 # shun non-character code points. 4597 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; 4598 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF 4599 4600 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range 4601 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate 4602 4603 return 1; 4604 } 4605 4606 sub get_valid_code_point { 4607 # Return a code point that's part of the range list. Returns nothing 4608 # if the table is empty or we can't find a suitable code point. This 4609 # used only for making the test script. 4610 4611 my $self = shift; 4612 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4613 4614 my $addr = do { no overloading; pack 'J', $self; }; 4615 4616 # On first pass, don't choose less desirable code points; if no good 4617 # one is found, repeat, allowing a less desirable one to be selected. 4618 for my $try_hard (0, 1) { 4619 4620 # Look through all the ranges for a usable code point. 4621 for my $set (reverse $self->ranges) { 4622 4623 # Try the edge cases first, starting with the end point of the 4624 # range. 4625 my $end = $set->end; 4626 return $end if is_code_point_usable($end, $try_hard); 4627 4628 # End point didn't, work. Start at the beginning and try 4629 # every one until find one that does work. 4630 for my $trial ($set->start .. $end - 1) { 4631 return $trial if is_code_point_usable($trial, $try_hard); 4632 } 4633 } 4634 } 4635 return (); # If none found, give up. 4636 } 4637 4638 sub get_invalid_code_point { 4639 # Return a code point that's not part of the table. Returns nothing 4640 # if the table covers all code points or a suitable code point can't 4641 # be found. This used only for making the test script. 4642 4643 my $self = shift; 4644 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4645 4646 # Just find a valid code point of the inverse, if any. 4647 return Range_List->new(Initialize => ~ $self)->get_valid_code_point; 4648 } 4649} # end closure for Range_List 4650 4651package Range_Map; 4652use base '_Range_List_Base'; 4653 4654# A Range_Map is a range list in which the range values (called maps) are 4655# significant, and hence shouldn't be manipulated by our other code, which 4656# could be ambiguous or lose things. For example, in taking the union of two 4657# lists, which share code points, but which have differing values, which one 4658# has precedence in the union? 4659# It turns out that these operations aren't really necessary for map tables, 4660# and so this class was created to make sure they aren't accidentally 4661# applied to them. 4662 4663{ # Closure 4664 4665 sub add_map { 4666 # Add a range containing a mapping value to the list 4667 4668 my $self = shift; 4669 # Rest of parameters passed on 4670 4671 return $self->_add_delete('+', @_); 4672 } 4673 4674 sub add_duplicate { 4675 # Adds entry to a range list which can duplicate an existing entry 4676 4677 my $self = shift; 4678 my $code_point = shift; 4679 my $value = shift; 4680 my %args = @_; 4681 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE; 4682 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4683 4684 return $self->add_map($code_point, $code_point, 4685 $value, Replace => $replace); 4686 } 4687} # End of closure for package Range_Map 4688 4689package _Base_Table; 4690 4691# A table is the basic data structure that gets written out into a file for 4692# use by the Perl core. This is the abstract base class implementing the 4693# common elements from the derived ones. A list of the methods to be 4694# furnished by an implementing class is just after the constructor. 4695 4696sub standardize { return main::standardize($_[0]); } 4697sub trace { return main::trace(@_); } 4698 4699{ # Closure 4700 4701 main::setup_package(); 4702 4703 my %range_list; 4704 # Object containing the ranges of the table. 4705 main::set_access('range_list', \%range_list, 'p_r', 'p_s'); 4706 4707 my %full_name; 4708 # The full table name. 4709 main::set_access('full_name', \%full_name, 'r'); 4710 4711 my %name; 4712 # The table name, almost always shorter 4713 main::set_access('name', \%name, 'r'); 4714 4715 my %short_name; 4716 # The shortest of all the aliases for this table, with underscores removed 4717 main::set_access('short_name', \%short_name); 4718 4719 my %nominal_short_name_length; 4720 # The length of short_name before removing underscores 4721 main::set_access('nominal_short_name_length', 4722 \%nominal_short_name_length); 4723 4724 my %complete_name; 4725 # The complete name, including property. 4726 main::set_access('complete_name', \%complete_name, 'r'); 4727 4728 my %property; 4729 # Parent property this table is attached to. 4730 main::set_access('property', \%property, 'r'); 4731 4732 my %aliases; 4733 # Ordered list of alias objects of the table's name. The first ones in 4734 # the list are output first in comments 4735 main::set_access('aliases', \%aliases, 'readable_array'); 4736 4737 my %comment; 4738 # A comment associated with the table for human readers of the files 4739 main::set_access('comment', \%comment, 's'); 4740 4741 my %description; 4742 # A comment giving a short description of the table's meaning for human 4743 # readers of the files. 4744 main::set_access('description', \%description, 'readable_array'); 4745 4746 my %note; 4747 # A comment giving a short note about the table for human readers of the 4748 # files. 4749 main::set_access('note', \%note, 'readable_array'); 4750 4751 my %fate; 4752 # Enum; there are a number of possibilities for what happens to this 4753 # table: it could be normal, or suppressed, or not for external use. See 4754 # values at definition for $SUPPRESSED. 4755 main::set_access('fate', \%fate, 'r'); 4756 4757 my %find_table_from_alias; 4758 # The parent property passes this pointer to a hash which this class adds 4759 # all its aliases to, so that the parent can quickly take an alias and 4760 # find this table. 4761 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); 4762 4763 my %locked; 4764 # After this table is made equivalent to another one; we shouldn't go 4765 # changing the contents because that could mean it's no longer equivalent 4766 main::set_access('locked', \%locked, 'r'); 4767 4768 my %file_path; 4769 # This gives the final path to the file containing the table. Each 4770 # directory in the path is an element in the array 4771 main::set_access('file_path', \%file_path, 'readable_array'); 4772 4773 my %status; 4774 # What is the table's status, normal, $OBSOLETE, etc. Enum 4775 main::set_access('status', \%status, 'r'); 4776 4777 my %status_info; 4778 # A comment about its being obsolete, or whatever non normal status it has 4779 main::set_access('status_info', \%status_info, 'r'); 4780 4781 my %caseless_equivalent; 4782 # The table this is equivalent to under /i matching, if any. 4783 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's'); 4784 4785 my %range_size_1; 4786 # Is the table to be output with each range only a single code point? 4787 # This is done to avoid breaking existing code that may have come to rely 4788 # on this behavior in previous versions of this program.) 4789 main::set_access('range_size_1', \%range_size_1, 'r', 's'); 4790 4791 my %perl_extension; 4792 # A boolean set iff this table is a Perl extension to the Unicode 4793 # standard. 4794 main::set_access('perl_extension', \%perl_extension, 'r'); 4795 4796 my %output_range_counts; 4797 # A boolean set iff this table is to have comments written in the 4798 # output file that contain the number of code points in the range. 4799 # The constructor can override the global flag of the same name. 4800 main::set_access('output_range_counts', \%output_range_counts, 'r'); 4801 4802 my %format; 4803 # The format of the entries of the table. This is calculated from the 4804 # data in the table (or passed in the constructor). This is an enum e.g., 4805 # $STRING_FORMAT. It is marked protected as it should not be generally 4806 # used to override calculations. 4807 main::set_access('format', \%format, 'r', 'p_s'); 4808 4809 sub new { 4810 # All arguments are key => value pairs, which you can see below, most 4811 # of which match fields documented above. Otherwise: Re_Pod_Entry, 4812 # OK_as_Filename, and Fuzzy apply to the names of the table, and are 4813 # documented in the Alias package 4814 4815 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 4816 4817 my $class = shift; 4818 4819 my $self = bless \do { my $anonymous_scalar }, $class; 4820 my $addr = do { no overloading; pack 'J', $self; }; 4821 4822 my %args = @_; 4823 4824 $name{$addr} = delete $args{'Name'}; 4825 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; 4826 $full_name{$addr} = delete $args{'Full_Name'}; 4827 my $complete_name = $complete_name{$addr} 4828 = delete $args{'Complete_Name'}; 4829 $format{$addr} = delete $args{'Format'}; 4830 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; 4831 $property{$addr} = delete $args{'_Property'}; 4832 $range_list{$addr} = delete $args{'_Range_List'}; 4833 $status{$addr} = delete $args{'Status'} || $NORMAL; 4834 $status_info{$addr} = delete $args{'_Status_Info'} || ""; 4835 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; 4836 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; 4837 $fate{$addr} = delete $args{'Fate'} || $ORDINARY; 4838 my $ucd = delete $args{'UCD'}; 4839 4840 my $description = delete $args{'Description'}; 4841 my $ok_as_filename = delete $args{'OK_as_Filename'}; 4842 my $loose_match = delete $args{'Fuzzy'}; 4843 my $note = delete $args{'Note'}; 4844 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 4845 my $perl_extension = delete $args{'Perl_Extension'}; 4846 4847 # Shouldn't have any left over 4848 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4849 4850 # Can't use || above because conceivably the name could be 0, and 4851 # can't use // operator in case this program gets used in Perl 5.8 4852 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; 4853 $output_range_counts{$addr} = $output_range_counts if 4854 ! defined $output_range_counts{$addr}; 4855 4856 $aliases{$addr} = [ ]; 4857 $comment{$addr} = [ ]; 4858 $description{$addr} = [ ]; 4859 $note{$addr} = [ ]; 4860 $file_path{$addr} = [ ]; 4861 $locked{$addr} = ""; 4862 4863 push @{$description{$addr}}, $description if $description; 4864 push @{$note{$addr}}, $note if $note; 4865 4866 if ($fate{$addr} == $PLACEHOLDER) { 4867 4868 # A placeholder table doesn't get documented, is a perl extension, 4869 # and quite likely will be empty 4870 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 4871 $perl_extension = 1 if ! defined $perl_extension; 4872 $ucd = 0 if ! defined $ucd; 4873 push @tables_that_may_be_empty, $complete_name{$addr}; 4874 $self->add_comment(<<END); 4875This is a placeholder because it is not in Version $string_version of Unicode, 4876but is needed by the Perl core to work gracefully. Because it is not in this 4877version of Unicode, it will not be listed in $pod_file.pod 4878END 4879 } 4880 elsif (exists $why_suppressed{$complete_name} 4881 # Don't suppress if overridden 4882 && ! grep { $_ eq $complete_name{$addr} } 4883 @output_mapped_properties) 4884 { 4885 $fate{$addr} = $SUPPRESSED; 4886 } 4887 elsif ($fate{$addr} == $SUPPRESSED 4888 && ! exists $why_suppressed{$property{$addr}->complete_name}) 4889 { 4890 Carp::my_carp_bug("There is no current capability to set the reason for suppressing."); 4891 # perhaps Fate => [ $SUPPRESSED, "reason" ] 4892 } 4893 4894 # If hasn't set its status already, see if it is on one of the 4895 # lists of properties or tables that have particular statuses; if 4896 # not, is normal. The lists are prioritized so the most serious 4897 # ones are checked first 4898 if (! $status{$addr}) { 4899 if (exists $why_deprecated{$complete_name}) { 4900 $status{$addr} = $DEPRECATED; 4901 } 4902 elsif (exists $why_stabilized{$complete_name}) { 4903 $status{$addr} = $STABILIZED; 4904 } 4905 elsif (exists $why_obsolete{$complete_name}) { 4906 $status{$addr} = $OBSOLETE; 4907 } 4908 4909 # Existence above doesn't necessarily mean there is a message 4910 # associated with it. Use the most serious message. 4911 if ($status{$addr}) { 4912 if ($why_deprecated{$complete_name}) { 4913 $status_info{$addr} 4914 = $why_deprecated{$complete_name}; 4915 } 4916 elsif ($why_stabilized{$complete_name}) { 4917 $status_info{$addr} 4918 = $why_stabilized{$complete_name}; 4919 } 4920 elsif ($why_obsolete{$complete_name}) { 4921 $status_info{$addr} 4922 = $why_obsolete{$complete_name}; 4923 } 4924 } 4925 } 4926 4927 $perl_extension{$addr} = $perl_extension || 0; 4928 4929 # Don't list a property by default that is internal only 4930 if ($fate{$addr} > $MAP_PROXIED) { 4931 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 4932 $ucd = 0 if ! defined $ucd; 4933 } 4934 else { 4935 $ucd = 1 if ! defined $ucd; 4936 } 4937 4938 # By convention what typically gets printed only or first is what's 4939 # first in the list, so put the full name there for good output 4940 # clarity. Other routines rely on the full name being first on the 4941 # list 4942 $self->add_alias($full_name{$addr}, 4943 OK_as_Filename => $ok_as_filename, 4944 Fuzzy => $loose_match, 4945 Re_Pod_Entry => $make_re_pod_entry, 4946 Status => $status{$addr}, 4947 UCD => $ucd, 4948 ); 4949 4950 # Then comes the other name, if meaningfully different. 4951 if (standardize($full_name{$addr}) ne standardize($name{$addr})) { 4952 $self->add_alias($name{$addr}, 4953 OK_as_Filename => $ok_as_filename, 4954 Fuzzy => $loose_match, 4955 Re_Pod_Entry => $make_re_pod_entry, 4956 Status => $status{$addr}, 4957 UCD => $ucd, 4958 ); 4959 } 4960 4961 return $self; 4962 } 4963 4964 # Here are the methods that are required to be defined by any derived 4965 # class 4966 for my $sub (qw( 4967 handle_special_range 4968 append_to_body 4969 pre_body 4970 )) 4971 # write() knows how to write out normal ranges, but it calls 4972 # handle_special_range() when it encounters a non-normal one. 4973 # append_to_body() is called by it after it has handled all 4974 # ranges to add anything after the main portion of the table. 4975 # And finally, pre_body() is called after all this to build up 4976 # anything that should appear before the main portion of the 4977 # table. Doing it this way allows things in the middle to 4978 # affect what should appear before the main portion of the 4979 # table. 4980 { 4981 no strict "refs"; 4982 *$sub = sub { 4983 Carp::my_carp_bug( __LINE__ 4984 . ": Must create method '$sub()' for " 4985 . ref shift); 4986 return; 4987 } 4988 } 4989 4990 use overload 4991 fallback => 0, 4992 "." => \&main::_operator_dot, 4993 ".=" => \&main::_operator_dot_equal, 4994 '!=' => \&main::_operator_not_equal, 4995 '==' => \&main::_operator_equal, 4996 ; 4997 4998 sub ranges { 4999 # Returns the array of ranges associated with this table. 5000 5001 no overloading; 5002 return $range_list{pack 'J', shift}->ranges; 5003 } 5004 5005 sub add_alias { 5006 # Add a synonym for this table. 5007 5008 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 5009 5010 my $self = shift; 5011 my $name = shift; # The name to add. 5012 my $pointer = shift; # What the alias hash should point to. For 5013 # map tables, this is the parent property; 5014 # for match tables, it is the table itself. 5015 5016 my %args = @_; 5017 my $loose_match = delete $args{'Fuzzy'}; 5018 5019 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 5020 $make_re_pod_entry = $YES unless defined $make_re_pod_entry; 5021 5022 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5023 $ok_as_filename = 1 unless defined $ok_as_filename; 5024 5025 my $status = delete $args{'Status'}; 5026 $status = $NORMAL unless defined $status; 5027 5028 # An internal name does not get documented, unless overridden by the 5029 # input. 5030 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); 5031 5032 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5033 5034 # Capitalize the first letter of the alias unless it is one of the CJK 5035 # ones which specifically begins with a lower 'k'. Do this because 5036 # Unicode has varied whether they capitalize first letters or not, and 5037 # have later changed their minds and capitalized them, but not the 5038 # other way around. So do it always and avoid changes from release to 5039 # release 5040 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 5041 5042 my $addr = do { no overloading; pack 'J', $self; }; 5043 5044 # Figure out if should be loosely matched if not already specified. 5045 if (! defined $loose_match) { 5046 5047 # Is a loose_match if isn't null, and doesn't begin with an 5048 # underscore and isn't just a number 5049 if ($name ne "" 5050 && substr($name, 0, 1) ne '_' 5051 && $name !~ qr{^[0-9_.+-/]+$}) 5052 { 5053 $loose_match = 1; 5054 } 5055 else { 5056 $loose_match = 0; 5057 } 5058 } 5059 5060 # If this alias has already been defined, do nothing. 5061 return if defined $find_table_from_alias{$addr}->{$name}; 5062 5063 # That includes if it is standardly equivalent to an existing alias, 5064 # in which case, add this name to the list, so won't have to search 5065 # for it again. 5066 my $standard_name = main::standardize($name); 5067 if (defined $find_table_from_alias{$addr}->{$standard_name}) { 5068 $find_table_from_alias{$addr}->{$name} 5069 = $find_table_from_alias{$addr}->{$standard_name}; 5070 return; 5071 } 5072 5073 # Set the index hash for this alias for future quick reference. 5074 $find_table_from_alias{$addr}->{$name} = $pointer; 5075 $find_table_from_alias{$addr}->{$standard_name} = $pointer; 5076 local $to_trace = 0 if main::DEBUG; 5077 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; 5078 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; 5079 5080 5081 # Put the new alias at the end of the list of aliases unless the final 5082 # element begins with an underscore (meaning it is for internal perl 5083 # use) or is all numeric, in which case, put the new one before that 5084 # one. This floats any all-numeric or underscore-beginning aliases to 5085 # the end. This is done so that they are listed last in output lists, 5086 # to encourage the user to use a better name (either more descriptive 5087 # or not an internal-only one) instead. This ordering is relied on 5088 # implicitly elsewhere in this program, like in short_name() 5089 my $list = $aliases{$addr}; 5090 my $insert_position = (@$list == 0 5091 || (substr($list->[-1]->name, 0, 1) ne '_' 5092 && $list->[-1]->name =~ /\D/)) 5093 ? @$list 5094 : @$list - 1; 5095 splice @$list, 5096 $insert_position, 5097 0, 5098 Alias->new($name, $loose_match, $make_re_pod_entry, 5099 $ok_as_filename, $status, $ucd); 5100 5101 # This name may be shorter than any existing ones, so clear the cache 5102 # of the shortest, so will have to be recalculated. 5103 no overloading; 5104 undef $short_name{pack 'J', $self}; 5105 return; 5106 } 5107 5108 sub short_name { 5109 # Returns a name suitable for use as the base part of a file name. 5110 # That is, shorter wins. It can return undef if there is no suitable 5111 # name. The name has all non-essential underscores removed. 5112 5113 # The optional second parameter is a reference to a scalar in which 5114 # this routine will store the length the returned name had before the 5115 # underscores were removed, or undef if the return is undef. 5116 5117 # The shortest name can change if new aliases are added. So using 5118 # this should be deferred until after all these are added. The code 5119 # that does that should clear this one's cache. 5120 # Any name with alphabetics is preferred over an all numeric one, even 5121 # if longer. 5122 5123 my $self = shift; 5124 my $nominal_length_ptr = shift; 5125 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5126 5127 my $addr = do { no overloading; pack 'J', $self; }; 5128 5129 # For efficiency, don't recalculate, but this means that adding new 5130 # aliases could change what the shortest is, so the code that does 5131 # that needs to undef this. 5132 if (defined $short_name{$addr}) { 5133 if ($nominal_length_ptr) { 5134 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5135 } 5136 return $short_name{$addr}; 5137 } 5138 5139 # Look at each alias 5140 foreach my $alias ($self->aliases()) { 5141 5142 # Don't use an alias that isn't ok to use for an external name. 5143 next if ! $alias->ok_as_filename; 5144 5145 my $name = main::Standardize($alias->name); 5146 trace $self, $name if main::DEBUG && $to_trace; 5147 5148 # Take the first one, or a shorter one that isn't numeric. This 5149 # relies on numeric aliases always being last in the array 5150 # returned by aliases(). Any alpha one will have precedence. 5151 if (! defined $short_name{$addr} 5152 || ($name =~ /\D/ 5153 && length($name) < length($short_name{$addr}))) 5154 { 5155 # Remove interior underscores. 5156 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; 5157 5158 $nominal_short_name_length{$addr} = length $name; 5159 } 5160 } 5161 5162 # If the short name isn't a nice one, perhaps an equivalent table has 5163 # a better one. 5164 if (! defined $short_name{$addr} 5165 || $short_name{$addr} eq "" 5166 || $short_name{$addr} eq "_") 5167 { 5168 my $return; 5169 foreach my $follower ($self->children) { # All equivalents 5170 my $follower_name = $follower->short_name; 5171 next unless defined $follower_name; 5172 5173 # Anything (except undefined) is better than underscore or 5174 # empty 5175 if (! defined $return || $return eq "_") { 5176 $return = $follower_name; 5177 next; 5178 } 5179 5180 # If the new follower name isn't "_" and is shorter than the 5181 # current best one, prefer the new one. 5182 next if $follower_name eq "_"; 5183 next if length $follower_name > length $return; 5184 $return = $follower_name; 5185 } 5186 $short_name{$addr} = $return if defined $return; 5187 } 5188 5189 # If no suitable external name return undef 5190 if (! defined $short_name{$addr}) { 5191 $$nominal_length_ptr = undef if $nominal_length_ptr; 5192 return; 5193 } 5194 5195 # Don't allow a null short name. 5196 if ($short_name{$addr} eq "") { 5197 $short_name{$addr} = '_'; 5198 $nominal_short_name_length{$addr} = 1; 5199 } 5200 5201 trace $self, $short_name{$addr} if main::DEBUG && $to_trace; 5202 5203 if ($nominal_length_ptr) { 5204 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5205 } 5206 return $short_name{$addr}; 5207 } 5208 5209 sub external_name { 5210 # Returns the external name that this table should be known by. This 5211 # is usually the short_name, but not if the short_name is undefined, 5212 # in which case the external_name is arbitrarily set to the 5213 # underscore. 5214 5215 my $self = shift; 5216 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5217 5218 my $short = $self->short_name; 5219 return $short if defined $short; 5220 5221 return '_'; 5222 } 5223 5224 sub add_description { # Adds the parameter as a short description. 5225 5226 my $self = shift; 5227 my $description = shift; 5228 chomp $description; 5229 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5230 5231 no overloading; 5232 push @{$description{pack 'J', $self}}, $description; 5233 5234 return; 5235 } 5236 5237 sub add_note { # Adds the parameter as a short note. 5238 5239 my $self = shift; 5240 my $note = shift; 5241 chomp $note; 5242 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5243 5244 no overloading; 5245 push @{$note{pack 'J', $self}}, $note; 5246 5247 return; 5248 } 5249 5250 sub add_comment { # Adds the parameter as a comment. 5251 5252 return unless $debugging_build; 5253 5254 my $self = shift; 5255 my $comment = shift; 5256 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5257 5258 chomp $comment; 5259 5260 no overloading; 5261 push @{$comment{pack 'J', $self}}, $comment; 5262 5263 return; 5264 } 5265 5266 sub comment { 5267 # Return the current comment for this table. If called in list 5268 # context, returns the array of comments. In scalar, returns a string 5269 # of each element joined together with a period ending each. 5270 5271 my $self = shift; 5272 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5273 5274 my $addr = do { no overloading; pack 'J', $self; }; 5275 my @list = @{$comment{$addr}}; 5276 return @list if wantarray; 5277 my $return = ""; 5278 foreach my $sentence (@list) { 5279 $return .= '. ' if $return; 5280 $return .= $sentence; 5281 $return =~ s/\.$//; 5282 } 5283 $return .= '.' if $return; 5284 return $return; 5285 } 5286 5287 sub initialize { 5288 # Initialize the table with the argument which is any valid 5289 # initialization for range lists. 5290 5291 my $self = shift; 5292 my $addr = do { no overloading; pack 'J', $self; }; 5293 my $initialization = shift; 5294 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5295 5296 # Replace the current range list with a new one of the same exact 5297 # type. 5298 my $class = ref $range_list{$addr}; 5299 $range_list{$addr} = $class->new(Owner => $self, 5300 Initialize => $initialization); 5301 return; 5302 5303 } 5304 5305 sub header { 5306 # The header that is output for the table in the file it is written 5307 # in. 5308 5309 my $self = shift; 5310 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5311 5312 my $return = ""; 5313 $return .= $DEVELOPMENT_ONLY if $compare_versions; 5314 $return .= $HEADER; 5315 return $return; 5316 } 5317 5318 sub write { 5319 # Write a representation of the table to its file. It calls several 5320 # functions furnished by sub-classes of this abstract base class to 5321 # handle non-normal ranges, to add stuff before the table, and at its 5322 # end. If the table is to be written so that adjustments are 5323 # required, this does that conversion. 5324 5325 my $self = shift; 5326 my $use_adjustments = shift; # ? output in adjusted format or not 5327 my $tab_stops = shift; # The number of tab stops over to put any 5328 # comment. 5329 my $suppress_value = shift; # Optional, if the value associated with 5330 # a range equals this one, don't write 5331 # the range 5332 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5333 5334 my $addr = do { no overloading; pack 'J', $self; }; 5335 5336 # Start with the header 5337 my @HEADER = $self->header; 5338 5339 # Then the comments 5340 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n" 5341 if $comment{$addr}; 5342 5343 # Things discovered processing the main body of the document may 5344 # affect what gets output before it, therefore pre_body() isn't called 5345 # until after all other processing of the table is done. 5346 5347 # The main body looks like a 'here' document. If annotating, get rid 5348 # of the comments before passing to the caller, as some callers, such 5349 # as charnames.pm, can't cope with them. (Outputting range counts 5350 # also introduces comments, but these don't show up in the tables that 5351 # can't cope with comments, and there aren't that many of them that 5352 # it's worth the extra real time to get rid of them). 5353 my @OUT; 5354 if ($annotate) { 5355 # Use the line below in Perls that don't have /r 5356 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n"; 5357 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n"; 5358 } else { 5359 push @OUT, "return <<'END';\n"; 5360 } 5361 5362 if ($range_list{$addr}->is_empty) { 5363 5364 # This is a kludge for empty tables to silence a warning in 5365 # utf8.c, which can't really deal with empty tables, but it can 5366 # deal with a table that matches nothing, as the inverse of 'Any' 5367 # does. 5368 push @OUT, "!utf8::Any\n"; 5369 } 5370 elsif ($self->name eq 'N' 5371 5372 # To save disk space and table cache space, avoid putting out 5373 # binary N tables, but instead create a file which just inverts 5374 # the Y table. Since the file will still exist and occupy a 5375 # certain number of blocks, might as well output the whole 5376 # thing if it all will fit in one block. The number of 5377 # ranges below is an approximate number for that. 5378 && ($self->property->type == $BINARY 5379 || $self->property->type == $FORCED_BINARY) 5380 # && $self->property->tables == 2 Can't do this because the 5381 # non-binary properties, like NFDQC aren't specifiable 5382 # by the notation 5383 && $range_list{$addr}->ranges > 15 5384 && ! $annotate) # Under --annotate, want to see everything 5385 { 5386 push @OUT, "!utf8::" . $self->property->name . "\n"; 5387 } 5388 else { 5389 my $range_size_1 = $range_size_1{$addr}; 5390 my $format; # Used only in $annotate option 5391 my $include_name; # Used only in $annotate option 5392 5393 if ($annotate) { 5394 5395 # If annotating each code point, must print 1 per line. 5396 # The variable could point to a subroutine, and we don't want 5397 # to lose that fact, so only set if not set already 5398 $range_size_1 = 1 if ! $range_size_1; 5399 5400 $format = $self->format; 5401 5402 # The name of the character is output only for tables that 5403 # don't already include the name in the output. 5404 my $property = $self->property; 5405 $include_name = 5406 ! ($property == $perl_charname 5407 || $property == main::property_ref('Unicode_1_Name') 5408 || $property == main::property_ref('Name') 5409 || $property == main::property_ref('Name_Alias') 5410 ); 5411 } 5412 5413 # Values for previous time through the loop. Initialize to 5414 # something that won't be adjacent to the first iteration; 5415 # only $previous_end matters for that. 5416 my $previous_start; 5417 my $previous_end = -2; 5418 my $previous_value; 5419 5420 # Values for next time through the portion of the loop that splits 5421 # the range. 0 in $next_start means there is no remaining portion 5422 # to deal with. 5423 my $next_start = 0; 5424 my $next_end; 5425 my $next_value; 5426 my $offset = 0; 5427 5428 # Output each range as part of the here document. 5429 RANGE: 5430 for my $set ($range_list{$addr}->ranges) { 5431 if ($set->type != 0) { 5432 $self->handle_special_range($set); 5433 next RANGE; 5434 } 5435 my $start = $set->start; 5436 my $end = $set->end; 5437 my $value = $set->value; 5438 5439 # Don't output ranges whose value is the one to suppress 5440 next RANGE if defined $suppress_value 5441 && $value eq $suppress_value; 5442 5443 { # This bare block encloses the scope where we may need to 5444 # split a range (when outputting adjusteds), and each time 5445 # through we handle the next portion of the original by 5446 # ending the block with a 'redo'. The values to use for 5447 # that next time through are set up just below in the 5448 # scalars whose names begin with '$next_'. 5449 5450 if ($use_adjustments) { 5451 5452 # When converting to use adjustments, we can handle 5453 # only single element ranges. Set up so that this 5454 # time through the loop, we look at the first element, 5455 # and the next time through, we start off with the 5456 # remainder. Thus each time through we look at the 5457 # first element of the range 5458 if ($end != $start) { 5459 $next_start = $start + 1; 5460 $next_end = $end; 5461 $next_value = $value; 5462 $end = $start; 5463 } 5464 5465 # The values for some of these tables are stored as 5466 # hex strings. Convert those to decimal 5467 $value = hex($value) 5468 if $self->default_map eq $CODE_POINT 5469 && $value =~ / ^ [A-Fa-f0-9]+ $ /x; 5470 5471 # If this range is adjacent to the previous one, and 5472 # the values in each are integers that are also 5473 # adjacent (differ by 1), then this range really 5474 # extends the previous one that is already in element 5475 # $OUT[-1]. So we pop that element, and pretend that 5476 # the range starts with whatever it started with. 5477 # $offset is incremented by 1 each time so that it 5478 # gives the current offset from the first element in 5479 # the accumulating range, and we keep in $value the 5480 # value of that first element. 5481 if ($start == $previous_end + 1 5482 && $value =~ /^ -? \d+ $/xa 5483 && $previous_value =~ /^ -? \d+ $/xa 5484 && ($value == ($previous_value + ++$offset))) 5485 { 5486 pop @OUT; 5487 $start = $previous_start; 5488 $value = $previous_value; 5489 } 5490 else { 5491 $offset = 0; 5492 } 5493 5494 # Save the current values for the next time through 5495 # the loop. 5496 $previous_start = $start; 5497 $previous_end = $end; 5498 $previous_value = $value; 5499 } 5500 5501 # If there is a range and doesn't need a single point range 5502 # output 5503 if ($start != $end && ! $range_size_1) { 5504 push @OUT, sprintf "%04X\t%04X", $start, $end; 5505 $OUT[-1] .= "\t$value" if $value ne ""; 5506 5507 # Add a comment with the size of the range, if 5508 # requested. Expand Tabs to make sure they all start 5509 # in the same column, and then unexpand to use mostly 5510 # tabs. 5511 if (! $output_range_counts{$addr}) { 5512 $OUT[-1] .= "\n"; 5513 } 5514 else { 5515 $OUT[-1] = Text::Tabs::expand($OUT[-1]); 5516 my $count = main::clarify_number($end - $start + 1); 5517 use integer; 5518 5519 my $width = $tab_stops * 8 - 1; 5520 $OUT[-1] = sprintf("%-*s # [%s]\n", 5521 $width, 5522 $OUT[-1], 5523 $count); 5524 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]); 5525 } 5526 } 5527 5528 # Here to output a single code point per line. 5529 # If not to annotate, use the simple formats 5530 elsif (! $annotate) { 5531 5532 # Use any passed in subroutine to output. 5533 if (ref $range_size_1 eq 'CODE') { 5534 for my $i ($start .. $end) { 5535 push @OUT, &{$range_size_1}($i, $value); 5536 } 5537 } 5538 else { 5539 5540 # Here, caller is ok with default output. 5541 for (my $i = $start; $i <= $end; $i++) { 5542 push @OUT, sprintf "%04X\t\t%s\n", $i, $value; 5543 } 5544 } 5545 } 5546 else { 5547 5548 # Here, wants annotation. 5549 for (my $i = $start; $i <= $end; $i++) { 5550 5551 # Get character information if don't have it already 5552 main::populate_char_info($i) 5553 if ! defined $viacode[$i]; 5554 my $type = $annotate_char_type[$i]; 5555 5556 # Figure out if should output the next code points 5557 # as part of a range or not. If this is not in an 5558 # annotation range, then won't output as a range, 5559 # so returns $i. Otherwise use the end of the 5560 # annotation range, but no further than the 5561 # maximum possible end point of the loop. 5562 my $range_end = main::min( 5563 $annotate_ranges->value_of($i) || $i, 5564 $end); 5565 5566 # Use a range if it is a range, and either is one 5567 # of the special annotation ranges, or the range 5568 # is at most 3 long. This last case causes the 5569 # algorithmically named code points to be output 5570 # individually in spans of at most 3, as they are 5571 # the ones whose $type is > 0. 5572 if ($range_end != $i 5573 && ( $type < 0 || $range_end - $i > 2)) 5574 { 5575 # Here is to output a range. We don't allow a 5576 # caller-specified output format--just use the 5577 # standard one. 5578 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i, 5579 $range_end, 5580 $value; 5581 my $range_name = $viacode[$i]; 5582 5583 # For the code points which end in their hex 5584 # value, we eliminate that from the output 5585 # annotation, and capitalize only the first 5586 # letter of each word. 5587 if ($type == $CP_IN_NAME) { 5588 my $hex = sprintf "%04X", $i; 5589 $range_name =~ s/-$hex$//; 5590 my @words = split " ", $range_name; 5591 for my $word (@words) { 5592 $word = 5593 ucfirst(lc($word)) if $word ne 'CJK'; 5594 } 5595 $range_name = join " ", @words; 5596 } 5597 elsif ($type == $HANGUL_SYLLABLE) { 5598 $range_name = "Hangul Syllable"; 5599 } 5600 5601 $OUT[-1] .= " $range_name" if $range_name; 5602 5603 # Include the number of code points in the 5604 # range 5605 my $count = 5606 main::clarify_number($range_end - $i + 1); 5607 $OUT[-1] .= " [$count]\n"; 5608 5609 # Skip to the end of the range 5610 $i = $range_end; 5611 } 5612 else { # Not in a range. 5613 my $comment = ""; 5614 5615 # When outputting the names of each character, 5616 # use the character itself if printable 5617 $comment .= "'" . chr($i) . "' " 5618 if $printable[$i]; 5619 5620 # To make it more readable, use a minimum 5621 # indentation 5622 my $comment_indent; 5623 5624 # Determine the annotation 5625 if ($format eq $DECOMP_STRING_FORMAT) { 5626 5627 # This is very specialized, with the type 5628 # of decomposition beginning the line 5629 # enclosed in <...>, and the code points 5630 # that the code point decomposes to 5631 # separated by blanks. Create two 5632 # strings, one of the printable 5633 # characters, and one of their official 5634 # names. 5635 (my $map = $value) =~ s/ \ * < .*? > \ +//x; 5636 my $tostr = ""; 5637 my $to_name = ""; 5638 my $to_chr = ""; 5639 foreach my $to (split " ", $map) { 5640 $to = CORE::hex $to; 5641 $to_name .= " + " if $to_name; 5642 $to_chr .= chr($to); 5643 main::populate_char_info($to) 5644 if ! defined $viacode[$to]; 5645 $to_name .= $viacode[$to]; 5646 } 5647 5648 $comment .= 5649 "=> '$to_chr'; $viacode[$i] => $to_name"; 5650 $comment_indent = 25; # Determined by 5651 # experiment 5652 } 5653 else { 5654 5655 # Assume that any table that has hex 5656 # format is a mapping of one code point to 5657 # another. 5658 if ($format eq $HEX_FORMAT) { 5659 my $decimal_value = CORE::hex $value; 5660 main::populate_char_info($decimal_value) 5661 if ! defined $viacode[$decimal_value]; 5662 $comment .= "=> '" 5663 . chr($decimal_value) 5664 . "'; " if $printable[$decimal_value]; 5665 } 5666 $comment .= $viacode[$i] if $include_name 5667 && $viacode[$i]; 5668 if ($format eq $HEX_FORMAT) { 5669 my $decimal_value = CORE::hex $value; 5670 $comment .= 5671 " => $viacode[$decimal_value]" 5672 if $viacode[$decimal_value]; 5673 } 5674 5675 # If including the name, no need to 5676 # indent, as the name will already be way 5677 # across the line. 5678 $comment_indent = ($include_name) ? 0 : 60; 5679 } 5680 5681 # Use any passed in routine to output the base 5682 # part of the line. 5683 if (ref $range_size_1 eq 'CODE') { 5684 my $base_part=&{$range_size_1}($i, $value); 5685 chomp $base_part; 5686 push @OUT, $base_part; 5687 } 5688 else { 5689 push @OUT, sprintf "%04X\t\t%s", $i, $value; 5690 } 5691 5692 # And add the annotation. 5693 $OUT[-1] = sprintf "%-*s\t# %s", 5694 $comment_indent, 5695 $OUT[-1], 5696 $comment 5697 if $comment; 5698 $OUT[-1] .= "\n"; 5699 } 5700 } 5701 } 5702 5703 # If we split the range, set up so the next time through 5704 # we get the remainder, and redo. 5705 if ($next_start) { 5706 $start = $next_start; 5707 $end = $next_end; 5708 $value = $next_value; 5709 $next_start = 0; 5710 redo; 5711 } 5712 } 5713 } # End of loop through all the table's ranges 5714 } 5715 5716 # Add anything that goes after the main body, but within the here 5717 # document, 5718 my $append_to_body = $self->append_to_body; 5719 push @OUT, $append_to_body if $append_to_body; 5720 5721 # And finish the here document. 5722 push @OUT, "END\n"; 5723 5724 # Done with the main portion of the body. Can now figure out what 5725 # should appear before it in the file. 5726 my $pre_body = $self->pre_body; 5727 push @HEADER, $pre_body, "\n" if $pre_body; 5728 5729 # All these files should have a .pl suffix added to them. 5730 my @file_with_pl = @{$file_path{$addr}}; 5731 $file_with_pl[-1] .= '.pl'; 5732 5733 main::write(\@file_with_pl, 5734 $annotate, # utf8 iff annotating 5735 \@HEADER, 5736 \@OUT); 5737 return; 5738 } 5739 5740 sub set_status { # Set the table's status 5741 my $self = shift; 5742 my $status = shift; # The status enum value 5743 my $info = shift; # Any message associated with it. 5744 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5745 5746 my $addr = do { no overloading; pack 'J', $self; }; 5747 5748 $status{$addr} = $status; 5749 $status_info{$addr} = $info; 5750 return; 5751 } 5752 5753 sub set_fate { # Set the fate of a table 5754 my $self = shift; 5755 my $fate = shift; 5756 my $reason = shift; 5757 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5758 5759 my $addr = do { no overloading; pack 'J', $self; }; 5760 5761 return if $fate{$addr} == $fate; # If no-op 5762 5763 # Can only change the ordinary fate, except if going to $MAP_PROXIED 5764 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; 5765 5766 $fate{$addr} = $fate; 5767 5768 # Don't document anything to do with a non-normal fated table 5769 if ($fate != $ORDINARY) { 5770 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; 5771 foreach my $alias ($self->aliases) { 5772 $alias->set_ucd($put_in_pod); 5773 5774 # MAP_PROXIED doesn't affect the match tables 5775 next if $fate == $MAP_PROXIED; 5776 $alias->set_make_re_pod_entry($put_in_pod); 5777 } 5778 } 5779 5780 # Save the reason for suppression for output 5781 if ($fate == $SUPPRESSED && defined $reason) { 5782 $why_suppressed{$complete_name{$addr}} = $reason; 5783 } 5784 5785 return; 5786 } 5787 5788 sub lock { 5789 # Don't allow changes to the table from now on. This stores a stack 5790 # trace of where it was called, so that later attempts to modify it 5791 # can immediately show where it got locked. 5792 5793 my $self = shift; 5794 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5795 5796 my $addr = do { no overloading; pack 'J', $self; }; 5797 5798 $locked{$addr} = ""; 5799 5800 my $line = (caller(0))[2]; 5801 my $i = 1; 5802 5803 # Accumulate the stack trace 5804 while (1) { 5805 my ($pkg, $file, $caller_line, $caller) = caller $i++; 5806 5807 last unless defined $caller; 5808 5809 $locked{$addr} .= " called from $caller() at line $line\n"; 5810 $line = $caller_line; 5811 } 5812 $locked{$addr} .= " called from main at line $line\n"; 5813 5814 return; 5815 } 5816 5817 sub carp_if_locked { 5818 # Return whether a table is locked or not, and, by the way, complain 5819 # if is locked 5820 5821 my $self = shift; 5822 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5823 5824 my $addr = do { no overloading; pack 'J', $self; }; 5825 5826 return 0 if ! $locked{$addr}; 5827 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); 5828 return 1; 5829 } 5830 5831 sub set_file_path { # Set the final directory path for this table 5832 my $self = shift; 5833 # Rest of parameters passed on 5834 5835 no overloading; 5836 @{$file_path{pack 'J', $self}} = @_; 5837 return 5838 } 5839 5840 # Accessors for the range list stored in this table. First for 5841 # unconditional 5842 for my $sub (qw( 5843 containing_range 5844 contains 5845 count 5846 each_range 5847 hash 5848 is_empty 5849 matches_identically_to 5850 max 5851 min 5852 range_count 5853 reset_each_range 5854 type_of 5855 value_of 5856 )) 5857 { 5858 no strict "refs"; 5859 *$sub = sub { 5860 use strict "refs"; 5861 my $self = shift; 5862 return $self->_range_list->$sub(@_); 5863 } 5864 } 5865 5866 # Then for ones that should fail if locked 5867 for my $sub (qw( 5868 delete_range 5869 )) 5870 { 5871 no strict "refs"; 5872 *$sub = sub { 5873 use strict "refs"; 5874 my $self = shift; 5875 5876 return if $self->carp_if_locked; 5877 no overloading; 5878 return $self->_range_list->$sub(@_); 5879 } 5880 } 5881 5882} # End closure 5883 5884package Map_Table; 5885use base '_Base_Table'; 5886 5887# A Map Table is a table that contains the mappings from code points to 5888# values. There are two weird cases: 5889# 1) Anomalous entries are ones that aren't maps of ranges of code points, but 5890# are written in the table's file at the end of the table nonetheless. It 5891# requires specially constructed code to handle these; utf8.c can not read 5892# these in, so they should not go in $map_directory. As of this writing, 5893# the only case that these happen is for named sequences used in 5894# charnames.pm. But this code doesn't enforce any syntax on these, so 5895# something else could come along that uses it. 5896# 2) Specials are anything that doesn't fit syntactically into the body of the 5897# table. The ranges for these have a map type of non-zero. The code below 5898# knows about and handles each possible type. In most cases, these are 5899# written as part of the header. 5900# 5901# A map table deliberately can't be manipulated at will unlike match tables. 5902# This is because of the ambiguities having to do with what to do with 5903# overlapping code points. And there just isn't a need for those things; 5904# what one wants to do is just query, add, replace, or delete mappings, plus 5905# write the final result. 5906# However, there is a method to get the list of possible ranges that aren't in 5907# this table to use for defaulting missing code point mappings. And, 5908# map_add_or_replace_non_nulls() does allow one to add another table to this 5909# one, but it is clearly very specialized, and defined that the other's 5910# non-null values replace this one's if there is any overlap. 5911 5912sub trace { return main::trace(@_); } 5913 5914{ # Closure 5915 5916 main::setup_package(); 5917 5918 my %default_map; 5919 # Many input files omit some entries; this gives what the mapping for the 5920 # missing entries should be 5921 main::set_access('default_map', \%default_map, 'r'); 5922 5923 my %anomalous_entries; 5924 # Things that go in the body of the table which don't fit the normal 5925 # scheme of things, like having a range. Not much can be done with these 5926 # once there except to output them. This was created to handle named 5927 # sequences. 5928 main::set_access('anomalous_entry', \%anomalous_entries, 'a'); 5929 main::set_access('anomalous_entries', # Append singular, read plural 5930 \%anomalous_entries, 5931 'readable_array'); 5932 5933 my %to_output_map; 5934 # Enum as to whether or not to write out this map table, and how: 5935 # 0 don't output 5936 # $EXTERNAL_MAP means its existence is noted in the documentation, and 5937 # it should not be removed nor its format changed. This 5938 # is done for those files that have traditionally been 5939 # output. 5940 # $INTERNAL_MAP means Perl reserves the right to do anything it wants 5941 # with this file 5942 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of 5943 # outputting the actual mappings as-is, we adjust things 5944 # to create a much more compact table. Only those few 5945 # tables where the mapping is convertible at least to an 5946 # integer and compacting makes a big difference should 5947 # have this. Hence, the default is to not do this 5948 # unless the table's default mapping is to $CODE_POINT, 5949 # and the range size is not 1. 5950 main::set_access('to_output_map', \%to_output_map, 's'); 5951 5952 sub new { 5953 my $class = shift; 5954 my $name = shift; 5955 5956 my %args = @_; 5957 5958 # Optional initialization data for the table. 5959 my $initialize = delete $args{'Initialize'}; 5960 5961 my $default_map = delete $args{'Default_Map'}; 5962 my $property = delete $args{'_Property'}; 5963 my $full_name = delete $args{'Full_Name'}; 5964 my $to_output_map = delete $args{'To_Output_Map'}; 5965 5966 # Rest of parameters passed on 5967 5968 my $range_list = Range_Map->new(Owner => $property); 5969 5970 my $self = $class->SUPER::new( 5971 Name => $name, 5972 Complete_Name => $full_name, 5973 Full_Name => $full_name, 5974 _Property => $property, 5975 _Range_List => $range_list, 5976 %args); 5977 5978 my $addr = do { no overloading; pack 'J', $self; }; 5979 5980 $anomalous_entries{$addr} = []; 5981 $default_map{$addr} = $default_map; 5982 $to_output_map{$addr} = $to_output_map; 5983 5984 $self->initialize($initialize) if defined $initialize; 5985 5986 return $self; 5987 } 5988 5989 use overload 5990 fallback => 0, 5991 qw("") => "_operator_stringify", 5992 ; 5993 5994 sub _operator_stringify { 5995 my $self = shift; 5996 5997 my $name = $self->property->full_name; 5998 $name = '""' if $name eq ""; 5999 return "Map table for Property '$name'"; 6000 } 6001 6002 sub add_alias { 6003 # Add a synonym for this table (which means the property itself) 6004 my $self = shift; 6005 my $name = shift; 6006 # Rest of parameters passed on. 6007 6008 $self->SUPER::add_alias($name, $self->property, @_); 6009 return; 6010 } 6011 6012 sub add_map { 6013 # Add a range of code points to the list of specially-handled code 6014 # points. $MULTI_CP is assumed if the type of special is not passed 6015 # in. 6016 6017 my $self = shift; 6018 my $lower = shift; 6019 my $upper = shift; 6020 my $string = shift; 6021 my %args = @_; 6022 6023 my $type = delete $args{'Type'} || 0; 6024 # Rest of parameters passed on 6025 6026 # Can't change the table if locked. 6027 return if $self->carp_if_locked; 6028 6029 my $addr = do { no overloading; pack 'J', $self; }; 6030 6031 $self->_range_list->add_map($lower, $upper, 6032 $string, 6033 @_, 6034 Type => $type); 6035 return; 6036 } 6037 6038 sub append_to_body { 6039 # Adds to the written HERE document of the table's body any anomalous 6040 # entries in the table.. 6041 6042 my $self = shift; 6043 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6044 6045 my $addr = do { no overloading; pack 'J', $self; }; 6046 6047 return "" unless @{$anomalous_entries{$addr}}; 6048 return join("\n", @{$anomalous_entries{$addr}}) . "\n"; 6049 } 6050 6051 sub map_add_or_replace_non_nulls { 6052 # This adds the mappings in the table $other to $self. Non-null 6053 # mappings from $other override those in $self. It essentially merges 6054 # the two tables, with the second having priority except for null 6055 # mappings. 6056 6057 my $self = shift; 6058 my $other = shift; 6059 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6060 6061 return if $self->carp_if_locked; 6062 6063 if (! $other->isa(__PACKAGE__)) { 6064 Carp::my_carp_bug("$other should be a " 6065 . __PACKAGE__ 6066 . ". Not a '" 6067 . ref($other) 6068 . "'. Not added;"); 6069 return; 6070 } 6071 6072 my $addr = do { no overloading; pack 'J', $self; }; 6073 my $other_addr = do { no overloading; pack 'J', $other; }; 6074 6075 local $to_trace = 0 if main::DEBUG; 6076 6077 my $self_range_list = $self->_range_list; 6078 my $other_range_list = $other->_range_list; 6079 foreach my $range ($other_range_list->ranges) { 6080 my $value = $range->value; 6081 next if $value eq ""; 6082 $self_range_list->_add_delete('+', 6083 $range->start, 6084 $range->end, 6085 $value, 6086 Type => $range->type, 6087 Replace => $UNCONDITIONALLY); 6088 } 6089 6090 return; 6091 } 6092 6093 sub set_default_map { 6094 # Define what code points that are missing from the input files should 6095 # map to 6096 6097 my $self = shift; 6098 my $map = shift; 6099 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6100 6101 my $addr = do { no overloading; pack 'J', $self; }; 6102 6103 # Convert the input to the standard equivalent, if any (won't have any 6104 # for $STRING properties) 6105 my $standard = $self->_find_table_from_alias->{$map}; 6106 $map = $standard->name if defined $standard; 6107 6108 # Warn if there already is a non-equivalent default map for this 6109 # property. Note that a default map can be a ref, which means that 6110 # what it actually means is delayed until later in the program, and it 6111 # IS permissible to override it here without a message. 6112 my $default_map = $default_map{$addr}; 6113 if (defined $default_map 6114 && ! ref($default_map) 6115 && $default_map ne $map 6116 && main::Standardize($map) ne $default_map) 6117 { 6118 my $property = $self->property; 6119 my $map_table = $property->table($map); 6120 my $default_table = $property->table($default_map); 6121 if (defined $map_table 6122 && defined $default_table 6123 && $map_table != $default_table) 6124 { 6125 Carp::my_carp("Changing the default mapping for " 6126 . $property 6127 . " from $default_map to $map'"); 6128 } 6129 } 6130 6131 $default_map{$addr} = $map; 6132 6133 # Don't also create any missing table for this map at this point, 6134 # because if we did, it could get done before the main table add is 6135 # done for PropValueAliases.txt; instead the caller will have to make 6136 # sure it exists, if desired. 6137 return; 6138 } 6139 6140 sub to_output_map { 6141 # Returns boolean: should we write this map table? 6142 6143 my $self = shift; 6144 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6145 6146 my $addr = do { no overloading; pack 'J', $self; }; 6147 6148 # If overridden, use that 6149 return $to_output_map{$addr} if defined $to_output_map{$addr}; 6150 6151 my $full_name = $self->full_name; 6152 return $global_to_output_map{$full_name} 6153 if defined $global_to_output_map{$full_name}; 6154 6155 # If table says to output, do so; if says to suppress it, do so. 6156 my $fate = $self->fate; 6157 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; 6158 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; 6159 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; 6160 6161 my $type = $self->property->type; 6162 6163 # Don't want to output binary map tables even for debugging. 6164 return 0 if $type == $BINARY; 6165 6166 # But do want to output string ones. All the ones that remain to 6167 # be dealt with (i.e. which haven't explicitly been set to external) 6168 # are for internal Perl use only. The default for those that map to 6169 # $CODE_POINT and haven't been restricted to a single element range 6170 # is to use the adjusted form. 6171 if ($type == $STRING) { 6172 return $INTERNAL_MAP if $self->range_size_1 6173 || $default_map{$addr} ne $CODE_POINT; 6174 return $OUTPUT_ADJUSTED; 6175 } 6176 6177 # Otherwise is an $ENUM, do output it, for Perl's purposes 6178 return $INTERNAL_MAP; 6179 } 6180 6181 sub inverse_list { 6182 # Returns a Range_List that is gaps of the current table. That is, 6183 # the inversion 6184 6185 my $self = shift; 6186 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6187 6188 my $current = Range_List->new(Initialize => $self->_range_list, 6189 Owner => $self->property); 6190 return ~ $current; 6191 } 6192 6193 sub header { 6194 my $self = shift; 6195 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6196 6197 my $return = $self->SUPER::header(); 6198 6199 if ($self->to_output_map >= $INTERNAL_MAP) { 6200 $return .= $INTERNAL_ONLY_HEADER; 6201 } 6202 else { 6203 my $property_name = $self->property->full_name =~ s/Legacy_//r; 6204 $return .= <<END; 6205 6206# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!! 6207 6208# This file is for internal use by core Perl only. It is retained for 6209# backwards compatibility with applications that may have come to rely on it, 6210# but its format and even its name or existence are subject to change without 6211# notice in a future Perl version. Don't use it directly. Instead, its 6212# contents are now retrievable through a stable API in the Unicode::UCD 6213# module: Unicode::UCD::prop_invmap('$property_name'). 6214END 6215 } 6216 return $return; 6217 } 6218 6219 sub set_final_comment { 6220 # Just before output, create the comment that heads the file 6221 # containing this table. 6222 6223 return unless $debugging_build; 6224 6225 my $self = shift; 6226 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6227 6228 # No sense generating a comment if aren't going to write it out. 6229 return if ! $self->to_output_map; 6230 6231 my $addr = do { no overloading; pack 'J', $self; }; 6232 6233 my $property = $self->property; 6234 6235 # Get all the possible names for this property. Don't use any that 6236 # aren't ok for use in a file name, etc. This is perhaps causing that 6237 # flag to do double duty, and may have to be changed in the future to 6238 # have our own flag for just this purpose; but it works now to exclude 6239 # Perl generated synonyms from the lists for properties, where the 6240 # name is always the proper Unicode one. 6241 my @property_aliases = grep { $_->ok_as_filename } $self->aliases; 6242 6243 my $count = $self->count; 6244 my $default_map = $default_map{$addr}; 6245 6246 # The ranges that map to the default aren't output, so subtract that 6247 # to get those actually output. A property with matching tables 6248 # already has the information calculated. 6249 if ($property->type != $STRING) { 6250 $count -= $property->table($default_map)->count; 6251 } 6252 elsif (defined $default_map) { 6253 6254 # But for $STRING properties, must calculate now. Subtract the 6255 # count from each range that maps to the default. 6256 foreach my $range ($self->_range_list->ranges) { 6257 if ($range->value eq $default_map) { 6258 $count -= $range->end +1 - $range->start; 6259 } 6260 } 6261 6262 } 6263 6264 # Get a string version of $count with underscores in large numbers, 6265 # for clarity. 6266 my $string_count = main::clarify_number($count); 6267 6268 my $code_points = ($count == 1) 6269 ? 'single code point' 6270 : "$string_count code points"; 6271 6272 my $mapping; 6273 my $these_mappings; 6274 my $are; 6275 if (@property_aliases <= 1) { 6276 $mapping = 'mapping'; 6277 $these_mappings = 'this mapping'; 6278 $are = 'is' 6279 } 6280 else { 6281 $mapping = 'synonymous mappings'; 6282 $these_mappings = 'these mappings'; 6283 $are = 'are' 6284 } 6285 my $cp; 6286 if ($count >= $MAX_UNICODE_CODEPOINTS) { 6287 $cp = "any code point in Unicode Version $string_version"; 6288 } 6289 else { 6290 my $map_to; 6291 if ($default_map eq "") { 6292 $map_to = 'the null string'; 6293 } 6294 elsif ($default_map eq $CODE_POINT) { 6295 $map_to = "itself"; 6296 } 6297 else { 6298 $map_to = "'$default_map'"; 6299 } 6300 if ($count == 1) { 6301 $cp = "the single code point"; 6302 } 6303 else { 6304 $cp = "one of the $code_points"; 6305 } 6306 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to"; 6307 } 6308 6309 my $comment = ""; 6310 6311 my $status = $self->status; 6312 if ($status && $status ne $PLACEHOLDER) { 6313 my $warn = uc $status_past_participles{$status}; 6314 $comment .= <<END; 6315 6316!!!!!!! $warn !!!!!!!!!!!!!!!!!!! 6317 All property or property=value combinations contained in this file are $warn. 6318 See $unicode_reference_url for what this means. 6319 6320END 6321 } 6322 $comment .= "This file returns the $mapping:\n"; 6323 6324 my $ucd_accessible_name = ""; 6325 my $full_name = $self->property->full_name; 6326 for my $i (0 .. @property_aliases - 1) { 6327 my $name = $property_aliases[$i]->name; 6328 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); 6329 if ($property_aliases[$i]->ucd) { 6330 if ($name eq $full_name) { 6331 $ucd_accessible_name = $full_name; 6332 } 6333 elsif (! $ucd_accessible_name) { 6334 $ucd_accessible_name = $name; 6335 } 6336 } 6337 } 6338 $comment .= "\nwhere 'cp' is $cp."; 6339 if ($ucd_accessible_name) { 6340 $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD"; 6341 } 6342 6343 # And append any commentary already set from the actual property. 6344 $comment .= "\n\n" . $self->comment if $self->comment; 6345 if ($self->description) { 6346 $comment .= "\n\n" . join " ", $self->description; 6347 } 6348 if ($self->note) { 6349 $comment .= "\n\n" . join " ", $self->note; 6350 } 6351 $comment .= "\n"; 6352 6353 if (! $self->perl_extension) { 6354 $comment .= <<END; 6355 6356For information about what this property really means, see: 6357$unicode_reference_url 6358END 6359 } 6360 6361 if ($count) { # Format differs for empty table 6362 $comment.= "\nThe format of the "; 6363 if ($self->range_size_1) { 6364 $comment.= <<END; 6365main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT 6366is in hex; MAPPING is what CODE_POINT maps to. 6367END 6368 } 6369 else { 6370 6371 # There are tables which end up only having one element per 6372 # range, but it is not worth keeping track of for making just 6373 # this comment a little better. 6374 $comment.= <<END; 6375non-comment portions of the main body of lines of this file is: 6376START\\tSTOP\\tMAPPING where START is the starting code point of the 6377range, in hex; STOP is the ending point, or if omitted, the range has just one 6378code point; MAPPING is what each code point between START and STOP maps to. 6379END 6380 if ($self->output_range_counts) { 6381 $comment .= <<END; 6382Numbers in comments in [brackets] indicate how many code points are in the 6383range (omitted when the range is a single code point or if the mapping is to 6384the null string). 6385END 6386 } 6387 } 6388 } 6389 $self->set_comment(main::join_lines($comment)); 6390 return; 6391 } 6392 6393 my %swash_keys; # Makes sure don't duplicate swash names. 6394 6395 # The remaining variables are temporaries used while writing each table, 6396 # to output special ranges. 6397 my @multi_code_point_maps; # Map is to more than one code point. 6398 6399 sub handle_special_range { 6400 # Called in the middle of write when it finds a range it doesn't know 6401 # how to handle. 6402 6403 my $self = shift; 6404 my $range = shift; 6405 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6406 6407 my $addr = do { no overloading; pack 'J', $self; }; 6408 6409 my $type = $range->type; 6410 6411 my $low = $range->start; 6412 my $high = $range->end; 6413 my $map = $range->value; 6414 6415 # No need to output the range if it maps to the default. 6416 return if $map eq $default_map{$addr}; 6417 6418 my $property = $self->property; 6419 6420 # Switch based on the map type... 6421 if ($type == $HANGUL_SYLLABLE) { 6422 6423 # These are entirely algorithmically determinable based on 6424 # some constants furnished by Unicode; for now, just set a 6425 # flag to indicate that have them. After everything is figured 6426 # out, we will output the code that does the algorithm. (Don't 6427 # output them if not needed because we are suppressing this 6428 # property.) 6429 $has_hangul_syllables = 1 if $property->to_output_map; 6430 } 6431 elsif ($type == $CP_IN_NAME) { 6432 6433 # Code points whose name ends in their code point are also 6434 # algorithmically determinable, but need information about the map 6435 # to do so. Both the map and its inverse are stored in data 6436 # structures output in the file. They are stored in the mean time 6437 # in global lists The lists will be written out later into Name.pm, 6438 # which is created only if needed. In order to prevent duplicates 6439 # in the list, only add to them for one property, should multiple 6440 # ones need them. 6441 if ($needing_code_points_ending_in_code_point == 0) { 6442 $needing_code_points_ending_in_code_point = $property; 6443 } 6444 if ($property == $needing_code_points_ending_in_code_point) { 6445 push @{$names_ending_in_code_point{$map}->{'low'}}, $low; 6446 push @{$names_ending_in_code_point{$map}->{'high'}}, $high; 6447 6448 my $squeezed = $map =~ s/[-\s]+//gr; 6449 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, 6450 $low; 6451 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, 6452 $high; 6453 6454 push @code_points_ending_in_code_point, { low => $low, 6455 high => $high, 6456 name => $map 6457 }; 6458 } 6459 } 6460 elsif ($range->type == $MULTI_CP || $range->type == $NULL) { 6461 6462 # Multi-code point maps and null string maps have an entry 6463 # for each code point in the range. They use the same 6464 # output format. 6465 for my $code_point ($low .. $high) { 6466 6467 # The pack() below can't cope with surrogates. XXX This may 6468 # no longer be true 6469 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { 6470 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); 6471 next; 6472 } 6473 6474 # Generate the hash entries for these in the form that 6475 # utf8.c understands. 6476 my $tostr = ""; 6477 my $to_name = ""; 6478 my $to_chr = ""; 6479 foreach my $to (split " ", $map) { 6480 if ($to !~ /^$code_point_re$/) { 6481 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); 6482 next; 6483 } 6484 $tostr .= sprintf "\\x{%s}", $to; 6485 $to = CORE::hex $to; 6486 if ($annotate) { 6487 $to_name .= " + " if $to_name; 6488 $to_chr .= chr($to); 6489 main::populate_char_info($to) 6490 if ! defined $viacode[$to]; 6491 $to_name .= $viacode[$to]; 6492 } 6493 } 6494 6495 # I (khw) have never waded through this line to 6496 # understand it well enough to comment it. 6497 my $utf8 = sprintf(qq["%s" => "$tostr",], 6498 join("", map { sprintf "\\x%02X", $_ } 6499 unpack("U0C*", pack("U", $code_point)))); 6500 6501 # Add a comment so that a human reader can more easily 6502 # see what's going on. 6503 push @multi_code_point_maps, 6504 sprintf("%-45s # U+%04X", $utf8, $code_point); 6505 if (! $annotate) { 6506 $multi_code_point_maps[-1] .= " => $map"; 6507 } 6508 else { 6509 main::populate_char_info($code_point) 6510 if ! defined $viacode[$code_point]; 6511 $multi_code_point_maps[-1] .= " '" 6512 . chr($code_point) 6513 . "' => '$to_chr'; $viacode[$code_point] => $to_name"; 6514 } 6515 } 6516 } 6517 else { 6518 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written"); 6519 } 6520 6521 return; 6522 } 6523 6524 sub pre_body { 6525 # Returns the string that should be output in the file before the main 6526 # body of this table. It isn't called until the main body is 6527 # calculated, saving a pass. The string includes some hash entries 6528 # identifying the format of the body, and what the single value should 6529 # be for all ranges missing from it. It also includes any code points 6530 # which have map_types that don't go in the main table. 6531 6532 my $self = shift; 6533 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6534 6535 my $addr = do { no overloading; pack 'J', $self; }; 6536 6537 my $name = $self->property->swash_name; 6538 6539 # Currently there is nothing in the pre_body unless a swash is being 6540 # generated. 6541 return unless defined $name; 6542 6543 if (defined $swash_keys{$name}) { 6544 Carp::my_carp(main::join_lines(<<END 6545Already created a swash name '$name' for $swash_keys{$name}. This means that 6546the same name desired for $self shouldn't be used. Bad News. This must be 6547fixed before production use, but proceeding anyway 6548END 6549 )); 6550 } 6551 $swash_keys{$name} = "$self"; 6552 6553 my $pre_body = ""; 6554 6555 # Here we assume we were called after have gone through the whole 6556 # file. If we actually generated anything for each map type, add its 6557 # respective header and trailer 6558 my $specials_name = ""; 6559 if (@multi_code_point_maps) { 6560 $specials_name = "utf8::ToSpec$name"; 6561 $pre_body .= <<END; 6562 6563# Some code points require special handling because their mappings are each to 6564# multiple code points. These do not appear in the main body, but are defined 6565# in the hash below. 6566 6567# Each key is the string of N bytes that together make up the UTF-8 encoding 6568# for the code point. (i.e. the same as looking at the code point's UTF-8 6569# under "use bytes"). Each value is the UTF-8 of the translation, for speed. 6570\%$specials_name = ( 6571END 6572 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; 6573 } 6574 6575 my $format = $self->format; 6576 6577 my $return = ""; 6578 6579 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 6580 if ($output_adjusted) { 6581 if ($specials_name) { 6582 $return .= <<END; 6583# The mappings in the non-hash portion of this file must be modified to get the 6584# correct values by adding the code point ordinal number to each one that is 6585# numeric. 6586END 6587 } 6588 else { 6589 $return .= <<END; 6590# The mappings must be modified to get the correct values by adding the code 6591# point ordinal number to each one that is numeric. 6592END 6593 } 6594 } 6595 6596 $return .= <<END; 6597 6598# The name this swash is to be known by, with the format of the mappings in 6599# the main body of the table, and what all code points missing from this file 6600# map to. 6601\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} 6602END 6603 if ($specials_name) { 6604 $return .= <<END; 6605\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings 6606END 6607 } 6608 my $default_map = $default_map{$addr}; 6609 6610 # For $CODE_POINT default maps and using adjustments, instead the default 6611 # becomes zero. 6612 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '" 6613 . (($output_adjusted && $default_map eq $CODE_POINT) 6614 ? "0" 6615 : $default_map) 6616 . "';"; 6617 6618 if ($default_map eq $CODE_POINT) { 6619 $return .= ' # code point maps to itself'; 6620 } 6621 elsif ($default_map eq "") { 6622 $return .= ' # code point maps to the null string'; 6623 } 6624 $return .= "\n"; 6625 6626 $return .= $pre_body; 6627 6628 return $return; 6629 } 6630 6631 sub write { 6632 # Write the table to the file. 6633 6634 my $self = shift; 6635 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6636 6637 my $addr = do { no overloading; pack 'J', $self; }; 6638 6639 # Clear the temporaries 6640 undef @multi_code_point_maps; 6641 6642 # Calculate the format of the table if not already done. 6643 my $format = $self->format; 6644 my $type = $self->property->type; 6645 my $default_map = $self->default_map; 6646 if (! defined $format) { 6647 if ($type == $BINARY) { 6648 6649 # Don't bother checking the values, because we elsewhere 6650 # verify that a binary table has only 2 values. 6651 $format = $BINARY_FORMAT; 6652 } 6653 else { 6654 my @ranges = $self->_range_list->ranges; 6655 6656 # default an empty table based on its type and default map 6657 if (! @ranges) { 6658 6659 # But it turns out that the only one we can say is a 6660 # non-string (besides binary, handled above) is when the 6661 # table is a string and the default map is to a code point 6662 if ($type == $STRING && $default_map eq $CODE_POINT) { 6663 $format = $HEX_FORMAT; 6664 } 6665 else { 6666 $format = $STRING_FORMAT; 6667 } 6668 } 6669 else { 6670 6671 # Start with the most restrictive format, and as we find 6672 # something that doesn't fit with that, change to the next 6673 # most restrictive, and so on. 6674 $format = $DECIMAL_FORMAT; 6675 foreach my $range (@ranges) { 6676 next if $range->type != 0; # Non-normal ranges don't 6677 # affect the main body 6678 my $map = $range->value; 6679 if ($map ne $default_map) { 6680 last if $format eq $STRING_FORMAT; # already at 6681 # least 6682 # restrictive 6683 $format = $INTEGER_FORMAT 6684 if $format eq $DECIMAL_FORMAT 6685 && $map !~ / ^ [0-9] $ /x; 6686 $format = $FLOAT_FORMAT 6687 if $format eq $INTEGER_FORMAT 6688 && $map !~ / ^ -? [0-9]+ $ /x; 6689 $format = $RATIONAL_FORMAT 6690 if $format eq $FLOAT_FORMAT 6691 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; 6692 $format = $HEX_FORMAT 6693 if ($format eq $RATIONAL_FORMAT 6694 && $map !~ 6695 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x) 6696 # Assume a leading zero means hex, 6697 # even if all digits are 0-9 6698 || ($format eq $INTEGER_FORMAT 6699 && $map =~ /^0[0-9A-F]/); 6700 $format = $STRING_FORMAT if $format eq $HEX_FORMAT 6701 && $map =~ /[^0-9A-F]/; 6702 } 6703 } 6704 } 6705 } 6706 } # end of calculating format 6707 6708 if ($default_map eq $CODE_POINT 6709 && $format ne $HEX_FORMAT 6710 && ! defined $self->format) # manual settings are always 6711 # considered ok 6712 { 6713 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'") 6714 } 6715 6716 # If the output is to be adjusted, the format of the table that gets 6717 # output is actually 'a' instead of whatever it is stored internally 6718 # as. 6719 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 6720 if ($output_adjusted) { 6721 $format = $ADJUST_FORMAT; 6722 } 6723 6724 $self->_set_format($format); 6725 6726 return $self->SUPER::write( 6727 $output_adjusted, 6728 ($self->property == $block) 6729 ? 7 # block file needs more tab stops 6730 : 3, 6731 $default_map); # don't write defaulteds 6732 } 6733 6734 # Accessors for the underlying list that should fail if locked. 6735 for my $sub (qw( 6736 add_duplicate 6737 )) 6738 { 6739 no strict "refs"; 6740 *$sub = sub { 6741 use strict "refs"; 6742 my $self = shift; 6743 6744 return if $self->carp_if_locked; 6745 return $self->_range_list->$sub(@_); 6746 } 6747 } 6748} # End closure for Map_Table 6749 6750package Match_Table; 6751use base '_Base_Table'; 6752 6753# A Match table is one which is a list of all the code points that have 6754# the same property and property value, for use in \p{property=value} 6755# constructs in regular expressions. It adds very little data to the base 6756# structure, but many methods, as these lists can be combined in many ways to 6757# form new ones. 6758# There are only a few concepts added: 6759# 1) Equivalents and Relatedness. 6760# Two tables can match the identical code points, but have different names. 6761# This always happens when there is a perl single form extension 6762# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two 6763# tables are set to be related, with the Perl extension being a child, and 6764# the Unicode property being the parent. 6765# 6766# It may be that two tables match the identical code points and we don't 6767# know if they are related or not. This happens most frequently when the 6768# Block and Script properties have the exact range. But note that a 6769# revision to Unicode could add new code points to the script, which would 6770# now have to be in a different block (as the block was filled, or there 6771# would have been 'Unknown' script code points in it and they wouldn't have 6772# been identical). So we can't rely on any two properties from Unicode 6773# always matching the same code points from release to release, and thus 6774# these tables are considered coincidentally equivalent--not related. When 6775# two tables are unrelated but equivalent, one is arbitrarily chosen as the 6776# 'leader', and the others are 'equivalents'. This concept is useful 6777# to minimize the number of tables written out. Only one file is used for 6778# any identical set of code points, with entries in Heavy.pl mapping all 6779# the involved tables to it. 6780# 6781# Related tables will always be identical; we set them up to be so. Thus 6782# if the Unicode one is deprecated, the Perl one will be too. Not so for 6783# unrelated tables. Relatedness makes generating the documentation easier. 6784# 6785# 2) Complement. 6786# Like equivalents, two tables may be the inverses of each other, the 6787# intersection between them is null, and the union is every Unicode code 6788# point. The two tables that occupy a binary property are necessarily like 6789# this. By specifying one table as the complement of another, we can avoid 6790# storing it on disk (using the other table and performing a fast 6791# transform), and some memory and calculations. 6792# 6793# 3) Conflicting. It may be that there will eventually be name clashes, with 6794# the same name meaning different things. For a while, there actually were 6795# conflicts, but they have so far been resolved by changing Perl's or 6796# Unicode's definitions to match the other, but when this code was written, 6797# it wasn't clear that that was what was going to happen. (Unicode changed 6798# because of protests during their beta period.) Name clashes are warned 6799# about during compilation, and the documentation. The generated tables 6800# are sane, free of name clashes, because the code suppresses the Perl 6801# version. But manual intervention to decide what the actual behavior 6802# should be may be required should this happen. The introductory comments 6803# have more to say about this. 6804 6805sub standardize { return main::standardize($_[0]); } 6806sub trace { return main::trace(@_); } 6807 6808 6809{ # Closure 6810 6811 main::setup_package(); 6812 6813 my %leader; 6814 # The leader table of this one; initially $self. 6815 main::set_access('leader', \%leader, 'r'); 6816 6817 my %equivalents; 6818 # An array of any tables that have this one as their leader 6819 main::set_access('equivalents', \%equivalents, 'readable_array'); 6820 6821 my %parent; 6822 # The parent table to this one, initially $self. This allows us to 6823 # distinguish between equivalent tables that are related (for which this 6824 # is set to), and those which may not be, but share the same output file 6825 # because they match the exact same set of code points in the current 6826 # Unicode release. 6827 main::set_access('parent', \%parent, 'r'); 6828 6829 my %children; 6830 # An array of any tables that have this one as their parent 6831 main::set_access('children', \%children, 'readable_array'); 6832 6833 my %conflicting; 6834 # Array of any tables that would have the same name as this one with 6835 # a different meaning. This is used for the generated documentation. 6836 main::set_access('conflicting', \%conflicting, 'readable_array'); 6837 6838 my %matches_all; 6839 # Set in the constructor for tables that are expected to match all code 6840 # points. 6841 main::set_access('matches_all', \%matches_all, 'r'); 6842 6843 my %complement; 6844 # Points to the complement that this table is expressed in terms of; 0 if 6845 # none. 6846 main::set_access('complement', \%complement, 'r'); 6847 6848 sub new { 6849 my $class = shift; 6850 6851 my %args = @_; 6852 6853 # The property for which this table is a listing of property values. 6854 my $property = delete $args{'_Property'}; 6855 6856 my $name = delete $args{'Name'}; 6857 my $full_name = delete $args{'Full_Name'}; 6858 $full_name = $name if ! defined $full_name; 6859 6860 # Optional 6861 my $initialize = delete $args{'Initialize'}; 6862 my $matches_all = delete $args{'Matches_All'} || 0; 6863 my $format = delete $args{'Format'}; 6864 # Rest of parameters passed on. 6865 6866 my $range_list = Range_List->new(Initialize => $initialize, 6867 Owner => $property); 6868 6869 my $complete = $full_name; 6870 $complete = '""' if $complete eq ""; # A null name shouldn't happen, 6871 # but this helps debug if it 6872 # does 6873 # The complete name for a match table includes it's property in a 6874 # compound form 'property=table', except if the property is the 6875 # pseudo-property, perl, in which case it is just the single form, 6876 # 'table' (If you change the '=' must also change the ':' in lots of 6877 # places in this program that assume an equal sign) 6878 $complete = $property->full_name . "=$complete" if $property != $perl; 6879 6880 my $self = $class->SUPER::new(%args, 6881 Name => $name, 6882 Complete_Name => $complete, 6883 Full_Name => $full_name, 6884 _Property => $property, 6885 _Range_List => $range_list, 6886 Format => $EMPTY_FORMAT, 6887 ); 6888 my $addr = do { no overloading; pack 'J', $self; }; 6889 6890 $conflicting{$addr} = [ ]; 6891 $equivalents{$addr} = [ ]; 6892 $children{$addr} = [ ]; 6893 $matches_all{$addr} = $matches_all; 6894 $leader{$addr} = $self; 6895 $parent{$addr} = $self; 6896 $complement{$addr} = 0; 6897 6898 if (defined $format && $format ne $EMPTY_FORMAT) { 6899 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); 6900 } 6901 6902 return $self; 6903 } 6904 6905 # See this program's beginning comment block about overloading these. 6906 use overload 6907 fallback => 0, 6908 qw("") => "_operator_stringify", 6909 '=' => sub { 6910 my $self = shift; 6911 6912 return if $self->carp_if_locked; 6913 return $self; 6914 }, 6915 6916 '+' => sub { 6917 my $self = shift; 6918 my $other = shift; 6919 6920 return $self->_range_list + $other; 6921 }, 6922 '&' => sub { 6923 my $self = shift; 6924 my $other = shift; 6925 6926 return $self->_range_list & $other; 6927 }, 6928 '+=' => sub { 6929 my $self = shift; 6930 my $other = shift; 6931 my $reversed = shift; 6932 6933 if ($reversed) { 6934 Carp::my_carp_bug("Bad news. Can't cope with '" 6935 . ref($other) 6936 . ' += ' 6937 . ref($self) 6938 . "'. undef returned."); 6939 return; 6940 } 6941 6942 return if $self->carp_if_locked; 6943 6944 my $addr = do { no overloading; pack 'J', $self; }; 6945 6946 if (ref $other) { 6947 6948 # Change the range list of this table to be the 6949 # union of the two. 6950 $self->_set_range_list($self->_range_list 6951 + $other); 6952 } 6953 else { # $other is just a simple value 6954 $self->add_range($other, $other); 6955 } 6956 return $self; 6957 }, 6958 '&=' => sub { 6959 my $self = shift; 6960 my $other = shift; 6961 my $reversed = shift; 6962 6963 if ($reversed) { 6964 Carp::my_carp_bug("Bad news. Can't cope with '" 6965 . ref($other) 6966 . ' &= ' 6967 . ref($self) 6968 . "'. undef returned."); 6969 return; 6970 } 6971 6972 return if $self->carp_if_locked; 6973 $self->_set_range_list($self->_range_list & $other); 6974 return $self; 6975 }, 6976 '-' => sub { my $self = shift; 6977 my $other = shift; 6978 my $reversed = shift; 6979 if ($reversed) { 6980 Carp::my_carp_bug("Bad news. Can't cope with '" 6981 . ref($other) 6982 . ' - ' 6983 . ref($self) 6984 . "'. undef returned."); 6985 return; 6986 } 6987 6988 return $self->_range_list - $other; 6989 }, 6990 '~' => sub { my $self = shift; 6991 return ~ $self->_range_list; 6992 }, 6993 ; 6994 6995 sub _operator_stringify { 6996 my $self = shift; 6997 6998 my $name = $self->complete_name; 6999 return "Table '$name'"; 7000 } 7001 7002 sub _range_list { 7003 # Returns the range list associated with this table, which will be the 7004 # complement's if it has one. 7005 7006 my $self = shift; 7007 my $complement; 7008 if (($complement = $self->complement) != 0) { 7009 return ~ $complement->_range_list; 7010 } 7011 else { 7012 return $self->SUPER::_range_list; 7013 } 7014 } 7015 7016 sub add_alias { 7017 # Add a synonym for this table. See the comments in the base class 7018 7019 my $self = shift; 7020 my $name = shift; 7021 # Rest of parameters passed on. 7022 7023 $self->SUPER::add_alias($name, $self, @_); 7024 return; 7025 } 7026 7027 sub add_conflicting { 7028 # Add the name of some other object to the list of ones that name 7029 # clash with this match table. 7030 7031 my $self = shift; 7032 my $conflicting_name = shift; # The name of the conflicting object 7033 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? 7034 my $conflicting_object = shift; # Optional, the conflicting object 7035 # itself. This is used to 7036 # disambiguate the text if the input 7037 # name is identical to any of the 7038 # aliases $self is known by. 7039 # Sometimes the conflicting object is 7040 # merely hypothetical, so this has to 7041 # be an optional parameter. 7042 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7043 7044 my $addr = do { no overloading; pack 'J', $self; }; 7045 7046 # Check if the conflicting name is exactly the same as any existing 7047 # alias in this table (as long as there is a real object there to 7048 # disambiguate with). 7049 if (defined $conflicting_object) { 7050 foreach my $alias ($self->aliases) { 7051 if ($alias->name eq $conflicting_name) { 7052 7053 # Here, there is an exact match. This results in 7054 # ambiguous comments, so disambiguate by changing the 7055 # conflicting name to its object's complete equivalent. 7056 $conflicting_name = $conflicting_object->complete_name; 7057 last; 7058 } 7059 } 7060 } 7061 7062 # Convert to the \p{...} final name 7063 $conflicting_name = "\\$p" . "{$conflicting_name}"; 7064 7065 # Only add once 7066 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; 7067 7068 push @{$conflicting{$addr}}, $conflicting_name; 7069 7070 return; 7071 } 7072 7073 sub is_set_equivalent_to { 7074 # Return boolean of whether or not the other object is a table of this 7075 # type and has been marked equivalent to this one. 7076 7077 my $self = shift; 7078 my $other = shift; 7079 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7080 7081 return 0 if ! defined $other; # Can happen for incomplete early 7082 # releases 7083 unless ($other->isa(__PACKAGE__)) { 7084 my $ref_other = ref $other; 7085 my $ref_self = ref $self; 7086 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); 7087 return 0; 7088 } 7089 7090 # Two tables are equivalent if they have the same leader. 7091 no overloading; 7092 return $leader{pack 'J', $self} == $leader{pack 'J', $other}; 7093 return; 7094 } 7095 7096 sub set_equivalent_to { 7097 # Set $self equivalent to the parameter table. 7098 # The required Related => 'x' parameter is a boolean indicating 7099 # whether these tables are related or not. If related, $other becomes 7100 # the 'parent' of $self; if unrelated it becomes the 'leader' 7101 # 7102 # Related tables share all characteristics except names; equivalents 7103 # not quite so many. 7104 # If they are related, one must be a perl extension. This is because 7105 # we can't guarantee that Unicode won't change one or the other in a 7106 # later release even if they are identical now. 7107 7108 my $self = shift; 7109 my $other = shift; 7110 7111 my %args = @_; 7112 my $related = delete $args{'Related'}; 7113 7114 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 7115 7116 return if ! defined $other; # Keep on going; happens in some early 7117 # Unicode releases. 7118 7119 if (! defined $related) { 7120 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); 7121 $related = 0; 7122 } 7123 7124 # If already are equivalent, no need to re-do it; if subroutine 7125 # returns null, it found an error, also do nothing 7126 my $are_equivalent = $self->is_set_equivalent_to($other); 7127 return if ! defined $are_equivalent || $are_equivalent; 7128 7129 my $addr = do { no overloading; pack 'J', $self; }; 7130 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; 7131 7132 if ($related) { 7133 if ($current_leader->perl_extension) { 7134 if ($other->perl_extension) { 7135 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); 7136 return; 7137 } 7138 } elsif ($self->property != $other->property # Depending on 7139 # situation, might 7140 # be better to use 7141 # add_alias() 7142 # instead for same 7143 # property 7144 && ! $other->perl_extension) 7145 { 7146 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); 7147 $related = 0; 7148 } 7149 } 7150 7151 if (! $self->is_empty && ! $self->matches_identically_to($other)) { 7152 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); 7153 return; 7154 } 7155 7156 my $leader = do { no overloading; pack 'J', $current_leader; }; 7157 my $other_addr = do { no overloading; pack 'J', $other; }; 7158 7159 # Any tables that are equivalent to or children of this table must now 7160 # instead be equivalent to or (children) to the new leader (parent), 7161 # still equivalent. The equivalency includes their matches_all info, 7162 # and for related tables, their fate and status. 7163 # All related tables are of necessity equivalent, but the converse 7164 # isn't necessarily true 7165 my $status = $other->status; 7166 my $status_info = $other->status_info; 7167 my $fate = $other->fate; 7168 my $matches_all = $matches_all{other_addr}; 7169 my $caseless_equivalent = $other->caseless_equivalent; 7170 foreach my $table ($current_leader, @{$equivalents{$leader}}) { 7171 next if $table == $other; 7172 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; 7173 7174 my $table_addr = do { no overloading; pack 'J', $table; }; 7175 $leader{$table_addr} = $other; 7176 $matches_all{$table_addr} = $matches_all; 7177 $self->_set_range_list($other->_range_list); 7178 push @{$equivalents{$other_addr}}, $table; 7179 if ($related) { 7180 $parent{$table_addr} = $other; 7181 push @{$children{$other_addr}}, $table; 7182 $table->set_status($status, $status_info); 7183 7184 # This reason currently doesn't get exposed outside; otherwise 7185 # would have to look up the parent's reason and use it instead. 7186 $table->set_fate($fate, "Parent's fate"); 7187 7188 $self->set_caseless_equivalent($caseless_equivalent); 7189 } 7190 } 7191 7192 # Now that we've declared these to be equivalent, any changes to one 7193 # of the tables would invalidate that equivalency. 7194 $self->lock; 7195 $other->lock; 7196 return; 7197 } 7198 7199 sub set_complement { 7200 # Set $self to be the complement of the parameter table. $self is 7201 # locked, as what it contains should all come from the other table. 7202 7203 my $self = shift; 7204 my $other = shift; 7205 7206 my %args = @_; 7207 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 7208 7209 if ($other->complement != 0) { 7210 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); 7211 return; 7212 } 7213 my $addr = do { no overloading; pack 'J', $self; }; 7214 $complement{$addr} = $other; 7215 $self->lock; 7216 return; 7217 } 7218 7219 sub add_range { # Add a range to the list for this table. 7220 my $self = shift; 7221 # Rest of parameters passed on 7222 7223 return if $self->carp_if_locked; 7224 return $self->_range_list->add_range(@_); 7225 } 7226 7227 sub header { 7228 my $self = shift; 7229 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7230 7231 # All match tables are to be used only by the Perl core. 7232 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; 7233 } 7234 7235 sub pre_body { # Does nothing for match tables. 7236 return 7237 } 7238 7239 sub append_to_body { # Does nothing for match tables. 7240 return 7241 } 7242 7243 sub set_fate { 7244 my $self = shift; 7245 my $fate = shift; 7246 my $reason = shift; 7247 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7248 7249 $self->SUPER::set_fate($fate, $reason); 7250 7251 # All children share this fate 7252 foreach my $child ($self->children) { 7253 $child->set_fate($fate, $reason); 7254 } 7255 return; 7256 } 7257 7258 sub write { 7259 my $self = shift; 7260 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7261 7262 return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops 7263 } 7264 7265 sub set_final_comment { 7266 # This creates a comment for the file that is to hold the match table 7267 # $self. It is somewhat convoluted to make the English read nicely, 7268 # but, heh, it's just a comment. 7269 # This should be called only with the leader match table of all the 7270 # ones that share the same file. It lists all such tables, ordered so 7271 # that related ones are together. 7272 7273 return unless $debugging_build; 7274 7275 my $leader = shift; # Should only be called on the leader table of 7276 # an equivalent group 7277 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7278 7279 my $addr = do { no overloading; pack 'J', $leader; }; 7280 7281 if ($leader{$addr} != $leader) { 7282 Carp::my_carp_bug(<<END 7283set_final_comment() must be called on a leader table, which $leader is not. 7284It is equivalent to $leader{$addr}. No comment created 7285END 7286 ); 7287 return; 7288 } 7289 7290 # Get the number of code points matched by each of the tables in this 7291 # file, and add underscores for clarity. 7292 my $count = $leader->count; 7293 my $string_count = main::clarify_number($count); 7294 7295 my $loose_count = 0; # how many aliases loosely matched 7296 my $compound_name = ""; # ? Are any names compound?, and if so, an 7297 # example 7298 my $properties_with_compound_names = 0; # count of these 7299 7300 7301 my %flags; # The status flags used in the file 7302 my $total_entries = 0; # number of entries written in the comment 7303 my $matches_comment = ""; # The portion of the comment about the 7304 # \p{}'s 7305 my @global_comments; # List of all the tables' comments that are 7306 # there before this routine was called. 7307 my $has_ucd_alias = 0; # If there is an alias that is accessible via 7308 # Unicode::UCD. If not, then don't say it is 7309 # in the comment 7310 7311 # Get list of all the parent tables that are equivalent to this one 7312 # (including itself). 7313 my @parents = grep { $parent{main::objaddr $_} == $_ } 7314 main::uniques($leader, @{$equivalents{$addr}}); 7315 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated 7316 # tables 7317 7318 for my $parent (@parents) { 7319 7320 my $property = $parent->property; 7321 7322 # Special case 'N' tables in properties with two match tables when 7323 # the other is a 'Y' one. These are likely to be binary tables, 7324 # but not necessarily. In either case, \P{} will match the 7325 # complement of \p{}, and so if something is a synonym of \p, the 7326 # complement of that something will be the synonym of \P. This 7327 # would be true of any property with just two match tables, not 7328 # just those whose values are Y and N; but that would require a 7329 # little extra work, and there are none such so far in Unicode. 7330 my $perl_p = 'p'; # which is it? \p{} or \P{} 7331 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table 7332 7333 if (scalar $property->tables == 2 7334 && $parent == $property->table('N') 7335 && defined (my $yes = $property->table('Y'))) 7336 { 7337 my $yes_addr = do { no overloading; pack 'J', $yes; }; 7338 @yes_perl_synonyms 7339 = grep { $_->property == $perl } 7340 main::uniques($yes, 7341 $parent{$yes_addr}, 7342 $parent{$yes_addr}->children); 7343 7344 # But these synonyms are \P{} ,not \p{} 7345 $perl_p = 'P'; 7346 } 7347 7348 my @description; # Will hold the table description 7349 my @note; # Will hold the table notes. 7350 my @conflicting; # Will hold the table conflicts. 7351 7352 # Look at the parent, any yes synonyms, and all the children 7353 my $parent_addr = do { no overloading; pack 'J', $parent; }; 7354 for my $table ($parent, 7355 @yes_perl_synonyms, 7356 @{$children{$parent_addr}}) 7357 { 7358 my $table_addr = do { no overloading; pack 'J', $table; }; 7359 my $table_property = $table->property; 7360 7361 # Tables are separated by a blank line to create a grouping. 7362 $matches_comment .= "\n" if $matches_comment; 7363 7364 # The table is named based on the property and value 7365 # combination it is for, like script=greek. But there may be 7366 # a number of synonyms for each side, like 'sc' for 'script', 7367 # and 'grek' for 'greek'. Any combination of these is a valid 7368 # name for this table. In this case, there are three more, 7369 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than 7370 # listing all possible combinations in the comment, we make 7371 # sure that each synonym occurs at least once, and add 7372 # commentary that the other combinations are possible. 7373 # Because regular expressions don't recognize things like 7374 # \p{jsn=}, only look at non-null right-hand-sides 7375 my @property_aliases = $table_property->aliases; 7376 my @table_aliases = grep { $_->name ne "" } $table->aliases; 7377 7378 # The alias lists above are already ordered in the order we 7379 # want to output them. To ensure that each synonym is listed, 7380 # we must use the max of the two numbers. But if there are no 7381 # legal synonyms (nothing in @table_aliases), then we don't 7382 # list anything. 7383 my $listed_combos = (@table_aliases) 7384 ? main::max(scalar @table_aliases, 7385 scalar @property_aliases) 7386 : 0; 7387 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG; 7388 7389 7390 my $property_had_compound_name = 0; 7391 7392 for my $i (0 .. $listed_combos - 1) { 7393 $total_entries++; 7394 7395 # The current alias for the property is the next one on 7396 # the list, or if beyond the end, start over. Similarly 7397 # for the table (\p{prop=table}) 7398 my $property_alias = $property_aliases 7399 [$i % @property_aliases]->name; 7400 my $table_alias_object = $table_aliases 7401 [$i % @table_aliases]; 7402 my $table_alias = $table_alias_object->name; 7403 my $loose_match = $table_alias_object->loose_match; 7404 $has_ucd_alias |= $table_alias_object->ucd; 7405 7406 if ($table_alias !~ /\D/) { # Clarify large numbers. 7407 $table_alias = main::clarify_number($table_alias) 7408 } 7409 7410 # Add a comment for this alias combination 7411 my $current_match_comment; 7412 if ($table_property == $perl) { 7413 $current_match_comment = "\\$perl_p" 7414 . "{$table_alias}"; 7415 } 7416 else { 7417 $current_match_comment 7418 = "\\p{$property_alias=$table_alias}"; 7419 $property_had_compound_name = 1; 7420 } 7421 7422 # Flag any abnormal status for this table. 7423 my $flag = $property->status 7424 || $table->status 7425 || $table_alias_object->status; 7426 if ($flag && $flag ne $PLACEHOLDER) { 7427 $flags{$flag} = $status_past_participles{$flag}; 7428 } 7429 7430 $loose_count++; 7431 7432 # Pretty up the comment. Note the \b; it says don't make 7433 # this line a continuation. 7434 $matches_comment .= sprintf("\b%-1s%-s%s\n", 7435 $flag, 7436 " " x 7, 7437 $current_match_comment); 7438 } # End of generating the entries for this table. 7439 7440 # Save these for output after this group of related tables. 7441 push @description, $table->description; 7442 push @note, $table->note; 7443 push @conflicting, $table->conflicting; 7444 7445 # And this for output after all the tables. 7446 push @global_comments, $table->comment; 7447 7448 # Compute an alternate compound name using the final property 7449 # synonym and the first table synonym with a colon instead of 7450 # the equal sign used elsewhere. 7451 if ($property_had_compound_name) { 7452 $properties_with_compound_names ++; 7453 if (! $compound_name || @property_aliases > 1) { 7454 $compound_name = $property_aliases[-1]->name 7455 . ': ' 7456 . $table_aliases[0]->name; 7457 } 7458 } 7459 } # End of looping through all children of this table 7460 7461 # Here have assembled in $matches_comment all the related tables 7462 # to the current parent (preceded by the same info for all the 7463 # previous parents). Put out information that applies to all of 7464 # the current family. 7465 if (@conflicting) { 7466 7467 # But output the conflicting information now, as it applies to 7468 # just this table. 7469 my $conflicting = join ", ", @conflicting; 7470 if ($conflicting) { 7471 $matches_comment .= <<END; 7472 7473 Note that contrary to what you might expect, the above is NOT the same as 7474END 7475 $matches_comment .= "any of: " if @conflicting > 1; 7476 $matches_comment .= "$conflicting\n"; 7477 } 7478 } 7479 if (@description) { 7480 $matches_comment .= "\n Meaning: " 7481 . join('; ', @description) 7482 . "\n"; 7483 } 7484 if (@note) { 7485 $matches_comment .= "\n Note: " 7486 . join("\n ", @note) 7487 . "\n"; 7488 } 7489 } # End of looping through all tables 7490 7491 7492 my $code_points; 7493 my $match; 7494 my $any_of_these; 7495 if ($count == 1) { 7496 $match = 'matches'; 7497 $code_points = 'single code point'; 7498 } 7499 else { 7500 $match = 'match'; 7501 $code_points = "$string_count code points"; 7502 } 7503 7504 my $synonyms; 7505 my $entries; 7506 if ($total_entries == 1) { 7507 $synonyms = ""; 7508 $entries = 'entry'; 7509 $any_of_these = 'this' 7510 } 7511 else { 7512 $synonyms = " any of the following regular expression constructs"; 7513 $entries = 'entries'; 7514 $any_of_these = 'any of these' 7515 } 7516 7517 my $comment = ""; 7518 if ($has_ucd_alias) { 7519 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; 7520 } 7521 if ($has_unrelated) { 7522 $comment .= <<END; 7523This file is for tables that are not necessarily related: To conserve 7524resources, every table that matches the identical set of code points in this 7525version of Unicode uses this file. Each one is listed in a separate group 7526below. It could be that the tables will match the same set of code points in 7527other Unicode releases, or it could be purely coincidence that they happen to 7528be the same in Unicode $string_version, and hence may not in other versions. 7529 7530END 7531 } 7532 7533 if (%flags) { 7534 foreach my $flag (sort keys %flags) { 7535 $comment .= <<END; 7536'$flag' below means that this form is $flags{$flag}. 7537Consult $pod_file.pod 7538END 7539 } 7540 $comment .= "\n"; 7541 } 7542 7543 if ($total_entries == 0) { 7544 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); 7545 $comment .= <<END; 7546This file returns the $code_points in Unicode Version $string_version for 7547$leader, but it is inaccessible through Perl regular expressions, as 7548"\\p{prop=}" is not recognized. 7549END 7550 7551 } else { 7552 $comment .= <<END; 7553This file returns the $code_points in Unicode Version $string_version that 7554$match$synonyms: 7555 7556$matches_comment 7557$pod_file.pod should be consulted for the syntax rules for $any_of_these, 7558including if adding or subtracting white space, underscore, and hyphen 7559characters matters or doesn't matter, and other permissible syntactic 7560variants. Upper/lower case distinctions never matter. 7561END 7562 7563 } 7564 if ($compound_name) { 7565 $comment .= <<END; 7566 7567A colon can be substituted for the equals sign, and 7568END 7569 if ($properties_with_compound_names > 1) { 7570 $comment .= <<END; 7571within each group above, 7572END 7573 } 7574 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name); 7575 7576 # Note the \b below, it says don't make that line a continuation. 7577 $comment .= <<END; 7578anything to the left of the equals (or colon) can be combined with anything to 7579the right. Thus, for example, 7580$compound_name 7581\bis also valid. 7582END 7583 } 7584 7585 # And append any comment(s) from the actual tables. They are all 7586 # gathered here, so may not read all that well. 7587 if (@global_comments) { 7588 $comment .= "\n" . join("\n\n", @global_comments) . "\n"; 7589 } 7590 7591 if ($count) { # The format differs if no code points, and needs no 7592 # explanation in that case 7593 $comment.= <<END; 7594 7595The format of the lines of this file is: 7596END 7597 $comment.= <<END; 7598START\\tSTOP\\twhere START is the starting code point of the range, in hex; 7599STOP is the ending point, or if omitted, the range has just one code point. 7600END 7601 if ($leader->output_range_counts) { 7602 $comment .= <<END; 7603Numbers in comments in [brackets] indicate how many code points are in the 7604range. 7605END 7606 } 7607 } 7608 7609 $leader->set_comment(main::join_lines($comment)); 7610 return; 7611 } 7612 7613 # Accessors for the underlying list 7614 for my $sub (qw( 7615 get_valid_code_point 7616 get_invalid_code_point 7617 )) 7618 { 7619 no strict "refs"; 7620 *$sub = sub { 7621 use strict "refs"; 7622 my $self = shift; 7623 7624 return $self->_range_list->$sub(@_); 7625 } 7626 } 7627} # End closure for Match_Table 7628 7629package Property; 7630 7631# The Property class represents a Unicode property, or the $perl 7632# pseudo-property. It contains a map table initialized empty at construction 7633# time, and for properties accessible through regular expressions, various 7634# match tables, created through the add_match_table() method, and referenced 7635# by the table('NAME') or tables() methods, the latter returning a list of all 7636# of the match tables. Otherwise table operations implicitly are for the map 7637# table. 7638# 7639# Most of the data in the property is actually about its map table, so it 7640# mostly just uses that table's accessors for most methods. The two could 7641# have been combined into one object, but for clarity because of their 7642# differing semantics, they have been kept separate. It could be argued that 7643# the 'file' and 'directory' fields should be kept with the map table. 7644# 7645# Each property has a type. This can be set in the constructor, or in the 7646# set_type accessor, but mostly it is figured out by the data. Every property 7647# starts with unknown type, overridden by a parameter to the constructor, or 7648# as match tables are added, or ranges added to the map table, the data is 7649# inspected, and the type changed. After the table is mostly or entirely 7650# filled, compute_type() should be called to finalize they analysis. 7651# 7652# There are very few operations defined. One can safely remove a range from 7653# the map table, and property_add_or_replace_non_nulls() adds the maps from another 7654# table to this one, replacing any in the intersection of the two. 7655 7656sub standardize { return main::standardize($_[0]); } 7657sub trace { return main::trace(@_) if main::DEBUG && $to_trace } 7658 7659{ # Closure 7660 7661 # This hash will contain as keys, all the aliases of all properties, and 7662 # as values, pointers to their respective property objects. This allows 7663 # quick look-up of a property from any of its names. 7664 my %alias_to_property_of; 7665 7666 sub dump_alias_to_property_of { 7667 # For debugging 7668 7669 print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; 7670 return; 7671 } 7672 7673 sub property_ref { 7674 # This is a package subroutine, not called as a method. 7675 # If the single parameter is a literal '*' it returns a list of all 7676 # defined properties. 7677 # Otherwise, the single parameter is a name, and it returns a pointer 7678 # to the corresponding property object, or undef if none. 7679 # 7680 # Properties can have several different names. The 'standard' form of 7681 # each of them is stored in %alias_to_property_of as they are defined. 7682 # But it's possible that this subroutine will be called with some 7683 # variant, so if the initial lookup fails, it is repeated with the 7684 # standardized form of the input name. If found, besides returning the 7685 # result, the input name is added to the list so future calls won't 7686 # have to do the conversion again. 7687 7688 my $name = shift; 7689 7690 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7691 7692 if (! defined $name) { 7693 Carp::my_carp_bug("Undefined input property. No action taken."); 7694 return; 7695 } 7696 7697 return main::uniques(values %alias_to_property_of) if $name eq '*'; 7698 7699 # Return cached result if have it. 7700 my $result = $alias_to_property_of{$name}; 7701 return $result if defined $result; 7702 7703 # Convert the input to standard form. 7704 my $standard_name = standardize($name); 7705 7706 $result = $alias_to_property_of{$standard_name}; 7707 return unless defined $result; # Don't cache undefs 7708 7709 # Cache the result before returning it. 7710 $alias_to_property_of{$name} = $result; 7711 return $result; 7712 } 7713 7714 7715 main::setup_package(); 7716 7717 my %map; 7718 # A pointer to the map table object for this property 7719 main::set_access('map', \%map); 7720 7721 my %full_name; 7722 # The property's full name. This is a duplicate of the copy kept in the 7723 # map table, but is needed because stringify needs it during 7724 # construction of the map table, and then would have a chicken before egg 7725 # problem. 7726 main::set_access('full_name', \%full_name, 'r'); 7727 7728 my %table_ref; 7729 # This hash will contain as keys, all the aliases of any match tables 7730 # attached to this property, and as values, the pointers to their 7731 # respective tables. This allows quick look-up of a table from any of its 7732 # names. 7733 main::set_access('table_ref', \%table_ref); 7734 7735 my %type; 7736 # The type of the property, $ENUM, $BINARY, etc 7737 main::set_access('type', \%type, 'r'); 7738 7739 my %file; 7740 # The filename where the map table will go (if actually written). 7741 # Normally defaulted, but can be overridden. 7742 main::set_access('file', \%file, 'r', 's'); 7743 7744 my %directory; 7745 # The directory where the map table will go (if actually written). 7746 # Normally defaulted, but can be overridden. 7747 main::set_access('directory', \%directory, 's'); 7748 7749 my %pseudo_map_type; 7750 # This is used to affect the calculation of the map types for all the 7751 # ranges in the table. It should be set to one of the values that signify 7752 # to alter the calculation. 7753 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); 7754 7755 my %has_only_code_point_maps; 7756 # A boolean used to help in computing the type of data in the map table. 7757 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); 7758 7759 my %unique_maps; 7760 # A list of the first few distinct mappings this property has. This is 7761 # used to disambiguate between binary and enum property types, so don't 7762 # have to keep more than three. 7763 main::set_access('unique_maps', \%unique_maps); 7764 7765 my %pre_declared_maps; 7766 # A boolean that gives whether the input data should declare all the 7767 # tables used, or not. If the former, unknown ones raise a warning. 7768 main::set_access('pre_declared_maps', 7769 \%pre_declared_maps, 'r', 's'); 7770 7771 sub new { 7772 # The only required parameter is the positionally first, name. All 7773 # other parameters are key => value pairs. See the documentation just 7774 # above for the meanings of the ones not passed directly on to the map 7775 # table constructor. 7776 7777 my $class = shift; 7778 my $name = shift || ""; 7779 7780 my $self = property_ref($name); 7781 if (defined $self) { 7782 my $options_string = join ", ", @_; 7783 $options_string = ". Ignoring options $options_string" if $options_string; 7784 Carp::my_carp("$self is already in use. Using existing one$options_string;"); 7785 return $self; 7786 } 7787 7788 my %args = @_; 7789 7790 $self = bless \do { my $anonymous_scalar }, $class; 7791 my $addr = do { no overloading; pack 'J', $self; }; 7792 7793 $directory{$addr} = delete $args{'Directory'}; 7794 $file{$addr} = delete $args{'File'}; 7795 $full_name{$addr} = delete $args{'Full_Name'} || $name; 7796 $type{$addr} = delete $args{'Type'} || $UNKNOWN; 7797 $pseudo_map_type{$addr} = delete $args{'Map_Type'}; 7798 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'} 7799 # Starting in this release, property 7800 # values should be defined for all 7801 # properties, except those overriding this 7802 // $v_version ge v5.1.0; 7803 7804 # Rest of parameters passed on. 7805 7806 $has_only_code_point_maps{$addr} = 1; 7807 $table_ref{$addr} = { }; 7808 $unique_maps{$addr} = { }; 7809 7810 $map{$addr} = Map_Table->new($name, 7811 Full_Name => $full_name{$addr}, 7812 _Alias_Hash => \%alias_to_property_of, 7813 _Property => $self, 7814 %args); 7815 return $self; 7816 } 7817 7818 # See this program's beginning comment block about overloading the copy 7819 # constructor. Few operations are defined on properties, but a couple are 7820 # useful. It is safe to take the inverse of a property, and to remove a 7821 # single code point from it. 7822 use overload 7823 fallback => 0, 7824 qw("") => "_operator_stringify", 7825 "." => \&main::_operator_dot, 7826 ".=" => \&main::_operator_dot_equal, 7827 '==' => \&main::_operator_equal, 7828 '!=' => \&main::_operator_not_equal, 7829 '=' => sub { return shift }, 7830 '-=' => "_minus_and_equal", 7831 ; 7832 7833 sub _operator_stringify { 7834 return "Property '" . shift->full_name . "'"; 7835 } 7836 7837 sub _minus_and_equal { 7838 # Remove a single code point from the map table of a property. 7839 7840 my $self = shift; 7841 my $other = shift; 7842 my $reversed = shift; 7843 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7844 7845 if (ref $other) { 7846 Carp::my_carp_bug("Bad news. Can't cope with a " 7847 . ref($other) 7848 . " argument to '-='. Subtraction ignored."); 7849 return $self; 7850 } 7851 elsif ($reversed) { # Shouldn't happen in a -=, but just in case 7852 Carp::my_carp_bug("Bad news. Can't cope with subtracting a " 7853 . ref $self 7854 . " from a non-object. undef returned."); 7855 return; 7856 } 7857 else { 7858 no overloading; 7859 $map{pack 'J', $self}->delete_range($other, $other); 7860 } 7861 return $self; 7862 } 7863 7864 sub add_match_table { 7865 # Add a new match table for this property, with name given by the 7866 # parameter. It returns a pointer to the table. 7867 7868 my $self = shift; 7869 my $name = shift; 7870 my %args = @_; 7871 7872 my $addr = do { no overloading; pack 'J', $self; }; 7873 7874 my $table = $table_ref{$addr}{$name}; 7875 my $standard_name = main::standardize($name); 7876 if (defined $table 7877 || (defined ($table = $table_ref{$addr}{$standard_name}))) 7878 { 7879 Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); 7880 $table_ref{$addr}{$name} = $table; 7881 return $table; 7882 } 7883 else { 7884 7885 # See if this is a perl extension, if not passed in. 7886 my $perl_extension = delete $args{'Perl_Extension'}; 7887 $perl_extension 7888 = $self->perl_extension if ! defined $perl_extension; 7889 7890 $table = Match_Table->new( 7891 Name => $name, 7892 Perl_Extension => $perl_extension, 7893 _Alias_Hash => $table_ref{$addr}, 7894 _Property => $self, 7895 7896 # gets property's fate and status by default 7897 Fate => $self->fate, 7898 Status => $self->status, 7899 _Status_Info => $self->status_info, 7900 %args); 7901 return unless defined $table; 7902 } 7903 7904 # Save the names for quick look up 7905 $table_ref{$addr}{$standard_name} = $table; 7906 $table_ref{$addr}{$name} = $table; 7907 7908 # Perhaps we can figure out the type of this property based on the 7909 # fact of adding this match table. First, string properties don't 7910 # have match tables; second, a binary property can't have 3 match 7911 # tables 7912 if ($type{$addr} == $UNKNOWN) { 7913 $type{$addr} = $NON_STRING; 7914 } 7915 elsif ($type{$addr} == $STRING) { 7916 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); 7917 $type{$addr} = $NON_STRING; 7918 } 7919 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { 7920 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2 7921 && $type{$addr} == $BINARY) 7922 { 7923 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); 7924 $type{$addr} = $ENUM; 7925 } 7926 } 7927 7928 return $table; 7929 } 7930 7931 sub delete_match_table { 7932 # Delete the table referred to by $2 from the property $1. 7933 7934 my $self = shift; 7935 my $table_to_remove = shift; 7936 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7937 7938 my $addr = do { no overloading; pack 'J', $self; }; 7939 7940 # Remove all names that refer to it. 7941 foreach my $key (keys %{$table_ref{$addr}}) { 7942 delete $table_ref{$addr}{$key} 7943 if $table_ref{$addr}{$key} == $table_to_remove; 7944 } 7945 7946 $table_to_remove->DESTROY; 7947 return; 7948 } 7949 7950 sub table { 7951 # Return a pointer to the match table (with name given by the 7952 # parameter) associated with this property; undef if none. 7953 7954 my $self = shift; 7955 my $name = shift; 7956 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7957 7958 my $addr = do { no overloading; pack 'J', $self; }; 7959 7960 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; 7961 7962 # If quick look-up failed, try again using the standard form of the 7963 # input name. If that succeeds, cache the result before returning so 7964 # won't have to standardize this input name again. 7965 my $standard_name = main::standardize($name); 7966 return unless defined $table_ref{$addr}{$standard_name}; 7967 7968 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; 7969 return $table_ref{$addr}{$name}; 7970 } 7971 7972 sub tables { 7973 # Return a list of pointers to all the match tables attached to this 7974 # property 7975 7976 no overloading; 7977 return main::uniques(values %{$table_ref{pack 'J', shift}}); 7978 } 7979 7980 sub directory { 7981 # Returns the directory the map table for this property should be 7982 # output in. If a specific directory has been specified, that has 7983 # priority; 'undef' is returned if the type isn't defined; 7984 # or $map_directory for everything else. 7985 7986 my $addr = do { no overloading; pack 'J', shift; }; 7987 7988 return $directory{$addr} if defined $directory{$addr}; 7989 return undef if $type{$addr} == $UNKNOWN; 7990 return $map_directory; 7991 } 7992 7993 sub swash_name { 7994 # Return the name that is used to both: 7995 # 1) Name the file that the map table is written to. 7996 # 2) The name of swash related stuff inside that file. 7997 # The reason for this is that the Perl core historically has used 7998 # certain names that aren't the same as the Unicode property names. 7999 # To continue using these, $file is hard-coded in this file for those, 8000 # but otherwise the standard name is used. This is different from the 8001 # external_name, so that the rest of the files, like in lib can use 8002 # the standard name always, without regard to historical precedent. 8003 8004 my $self = shift; 8005 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8006 8007 my $addr = do { no overloading; pack 'J', $self; }; 8008 8009 # Swash names are used only on regular map tables; otherwise there 8010 # should be no access to the property map table from other parts of 8011 # Perl. 8012 return if $map{$addr}->fate != $ORDINARY; 8013 8014 return $file{$addr} if defined $file{$addr}; 8015 return $map{$addr}->external_name; 8016 } 8017 8018 sub to_create_match_tables { 8019 # Returns a boolean as to whether or not match tables should be 8020 # created for this property. 8021 8022 my $self = shift; 8023 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8024 8025 # The whole point of this pseudo property is match tables. 8026 return 1 if $self == $perl; 8027 8028 my $addr = do { no overloading; pack 'J', $self; }; 8029 8030 # Don't generate tables of code points that match the property values 8031 # of a string property. Such a list would most likely have many 8032 # property values, each with just one or very few code points mapping 8033 # to it. 8034 return 0 if $type{$addr} == $STRING; 8035 8036 # Don't generate anything for unimplemented properties. 8037 return 0 if grep { $self->complete_name eq $_ } 8038 @unimplemented_properties; 8039 # Otherwise, do. 8040 return 1; 8041 } 8042 8043 sub property_add_or_replace_non_nulls { 8044 # This adds the mappings in the property $other to $self. Non-null 8045 # mappings from $other override those in $self. It essentially merges 8046 # the two properties, with the second having priority except for null 8047 # mappings. 8048 8049 my $self = shift; 8050 my $other = shift; 8051 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8052 8053 if (! $other->isa(__PACKAGE__)) { 8054 Carp::my_carp_bug("$other should be a " 8055 . __PACKAGE__ 8056 . ". Not a '" 8057 . ref($other) 8058 . "'. Not added;"); 8059 return; 8060 } 8061 8062 no overloading; 8063 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); 8064 } 8065 8066 sub set_proxy_for { 8067 # Certain tables are not generally written out to files, but 8068 # Unicode::UCD has the intelligence to know that the file for $self 8069 # can be used to reconstruct those tables. This routine just changes 8070 # things so that UCD pod entries for those suppressed tables are 8071 # generated, so the fact that a proxy is used is invisible to the 8072 # user. 8073 8074 my $self = shift; 8075 8076 foreach my $property_name (@_) { 8077 my $ref = property_ref($property_name); 8078 next if $ref->to_output_map; 8079 $ref->set_fate($MAP_PROXIED); 8080 } 8081 } 8082 8083 sub set_type { 8084 # Set the type of the property. Mostly this is figured out by the 8085 # data in the table. But this is used to set it explicitly. The 8086 # reason it is not a standard accessor is that when setting a binary 8087 # property, we need to make sure that all the true/false aliases are 8088 # present, as they were omitted in early Unicode releases. 8089 8090 my $self = shift; 8091 my $type = shift; 8092 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8093 8094 if ($type != $ENUM 8095 && $type != $BINARY 8096 && $type != $FORCED_BINARY 8097 && $type != $STRING) 8098 { 8099 Carp::my_carp("Unrecognized type '$type'. Type not set"); 8100 return; 8101 } 8102 8103 { no overloading; $type{pack 'J', $self} = $type; } 8104 return if $type != $BINARY && $type != $FORCED_BINARY; 8105 8106 my $yes = $self->table('Y'); 8107 $yes = $self->table('Yes') if ! defined $yes; 8108 $yes = $self->add_match_table('Y', Full_Name => 'Yes') 8109 if ! defined $yes; 8110 8111 # Add aliases in order wanted, duplicates will be ignored. We use a 8112 # binary property present in all releases for its ordered lists of 8113 # true/false aliases. Note, that could run into problems in 8114 # outputting things in that we don't distinguish between the name and 8115 # full name of these. Hopefully, if the table was already created 8116 # before this code is executed, it was done with these set properly. 8117 my $bm = property_ref("Bidi_Mirrored"); 8118 foreach my $alias ($bm->table("Y")->aliases) { 8119 $yes->add_alias($alias->name); 8120 } 8121 my $no = $self->table('N'); 8122 $no = $self->table('No') if ! defined $no; 8123 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; 8124 foreach my $alias ($bm->table("N")->aliases) { 8125 $no->add_alias($alias->name); 8126 } 8127 8128 return; 8129 } 8130 8131 sub add_map { 8132 # Add a map to the property's map table. This also keeps 8133 # track of the maps so that the property type can be determined from 8134 # its data. 8135 8136 my $self = shift; 8137 my $start = shift; # First code point in range 8138 my $end = shift; # Final code point in range 8139 my $map = shift; # What the range maps to. 8140 # Rest of parameters passed on. 8141 8142 my $addr = do { no overloading; pack 'J', $self; }; 8143 8144 # If haven't the type of the property, gather information to figure it 8145 # out. 8146 if ($type{$addr} == $UNKNOWN) { 8147 8148 # If the map contains an interior blank or dash, or most other 8149 # nonword characters, it will be a string property. This 8150 # heuristic may actually miss some string properties. If so, they 8151 # may need to have explicit set_types called for them. This 8152 # happens in the Unihan properties. 8153 if ($map =~ / (?<= . ) [ -] (?= . ) /x 8154 || $map =~ / [^\w.\/\ -] /x) 8155 { 8156 $self->set_type($STRING); 8157 8158 # $unique_maps is used for disambiguating between ENUM and 8159 # BINARY later; since we know the property is not going to be 8160 # one of those, no point in keeping the data around 8161 undef $unique_maps{$addr}; 8162 } 8163 else { 8164 8165 # Not necessarily a string. The final decision has to be 8166 # deferred until all the data are in. We keep track of if all 8167 # the values are code points for that eventual decision. 8168 $has_only_code_point_maps{$addr} &= 8169 $map =~ / ^ $code_point_re $/x; 8170 8171 # For the purposes of disambiguating between binary and other 8172 # enumerations at the end, we keep track of the first three 8173 # distinct property values. Once we get to three, we know 8174 # it's not going to be binary, so no need to track more. 8175 if (scalar keys %{$unique_maps{$addr}} < 3) { 8176 $unique_maps{$addr}{main::standardize($map)} = 1; 8177 } 8178 } 8179 } 8180 8181 # Add the mapping by calling our map table's method 8182 return $map{$addr}->add_map($start, $end, $map, @_); 8183 } 8184 8185 sub compute_type { 8186 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This 8187 # should be called after the property is mostly filled with its maps. 8188 # We have been keeping track of what the property values have been, 8189 # and now have the necessary information to figure out the type. 8190 8191 my $self = shift; 8192 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8193 8194 my $addr = do { no overloading; pack 'J', $self; }; 8195 8196 my $type = $type{$addr}; 8197 8198 # If already have figured these out, no need to do so again, but we do 8199 # a double check on ENUMS to make sure that a string property hasn't 8200 # improperly been classified as an ENUM, so continue on with those. 8201 return if $type == $STRING 8202 || $type == $BINARY 8203 || $type == $FORCED_BINARY; 8204 8205 # If every map is to a code point, is a string property. 8206 if ($type == $UNKNOWN 8207 && ($has_only_code_point_maps{$addr} 8208 || (defined $map{$addr}->default_map 8209 && $map{$addr}->default_map eq ""))) 8210 { 8211 $self->set_type($STRING); 8212 } 8213 else { 8214 8215 # Otherwise, it is to some sort of enumeration. (The case where 8216 # it is a Unicode miscellaneous property, and treated like a 8217 # string in this program is handled in add_map()). Distinguish 8218 # between binary and some other enumeration type. Of course, if 8219 # there are more than two values, it's not binary. But more 8220 # subtle is the test that the default mapping is defined means it 8221 # isn't binary. This in fact may change in the future if Unicode 8222 # changes the way its data is structured. But so far, no binary 8223 # properties ever have @missing lines for them, so the default map 8224 # isn't defined for them. The few properties that are two-valued 8225 # and aren't considered binary have the default map defined 8226 # starting in Unicode 5.0, when the @missing lines appeared; and 8227 # this program has special code to put in a default map for them 8228 # for earlier than 5.0 releases. 8229 if ($type == $ENUM 8230 || scalar keys %{$unique_maps{$addr}} > 2 8231 || defined $self->default_map) 8232 { 8233 my $tables = $self->tables; 8234 my $count = $self->count; 8235 if ($verbosity && $count > 500 && $tables/$count > .1) { 8236 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n"); 8237 } 8238 $self->set_type($ENUM); 8239 } 8240 else { 8241 $self->set_type($BINARY); 8242 } 8243 } 8244 undef $unique_maps{$addr}; # Garbage collect 8245 return; 8246 } 8247 8248 sub set_fate { 8249 my $self = shift; 8250 my $fate = shift; 8251 my $reason = shift; # Ignored unless suppressing 8252 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8253 8254 my $addr = do { no overloading; pack 'J', $self; }; 8255 if ($fate == $SUPPRESSED) { 8256 $why_suppressed{$self->complete_name} = $reason; 8257 } 8258 8259 # Each table shares the property's fate, except that MAP_PROXIED 8260 # doesn't affect match tables 8261 $map{$addr}->set_fate($fate, $reason); 8262 if ($fate != $MAP_PROXIED) { 8263 foreach my $table ($map{$addr}, $self->tables) { 8264 $table->set_fate($fate, $reason); 8265 } 8266 } 8267 return; 8268 } 8269 8270 8271 # Most of the accessors for a property actually apply to its map table. 8272 # Setup up accessor functions for those, referring to %map 8273 for my $sub (qw( 8274 add_alias 8275 add_anomalous_entry 8276 add_comment 8277 add_conflicting 8278 add_description 8279 add_duplicate 8280 add_note 8281 aliases 8282 comment 8283 complete_name 8284 containing_range 8285 count 8286 default_map 8287 delete_range 8288 description 8289 each_range 8290 external_name 8291 fate 8292 file_path 8293 format 8294 initialize 8295 inverse_list 8296 is_empty 8297 name 8298 note 8299 perl_extension 8300 property 8301 range_count 8302 ranges 8303 range_size_1 8304 reset_each_range 8305 set_comment 8306 set_default_map 8307 set_file_path 8308 set_final_comment 8309 _set_format 8310 set_range_size_1 8311 set_status 8312 set_to_output_map 8313 short_name 8314 status 8315 status_info 8316 to_output_map 8317 type_of 8318 value_of 8319 write 8320 )) 8321 # 'property' above is for symmetry, so that one can take 8322 # the property of a property and get itself, and so don't 8323 # have to distinguish between properties and tables in 8324 # calling code 8325 { 8326 no strict "refs"; 8327 *$sub = sub { 8328 use strict "refs"; 8329 my $self = shift; 8330 no overloading; 8331 return $map{pack 'J', $self}->$sub(@_); 8332 } 8333 } 8334 8335 8336} # End closure 8337 8338package main; 8339 8340sub join_lines($) { 8341 # Returns lines of the input joined together, so that they can be folded 8342 # properly. 8343 # This causes continuation lines to be joined together into one long line 8344 # for folding. A continuation line is any line that doesn't begin with a 8345 # space or "\b" (the latter is stripped from the output). This is so 8346 # lines can be be in a HERE document so as to fit nicely in the terminal 8347 # width, but be joined together in one long line, and then folded with 8348 # indents, '#' prefixes, etc, properly handled. 8349 # A blank separates the joined lines except if there is a break; an extra 8350 # blank is inserted after a period ending a line. 8351 8352 # Initialize the return with the first line. 8353 my ($return, @lines) = split "\n", shift; 8354 8355 # If the first line is null, it was an empty line, add the \n back in 8356 $return = "\n" if $return eq ""; 8357 8358 # Now join the remainder of the physical lines. 8359 for my $line (@lines) { 8360 8361 # An empty line means wanted a blank line, so add two \n's to get that 8362 # effect, and go to the next line. 8363 if (length $line == 0) { 8364 $return .= "\n\n"; 8365 next; 8366 } 8367 8368 # Look at the last character of what we have so far. 8369 my $previous_char = substr($return, -1, 1); 8370 8371 # And at the next char to be output. 8372 my $next_char = substr($line, 0, 1); 8373 8374 if ($previous_char ne "\n") { 8375 8376 # Here didn't end wth a nl. If the next char a blank or \b, it 8377 # means that here there is a break anyway. So add a nl to the 8378 # output. 8379 if ($next_char eq " " || $next_char eq "\b") { 8380 $previous_char = "\n"; 8381 $return .= $previous_char; 8382 } 8383 8384 # Add an extra space after periods. 8385 $return .= " " if $previous_char eq '.'; 8386 } 8387 8388 # Here $previous_char is still the latest character to be output. If 8389 # it isn't a nl, it means that the next line is to be a continuation 8390 # line, with a blank inserted between them. 8391 $return .= " " if $previous_char ne "\n"; 8392 8393 # Get rid of any \b 8394 substr($line, 0, 1) = "" if $next_char eq "\b"; 8395 8396 # And append this next line. 8397 $return .= $line; 8398 } 8399 8400 return $return; 8401} 8402 8403sub simple_fold($;$$$) { 8404 # Returns a string of the input (string or an array of strings) folded 8405 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus 8406 # a \n 8407 # This is tailored for the kind of text written by this program, 8408 # especially the pod file, which can have very long names with 8409 # underscores in the middle, or words like AbcDefgHij.... We allow 8410 # breaking in the middle of such constructs if the line won't fit 8411 # otherwise. The break in such cases will come either just after an 8412 # underscore, or just before one of the Capital letters. 8413 8414 local $to_trace = 0 if main::DEBUG; 8415 8416 my $line = shift; 8417 my $prefix = shift; # Optional string to prepend to each output 8418 # line 8419 $prefix = "" unless defined $prefix; 8420 8421 my $hanging_indent = shift; # Optional number of spaces to indent 8422 # continuation lines 8423 $hanging_indent = 0 unless $hanging_indent; 8424 8425 my $right_margin = shift; # Optional number of spaces to narrow the 8426 # total width by. 8427 $right_margin = 0 unless defined $right_margin; 8428 8429 # Call carp with the 'nofold' option to avoid it from trying to call us 8430 # recursively 8431 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_; 8432 8433 # The space available doesn't include what's automatically prepended 8434 # to each line, or what's reserved on the right. 8435 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; 8436 # XXX Instead of using the 'nofold' perhaps better to look up the stack 8437 8438 if (DEBUG && $hanging_indent >= $max) { 8439 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); 8440 $hanging_indent = 0; 8441 } 8442 8443 # First, split into the current physical lines. 8444 my @line; 8445 if (ref $line) { # Better be an array, because not bothering to 8446 # test 8447 foreach my $line (@{$line}) { 8448 push @line, split /\n/, $line; 8449 } 8450 } 8451 else { 8452 @line = split /\n/, $line; 8453 } 8454 8455 #local $to_trace = 1 if main::DEBUG; 8456 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; 8457 8458 # Look at each current physical line. 8459 for (my $i = 0; $i < @line; $i++) { 8460 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; 8461 #local $to_trace = 1 if main::DEBUG; 8462 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; 8463 8464 # Remove prefix, because will be added back anyway, don't want 8465 # doubled prefix 8466 $line[$i] =~ s/^$prefix//; 8467 8468 # Remove trailing space 8469 $line[$i] =~ s/\s+\Z//; 8470 8471 # If the line is too long, fold it. 8472 if (length $line[$i] > $max) { 8473 my $remainder; 8474 8475 # Here needs to fold. Save the leading space in the line for 8476 # later. 8477 $line[$i] =~ /^ ( \s* )/x; 8478 my $leading_space = $1; 8479 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; 8480 8481 # If character at final permissible position is white space, 8482 # fold there, which will delete that white space 8483 if (substr($line[$i], $max - 1, 1) =~ /\s/) { 8484 $remainder = substr($line[$i], $max); 8485 $line[$i] = substr($line[$i], 0, $max - 1); 8486 } 8487 else { 8488 8489 # Otherwise fold at an acceptable break char closest to 8490 # the max length. Look at just the maximal initial 8491 # segment of the line 8492 my $segment = substr($line[$i], 0, $max - 1); 8493 if ($segment =~ 8494 /^ ( .{$hanging_indent} # Don't look before the 8495 # indent. 8496 \ * # Don't look in leading 8497 # blanks past the indent 8498 [^ ] .* # Find the right-most 8499 (?: # acceptable break: 8500 [ \s = ] # space or equal 8501 | - (?! [.0-9] ) # or non-unary minus. 8502 ) # $1 includes the character 8503 )/x) 8504 { 8505 # Split into the initial part that fits, and remaining 8506 # part of the input 8507 $remainder = substr($line[$i], length $1); 8508 $line[$i] = $1; 8509 trace $line[$i] if DEBUG && $to_trace; 8510 trace $remainder if DEBUG && $to_trace; 8511 } 8512 8513 # If didn't find a good breaking spot, see if there is a 8514 # not-so-good breaking spot. These are just after 8515 # underscores or where the case changes from lower to 8516 # upper. Use \a as a soft hyphen, but give up 8517 # and don't break the line if there is actually a \a 8518 # already in the input. We use an ascii character for the 8519 # soft-hyphen to avoid any attempt by miniperl to try to 8520 # access the files that this program is creating. 8521 elsif ($segment !~ /\a/ 8522 && ($segment =~ s/_/_\a/g 8523 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg)) 8524 { 8525 # Here were able to find at least one place to insert 8526 # our substitute soft hyphen. Find the right-most one 8527 # and replace it by a real hyphen. 8528 trace $segment if DEBUG && $to_trace; 8529 substr($segment, 8530 rindex($segment, "\a"), 8531 1) = '-'; 8532 8533 # Then remove the soft hyphen substitutes. 8534 $segment =~ s/\a//g; 8535 trace $segment if DEBUG && $to_trace; 8536 8537 # And split into the initial part that fits, and 8538 # remainder of the line 8539 my $pos = rindex($segment, '-'); 8540 $remainder = substr($line[$i], $pos); 8541 trace $remainder if DEBUG && $to_trace; 8542 $line[$i] = substr($segment, 0, $pos + 1); 8543 } 8544 } 8545 8546 # Here we know if we can fold or not. If we can, $remainder 8547 # is what remains to be processed in the next iteration. 8548 if (defined $remainder) { 8549 trace "folded='$line[$i]'" if main::DEBUG && $to_trace; 8550 8551 # Insert the folded remainder of the line as a new element 8552 # of the array. (It may still be too long, but we will 8553 # deal with that next time through the loop.) Omit any 8554 # leading space in the remainder. 8555 $remainder =~ s/^\s+//; 8556 trace "remainder='$remainder'" if main::DEBUG && $to_trace; 8557 8558 # But then indent by whichever is larger of: 8559 # 1) the leading space on the input line; 8560 # 2) the hanging indent. 8561 # This preserves indentation in the original line. 8562 my $lead = ($leading_space) 8563 ? length $leading_space 8564 : $hanging_indent; 8565 $lead = max($lead, $hanging_indent); 8566 splice @line, $i+1, 0, (" " x $lead) . $remainder; 8567 } 8568 } 8569 8570 # Ready to output the line. Get rid of any trailing space 8571 # And prefix by the required $prefix passed in. 8572 $line[$i] =~ s/\s+$//; 8573 $line[$i] = "$prefix$line[$i]\n"; 8574 } # End of looping through all the lines. 8575 8576 return join "", @line; 8577} 8578 8579sub property_ref { # Returns a reference to a property object. 8580 return Property::property_ref(@_); 8581} 8582 8583sub force_unlink ($) { 8584 my $filename = shift; 8585 return unless file_exists($filename); 8586 return if CORE::unlink($filename); 8587 8588 # We might need write permission 8589 chmod 0777, $filename; 8590 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); 8591 return; 8592} 8593 8594sub write ($$@) { 8595 # Given a filename and references to arrays of lines, write the lines of 8596 # each array to the file 8597 # Filename can be given as an arrayref of directory names 8598 8599 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 8600 8601 my $file = shift; 8602 my $use_utf8 = shift; 8603 8604 # Get into a single string if an array, and get rid of, in Unix terms, any 8605 # leading '.' 8606 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; 8607 $file = File::Spec->canonpath($file); 8608 8609 # If has directories, make sure that they all exist 8610 (undef, my $directories, undef) = File::Spec->splitpath($file); 8611 File::Path::mkpath($directories) if $directories && ! -d $directories; 8612 8613 push @files_actually_output, $file; 8614 8615 force_unlink ($file); 8616 8617 my $OUT; 8618 if (not open $OUT, ">", $file) { 8619 Carp::my_carp("can't open $file for output. Skipping this file: $!"); 8620 return; 8621 } 8622 8623 binmode $OUT, ":utf8" if $use_utf8; 8624 8625 while (defined (my $lines_ref = shift)) { 8626 unless (@$lines_ref) { 8627 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;"); 8628 } 8629 8630 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); 8631 } 8632 close $OUT or die Carp::my_carp("close '$file' failed: $!"); 8633 8634 print "$file written.\n" if $verbosity >= $VERBOSE; 8635 8636 return; 8637} 8638 8639 8640sub Standardize($) { 8641 # This converts the input name string into a standardized equivalent to 8642 # use internally. 8643 8644 my $name = shift; 8645 unless (defined $name) { 8646 Carp::my_carp_bug("Standardize() called with undef. Returning undef."); 8647 return; 8648 } 8649 8650 # Remove any leading or trailing white space 8651 $name =~ s/^\s+//g; 8652 $name =~ s/\s+$//g; 8653 8654 # Convert interior white space and hyphens into underscores. 8655 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; 8656 8657 # Capitalize the letter following an underscore, and convert a sequence of 8658 # multiple underscores to a single one 8659 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; 8660 8661 # And capitalize the first letter, but not for the special cjk ones. 8662 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 8663 return $name; 8664} 8665 8666sub standardize ($) { 8667 # Returns a lower-cased standardized name, without underscores. This form 8668 # is chosen so that it can distinguish between any real versus superficial 8669 # Unicode name differences. It relies on the fact that Unicode doesn't 8670 # have interior underscores, white space, nor dashes in any 8671 # stricter-matched name. It should not be used on Unicode code point 8672 # names (the Name property), as they mostly, but not always follow these 8673 # rules. 8674 8675 my $name = Standardize(shift); 8676 return if !defined $name; 8677 8678 $name =~ s/ (?<= .) _ (?= . ) //xg; 8679 return lc $name; 8680} 8681 8682sub utf8_heavy_name ($$) { 8683 # Returns the name that utf8_heavy.pl will use to find a table. XXX 8684 # perhaps this function should be placed somewhere, like Heavy.pl so that 8685 # utf8_heavy can use it directly without duplicating code that can get 8686 # out-of sync. 8687 8688 my $table = shift; 8689 my $alias = shift; 8690 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8691 8692 my $property = $table->property; 8693 $property = ($property == $perl) 8694 ? "" # 'perl' is never explicitly stated 8695 : standardize($property->name) . '='; 8696 if ($alias->loose_match) { 8697 return $property . standardize($alias->name); 8698 } 8699 else { 8700 return lc ($property . $alias->name); 8701 } 8702 8703 return; 8704} 8705 8706{ # Closure 8707 8708 my $indent_increment = " " x (($debugging_build) ? 2 : 0); 8709 my %already_output; 8710 8711 $main::simple_dumper_nesting = 0; 8712 8713 sub simple_dumper { 8714 # Like Simple Data::Dumper. Good enough for our needs. We can't use 8715 # the real thing as we have to run under miniperl. 8716 8717 # It is designed so that on input it is at the beginning of a line, 8718 # and the final thing output in any call is a trailing ",\n". 8719 8720 my $item = shift; 8721 my $indent = shift; 8722 $indent = "" if ! $debugging_build || ! defined $indent; 8723 8724 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8725 8726 # nesting level is localized, so that as the call stack pops, it goes 8727 # back to the prior value. 8728 local $main::simple_dumper_nesting = $main::simple_dumper_nesting; 8729 undef %already_output if $main::simple_dumper_nesting == 0; 8730 $main::simple_dumper_nesting++; 8731 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; 8732 8733 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8734 8735 # Determine the indent for recursive calls. 8736 my $next_indent = $indent . $indent_increment; 8737 8738 my $output; 8739 if (! ref $item) { 8740 8741 # Dump of scalar: just output it in quotes if not a number. To do 8742 # so we must escape certain characters, and therefore need to 8743 # operate on a copy to avoid changing the original 8744 my $copy = $item; 8745 $copy = $UNDEF unless defined $copy; 8746 8747 # Quote non-integers (integers also have optional leading '-') 8748 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { 8749 8750 # Escape apostrophe and backslash 8751 $copy =~ s/ ( ['\\] ) /\\$1/xg; 8752 $copy = "'$copy'"; 8753 } 8754 $output = "$indent$copy,\n"; 8755 } 8756 else { 8757 8758 # Keep track of cycles in the input, and refuse to infinitely loop 8759 my $addr = do { no overloading; pack 'J', $item; }; 8760 if (defined $already_output{$addr}) { 8761 return "${indent}ALREADY OUTPUT: $item\n"; 8762 } 8763 $already_output{$addr} = $item; 8764 8765 if (ref $item eq 'ARRAY') { 8766 my $using_brackets; 8767 $output = $indent; 8768 if ($main::simple_dumper_nesting > 1) { 8769 $output .= '['; 8770 $using_brackets = 1; 8771 } 8772 else { 8773 $using_brackets = 0; 8774 } 8775 8776 # If the array is empty, put the closing bracket on the same 8777 # line. Otherwise, recursively add each array element 8778 if (@$item == 0) { 8779 $output .= " "; 8780 } 8781 else { 8782 $output .= "\n"; 8783 for (my $i = 0; $i < @$item; $i++) { 8784 8785 # Indent array elements one level 8786 $output .= &simple_dumper($item->[$i], $next_indent); 8787 next if ! $debugging_build; 8788 $output =~ s/\n$//; # Remove any trailing nl so 8789 $output .= " # [$i]\n"; # as to add a comment giving 8790 # the array index 8791 } 8792 $output .= $indent; # Indent closing ']' to orig level 8793 } 8794 $output .= ']' if $using_brackets; 8795 $output .= ",\n"; 8796 } 8797 elsif (ref $item eq 'HASH') { 8798 my $is_first_line; 8799 my $using_braces; 8800 my $body_indent; 8801 8802 # No surrounding braces at top level 8803 $output .= $indent; 8804 if ($main::simple_dumper_nesting > 1) { 8805 $output .= "{\n"; 8806 $is_first_line = 0; 8807 $body_indent = $next_indent; 8808 $next_indent .= $indent_increment; 8809 $using_braces = 1; 8810 } 8811 else { 8812 $is_first_line = 1; 8813 $body_indent = $indent; 8814 $using_braces = 0; 8815 } 8816 8817 # Output hashes sorted alphabetically instead of apparently 8818 # random. Use caseless alphabetic sort 8819 foreach my $key (sort { lc $a cmp lc $b } keys %$item) 8820 { 8821 if ($is_first_line) { 8822 $is_first_line = 0; 8823 } 8824 else { 8825 $output .= "$body_indent"; 8826 } 8827 8828 # The key must be a scalar, but this recursive call quotes 8829 # it 8830 $output .= &simple_dumper($key); 8831 8832 # And change the trailing comma and nl to the hash fat 8833 # comma for clarity, and so the value can be on the same 8834 # line 8835 $output =~ s/,\n$/ => /; 8836 8837 # Recursively call to get the value's dump. 8838 my $next = &simple_dumper($item->{$key}, $next_indent); 8839 8840 # If the value is all on one line, remove its indent, so 8841 # will follow the => immediately. If it takes more than 8842 # one line, start it on a new line. 8843 if ($next !~ /\n.*\n/) { 8844 $next =~ s/^ *//; 8845 } 8846 else { 8847 $output .= "\n"; 8848 } 8849 $output .= $next; 8850 } 8851 8852 $output .= "$indent},\n" if $using_braces; 8853 } 8854 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { 8855 $output = $indent . ref($item) . "\n"; 8856 # XXX see if blessed 8857 } 8858 elsif ($item->can('dump')) { 8859 8860 # By convention in this program, objects furnish a 'dump' 8861 # method. Since not doing any output at this level, just pass 8862 # on the input indent 8863 $output = $item->dump($indent); 8864 } 8865 else { 8866 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); 8867 } 8868 } 8869 return $output; 8870 } 8871} 8872 8873sub dump_inside_out { 8874 # Dump inside-out hashes in an object's state by converting them to a 8875 # regular hash and then calling simple_dumper on that. 8876 8877 my $object = shift; 8878 my $fields_ref = shift; 8879 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8880 8881 my $addr = do { no overloading; pack 'J', $object; }; 8882 8883 my %hash; 8884 foreach my $key (keys %$fields_ref) { 8885 $hash{$key} = $fields_ref->{$key}{$addr}; 8886 } 8887 8888 return simple_dumper(\%hash, @_); 8889} 8890 8891sub _operator_dot { 8892 # Overloaded '.' method that is common to all packages. It uses the 8893 # package's stringify method. 8894 8895 my $self = shift; 8896 my $other = shift; 8897 my $reversed = shift; 8898 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8899 8900 $other = "" unless defined $other; 8901 8902 foreach my $which (\$self, \$other) { 8903 next unless ref $$which; 8904 if ($$which->can('_operator_stringify')) { 8905 $$which = $$which->_operator_stringify; 8906 } 8907 else { 8908 my $ref = ref $$which; 8909 my $addr = do { no overloading; pack 'J', $$which; }; 8910 $$which = "$ref ($addr)"; 8911 } 8912 } 8913 return ($reversed) 8914 ? "$other$self" 8915 : "$self$other"; 8916} 8917 8918sub _operator_dot_equal { 8919 # Overloaded '.=' method that is common to all packages. 8920 8921 my $self = shift; 8922 my $other = shift; 8923 my $reversed = shift; 8924 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8925 8926 $other = "" unless defined $other; 8927 8928 if ($reversed) { 8929 return $other .= "$self"; 8930 } 8931 else { 8932 return "$self" . "$other"; 8933 } 8934} 8935 8936sub _operator_equal { 8937 # Generic overloaded '==' routine. To be equal, they must be the exact 8938 # same object 8939 8940 my $self = shift; 8941 my $other = shift; 8942 8943 return 0 unless defined $other; 8944 return 0 unless ref $other; 8945 no overloading; 8946 return $self == $other; 8947} 8948 8949sub _operator_not_equal { 8950 my $self = shift; 8951 my $other = shift; 8952 8953 return ! _operator_equal($self, $other); 8954} 8955 8956sub process_PropertyAliases($) { 8957 # This reads in the PropertyAliases.txt file, which contains almost all 8958 # the character properties in Unicode and their equivalent aliases: 8959 # scf ; Simple_Case_Folding ; sfc 8960 # 8961 # Field 0 is the preferred short name for the property. 8962 # Field 1 is the full name. 8963 # Any succeeding ones are other accepted names. 8964 8965 my $file= shift; 8966 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8967 8968 # This whole file was non-existent in early releases, so use our own 8969 # internal one. 8970 $file->insert_lines(get_old_property_aliases()) 8971 if ! -e 'PropertyAliases.txt'; 8972 8973 # Add any cjk properties that may have been defined. 8974 $file->insert_lines(@cjk_properties); 8975 8976 while ($file->next_line) { 8977 8978 my @data = split /\s*;\s*/; 8979 8980 my $full = $data[1]; 8981 8982 my $this = Property->new($data[0], Full_Name => $full); 8983 8984 # Start looking for more aliases after these two. 8985 for my $i (2 .. @data - 1) { 8986 $this->add_alias($data[$i]); 8987 } 8988 8989 } 8990 8991 my $scf = property_ref("Simple_Case_Folding"); 8992 $scf->add_alias("scf"); 8993 $scf->add_alias("sfc"); 8994 8995 return; 8996} 8997 8998sub finish_property_setup { 8999 # Finishes setting up after PropertyAliases. 9000 9001 my $file = shift; 9002 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9003 9004 # This entry was missing from this file in earlier Unicode versions 9005 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) { 9006 Property->new('JSN', Full_Name => 'Jamo_Short_Name'); 9007 } 9008 9009 # These two properties must be defined in all releases so we can generate 9010 # the tables from them to make regex \X work, but suppress their output so 9011 # aren't application visible prior to releases where they should be 9012 if (! defined property_ref('GCB')) { 9013 Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break', 9014 Fate => $PLACEHOLDER); 9015 } 9016 if (! defined property_ref('hst')) { 9017 Property->new('hst', Full_Name => 'Hangul_Syllable_Type', 9018 Fate => $PLACEHOLDER); 9019 } 9020 9021 # These are used so much, that we set globals for them. 9022 $gc = property_ref('General_Category'); 9023 $block = property_ref('Block'); 9024 $script = property_ref('Script'); 9025 9026 # Perl adds this alias. 9027 $gc->add_alias('Category'); 9028 9029 # Unicode::Normalize expects this file with this name and directory. 9030 my $ccc = property_ref('Canonical_Combining_Class'); 9031 if (defined $ccc) { 9032 $ccc->set_file('CombiningClass'); 9033 $ccc->set_directory(File::Spec->curdir()); 9034 } 9035 9036 # These two properties aren't actually used in the core, but unfortunately 9037 # the names just above that are in the core interfere with these, so 9038 # choose different names. These aren't a problem unless the map tables 9039 # for these files get written out. 9040 my $lowercase = property_ref('Lowercase'); 9041 $lowercase->set_file('IsLower') if defined $lowercase; 9042 my $uppercase = property_ref('Uppercase'); 9043 $uppercase->set_file('IsUpper') if defined $uppercase; 9044 9045 # Set up the hard-coded default mappings, but only on properties defined 9046 # for this release 9047 foreach my $property (keys %default_mapping) { 9048 my $property_object = property_ref($property); 9049 next if ! defined $property_object; 9050 my $default_map = $default_mapping{$property}; 9051 $property_object->set_default_map($default_map); 9052 9053 # A map of <code point> implies the property is string. 9054 if ($property_object->type == $UNKNOWN 9055 && $default_map eq $CODE_POINT) 9056 { 9057 $property_object->set_type($STRING); 9058 } 9059 } 9060 9061 # The following use the Multi_Default class to create objects for 9062 # defaults. 9063 9064 # Bidi class has a complicated default, but the derived file takes care of 9065 # the complications, leaving just 'L'. 9066 if (file_exists("${EXTRACTED}DBidiClass.txt")) { 9067 property_ref('Bidi_Class')->set_default_map('L'); 9068 } 9069 else { 9070 my $default; 9071 9072 # The derived file was introduced in 3.1.1. The values below are 9073 # taken from table 3-8, TUS 3.0 9074 my $default_R = 9075 'my $default = Range_List->new; 9076 $default->add_range(0x0590, 0x05FF); 9077 $default->add_range(0xFB1D, 0xFB4F);' 9078 ; 9079 9080 # The defaults apply only to unassigned characters 9081 $default_R .= '$gc->table("Unassigned") & $default;'; 9082 9083 if ($v_version lt v3.0.0) { 9084 $default = Multi_Default->new(R => $default_R, 'L'); 9085 } 9086 else { 9087 9088 # AL apparently not introduced until 3.0: TUS 2.x references are 9089 # not on-line to check it out 9090 my $default_AL = 9091 'my $default = Range_List->new; 9092 $default->add_range(0x0600, 0x07BF); 9093 $default->add_range(0xFB50, 0xFDFF); 9094 $default->add_range(0xFE70, 0xFEFF);' 9095 ; 9096 9097 # Non-character code points introduced in this release; aren't AL 9098 if ($v_version ge 3.1.0) { 9099 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; 9100 } 9101 $default_AL .= '$gc->table("Unassigned") & $default'; 9102 $default = Multi_Default->new(AL => $default_AL, 9103 R => $default_R, 9104 'L'); 9105 } 9106 property_ref('Bidi_Class')->set_default_map($default); 9107 } 9108 9109 # Joining type has a complicated default, but the derived file takes care 9110 # of the complications, leaving just 'U' (or Non_Joining), except the file 9111 # is bad in 3.1.0 9112 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { 9113 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { 9114 property_ref('Joining_Type')->set_default_map('Non_Joining'); 9115 } 9116 else { 9117 9118 # Otherwise, there are not one, but two possibilities for the 9119 # missing defaults: T and U. 9120 # The missing defaults that evaluate to T are given by: 9121 # T = Mn + Cf - ZWNJ - ZWJ 9122 # where Mn and Cf are the general category values. In other words, 9123 # any non-spacing mark or any format control character, except 9124 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO 9125 # WIDTH JOINER (joining type C). 9126 my $default = Multi_Default->new( 9127 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', 9128 'Non_Joining'); 9129 property_ref('Joining_Type')->set_default_map($default); 9130 } 9131 } 9132 9133 # Line break has a complicated default in early releases. It is 'Unknown' 9134 # for non-assigned code points; 'AL' for assigned. 9135 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { 9136 my $lb = property_ref('Line_Break'); 9137 if ($v_version gt 3.2.0) { 9138 $lb->set_default_map('Unknown'); 9139 } 9140 else { 9141 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")', 9142 'AL'); 9143 $lb->set_default_map($default); 9144 } 9145 9146 # If has the URS property, make sure that the standard aliases are in 9147 # it, since not in the input tables in some versions. 9148 my $urs = property_ref('Unicode_Radical_Stroke'); 9149 if (defined $urs) { 9150 $urs->add_alias('cjkRSUnicode'); 9151 $urs->add_alias('kRSUnicode'); 9152 } 9153 } 9154 9155 # For backwards compatibility with applications that may read the mapping 9156 # file directly (it was documented in 5.12 and 5.14 as being thusly 9157 # usable), keep it from being adjusted. (range_size_1 is 9158 # used to force the traditional format.) 9159 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { 9160 $nfkc_cf->set_to_output_map($EXTERNAL_MAP); 9161 $nfkc_cf->set_range_size_1(1); 9162 } 9163 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { 9164 $bmg->set_to_output_map($EXTERNAL_MAP); 9165 $bmg->set_range_size_1(1); 9166 } 9167 9168 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); 9169 9170 return; 9171} 9172 9173sub get_old_property_aliases() { 9174 # Returns what would be in PropertyAliases.txt if it existed in very old 9175 # versions of Unicode. It was derived from the one in 3.2, and pared 9176 # down based on the data that was actually in the older releases. 9177 # An attempt was made to use the existence of files to mean inclusion or 9178 # not of various aliases, but if this was not sufficient, using version 9179 # numbers was resorted to. 9180 9181 my @return; 9182 9183 # These are to be used in all versions (though some are constructed by 9184 # this program if missing) 9185 push @return, split /\n/, <<'END'; 9186bc ; Bidi_Class 9187Bidi_M ; Bidi_Mirrored 9188cf ; Case_Folding 9189ccc ; Canonical_Combining_Class 9190dm ; Decomposition_Mapping 9191dt ; Decomposition_Type 9192gc ; General_Category 9193isc ; ISO_Comment 9194lc ; Lowercase_Mapping 9195na ; Name 9196na1 ; Unicode_1_Name 9197nt ; Numeric_Type 9198nv ; Numeric_Value 9199scf ; Simple_Case_Folding 9200slc ; Simple_Lowercase_Mapping 9201stc ; Simple_Titlecase_Mapping 9202suc ; Simple_Uppercase_Mapping 9203tc ; Titlecase_Mapping 9204uc ; Uppercase_Mapping 9205END 9206 9207 if (-e 'Blocks.txt') { 9208 push @return, "blk ; Block\n"; 9209 } 9210 if (-e 'ArabicShaping.txt') { 9211 push @return, split /\n/, <<'END'; 9212jg ; Joining_Group 9213jt ; Joining_Type 9214END 9215 } 9216 if (-e 'PropList.txt') { 9217 9218 # This first set is in the original old-style proplist. 9219 push @return, split /\n/, <<'END'; 9220Bidi_C ; Bidi_Control 9221Dash ; Dash 9222Dia ; Diacritic 9223Ext ; Extender 9224Hex ; Hex_Digit 9225Hyphen ; Hyphen 9226IDC ; ID_Continue 9227Ideo ; Ideographic 9228Join_C ; Join_Control 9229Math ; Math 9230QMark ; Quotation_Mark 9231Term ; Terminal_Punctuation 9232WSpace ; White_Space 9233END 9234 # The next sets were added later 9235 if ($v_version ge v3.0.0) { 9236 push @return, split /\n/, <<'END'; 9237Upper ; Uppercase 9238Lower ; Lowercase 9239END 9240 } 9241 if ($v_version ge v3.0.1) { 9242 push @return, split /\n/, <<'END'; 9243NChar ; Noncharacter_Code_Point 9244END 9245 } 9246 # The next sets were added in the new-style 9247 if ($v_version ge v3.1.0) { 9248 push @return, split /\n/, <<'END'; 9249OAlpha ; Other_Alphabetic 9250OLower ; Other_Lowercase 9251OMath ; Other_Math 9252OUpper ; Other_Uppercase 9253END 9254 } 9255 if ($v_version ge v3.1.1) { 9256 push @return, "AHex ; ASCII_Hex_Digit\n"; 9257 } 9258 } 9259 if (-e 'EastAsianWidth.txt') { 9260 push @return, "ea ; East_Asian_Width\n"; 9261 } 9262 if (-e 'CompositionExclusions.txt') { 9263 push @return, "CE ; Composition_Exclusion\n"; 9264 } 9265 if (-e 'LineBreak.txt') { 9266 push @return, "lb ; Line_Break\n"; 9267 } 9268 if (-e 'BidiMirroring.txt') { 9269 push @return, "bmg ; Bidi_Mirroring_Glyph\n"; 9270 } 9271 if (-e 'Scripts.txt') { 9272 push @return, "sc ; Script\n"; 9273 } 9274 if (-e 'DNormalizationProps.txt') { 9275 push @return, split /\n/, <<'END'; 9276Comp_Ex ; Full_Composition_Exclusion 9277FC_NFKC ; FC_NFKC_Closure 9278NFC_QC ; NFC_Quick_Check 9279NFD_QC ; NFD_Quick_Check 9280NFKC_QC ; NFKC_Quick_Check 9281NFKD_QC ; NFKD_Quick_Check 9282XO_NFC ; Expands_On_NFC 9283XO_NFD ; Expands_On_NFD 9284XO_NFKC ; Expands_On_NFKC 9285XO_NFKD ; Expands_On_NFKD 9286END 9287 } 9288 if (-e 'DCoreProperties.txt') { 9289 push @return, split /\n/, <<'END'; 9290Alpha ; Alphabetic 9291IDS ; ID_Start 9292XIDC ; XID_Continue 9293XIDS ; XID_Start 9294END 9295 # These can also appear in some versions of PropList.txt 9296 push @return, "Lower ; Lowercase\n" 9297 unless grep { $_ =~ /^Lower\b/} @return; 9298 push @return, "Upper ; Uppercase\n" 9299 unless grep { $_ =~ /^Upper\b/} @return; 9300 } 9301 9302 # This flag requires the DAge.txt file to be copied into the directory. 9303 if (DEBUG && $compare_versions) { 9304 push @return, 'age ; Age'; 9305 } 9306 9307 return @return; 9308} 9309 9310sub process_PropValueAliases { 9311 # This file contains values that properties look like: 9312 # bc ; AL ; Arabic_Letter 9313 # blk; n/a ; Greek_And_Coptic ; Greek 9314 # 9315 # Field 0 is the property. 9316 # Field 1 is the short name of a property value or 'n/a' if no 9317 # short name exists; 9318 # Field 2 is the full property value name; 9319 # Any other fields are more synonyms for the property value. 9320 # Purely numeric property values are omitted from the file; as are some 9321 # others, fewer and fewer in later releases 9322 9323 # Entries for the ccc property have an extra field before the 9324 # abbreviation: 9325 # ccc; 0; NR ; Not_Reordered 9326 # It is the numeric value that the names are synonyms for. 9327 9328 # There are comment entries for values missing from this file: 9329 # # @missing: 0000..10FFFF; ISO_Comment; <none> 9330 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point> 9331 9332 my $file= shift; 9333 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9334 9335 # This whole file was non-existent in early releases, so use our own 9336 # internal one if necessary. 9337 if (! -e 'PropValueAliases.txt') { 9338 $file->insert_lines(get_old_property_value_aliases()); 9339 } 9340 9341 if ($v_version lt 4.0.0) { 9342 $file->insert_lines(split /\n/, <<'END' 9343hst; L ; Leading_Jamo 9344hst; LV ; LV_Syllable 9345hst; LVT ; LVT_Syllable 9346hst; NA ; Not_Applicable 9347hst; T ; Trailing_Jamo 9348hst; V ; Vowel_Jamo 9349END 9350 ); 9351 } 9352 if ($v_version lt 4.1.0) { 9353 $file->insert_lines(split /\n/, <<'END' 9354GCB; CN ; Control 9355GCB; CR ; CR 9356GCB; EX ; Extend 9357GCB; L ; L 9358GCB; LF ; LF 9359GCB; LV ; LV 9360GCB; LVT ; LVT 9361GCB; T ; T 9362GCB; V ; V 9363GCB; XX ; Other 9364END 9365 ); 9366 } 9367 9368 9369 # Add any explicit cjk values 9370 $file->insert_lines(@cjk_property_values); 9371 9372 # This line is used only for testing the code that checks for name 9373 # conflicts. There is a script Inherited, and when this line is executed 9374 # it causes there to be a name conflict with the 'Inherited' that this 9375 # program generates for this block property value 9376 #$file->insert_lines('blk; n/a; Herited'); 9377 9378 9379 # Process each line of the file ... 9380 while ($file->next_line) { 9381 9382 # Fix typo in input file 9383 s/CCC133/CCC132/g if $v_version eq v6.1.0; 9384 9385 my ($property, @data) = split /\s*;\s*/; 9386 9387 # The ccc property has an extra field at the beginning, which is the 9388 # numeric value. Move it to be after the other two, mnemonic, fields, 9389 # so that those will be used as the property value's names, and the 9390 # number will be an extra alias. (Rightmost splice removes field 1-2, 9391 # returning them in a slice; left splice inserts that before anything, 9392 # thus shifting the former field 0 to after them.) 9393 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; 9394 9395 # Field 0 is a short name unless "n/a"; field 1 is the full name. If 9396 # there is no short name, use the full one in element 1 9397 if ($data[0] eq "n/a") { 9398 $data[0] = $data[1]; 9399 } 9400 elsif ($data[0] ne $data[1] 9401 && standardize($data[0]) eq standardize($data[1]) 9402 && $data[1] !~ /[[:upper:]]/) 9403 { 9404 # Also, there is a bug in the file in which "n/a" is omitted, and 9405 # the two fields are identical except for case, and the full name 9406 # is all lower case. Copy the "short" name unto the full one to 9407 # give it some upper case. 9408 9409 $data[1] = $data[0]; 9410 } 9411 9412 # Earlier releases had the pseudo property 'qc' that should expand to 9413 # the ones that replace it below. 9414 if ($property eq 'qc') { 9415 if (lc $data[0] eq 'y') { 9416 $file->insert_lines('NFC_QC; Y ; Yes', 9417 'NFD_QC; Y ; Yes', 9418 'NFKC_QC; Y ; Yes', 9419 'NFKD_QC; Y ; Yes', 9420 ); 9421 } 9422 elsif (lc $data[0] eq 'n') { 9423 $file->insert_lines('NFC_QC; N ; No', 9424 'NFD_QC; N ; No', 9425 'NFKC_QC; N ; No', 9426 'NFKD_QC; N ; No', 9427 ); 9428 } 9429 elsif (lc $data[0] eq 'm') { 9430 $file->insert_lines('NFC_QC; M ; Maybe', 9431 'NFKC_QC; M ; Maybe', 9432 ); 9433 } 9434 else { 9435 $file->carp_bad_line("qc followed by unexpected '$data[0]"); 9436 } 9437 next; 9438 } 9439 9440 # The first field is the short name, 2nd is the full one. 9441 my $property_object = property_ref($property); 9442 my $table = $property_object->add_match_table($data[0], 9443 Full_Name => $data[1]); 9444 9445 # Start looking for more aliases after these two. 9446 for my $i (2 .. @data - 1) { 9447 $table->add_alias($data[$i]); 9448 } 9449 } # End of looping through the file 9450 9451 # As noted in the comments early in the program, it generates tables for 9452 # the default values for all releases, even those for which the concept 9453 # didn't exist at the time. Here we add those if missing. 9454 my $age = property_ref('age'); 9455 if (defined $age && ! defined $age->table('Unassigned')) { 9456 $age->add_match_table('Unassigned'); 9457 } 9458 $block->add_match_table('No_Block') if -e 'Blocks.txt' 9459 && ! defined $block->table('No_Block'); 9460 9461 9462 # Now set the default mappings of the properties from the file. This is 9463 # done after the loop because a number of properties have only @missings 9464 # entries in the file, and may not show up until the end. 9465 my @defaults = $file->get_missings; 9466 foreach my $default_ref (@defaults) { 9467 my $default = $default_ref->[0]; 9468 my $property = property_ref($default_ref->[1]); 9469 $property->set_default_map($default); 9470 } 9471 return; 9472} 9473 9474sub get_old_property_value_aliases () { 9475 # Returns what would be in PropValueAliases.txt if it existed in very old 9476 # versions of Unicode. It was derived from the one in 3.2, and pared 9477 # down. An attempt was made to use the existence of files to mean 9478 # inclusion or not of various aliases, but if this was not sufficient, 9479 # using version numbers was resorted to. 9480 9481 my @return = split /\n/, <<'END'; 9482bc ; AN ; Arabic_Number 9483bc ; B ; Paragraph_Separator 9484bc ; CS ; Common_Separator 9485bc ; EN ; European_Number 9486bc ; ES ; European_Separator 9487bc ; ET ; European_Terminator 9488bc ; L ; Left_To_Right 9489bc ; ON ; Other_Neutral 9490bc ; R ; Right_To_Left 9491bc ; WS ; White_Space 9492 9493Bidi_M; N; No; F; False 9494Bidi_M; Y; Yes; T; True 9495 9496# The standard combining classes are very much different in v1, so only use 9497# ones that look right (not checked thoroughly) 9498ccc; 0; NR ; Not_Reordered 9499ccc; 1; OV ; Overlay 9500ccc; 7; NK ; Nukta 9501ccc; 8; KV ; Kana_Voicing 9502ccc; 9; VR ; Virama 9503ccc; 202; ATBL ; Attached_Below_Left 9504ccc; 216; ATAR ; Attached_Above_Right 9505ccc; 218; BL ; Below_Left 9506ccc; 220; B ; Below 9507ccc; 222; BR ; Below_Right 9508ccc; 224; L ; Left 9509ccc; 228; AL ; Above_Left 9510ccc; 230; A ; Above 9511ccc; 232; AR ; Above_Right 9512ccc; 234; DA ; Double_Above 9513 9514dt ; can ; canonical 9515dt ; enc ; circle 9516dt ; fin ; final 9517dt ; font ; font 9518dt ; fra ; fraction 9519dt ; init ; initial 9520dt ; iso ; isolated 9521dt ; med ; medial 9522dt ; n/a ; none 9523dt ; nb ; noBreak 9524dt ; sqr ; square 9525dt ; sub ; sub 9526dt ; sup ; super 9527 9528gc ; C ; Other # Cc | Cf | Cn | Co | Cs 9529gc ; Cc ; Control 9530gc ; Cn ; Unassigned 9531gc ; Co ; Private_Use 9532gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu 9533gc ; LC ; Cased_Letter # Ll | Lt | Lu 9534gc ; Ll ; Lowercase_Letter 9535gc ; Lm ; Modifier_Letter 9536gc ; Lo ; Other_Letter 9537gc ; Lu ; Uppercase_Letter 9538gc ; M ; Mark # Mc | Me | Mn 9539gc ; Mc ; Spacing_Mark 9540gc ; Mn ; Nonspacing_Mark 9541gc ; N ; Number # Nd | Nl | No 9542gc ; Nd ; Decimal_Number 9543gc ; No ; Other_Number 9544gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps 9545gc ; Pd ; Dash_Punctuation 9546gc ; Pe ; Close_Punctuation 9547gc ; Po ; Other_Punctuation 9548gc ; Ps ; Open_Punctuation 9549gc ; S ; Symbol # Sc | Sk | Sm | So 9550gc ; Sc ; Currency_Symbol 9551gc ; Sm ; Math_Symbol 9552gc ; So ; Other_Symbol 9553gc ; Z ; Separator # Zl | Zp | Zs 9554gc ; Zl ; Line_Separator 9555gc ; Zp ; Paragraph_Separator 9556gc ; Zs ; Space_Separator 9557 9558nt ; de ; Decimal 9559nt ; di ; Digit 9560nt ; n/a ; None 9561nt ; nu ; Numeric 9562END 9563 9564 if (-e 'ArabicShaping.txt') { 9565 push @return, split /\n/, <<'END'; 9566jg ; n/a ; AIN 9567jg ; n/a ; ALEF 9568jg ; n/a ; DAL 9569jg ; n/a ; GAF 9570jg ; n/a ; LAM 9571jg ; n/a ; MEEM 9572jg ; n/a ; NO_JOINING_GROUP 9573jg ; n/a ; NOON 9574jg ; n/a ; QAF 9575jg ; n/a ; SAD 9576jg ; n/a ; SEEN 9577jg ; n/a ; TAH 9578jg ; n/a ; WAW 9579 9580jt ; C ; Join_Causing 9581jt ; D ; Dual_Joining 9582jt ; L ; Left_Joining 9583jt ; R ; Right_Joining 9584jt ; U ; Non_Joining 9585jt ; T ; Transparent 9586END 9587 if ($v_version ge v3.0.0) { 9588 push @return, split /\n/, <<'END'; 9589jg ; n/a ; ALAPH 9590jg ; n/a ; BEH 9591jg ; n/a ; BETH 9592jg ; n/a ; DALATH_RISH 9593jg ; n/a ; E 9594jg ; n/a ; FEH 9595jg ; n/a ; FINAL_SEMKATH 9596jg ; n/a ; GAMAL 9597jg ; n/a ; HAH 9598jg ; n/a ; HAMZA_ON_HEH_GOAL 9599jg ; n/a ; HE 9600jg ; n/a ; HEH 9601jg ; n/a ; HEH_GOAL 9602jg ; n/a ; HETH 9603jg ; n/a ; KAF 9604jg ; n/a ; KAPH 9605jg ; n/a ; KNOTTED_HEH 9606jg ; n/a ; LAMADH 9607jg ; n/a ; MIM 9608jg ; n/a ; NUN 9609jg ; n/a ; PE 9610jg ; n/a ; QAPH 9611jg ; n/a ; REH 9612jg ; n/a ; REVERSED_PE 9613jg ; n/a ; SADHE 9614jg ; n/a ; SEMKATH 9615jg ; n/a ; SHIN 9616jg ; n/a ; SWASH_KAF 9617jg ; n/a ; TAW 9618jg ; n/a ; TEH_MARBUTA 9619jg ; n/a ; TETH 9620jg ; n/a ; YEH 9621jg ; n/a ; YEH_BARREE 9622jg ; n/a ; YEH_WITH_TAIL 9623jg ; n/a ; YUDH 9624jg ; n/a ; YUDH_HE 9625jg ; n/a ; ZAIN 9626END 9627 } 9628 } 9629 9630 9631 if (-e 'EastAsianWidth.txt') { 9632 push @return, split /\n/, <<'END'; 9633ea ; A ; Ambiguous 9634ea ; F ; Fullwidth 9635ea ; H ; Halfwidth 9636ea ; N ; Neutral 9637ea ; Na ; Narrow 9638ea ; W ; Wide 9639END 9640 } 9641 9642 if (-e 'LineBreak.txt') { 9643 push @return, split /\n/, <<'END'; 9644lb ; AI ; Ambiguous 9645lb ; AL ; Alphabetic 9646lb ; B2 ; Break_Both 9647lb ; BA ; Break_After 9648lb ; BB ; Break_Before 9649lb ; BK ; Mandatory_Break 9650lb ; CB ; Contingent_Break 9651lb ; CL ; Close_Punctuation 9652lb ; CM ; Combining_Mark 9653lb ; CR ; Carriage_Return 9654lb ; EX ; Exclamation 9655lb ; GL ; Glue 9656lb ; HY ; Hyphen 9657lb ; ID ; Ideographic 9658lb ; IN ; Inseperable 9659lb ; IS ; Infix_Numeric 9660lb ; LF ; Line_Feed 9661lb ; NS ; Nonstarter 9662lb ; NU ; Numeric 9663lb ; OP ; Open_Punctuation 9664lb ; PO ; Postfix_Numeric 9665lb ; PR ; Prefix_Numeric 9666lb ; QU ; Quotation 9667lb ; SA ; Complex_Context 9668lb ; SG ; Surrogate 9669lb ; SP ; Space 9670lb ; SY ; Break_Symbols 9671lb ; XX ; Unknown 9672lb ; ZW ; ZWSpace 9673END 9674 } 9675 9676 if (-e 'DNormalizationProps.txt') { 9677 push @return, split /\n/, <<'END'; 9678qc ; M ; Maybe 9679qc ; N ; No 9680qc ; Y ; Yes 9681END 9682 } 9683 9684 if (-e 'Scripts.txt') { 9685 push @return, split /\n/, <<'END'; 9686sc ; Arab ; Arabic 9687sc ; Armn ; Armenian 9688sc ; Beng ; Bengali 9689sc ; Bopo ; Bopomofo 9690sc ; Cans ; Canadian_Aboriginal 9691sc ; Cher ; Cherokee 9692sc ; Cyrl ; Cyrillic 9693sc ; Deva ; Devanagari 9694sc ; Dsrt ; Deseret 9695sc ; Ethi ; Ethiopic 9696sc ; Geor ; Georgian 9697sc ; Goth ; Gothic 9698sc ; Grek ; Greek 9699sc ; Gujr ; Gujarati 9700sc ; Guru ; Gurmukhi 9701sc ; Hang ; Hangul 9702sc ; Hani ; Han 9703sc ; Hebr ; Hebrew 9704sc ; Hira ; Hiragana 9705sc ; Ital ; Old_Italic 9706sc ; Kana ; Katakana 9707sc ; Khmr ; Khmer 9708sc ; Knda ; Kannada 9709sc ; Laoo ; Lao 9710sc ; Latn ; Latin 9711sc ; Mlym ; Malayalam 9712sc ; Mong ; Mongolian 9713sc ; Mymr ; Myanmar 9714sc ; Ogam ; Ogham 9715sc ; Orya ; Oriya 9716sc ; Qaai ; Inherited 9717sc ; Runr ; Runic 9718sc ; Sinh ; Sinhala 9719sc ; Syrc ; Syriac 9720sc ; Taml ; Tamil 9721sc ; Telu ; Telugu 9722sc ; Thaa ; Thaana 9723sc ; Thai ; Thai 9724sc ; Tibt ; Tibetan 9725sc ; Yiii ; Yi 9726sc ; Zyyy ; Common 9727END 9728 } 9729 9730 if ($v_version ge v2.0.0) { 9731 push @return, split /\n/, <<'END'; 9732dt ; com ; compat 9733dt ; nar ; narrow 9734dt ; sml ; small 9735dt ; vert ; vertical 9736dt ; wide ; wide 9737 9738gc ; Cf ; Format 9739gc ; Cs ; Surrogate 9740gc ; Lt ; Titlecase_Letter 9741gc ; Me ; Enclosing_Mark 9742gc ; Nl ; Letter_Number 9743gc ; Pc ; Connector_Punctuation 9744gc ; Sk ; Modifier_Symbol 9745END 9746 } 9747 if ($v_version ge v2.1.2) { 9748 push @return, "bc ; S ; Segment_Separator\n"; 9749 } 9750 if ($v_version ge v2.1.5) { 9751 push @return, split /\n/, <<'END'; 9752gc ; Pf ; Final_Punctuation 9753gc ; Pi ; Initial_Punctuation 9754END 9755 } 9756 if ($v_version ge v2.1.8) { 9757 push @return, "ccc; 240; IS ; Iota_Subscript\n"; 9758 } 9759 9760 if ($v_version ge v3.0.0) { 9761 push @return, split /\n/, <<'END'; 9762bc ; AL ; Arabic_Letter 9763bc ; BN ; Boundary_Neutral 9764bc ; LRE ; Left_To_Right_Embedding 9765bc ; LRO ; Left_To_Right_Override 9766bc ; NSM ; Nonspacing_Mark 9767bc ; PDF ; Pop_Directional_Format 9768bc ; RLE ; Right_To_Left_Embedding 9769bc ; RLO ; Right_To_Left_Override 9770 9771ccc; 233; DB ; Double_Below 9772END 9773 } 9774 9775 if ($v_version ge v3.1.0) { 9776 push @return, "ccc; 226; R ; Right\n"; 9777 } 9778 9779 return @return; 9780} 9781 9782sub process_NormalizationsTest { 9783 9784 # Each line looks like: 9785 # source code point; NFC; NFD; NFKC; NFKD 9786 # e.g. 9787 # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 9788 9789 my $file= shift; 9790 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9791 9792 # Process each line of the file ... 9793 while ($file->next_line) { 9794 9795 next if /^@/; 9796 9797 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/; 9798 9799 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) { 9800 $$var = pack "U0U*", map { hex } split " ", $$var; 9801 $$var =~ s/(\\)/$1$1/g; 9802 } 9803 9804 push @normalization_tests, 9805 "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n"; 9806 } # End of looping through the file 9807} 9808 9809sub output_perl_charnames_line ($$) { 9810 9811 # Output the entries in Perl_charnames specially, using 5 digits instead 9812 # of four. This makes the entries a constant length, and simplifies 9813 # charnames.pm which this table is for. Unicode can have 6 digit 9814 # ordinals, but they are all private use or noncharacters which do not 9815 # have names, so won't be in this table. 9816 9817 return sprintf "%05X\t%s\n", $_[0], $_[1]; 9818} 9819 9820{ # Closure 9821 # This is used to store the range list of all the code points usable when 9822 # the little used $compare_versions feature is enabled. 9823 my $compare_versions_range_list; 9824 9825 # These are constants to the $property_info hash in this subroutine, to 9826 # avoid using a quoted-string which might have a typo. 9827 my $TYPE = 'type'; 9828 my $DEFAULT_MAP = 'default_map'; 9829 my $DEFAULT_TABLE = 'default_table'; 9830 my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; 9831 my $MISSINGS = 'missings'; 9832 9833 sub process_generic_property_file { 9834 # This processes a file containing property mappings and puts them 9835 # into internal map tables. It should be used to handle any property 9836 # files that have mappings from a code point or range thereof to 9837 # something else. This means almost all the UCD .txt files. 9838 # each_line_handlers() should be set to adjust the lines of these 9839 # files, if necessary, to what this routine understands: 9840 # 9841 # 0374 ; NFD_QC; N 9842 # 003C..003E ; Math 9843 # 9844 # the fields are: "codepoint-range ; property; map" 9845 # 9846 # meaning the codepoints in the range all have the value 'map' under 9847 # 'property'. 9848 # Beginning and trailing white space in each field are not significant. 9849 # Note there is not a trailing semi-colon in the above. A trailing 9850 # semi-colon means the map is a null-string. An omitted map, as 9851 # opposed to a null-string, is assumed to be 'Y', based on Unicode 9852 # table syntax. (This could have been hidden from this routine by 9853 # doing it in the $file object, but that would require parsing of the 9854 # line there, so would have to parse it twice, or change the interface 9855 # to pass this an array. So not done.) 9856 # 9857 # The map field may begin with a sequence of commands that apply to 9858 # this range. Each such command begins and ends with $CMD_DELIM. 9859 # These are used to indicate, for example, that the mapping for a 9860 # range has a non-default type. 9861 # 9862 # This loops through the file, calling it's next_line() method, and 9863 # then taking the map and adding it to the property's table. 9864 # Complications arise because any number of properties can be in the 9865 # file, in any order, interspersed in any way. The first time a 9866 # property is seen, it gets information about that property and 9867 # caches it for quick retrieval later. It also normalizes the maps 9868 # so that only one of many synonyms is stored. The Unicode input 9869 # files do use some multiple synonyms. 9870 9871 my $file = shift; 9872 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9873 9874 my %property_info; # To keep track of what properties 9875 # have already had entries in the 9876 # current file, and info about each, 9877 # so don't have to recompute. 9878 my $property_name; # property currently being worked on 9879 my $property_type; # and its type 9880 my $previous_property_name = ""; # name from last time through loop 9881 my $property_object; # pointer to the current property's 9882 # object 9883 my $property_addr; # the address of that object 9884 my $default_map; # the string that code points missing 9885 # from the file map to 9886 my $default_table; # For non-string properties, a 9887 # reference to the match table that 9888 # will contain the list of code 9889 # points that map to $default_map. 9890 9891 # Get the next real non-comment line 9892 LINE: 9893 while ($file->next_line) { 9894 9895 # Default replacement type; means that if parts of the range have 9896 # already been stored in our tables, the new map overrides them if 9897 # they differ more than cosmetically 9898 my $replace = $IF_NOT_EQUIVALENT; 9899 my $map_type; # Default type for the map of this range 9900 9901 #local $to_trace = 1 if main::DEBUG; 9902 trace $_ if main::DEBUG && $to_trace; 9903 9904 # Split the line into components 9905 my ($range, $property_name, $map, @remainder) 9906 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 9907 9908 # If more or less on the line than we are expecting, warn and skip 9909 # the line 9910 if (@remainder) { 9911 $file->carp_bad_line('Extra fields'); 9912 next LINE; 9913 } 9914 elsif ( ! defined $property_name) { 9915 $file->carp_bad_line('Missing property'); 9916 next LINE; 9917 } 9918 9919 # Examine the range. 9920 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 9921 { 9922 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); 9923 next LINE; 9924 } 9925 my $low = hex $1; 9926 my $high = (defined $2) ? hex $2 : $low; 9927 9928 # For the very specialized case of comparing two Unicode 9929 # versions... 9930 if (DEBUG && $compare_versions) { 9931 if ($property_name eq 'Age') { 9932 9933 # Only allow code points at least as old as the version 9934 # specified. 9935 my $age = pack "C*", split(/\./, $map); # v string 9936 next LINE if $age gt $compare_versions; 9937 } 9938 else { 9939 9940 # Again, we throw out code points younger than those of 9941 # the specified version. By now, the Age property is 9942 # populated. We use the intersection of each input range 9943 # with this property to find what code points in it are 9944 # valid. To do the intersection, we have to convert the 9945 # Age property map to a Range_list. We only have to do 9946 # this once. 9947 if (! defined $compare_versions_range_list) { 9948 my $age = property_ref('Age'); 9949 if (! -e 'DAge.txt') { 9950 croak "Need to have 'DAge.txt' file to do version comparison"; 9951 } 9952 elsif ($age->count == 0) { 9953 croak "The 'Age' table is empty, but its file exists"; 9954 } 9955 $compare_versions_range_list 9956 = Range_List->new(Initialize => $age); 9957 } 9958 9959 # An undefined map is always 'Y' 9960 $map = 'Y' if ! defined $map; 9961 9962 # Calculate the intersection of the input range with the 9963 # code points that are known in the specified version 9964 my @ranges = ($compare_versions_range_list 9965 & Range->new($low, $high))->ranges; 9966 9967 # If the intersection is empty, throw away this range 9968 next LINE unless @ranges; 9969 9970 # Only examine the first range this time through the loop. 9971 my $this_range = shift @ranges; 9972 9973 # Put any remaining ranges in the queue to be processed 9974 # later. Note that there is unnecessary work here, as we 9975 # will do the intersection again for each of these ranges 9976 # during some future iteration of the LINE loop, but this 9977 # code is not used in production. The later intersections 9978 # are guaranteed to not splinter, so this will not become 9979 # an infinite loop. 9980 my $line = join ';', $property_name, $map; 9981 foreach my $range (@ranges) { 9982 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s", 9983 $range->start, 9984 $range->end, 9985 $line)); 9986 } 9987 9988 # And process the first range, like any other. 9989 $low = $this_range->start; 9990 $high = $this_range->end; 9991 } 9992 } # End of $compare_versions 9993 9994 # If changing to a new property, get the things constant per 9995 # property 9996 if ($previous_property_name ne $property_name) { 9997 9998 $property_object = property_ref($property_name); 9999 if (! defined $property_object) { 10000 $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); 10001 next LINE; 10002 } 10003 { no overloading; $property_addr = pack 'J', $property_object; } 10004 10005 # Defer changing names until have a line that is acceptable 10006 # (the 'next' statement above means is unacceptable) 10007 $previous_property_name = $property_name; 10008 10009 # If not the first time for this property, retrieve info about 10010 # it from the cache 10011 if (defined ($property_info{$property_addr}{$TYPE})) { 10012 $property_type = $property_info{$property_addr}{$TYPE}; 10013 $default_map = $property_info{$property_addr}{$DEFAULT_MAP}; 10014 $map_type 10015 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}; 10016 $default_table 10017 = $property_info{$property_addr}{$DEFAULT_TABLE}; 10018 } 10019 else { 10020 10021 # Here, is the first time for this property. Set up the 10022 # cache. 10023 $property_type = $property_info{$property_addr}{$TYPE} 10024 = $property_object->type; 10025 $map_type 10026 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE} 10027 = $property_object->pseudo_map_type; 10028 10029 # The Unicode files are set up so that if the map is not 10030 # defined, it is a binary property 10031 if (! defined $map && $property_type != $BINARY) { 10032 if ($property_type != $UNKNOWN 10033 && $property_type != $NON_STRING) 10034 { 10035 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); 10036 } 10037 else { 10038 $property_object->set_type($BINARY); 10039 $property_type 10040 = $property_info{$property_addr}{$TYPE} 10041 = $BINARY; 10042 } 10043 } 10044 10045 # Get any @missings default for this property. This 10046 # should precede the first entry for the property in the 10047 # input file, and is located in a comment that has been 10048 # stored by the Input_file class until we access it here. 10049 # It's possible that there is more than one such line 10050 # waiting for us; collect them all, and parse 10051 my @missings_list = $file->get_missings 10052 if $file->has_missings_defaults; 10053 foreach my $default_ref (@missings_list) { 10054 my $default = $default_ref->[0]; 10055 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); }; 10056 10057 # For string properties, the default is just what the 10058 # file says, but non-string properties should already 10059 # have set up a table for the default property value; 10060 # use the table for these, so can resolve synonyms 10061 # later to a single standard one. 10062 if ($property_type == $STRING 10063 || $property_type == $UNKNOWN) 10064 { 10065 $property_info{$addr}{$MISSINGS} = $default; 10066 } 10067 else { 10068 $property_info{$addr}{$MISSINGS} 10069 = $property_object->table($default); 10070 } 10071 } 10072 10073 # Finished storing all the @missings defaults in the input 10074 # file so far. Get the one for the current property. 10075 my $missings = $property_info{$property_addr}{$MISSINGS}; 10076 10077 # But we likely have separately stored what the default 10078 # should be. (This is to accommodate versions of the 10079 # standard where the @missings lines are absent or 10080 # incomplete.) Hopefully the two will match. But check 10081 # it out. 10082 $default_map = $property_object->default_map; 10083 10084 # If the map is a ref, it means that the default won't be 10085 # processed until later, so undef it, so next few lines 10086 # will redefine it to something that nothing will match 10087 undef $default_map if ref $default_map; 10088 10089 # Create a $default_map if don't have one; maybe a dummy 10090 # that won't match anything. 10091 if (! defined $default_map) { 10092 10093 # Use any @missings line in the file. 10094 if (defined $missings) { 10095 if (ref $missings) { 10096 $default_map = $missings->full_name; 10097 $default_table = $missings; 10098 } 10099 else { 10100 $default_map = $missings; 10101 } 10102 10103 # And store it with the property for outside use. 10104 $property_object->set_default_map($default_map); 10105 } 10106 else { 10107 10108 # Neither an @missings nor a default map. Create 10109 # a dummy one, so won't have to test definedness 10110 # in the main loop. 10111 $default_map = '_Perl This will never be in a file 10112 from Unicode'; 10113 } 10114 } 10115 10116 # Here, we have $default_map defined, possibly in terms of 10117 # $missings, but maybe not, and possibly is a dummy one. 10118 if (defined $missings) { 10119 10120 # Make sure there is no conflict between the two. 10121 # $missings has priority. 10122 if (ref $missings) { 10123 $default_table 10124 = $property_object->table($default_map); 10125 if (! defined $default_table 10126 || $default_table != $missings) 10127 { 10128 if (! defined $default_table) { 10129 $default_table = $UNDEF; 10130 } 10131 $file->carp_bad_line(<<END 10132The \@missings line for $property_name in $file says that missings default to 10133$missings, but we expect it to be $default_table. $missings used. 10134END 10135 ); 10136 $default_table = $missings; 10137 $default_map = $missings->full_name; 10138 } 10139 $property_info{$property_addr}{$DEFAULT_TABLE} 10140 = $default_table; 10141 } 10142 elsif ($default_map ne $missings) { 10143 $file->carp_bad_line(<<END 10144The \@missings line for $property_name in $file says that missings default to 10145$missings, but we expect it to be $default_map. $missings used. 10146END 10147 ); 10148 $default_map = $missings; 10149 } 10150 } 10151 10152 $property_info{$property_addr}{$DEFAULT_MAP} 10153 = $default_map; 10154 10155 # If haven't done so already, find the table corresponding 10156 # to this map for non-string properties. 10157 if (! defined $default_table 10158 && $property_type != $STRING 10159 && $property_type != $UNKNOWN) 10160 { 10161 $default_table = $property_info{$property_addr} 10162 {$DEFAULT_TABLE} 10163 = $property_object->table($default_map); 10164 } 10165 } # End of is first time for this property 10166 } # End of switching properties. 10167 10168 # Ready to process the line. 10169 # The Unicode files are set up so that if the map is not defined, 10170 # it is a binary property with value 'Y' 10171 if (! defined $map) { 10172 $map = 'Y'; 10173 } 10174 else { 10175 10176 # If the map begins with a special command to us (enclosed in 10177 # delimiters), extract the command(s). 10178 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { 10179 my $command = $1; 10180 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { 10181 $replace = $1; 10182 } 10183 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { 10184 $map_type = $1; 10185 } 10186 else { 10187 $file->carp_bad_line("Unknown command line: '$1'"); 10188 next LINE; 10189 } 10190 } 10191 } 10192 10193 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x) 10194 { 10195 10196 # Here, we have a map to a particular code point, and the 10197 # default map is to a code point itself. If the range 10198 # includes the particular code point, change that portion of 10199 # the range to the default. This makes sure that in the final 10200 # table only the non-defaults are listed. 10201 my $decimal_map = hex $map; 10202 if ($low <= $decimal_map && $decimal_map <= $high) { 10203 10204 # If the range includes stuff before or after the map 10205 # we're changing, split it and process the split-off parts 10206 # later. 10207 if ($low < $decimal_map) { 10208 $file->insert_adjusted_lines( 10209 sprintf("%04X..%04X; %s; %s", 10210 $low, 10211 $decimal_map - 1, 10212 $property_name, 10213 $map)); 10214 } 10215 if ($high > $decimal_map) { 10216 $file->insert_adjusted_lines( 10217 sprintf("%04X..%04X; %s; %s", 10218 $decimal_map + 1, 10219 $high, 10220 $property_name, 10221 $map)); 10222 } 10223 $low = $high = $decimal_map; 10224 $map = $CODE_POINT; 10225 } 10226 } 10227 10228 # If we can tell that this is a synonym for the default map, use 10229 # the default one instead. 10230 if ($property_type != $STRING 10231 && $property_type != $UNKNOWN) 10232 { 10233 my $table = $property_object->table($map); 10234 if (defined $table && $table == $default_table) { 10235 $map = $default_map; 10236 } 10237 } 10238 10239 # And figure out the map type if not known. 10240 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { 10241 if ($map eq "") { # Nulls are always $NULL map type 10242 $map_type = $NULL; 10243 } # Otherwise, non-strings, and those that don't allow 10244 # $MULTI_CP, and those that aren't multiple code points are 10245 # 0 10246 elsif 10247 (($property_type != $STRING && $property_type != $UNKNOWN) 10248 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) 10249 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) 10250 { 10251 $map_type = 0; 10252 } 10253 else { 10254 $map_type = $MULTI_CP; 10255 } 10256 } 10257 10258 $property_object->add_map($low, $high, 10259 $map, 10260 Type => $map_type, 10261 Replace => $replace); 10262 } # End of loop through file's lines 10263 10264 return; 10265 } 10266} 10267 10268{ # Closure for UnicodeData.txt handling 10269 10270 # This file was the first one in the UCD; its design leads to some 10271 # awkwardness in processing. Here is a sample line: 10272 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 10273 # The fields in order are: 10274 my $i = 0; # The code point is in field 0, and is shifted off. 10275 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") 10276 my $CATEGORY = $i++; # category (e.g. "Lu") 10277 my $CCC = $i++; # Canonical combining class (e.g. "230") 10278 my $BIDI = $i++; # directional class (e.g. "L") 10279 my $PERL_DECOMPOSITION = $i++; # decomposition mapping 10280 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value 10281 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript 10282 # Dual-use in this program; see below 10283 my $NUMERIC = $i++; # numeric value 10284 my $MIRRORED = $i++; # ? mirrored 10285 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 10286 my $COMMENT = $i++; # iso comment 10287 my $UPPER = $i++; # simple uppercase mapping 10288 my $LOWER = $i++; # simple lowercase mapping 10289 my $TITLE = $i++; # simple titlecase mapping 10290 my $input_field_count = $i; 10291 10292 # This routine in addition outputs these extra fields: 10293 10294 my $DECOMP_TYPE = $i++; # Decomposition type 10295 10296 # These fields are modifications of ones above, and are usually 10297 # suppressed; they must come last, as for speed, the loop upper bound is 10298 # normally set to ignore them 10299 my $NAME = $i++; # This is the strict name field, not the one that 10300 # charnames uses. 10301 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used 10302 # by Unicode::Normalize 10303 my $last_field = $i - 1; 10304 10305 # All these are read into an array for each line, with the indices defined 10306 # above. The empty fields in the example line above indicate that the 10307 # value is defaulted. The handler called for each line of the input 10308 # changes these to their defaults. 10309 10310 # Here are the official names of the properties, in a parallel array: 10311 my @field_names; 10312 $field_names[$BIDI] = 'Bidi_Class'; 10313 $field_names[$CATEGORY] = 'General_Category'; 10314 $field_names[$CCC] = 'Canonical_Combining_Class'; 10315 $field_names[$CHARNAME] = 'Perl_Charnames'; 10316 $field_names[$COMMENT] = 'ISO_Comment'; 10317 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; 10318 $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; 10319 $field_names[$LOWER] = 'Lowercase_Mapping'; 10320 $field_names[$MIRRORED] = 'Bidi_Mirrored'; 10321 $field_names[$NAME] = 'Name'; 10322 $field_names[$NUMERIC] = 'Numeric_Value'; 10323 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; 10324 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; 10325 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; 10326 $field_names[$TITLE] = 'Titlecase_Mapping'; 10327 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; 10328 $field_names[$UPPER] = 'Uppercase_Mapping'; 10329 10330 # Some of these need a little more explanation: 10331 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode 10332 # property, but is used in calculating the Numeric_Type. Perl however, 10333 # creates a file from this field, so a Perl property is created from it. 10334 # Similarly, the Other_Digit field is used only for calculating the 10335 # Numeric_Type, and so it can be safely re-used as the place to store 10336 # the value for Numeric_Type; hence it is referred to as 10337 # $NUMERIC_TYPE_OTHER_DIGIT. 10338 # The input field named $PERL_DECOMPOSITION is a combination of both the 10339 # decomposition mapping and its type. Perl creates a file containing 10340 # exactly this field, so it is used for that. The two properties are 10341 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. 10342 # $DECOMP_MAP is usually suppressed (unless the lists are changed to 10343 # output it), as Perl doesn't use it directly. 10344 # The input field named here $CHARNAME is used to construct the 10345 # Perl_Charnames property, which is a combination of the Name property 10346 # (which the input field contains), and the Unicode_1_Name property, and 10347 # others from other files. Since, the strict Name property is not used 10348 # by Perl, this field is used for the table that Perl does use. The 10349 # strict Name property table is usually suppressed (unless the lists are 10350 # changed to output it), so it is accumulated in a separate field, 10351 # $NAME, which to save time is discarded unless the table is actually to 10352 # be output 10353 10354 # This file is processed like most in this program. Control is passed to 10355 # process_generic_property_file() which calls filter_UnicodeData_line() 10356 # for each input line. This filter converts the input into line(s) that 10357 # process_generic_property_file() understands. There is also a setup 10358 # routine called before any of the file is processed, and a handler for 10359 # EOF processing, all in this closure. 10360 10361 # A huge speed-up occurred at the cost of some added complexity when these 10362 # routines were altered to buffer the outputs into ranges. Almost all the 10363 # lines of the input file apply to just one code point, and for most 10364 # properties, the map for the next code point up is the same as the 10365 # current one. So instead of creating a line for each property for each 10366 # input line, filter_UnicodeData_line() remembers what the previous map 10367 # of a property was, and doesn't generate a line to pass on until it has 10368 # to, as when the map changes; and that passed-on line encompasses the 10369 # whole contiguous range of code points that have the same map for that 10370 # property. This means a slight amount of extra setup, and having to 10371 # flush these buffers on EOF, testing if the maps have changed, plus 10372 # remembering state information in the closure. But it means a lot less 10373 # real time in not having to change the data base for each property on 10374 # each line. 10375 10376 # Another complication is that there are already a few ranges designated 10377 # in the input. There are two lines for each, with the same maps except 10378 # the code point and name on each line. This was actually the hardest 10379 # thing to design around. The code points in those ranges may actually 10380 # have real maps not given by these two lines. These maps will either 10381 # be algorithmically determinable, or be in the extracted files furnished 10382 # with the UCD. In the event of conflicts between these extracted files, 10383 # and this one, Unicode says that this one prevails. But it shouldn't 10384 # prevail for conflicts that occur in these ranges. The data from the 10385 # extracted files prevails in those cases. So, this program is structured 10386 # so that those files are processed first, storing maps. Then the other 10387 # files are processed, generally overwriting what the extracted files 10388 # stored. But just the range lines in this input file are processed 10389 # without overwriting. This is accomplished by adding a special string to 10390 # the lines output to tell process_generic_property_file() to turn off the 10391 # overwriting for just this one line. 10392 # A similar mechanism is used to tell it that the map is of a non-default 10393 # type. 10394 10395 sub setup_UnicodeData { # Called before any lines of the input are read 10396 my $file = shift; 10397 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10398 10399 # Create a new property specially located that is a combination of the 10400 # various Name properties: Name, Unicode_1_Name, Named Sequences, and 10401 # Name_Alias properties. (The final duplicates elements of the 10402 # first.) A comment for it will later be constructed based on the 10403 # actual properties present and used 10404 $perl_charname = Property->new('Perl_Charnames', 10405 Default_Map => "", 10406 Directory => File::Spec->curdir(), 10407 File => 'Name', 10408 Fate => $INTERNAL_ONLY, 10409 Perl_Extension => 1, 10410 Range_Size_1 => \&output_perl_charnames_line, 10411 Type => $STRING, 10412 ); 10413 $perl_charname->set_proxy_for('Name'); 10414 10415 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', 10416 Directory => File::Spec->curdir(), 10417 File => 'Decomposition', 10418 Format => $DECOMP_STRING_FORMAT, 10419 Fate => $INTERNAL_ONLY, 10420 Perl_Extension => 1, 10421 Default_Map => $CODE_POINT, 10422 10423 # normalize.pm can't cope with these 10424 Output_Range_Counts => 0, 10425 10426 # This is a specially formatted table 10427 # explicitly for normalize.pm, which 10428 # is expecting a particular format, 10429 # which means that mappings containing 10430 # multiple code points are in the main 10431 # body of the table 10432 Map_Type => $COMPUTE_NO_MULTI_CP, 10433 Type => $STRING, 10434 To_Output_Map => $INTERNAL_MAP, 10435 ); 10436 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); 10437 $Perl_decomp->add_comment(join_lines(<<END 10438This mapping is a combination of the Unicode 'Decomposition_Type' and 10439'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is 10440identical to the official Unicode 'Decomposition_Mapping' property except for 10441two things: 10442 1) It omits the algorithmically determinable Hangul syllable decompositions, 10443which normalize.pm handles algorithmically. 10444 2) It contains the decomposition type as well. Non-canonical decompositions 10445begin with a word in angle brackets, like <super>, which denotes the 10446compatible decomposition type. If the map does not begin with the <angle 10447brackets>, the decomposition is canonical. 10448END 10449 )); 10450 10451 my $Decimal_Digit = Property->new("Perl_Decimal_Digit", 10452 Default_Map => "", 10453 Perl_Extension => 1, 10454 Directory => $map_directory, 10455 Type => $STRING, 10456 To_Output_Map => $OUTPUT_ADJUSTED, 10457 ); 10458 $Decimal_Digit->add_comment(join_lines(<<END 10459This file gives the mapping of all code points which represent a single 10460decimal digit [0-9] to their respective digits, but it has ranges of 10 code 10461points, and the mapping of each non-initial element of each range is actually 10462not to "0", but to the offset that element has from its corresponding DIGIT 0. 10463These code points are those that have Numeric_Type=Decimal; not special 10464things, like subscripts nor Roman numerals. 10465END 10466 )); 10467 10468 # These properties are not used for generating anything else, and are 10469 # usually not output. By making them last in the list, we can just 10470 # change the high end of the loop downwards to avoid the work of 10471 # generating a table(s) that is/are just going to get thrown away. 10472 if (! property_ref('Decomposition_Mapping')->to_output_map 10473 && ! property_ref('Name')->to_output_map) 10474 { 10475 $last_field = min($NAME, $DECOMP_MAP) - 1; 10476 } elsif (property_ref('Decomposition_Mapping')->to_output_map) { 10477 $last_field = $DECOMP_MAP; 10478 } elsif (property_ref('Name')->to_output_map) { 10479 $last_field = $NAME; 10480 } 10481 return; 10482 } 10483 10484 my $first_time = 1; # ? Is this the first line of the file 10485 my $in_range = 0; # ? Are we in one of the file's ranges 10486 my $previous_cp; # hex code point of previous line 10487 my $decimal_previous_cp = -1; # And its decimal equivalent 10488 my @start; # For each field, the current starting 10489 # code point in hex for the range 10490 # being accumulated. 10491 my @fields; # The input fields; 10492 my @previous_fields; # And those from the previous call 10493 10494 sub filter_UnicodeData_line { 10495 # Handle a single input line from UnicodeData.txt; see comments above 10496 # Conceptually this takes a single line from the file containing N 10497 # properties, and converts it into N lines with one property per line, 10498 # which is what the final handler expects. But there are 10499 # complications due to the quirkiness of the input file, and to save 10500 # time, it accumulates ranges where the property values don't change 10501 # and only emits lines when necessary. This is about an order of 10502 # magnitude fewer lines emitted. 10503 10504 my $file = shift; 10505 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10506 10507 # $_ contains the input line. 10508 # -1 in split means retain trailing null fields 10509 (my $cp, @fields) = split /\s*;\s*/, $_, -1; 10510 10511 #local $to_trace = 1 if main::DEBUG; 10512 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; 10513 if (@fields > $input_field_count) { 10514 $file->carp_bad_line('Extra fields'); 10515 $_ = ""; 10516 return; 10517 } 10518 10519 my $decimal_cp = hex $cp; 10520 10521 # We have to output all the buffered ranges when the next code point 10522 # is not exactly one after the previous one, which means there is a 10523 # gap in the ranges. 10524 my $force_output = ($decimal_cp != $decimal_previous_cp + 1); 10525 10526 # The decomposition mapping field requires special handling. It looks 10527 # like either: 10528 # 10529 # <compat> 0032 0020 10530 # 0041 0300 10531 # 10532 # The decomposition type is enclosed in <brackets>; if missing, it 10533 # means the type is canonical. There are two decomposition mapping 10534 # tables: the one for use by Perl's normalize.pm has a special format 10535 # which is this field intact; the other, for general use is of 10536 # standard format. In either case we have to find the decomposition 10537 # type. Empty fields have None as their type, and map to the code 10538 # point itself 10539 if ($fields[$PERL_DECOMPOSITION] eq "") { 10540 $fields[$DECOMP_TYPE] = 'None'; 10541 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; 10542 } 10543 else { 10544 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] 10545 =~ / < ( .+? ) > \s* ( .+ ) /x; 10546 if (! defined $fields[$DECOMP_TYPE]) { 10547 $fields[$DECOMP_TYPE] = 'Canonical'; 10548 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; 10549 } 10550 else { 10551 $fields[$DECOMP_MAP] = $map; 10552 } 10553 } 10554 10555 # The 3 numeric fields also require special handling. The 2 digit 10556 # fields must be either empty or match the number field. This means 10557 # that if it is empty, they must be as well, and the numeric type is 10558 # None, and the numeric value is 'Nan'. 10559 # The decimal digit field must be empty or match the other digit 10560 # field. If the decimal digit field is non-empty, the code point is 10561 # a decimal digit, and the other two fields will have the same value. 10562 # If it is empty, but the other digit field is non-empty, the code 10563 # point is an 'other digit', and the number field will have the same 10564 # value as the other digit field. If the other digit field is empty, 10565 # but the number field is non-empty, the code point is a generic 10566 # numeric type. 10567 if ($fields[$NUMERIC] eq "") { 10568 if ($fields[$PERL_DECIMAL_DIGIT] ne "" 10569 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" 10570 ) { 10571 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); 10572 } 10573 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; 10574 $fields[$NUMERIC] = 'NaN'; 10575 } 10576 else { 10577 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; 10578 if ($fields[$PERL_DECIMAL_DIGIT] ne "") { 10579 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; 10580 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd"; 10581 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; 10582 } 10583 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { 10584 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; 10585 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; 10586 } 10587 else { 10588 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; 10589 10590 # Rationals require extra effort. 10591 register_fraction($fields[$NUMERIC]) 10592 if $fields[$NUMERIC] =~ qr{/}; 10593 } 10594 } 10595 10596 # For the properties that have empty fields in the file, and which 10597 # mean something different from empty, change them to that default. 10598 # Certain fields just haven't been empty so far in any Unicode 10599 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, 10600 # $CATEGORY. This leaves just the two fields, and so we hard-code in 10601 # the defaults; which are very unlikely to ever change. 10602 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; 10603 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; 10604 10605 # UAX44 says that if title is empty, it is the same as whatever upper 10606 # is, 10607 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; 10608 10609 # There are a few pairs of lines like: 10610 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 10611 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 10612 # that define ranges. These should be processed after the fields are 10613 # adjusted above, as they may override some of them; but mostly what 10614 # is left is to possibly adjust the $CHARNAME field. The names of all the 10615 # paired lines start with a '<', but this is also true of '<control>, 10616 # which isn't one of these special ones. 10617 if ($fields[$CHARNAME] eq '<control>') { 10618 10619 # Some code points in this file have the pseudo-name 10620 # '<control>', but the official name for such ones is the null 10621 # string. 10622 $fields[$NAME] = $fields[$CHARNAME] = ""; 10623 10624 # We had better not be in between range lines. 10625 if ($in_range) { 10626 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 10627 $in_range = 0; 10628 } 10629 } 10630 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { 10631 10632 # Here is a non-range line. We had better not be in between range 10633 # lines. 10634 if ($in_range) { 10635 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 10636 $in_range = 0; 10637 } 10638 if ($fields[$CHARNAME] =~ s/- $cp $//x) { 10639 10640 # These are code points whose names end in their code points, 10641 # which means the names are algorithmically derivable from the 10642 # code points. To shorten the output Name file, the algorithm 10643 # for deriving these is placed in the file instead of each 10644 # code point, so they have map type $CP_IN_NAME 10645 $fields[$CHARNAME] = $CMD_DELIM 10646 . $MAP_TYPE_CMD 10647 . '=' 10648 . $CP_IN_NAME 10649 . $CMD_DELIM 10650 . $fields[$CHARNAME]; 10651 } 10652 $fields[$NAME] = $fields[$CHARNAME]; 10653 } 10654 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { 10655 $fields[$CHARNAME] = $fields[$NAME] = $1; 10656 10657 # Here we are at the beginning of a range pair. 10658 if ($in_range) { 10659 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); 10660 } 10661 $in_range = 1; 10662 10663 # Because the properties in the range do not overwrite any already 10664 # in the db, we must flush the buffers of what's already there, so 10665 # they get handled in the normal scheme. 10666 $force_output = 1; 10667 10668 } 10669 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { 10670 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); 10671 $_ = ""; 10672 return; 10673 } 10674 else { # Here, we are at the last line of a range pair. 10675 10676 if (! $in_range) { 10677 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); 10678 $_ = ""; 10679 return; 10680 } 10681 $in_range = 0; 10682 10683 $fields[$NAME] = $fields[$CHARNAME]; 10684 10685 # Check that the input is valid: that the closing of the range is 10686 # the same as the beginning. 10687 foreach my $i (0 .. $last_field) { 10688 next if $fields[$i] eq $previous_fields[$i]; 10689 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); 10690 } 10691 10692 # The processing differs depending on the type of range, 10693 # determined by its $CHARNAME 10694 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { 10695 10696 # Check that the data looks right. 10697 if ($decimal_previous_cp != $SBase) { 10698 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); 10699 } 10700 if ($decimal_cp != $SBase + $SCount - 1) { 10701 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); 10702 } 10703 10704 # The Hangul syllable range has a somewhat complicated name 10705 # generation algorithm. Each code point in it has a canonical 10706 # decomposition also computable by an algorithm. The 10707 # perl decomposition map table built from these is used only 10708 # by normalize.pm, which has the algorithm built in it, so the 10709 # decomposition maps are not needed, and are large, so are 10710 # omitted from it. If the full decomposition map table is to 10711 # be output, the decompositions are generated for it, in the 10712 # EOF handling code for this input file. 10713 10714 $previous_fields[$DECOMP_TYPE] = 'Canonical'; 10715 10716 # This range is stored in our internal structure with its 10717 # own map type, different from all others. 10718 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 10719 = $CMD_DELIM 10720 . $MAP_TYPE_CMD 10721 . '=' 10722 . $HANGUL_SYLLABLE 10723 . $CMD_DELIM 10724 . $fields[$CHARNAME]; 10725 } 10726 elsif ($fields[$CHARNAME] =~ /^CJK/) { 10727 10728 # The name for these contains the code point itself, and all 10729 # are defined to have the same base name, regardless of what 10730 # is in the file. They are stored in our internal structure 10731 # with a map type of $CP_IN_NAME 10732 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 10733 = $CMD_DELIM 10734 . $MAP_TYPE_CMD 10735 . '=' 10736 . $CP_IN_NAME 10737 . $CMD_DELIM 10738 . 'CJK UNIFIED IDEOGRAPH'; 10739 10740 } 10741 elsif ($fields[$CATEGORY] eq 'Co' 10742 || $fields[$CATEGORY] eq 'Cs') 10743 { 10744 # The names of all the code points in these ranges are set to 10745 # null, as there are no names for the private use and 10746 # surrogate code points. 10747 10748 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; 10749 } 10750 else { 10751 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); 10752 } 10753 10754 # The first line of the range caused everything else to be output, 10755 # and then its values were stored as the beginning values for the 10756 # next set of ranges, which this one ends. Now, for each value, 10757 # add a command to tell the handler that these values should not 10758 # replace any existing ones in our database. 10759 foreach my $i (0 .. $last_field) { 10760 $previous_fields[$i] = $CMD_DELIM 10761 . $REPLACE_CMD 10762 . '=' 10763 . $NO 10764 . $CMD_DELIM 10765 . $previous_fields[$i]; 10766 } 10767 10768 # And change things so it looks like the entire range has been 10769 # gone through with this being the final part of it. Adding the 10770 # command above to each field will cause this range to be flushed 10771 # during the next iteration, as it guaranteed that the stored 10772 # field won't match whatever value the next one has. 10773 $previous_cp = $cp; 10774 $decimal_previous_cp = $decimal_cp; 10775 10776 # We are now set up for the next iteration; so skip the remaining 10777 # code in this subroutine that does the same thing, but doesn't 10778 # know about these ranges. 10779 $_ = ""; 10780 10781 return; 10782 } 10783 10784 # On the very first line, we fake it so the code below thinks there is 10785 # nothing to output, and initialize so that when it does get output it 10786 # uses the first line's values for the lowest part of the range. 10787 # (One could avoid this by using peek(), but then one would need to 10788 # know the adjustments done above and do the same ones in the setup 10789 # routine; not worth it) 10790 if ($first_time) { 10791 $first_time = 0; 10792 @previous_fields = @fields; 10793 @start = ($cp) x scalar @fields; 10794 $decimal_previous_cp = $decimal_cp - 1; 10795 } 10796 10797 # For each field, output the stored up ranges that this code point 10798 # doesn't fit in. Earlier we figured out if all ranges should be 10799 # terminated because of changing the replace or map type styles, or if 10800 # there is a gap between this new code point and the previous one, and 10801 # that is stored in $force_output. But even if those aren't true, we 10802 # need to output the range if this new code point's value for the 10803 # given property doesn't match the stored range's. 10804 #local $to_trace = 1 if main::DEBUG; 10805 foreach my $i (0 .. $last_field) { 10806 my $field = $fields[$i]; 10807 if ($force_output || $field ne $previous_fields[$i]) { 10808 10809 # Flush the buffer of stored values. 10810 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 10811 10812 # Start a new range with this code point and its value 10813 $start[$i] = $cp; 10814 $previous_fields[$i] = $field; 10815 } 10816 } 10817 10818 # Set the values for the next time. 10819 $previous_cp = $cp; 10820 $decimal_previous_cp = $decimal_cp; 10821 10822 # The input line has generated whatever adjusted lines are needed, and 10823 # should not be looked at further. 10824 $_ = ""; 10825 return; 10826 } 10827 10828 sub EOF_UnicodeData { 10829 # Called upon EOF to flush the buffers, and create the Hangul 10830 # decomposition mappings if needed. 10831 10832 my $file = shift; 10833 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10834 10835 # Flush the buffers. 10836 foreach my $i (0 .. $last_field) { 10837 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 10838 } 10839 10840 if (-e 'Jamo.txt') { 10841 10842 # The algorithm is published by Unicode, based on values in 10843 # Jamo.txt, (which should have been processed before this 10844 # subroutine), and the results left in %Jamo 10845 unless (%Jamo) { 10846 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); 10847 return; 10848 } 10849 10850 # If the full decomposition map table is being output, insert 10851 # into it the Hangul syllable mappings. This is to avoid having 10852 # to publish a subroutine in it to compute them. (which would 10853 # essentially be this code.) This uses the algorithm published by 10854 # Unicode. (No hangul syllables in version 1) 10855 if ($v_version ge v2.0.0 10856 && property_ref('Decomposition_Mapping')->to_output_map) { 10857 for (my $S = $SBase; $S < $SBase + $SCount; $S++) { 10858 use integer; 10859 my $SIndex = $S - $SBase; 10860 my $L = $LBase + $SIndex / $NCount; 10861 my $V = $VBase + ($SIndex % $NCount) / $TCount; 10862 my $T = $TBase + $SIndex % $TCount; 10863 10864 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; 10865 my $decomposition = sprintf("%04X %04X", $L, $V); 10866 $decomposition .= sprintf(" %04X", $T) if $T != $TBase; 10867 $file->insert_adjusted_lines( 10868 sprintf("%04X; Decomposition_Mapping; %s", 10869 $S, 10870 $decomposition)); 10871 } 10872 } 10873 } 10874 10875 return; 10876 } 10877 10878 sub filter_v1_ucd { 10879 # Fix UCD lines in version 1. This is probably overkill, but this 10880 # fixes some glaring errors in Version 1 UnicodeData.txt. That file: 10881 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later 10882 # removed. This program retains them 10883 # 2) didn't include ranges, which it should have, and which are now 10884 # added in @corrected_lines below. It was hand populated by 10885 # taking the data from Version 2, verified by analyzing 10886 # DAge.txt. 10887 # 3) There is a syntax error in the entry for U+09F8 which could 10888 # cause problems for utf8_heavy, and so is changed. It's 10889 # numeric value was simply a minus sign, without any number. 10890 # (Eventually Unicode changed the code point to non-numeric.) 10891 # 4) The decomposition types often don't match later versions 10892 # exactly, and the whole syntax of that field is different; so 10893 # the syntax is changed as well as the types to their later 10894 # terminology. Otherwise normalize.pm would be very unhappy 10895 # 5) Many ccc classes are different. These are left intact. 10896 # 6) U+FF10..U+FF19 are missing their numeric values in all three 10897 # fields. These are unchanged because it doesn't really cause 10898 # problems for Perl. 10899 # 7) A number of code points, such as controls, don't have their 10900 # Unicode Version 1 Names in this file. These are added. 10901 # 8) A number of Symbols were marked as Lm. This changes those in 10902 # the Latin1 range, so that regexes work. 10903 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are 10904 # referred to by their lc equivalents. Not fixed. 10905 10906 my @corrected_lines = split /\n/, <<'END'; 109074E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 109089FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 10909E000;<Private Use, First>;Co;0;L;;;;;N;;;;; 10910F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;; 10911F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;; 10912FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;; 10913END 10914 10915 my $file = shift; 10916 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10917 10918 #local $to_trace = 1 if main::DEBUG; 10919 trace $_ if main::DEBUG && $to_trace; 10920 10921 # -1 => retain trailing null fields 10922 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 10923 10924 # At the first place that is wrong in the input, insert all the 10925 # corrections, replacing the wrong line. 10926 if ($code_point eq '4E00') { 10927 my @copy = @corrected_lines; 10928 $_ = shift @copy; 10929 ($code_point, @fields) = split /\s*;\s*/, $_, -1; 10930 10931 $file->insert_lines(@copy); 10932 } 10933 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') { 10934 10935 # There are no Lm characters in Latin1; these should be 'Sk', but 10936 # there isn't that in V1. 10937 $fields[$CATEGORY] = 'So'; 10938 } 10939 10940 if ($fields[$NUMERIC] eq '-') { 10941 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. 10942 } 10943 10944 if ($fields[$PERL_DECOMPOSITION] ne "") { 10945 10946 # Several entries have this change to superscript 2 or 3 in the 10947 # middle. Convert these to the modern version, which is to use 10948 # the actual U+00B2 and U+00B3 (the superscript forms) instead. 10949 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes 10950 # 'HHHH HHHH 00B3 HHHH'. 10951 # It turns out that all of these that don't have another 10952 # decomposition defined at the beginning of the line have the 10953 # <square> decomposition in later releases. 10954 if ($code_point ne '00B2' && $code_point ne '00B3') { 10955 if ($fields[$PERL_DECOMPOSITION] 10956 =~ s/<\+sup> 003([23]) <-sup>/00B$1/) 10957 { 10958 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { 10959 $fields[$PERL_DECOMPOSITION] = '<square> ' 10960 . $fields[$PERL_DECOMPOSITION]; 10961 } 10962 } 10963 } 10964 10965 # If is like '<+circled> 0052 <-circled>', convert to 10966 # '<circled> 0052' 10967 $fields[$PERL_DECOMPOSITION] =~ 10968 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg; 10969 10970 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc. 10971 $fields[$PERL_DECOMPOSITION] =~ 10972 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x 10973 or $fields[$PERL_DECOMPOSITION] =~ 10974 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x 10975 or $fields[$PERL_DECOMPOSITION] =~ 10976 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x 10977 or $fields[$PERL_DECOMPOSITION] =~ 10978 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x; 10979 10980 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc. 10981 $fields[$PERL_DECOMPOSITION] =~ 10982 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; 10983 10984 # Change names to modern form. 10985 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g; 10986 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g; 10987 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g; 10988 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g; 10989 10990 # One entry has weird braces 10991 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; 10992 10993 # One entry at U+2116 has an extra <sup> 10994 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x; 10995 } 10996 10997 $_ = join ';', $code_point, @fields; 10998 trace $_ if main::DEBUG && $to_trace; 10999 return; 11000 } 11001 11002 sub filter_bad_Nd_ucd { 11003 # Early versions specified a value in the decimal digit field even 11004 # though the code point wasn't a decimal digit. Clear the field in 11005 # that situation, so that the main code doesn't think it is a decimal 11006 # digit. 11007 11008 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11009 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') { 11010 $fields[$PERL_DECIMAL_DIGIT] = ""; 11011 $_ = join ';', $code_point, @fields; 11012 } 11013 return; 11014 } 11015 11016 my @U1_control_names = split /\n/, <<'END'; 11017NULL 11018START OF HEADING 11019START OF TEXT 11020END OF TEXT 11021END OF TRANSMISSION 11022ENQUIRY 11023ACKNOWLEDGE 11024BELL 11025BACKSPACE 11026HORIZONTAL TABULATION 11027LINE FEED 11028VERTICAL TABULATION 11029FORM FEED 11030CARRIAGE RETURN 11031SHIFT OUT 11032SHIFT IN 11033DATA LINK ESCAPE 11034DEVICE CONTROL ONE 11035DEVICE CONTROL TWO 11036DEVICE CONTROL THREE 11037DEVICE CONTROL FOUR 11038NEGATIVE ACKNOWLEDGE 11039SYNCHRONOUS IDLE 11040END OF TRANSMISSION BLOCK 11041CANCEL 11042END OF MEDIUM 11043SUBSTITUTE 11044ESCAPE 11045FILE SEPARATOR 11046GROUP SEPARATOR 11047RECORD SEPARATOR 11048UNIT SEPARATOR 11049DELETE 11050BREAK PERMITTED HERE 11051NO BREAK HERE 11052INDEX 11053NEXT LINE 11054START OF SELECTED AREA 11055END OF SELECTED AREA 11056CHARACTER TABULATION SET 11057CHARACTER TABULATION WITH JUSTIFICATION 11058LINE TABULATION SET 11059PARTIAL LINE DOWN 11060PARTIAL LINE UP 11061REVERSE LINE FEED 11062SINGLE SHIFT TWO 11063SINGLE SHIFT THREE 11064DEVICE CONTROL STRING 11065PRIVATE USE ONE 11066PRIVATE USE TWO 11067SET TRANSMIT STATE 11068CANCEL CHARACTER 11069MESSAGE WAITING 11070START OF GUARDED AREA 11071END OF GUARDED AREA 11072START OF STRING 11073SINGLE CHARACTER INTRODUCER 11074CONTROL SEQUENCE INTRODUCER 11075STRING TERMINATOR 11076OPERATING SYSTEM COMMAND 11077PRIVACY MESSAGE 11078APPLICATION PROGRAM COMMAND 11079END 11080 11081 sub filter_early_U1_names { 11082 # Very early versions did not have the Unicode_1_name field specified. 11083 # They differed in which ones were present; make sure a U1 name 11084 # exists, so that Unicode::UCD::charinfo will work 11085 11086 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11087 11088 11089 # @U1_control names above are entirely positional, so we pull them out 11090 # in the exact order required, with gaps for the ones that don't have 11091 # names. 11092 if ($code_point =~ /^00[01]/ 11093 || $code_point eq '007F' 11094 || $code_point =~ /^008[2-9A-F]/ 11095 || $code_point =~ /^009[0-8A-F]/) 11096 { 11097 my $u1_name = shift @U1_control_names; 11098 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME]; 11099 $_ = join ';', $code_point, @fields; 11100 } 11101 return; 11102 } 11103 11104 sub filter_v2_1_5_ucd { 11105 # A dozen entries in this 2.1.5 file had the mirrored and numeric 11106 # columns swapped; These all had mirrored be 'N'. So if the numeric 11107 # column appears to be N, swap it back. 11108 11109 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11110 if ($fields[$NUMERIC] eq 'N') { 11111 $fields[$NUMERIC] = $fields[$MIRRORED]; 11112 $fields[$MIRRORED] = 'N'; 11113 $_ = join ';', $code_point, @fields; 11114 } 11115 return; 11116 } 11117 11118 sub filter_v6_ucd { 11119 11120 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17, 11121 # it wasn't accepted, to allow for some deprecation cycles. This 11122 # function is not called after 5.16 11123 11124 return if $_ !~ /^(?:0007|1F514|070F);/; 11125 11126 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11127 if ($code_point eq '0007') { 11128 $fields[$CHARNAME] = ""; 11129 } 11130 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see 11131 # http://www.unicode.org/versions/corrigendum8.html 11132 $fields[$BIDI] = "AL"; 11133 } 11134 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name 11135 $fields[$CHARNAME] = ""; 11136 } 11137 11138 $_ = join ';', $code_point, @fields; 11139 11140 return; 11141 } 11142} # End closure for UnicodeData 11143 11144sub process_GCB_test { 11145 11146 my $file = shift; 11147 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11148 11149 while ($file->next_line) { 11150 push @backslash_X_tests, $_; 11151 } 11152 11153 return; 11154} 11155 11156sub process_NamedSequences { 11157 # NamedSequences.txt entries are just added to an array. Because these 11158 # don't look like the other tables, they have their own handler. 11159 # An example: 11160 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 11161 # 11162 # This just adds the sequence to an array for later handling 11163 11164 my $file = shift; 11165 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11166 11167 while ($file->next_line) { 11168 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; 11169 if (@remainder) { 11170 $file->carp_bad_line( 11171 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); 11172 next; 11173 } 11174 11175 # Note single \t in keeping with special output format of 11176 # Perl_charnames. But it turns out that the code points don't have to 11177 # be 5 digits long, like the rest, based on the internal workings of 11178 # charnames.pm. This could be easily changed for consistency. 11179 push @named_sequences, "$sequence\t$name"; 11180 } 11181 return; 11182} 11183 11184{ # Closure 11185 11186 my $first_range; 11187 11188 sub filter_early_ea_lb { 11189 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a 11190 # third field be the name of the code point, which can be ignored in 11191 # most cases. But it can be meaningful if it marks a range: 11192 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 11193 # 3400;W;<CJK Ideograph Extension A, First> 11194 # 11195 # We need to see the First in the example above to know it's a range. 11196 # They did not use the later range syntaxes. This routine changes it 11197 # to use the modern syntax. 11198 # $1 is the Input_file object. 11199 11200 my @fields = split /\s*;\s*/; 11201 if ($fields[2] =~ /^<.*, First>/) { 11202 $first_range = $fields[0]; 11203 $_ = ""; 11204 } 11205 elsif ($fields[2] =~ /^<.*, Last>/) { 11206 $_ = $_ = "$first_range..$fields[0]; $fields[1]"; 11207 } 11208 else { 11209 undef $first_range; 11210 $_ = "$fields[0]; $fields[1]"; 11211 } 11212 11213 return; 11214 } 11215} 11216 11217sub filter_old_style_arabic_shaping { 11218 # Early versions used a different term for the later one. 11219 11220 my @fields = split /\s*;\s*/; 11221 $fields[3] =~ s/<no shaping>/No_Joining_Group/; 11222 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores 11223 $_ = join ';', @fields; 11224 return; 11225} 11226 11227sub filter_arabic_shaping_line { 11228 # ArabicShaping.txt has entries that look like: 11229 # 062A; TEH; D; BEH 11230 # The field containing 'TEH' is not used. The next field is Joining_Type 11231 # and the last is Joining_Group 11232 # This generates two lines to pass on, one for each property on the input 11233 # line. 11234 11235 my $file = shift; 11236 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11237 11238 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 11239 11240 if (@fields > 4) { 11241 $file->carp_bad_line('Extra fields'); 11242 $_ = ""; 11243 return; 11244 } 11245 11246 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]"); 11247 $_ = "$fields[0]; Joining_Type; $fields[2]"; 11248 11249 return; 11250} 11251 11252{ # Closure 11253 my $lc; # Table for lowercase mapping 11254 my $tc; 11255 my $uc; 11256 my %special_casing_code_points; 11257 11258 sub setup_special_casing { 11259 # SpecialCasing.txt contains the non-simple case change mappings. The 11260 # simple ones are in UnicodeData.txt, which should already have been 11261 # read in to the full property data structures, so as to initialize 11262 # these with the simple ones. Then the SpecialCasing.txt entries 11263 # add or overwrite the ones which have different full mappings. 11264 11265 # This routine sees if the simple mappings are to be output, and if 11266 # so, copies what has already been put into the full mapping tables, 11267 # while they still contain only the simple mappings. 11268 11269 # The reason it is done this way is that the simple mappings are 11270 # probably not going to be output, so it saves work to initialize the 11271 # full tables with the simple mappings, and then overwrite those 11272 # relatively few entries in them that have different full mappings, 11273 # and thus skip the simple mapping tables altogether. 11274 11275 my $file= shift; 11276 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11277 11278 $lc = property_ref('lc'); 11279 $tc = property_ref('tc'); 11280 $uc = property_ref('uc'); 11281 11282 # For each of the case change mappings... 11283 foreach my $full_table ($lc, $tc, $uc) { 11284 my $full_name = $full_table->name; 11285 unless (defined $full_table && ! $full_table->is_empty) { 11286 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); 11287 } 11288 11289 # Create a table in the old-style format and with the original 11290 # file name for backwards compatibility with applications that 11291 # read it directly. The new tables contain both the simple and 11292 # full maps, and the old are missing simple maps when there is a 11293 # conflicting full one. Probably it would have been ok to add 11294 # those to the legacy version, as was already done in 5.14 to the 11295 # case folding one, but this was not done, out of an abundance of 11296 # caution. The tables are set up here before we deal with the 11297 # full maps so that as we handle those, we can override the simple 11298 # maps for them in the legacy table, and merely add them in the 11299 # new-style one. 11300 my $legacy = Property->new("Legacy_" . $full_table->full_name, 11301 File => $full_table->full_name =~ 11302 s/case_Mapping//r, 11303 Range_Size_1 => 1, 11304 Format => $HEX_FORMAT, 11305 Default_Map => $CODE_POINT, 11306 UCD => 0, 11307 Initialize => $full_table, 11308 To_Output_Map => $EXTERNAL_MAP, 11309 ); 11310 11311 $full_table->add_comment(join_lines( <<END 11312This file includes both the simple and full case changing maps. The simple 11313ones are in the main body of the table below, and the full ones adding to or 11314overriding them are in the hash. 11315END 11316 )); 11317 11318 # The simple version's name in each mapping merely has an 's' in 11319 # front of the full one's 11320 my $simple_name = 's' . $full_name; 11321 my $simple = property_ref($simple_name); 11322 $simple->initialize($full_table) if $simple->to_output_map(); 11323 } 11324 11325 return; 11326 } 11327 11328 sub filter_2_1_8_special_casing_line { 11329 11330 # This version had duplicate entries in this file. Delete all but the 11331 # first one 11332 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 11333 # fields 11334 if (exists $special_casing_code_points{$fields[0]}) { 11335 $_ = ""; 11336 return; 11337 } 11338 11339 $special_casing_code_points{$fields[0]} = 1; 11340 filter_special_casing_line(@_); 11341 } 11342 11343 sub filter_special_casing_line { 11344 # Change the format of $_ from SpecialCasing.txt into something that 11345 # the generic handler understands. Each input line contains three 11346 # case mappings. This will generate three lines to pass to the 11347 # generic handler for each of those. 11348 11349 # The input syntax (after stripping comments and trailing white space 11350 # is like one of the following (with the final two being entries that 11351 # we ignore): 11352 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 11353 # 03A3; 03C2; 03A3; 03A3; Final_Sigma; 11354 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 11355 # Note the trailing semi-colon, unlike many of the input files. That 11356 # means that there will be an extra null field generated by the split 11357 11358 my $file = shift; 11359 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11360 11361 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 11362 # fields 11363 11364 # field #4 is when this mapping is conditional. If any of these get 11365 # implemented, it would be by hard-coding in the casing functions in 11366 # the Perl core, not through tables. But if there is a new condition 11367 # we don't know about, output a warning. We know about all the 11368 # conditions through 6.0 11369 if ($fields[4] ne "") { 11370 my @conditions = split ' ', $fields[4]; 11371 if ($conditions[0] ne 'tr' # We know that these languages have 11372 # conditions, and some are multiple 11373 && $conditions[0] ne 'az' 11374 && $conditions[0] ne 'lt' 11375 11376 # And, we know about a single condition Final_Sigma, but 11377 # nothing else. 11378 && ($v_version gt v5.2.0 11379 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) 11380 { 11381 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); 11382 } 11383 elsif ($conditions[0] ne 'Final_Sigma') { 11384 11385 # Don't print out a message for Final_Sigma, because we 11386 # have hard-coded handling for it. (But the standard 11387 # could change what the rule should be, but it wouldn't 11388 # show up here anyway. 11389 11390 print "# SKIPPING Special Casing: $_\n" 11391 if $verbosity >= $VERBOSE; 11392 } 11393 $_ = ""; 11394 return; 11395 } 11396 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { 11397 $file->carp_bad_line('Extra fields'); 11398 $_ = ""; 11399 return; 11400 } 11401 11402 my $decimal_code_point = hex $fields[0]; 11403 11404 # Loop to handle each of the three mappings in the input line, in 11405 # order, with $i indicating the current field number. 11406 my $i = 0; 11407 for my $object ($lc, $tc, $uc) { 11408 $i++; # First time through, $i = 0 ... 3rd time = 3 11409 11410 my $value = $object->value_of($decimal_code_point); 11411 $value = ($value eq $CODE_POINT) 11412 ? $decimal_code_point 11413 : hex $value; 11414 11415 # If this isn't a multi-character mapping, it should already have 11416 # been read in. 11417 if ($fields[$i] !~ / /) { 11418 if ($value != hex $fields[$i]) { 11419 Carp::my_carp("Bad news. UnicodeData.txt thinks " 11420 . $object->name 11421 . "(0x$fields[0]) is $value" 11422 . " and SpecialCasing.txt thinks it is " 11423 . hex($fields[$i]) 11424 . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); 11425 } 11426 } 11427 else { 11428 11429 # The mapping goes into both the legacy table, in which it 11430 # replaces the simple one... 11431 $file->insert_adjusted_lines("$fields[0]; Legacy_" 11432 . $object->full_name 11433 . "; $fields[$i]"); 11434 11435 # ... and, the The regular table, in which it is additional, 11436 # beyond the simple mapping. 11437 $file->insert_adjusted_lines("$fields[0]; " 11438 . $object->name 11439 . "; " 11440 . $CMD_DELIM 11441 . "$REPLACE_CMD=$MULTIPLE_BEFORE" 11442 . $CMD_DELIM 11443 . $fields[$i]); 11444 } 11445 } 11446 11447 # Everything has been handled by the insert_adjusted_lines() 11448 $_ = ""; 11449 11450 return; 11451 } 11452} 11453 11454sub filter_old_style_case_folding { 11455 # This transforms $_ containing the case folding style of 3.0.1, to 3.1 11456 # and later style. Different letters were used in the earlier. 11457 11458 my $file = shift; 11459 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11460 11461 my @fields = split /\s*;\s*/; 11462 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields 11463 $fields[1] = 'I'; 11464 } 11465 elsif ($fields[1] eq 'L') { 11466 $fields[1] = 'C'; # L => C always 11467 } 11468 elsif ($fields[1] eq 'E') { 11469 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise 11470 $fields[1] = 'F' 11471 } 11472 else { 11473 $fields[1] = 'C' 11474 } 11475 } 11476 else { 11477 $file->carp_bad_line("Expecting L or E in second field"); 11478 $_ = ""; 11479 return; 11480 } 11481 $_ = join("; ", @fields) . ';'; 11482 return; 11483} 11484 11485{ # Closure for case folding 11486 11487 # Create the map for simple only if are going to output it, for otherwise 11488 # it takes no part in anything we do. 11489 my $to_output_simple; 11490 my $all_folds; 11491 11492 sub setup_case_folding($) { 11493 # Read in the case foldings in CaseFolding.txt. This handles both 11494 # simple and full case folding. 11495 11496 $to_output_simple 11497 = property_ref('Simple_Case_Folding')->to_output_map; 11498 11499 if (! $to_output_simple) { 11500 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); 11501 } 11502 11503 $all_folds = $perl->add_match_table("_Perl_Any_Folds", 11504 Perl_Extension => 1, 11505 Fate => $INTERNAL_ONLY, 11506 Description => "Code points that particpate in some fold", 11507 ); 11508 11509 # If we ever wanted to show that these tables were combined, a new 11510 # property method could be created, like set_combined_props() 11511 property_ref('Case_Folding')->add_comment(join_lines( <<END 11512This file includes both the simple and full case folding maps. The simple 11513ones are in the main body of the table below, and the full ones adding to or 11514overriding them are in the hash. 11515END 11516 )); 11517 return; 11518 } 11519 11520 sub filter_case_folding_line { 11521 # Called for each line in CaseFolding.txt 11522 # Input lines look like: 11523 # 0041; C; 0061; # LATIN CAPITAL LETTER A 11524 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 11525 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 11526 # 11527 # 'C' means that folding is the same for both simple and full 11528 # 'F' that it is only for full folding 11529 # 'S' that it is only for simple folding 11530 # 'T' is locale-dependent, and ignored 11531 # 'I' is a type of 'F' used in some early releases. 11532 # Note the trailing semi-colon, unlike many of the input files. That 11533 # means that there will be an extra null field generated by the split 11534 # below, which we ignore and hence is not an error. 11535 11536 my $file = shift; 11537 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11538 11539 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; 11540 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { 11541 $file->carp_bad_line('Extra fields'); 11542 $_ = ""; 11543 return; 11544 } 11545 11546 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent 11547 $_ = ""; 11548 return; 11549 } 11550 11551 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase 11552 # I are all full foldings; S is single-char. For S, there is always 11553 # an F entry, so we must allow multiple values for the same code 11554 # point. Fortunately this table doesn't need further manipulation 11555 # which would preclude using multiple-values. The S is now included 11556 # so that _swash_inversion_hash() is able to construct closures 11557 # without having to worry about F mappings. 11558 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { 11559 my $from = hex $range; # Assumes range is single 11560 $all_folds->add_range($from, $from); 11561 $_ = "$range; Case_Folding; " 11562 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map"; 11563 11564 if ($type eq 'F') { 11565 my @string = split " ", $map; 11566 for my $i (0 .. @string - 1 -1) { 11567 my $decimal = hex $string[$i]; 11568 $all_folds->add_range($decimal, $decimal); 11569 } 11570 } 11571 else { 11572 $all_folds->add_range(hex $map, hex $map); 11573 } 11574 } 11575 else { 11576 $_ = ""; 11577 $file->carp_bad_line('Expecting C F I S or T in second field'); 11578 } 11579 11580 # C and S are simple foldings, but simple case folding is not needed 11581 # unless we explicitly want its map table output. 11582 if ($to_output_simple && $type eq 'C' || $type eq 'S') { 11583 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); 11584 } 11585 11586 return; 11587 } 11588 11589} # End case fold closure 11590 11591sub filter_jamo_line { 11592 # Filter Jamo.txt lines. This routine mainly is used to populate hashes 11593 # from this file that is used in generating the Name property for Jamo 11594 # code points. But, it also is used to convert early versions' syntax 11595 # into the modern form. Here are two examples: 11596 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax 11597 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax 11598 # 11599 # The input is $_, the output is $_ filtered. 11600 11601 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 11602 11603 # Let the caller handle unexpected input. In earlier versions, there was 11604 # a third field which is supposed to be a comment, but did not have a '#' 11605 # before it. 11606 return if @fields > (($v_version gt v3.0.0) ? 2 : 3); 11607 11608 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous 11609 # beginning. 11610 11611 # Some 2.1 versions had this wrong. Causes havoc with the algorithm. 11612 $fields[1] = 'R' if $fields[0] eq '1105'; 11613 11614 # Add to structure so can generate Names from it. 11615 my $cp = hex $fields[0]; 11616 my $short_name = $fields[1]; 11617 $Jamo{$cp} = $short_name; 11618 if ($cp <= $LBase + $LCount) { 11619 $Jamo_L{$short_name} = $cp - $LBase; 11620 } 11621 elsif ($cp <= $VBase + $VCount) { 11622 $Jamo_V{$short_name} = $cp - $VBase; 11623 } 11624 elsif ($cp <= $TBase + $TCount) { 11625 $Jamo_T{$short_name} = $cp - $TBase; 11626 } 11627 else { 11628 Carp::my_carp_bug("Unexpected Jamo code point in $_"); 11629 } 11630 11631 11632 # Reassemble using just the first two fields to look like a typical 11633 # property file line 11634 $_ = "$fields[0]; $fields[1]"; 11635 11636 return; 11637} 11638 11639sub register_fraction($) { 11640 # This registers the input rational number so that it can be passed on to 11641 # utf8_heavy.pl, both in rational and floating forms. 11642 11643 my $rational = shift; 11644 11645 my $float = eval $rational; 11646 $nv_floating_to_rational{$float} = $rational; 11647 return; 11648} 11649 11650sub filter_numeric_value_line { 11651 # DNumValues contains lines of a different syntax than the typical 11652 # property file: 11653 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO 11654 # 11655 # This routine transforms $_ containing the anomalous syntax to the 11656 # typical, by filtering out the extra columns, and convert early version 11657 # decimal numbers to strings that look like rational numbers. 11658 11659 my $file = shift; 11660 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11661 11662 # Starting in 5.1, there is a rational field. Just use that, omitting the 11663 # extra columns. Otherwise convert the decimal number in the second field 11664 # to a rational, and omit extraneous columns. 11665 my @fields = split /\s*;\s*/, $_, -1; 11666 my $rational; 11667 11668 if ($v_version ge v5.1.0) { 11669 if (@fields != 4) { 11670 $file->carp_bad_line('Not 4 semi-colon separated fields'); 11671 $_ = ""; 11672 return; 11673 } 11674 $rational = $fields[3]; 11675 $_ = join '; ', @fields[ 0, 3 ]; 11676 } 11677 else { 11678 11679 # Here, is an older Unicode file, which has decimal numbers instead of 11680 # rationals in it. Use the fraction to calculate the denominator and 11681 # convert to rational. 11682 11683 if (@fields != 2 && @fields != 3) { 11684 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); 11685 $_ = ""; 11686 return; 11687 } 11688 11689 my $codepoints = $fields[0]; 11690 my $decimal = $fields[1]; 11691 if ($decimal =~ s/\.0+$//) { 11692 11693 # Anything ending with a decimal followed by nothing but 0's is an 11694 # integer 11695 $_ = "$codepoints; $decimal"; 11696 $rational = $decimal; 11697 } 11698 else { 11699 11700 my $denominator; 11701 if ($decimal =~ /\.50*$/) { 11702 $denominator = 2; 11703 } 11704 11705 # Here have the hardcoded repeating decimals in the fraction, and 11706 # the denominator they imply. There were only a few denominators 11707 # in the older Unicode versions of this file which this code 11708 # handles, so it is easy to convert them. 11709 11710 # The 4 is because of a round-off error in the Unicode 3.2 files 11711 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { 11712 $denominator = 3; 11713 } 11714 elsif ($decimal =~ /\.[27]50*$/) { 11715 $denominator = 4; 11716 } 11717 elsif ($decimal =~ /\.[2468]0*$/) { 11718 $denominator = 5; 11719 } 11720 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { 11721 $denominator = 6; 11722 } 11723 elsif ($decimal =~ /\.(12|37|62|87)50*$/) { 11724 $denominator = 8; 11725 } 11726 if ($denominator) { 11727 my $sign = ($decimal < 0) ? "-" : ""; 11728 my $numerator = int((abs($decimal) * $denominator) + .5); 11729 $rational = "$sign$numerator/$denominator"; 11730 $_ = "$codepoints; $rational"; 11731 } 11732 else { 11733 $file->carp_bad_line("Can't cope with number '$decimal'."); 11734 $_ = ""; 11735 return; 11736 } 11737 } 11738 } 11739 11740 register_fraction($rational) if $rational =~ qr{/}; 11741 return; 11742} 11743 11744{ # Closure 11745 my %unihan_properties; 11746 11747 sub setup_unihan { 11748 # Do any special setup for Unihan properties. 11749 11750 # This property gives the wrong computed type, so override. 11751 my $usource = property_ref('kIRG_USource'); 11752 $usource->set_type($STRING) if defined $usource; 11753 11754 # This property is to be considered binary (it says so in 11755 # http://www.unicode.org/reports/tr38/) 11756 my $iicore = property_ref('kIICore'); 11757 if (defined $iicore) { 11758 $iicore->set_type($FORCED_BINARY); 11759 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38."); 11760 11761 # Unicode doesn't include the maps for this property, so don't 11762 # warn that they are missing. 11763 $iicore->set_pre_declared_maps(0); 11764 $iicore->add_comment(join_lines( <<END 11765This property contains enum values, but Unicode UAX #38 says it should be 11766interpreted as binary, so Perl creates tables for both 1) its enum values, 11767plus 2) true/false tables in which it is considered true for all code points 11768that have a non-null value 11769END 11770 )); 11771 } 11772 11773 return; 11774 } 11775 11776 sub filter_unihan_line { 11777 # Change unihan db lines to look like the others in the db. Here is 11778 # an input sample: 11779 # U+341C kCangjie IEKN 11780 11781 # Tabs are used instead of semi-colons to separate fields; therefore 11782 # they may have semi-colons embedded in them. Change these to periods 11783 # so won't screw up the rest of the code. 11784 s/;/./g; 11785 11786 # Remove lines that don't look like ones we accept. 11787 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { 11788 $_ = ""; 11789 return; 11790 } 11791 11792 # Extract the property, and save a reference to its object. 11793 my $property = $1; 11794 if (! exists $unihan_properties{$property}) { 11795 $unihan_properties{$property} = property_ref($property); 11796 } 11797 11798 # Don't do anything unless the property is one we're handling, which 11799 # we determine by seeing if there is an object defined for it or not 11800 if (! defined $unihan_properties{$property}) { 11801 $_ = ""; 11802 return; 11803 } 11804 11805 # Convert the tab separators to our standard semi-colons, and convert 11806 # the U+HHHH notation to the rest of the standard's HHHH 11807 s/\t/;/g; 11808 s/\b U \+ (?= $code_point_re )//xg; 11809 11810 #local $to_trace = 1 if main::DEBUG; 11811 trace $_ if main::DEBUG && $to_trace; 11812 11813 return; 11814 } 11815} 11816 11817sub filter_blocks_lines { 11818 # In the Blocks.txt file, the names of the blocks don't quite match the 11819 # names given in PropertyValueAliases.txt, so this changes them so they 11820 # do match: Blanks and hyphens are changed into underscores. Also makes 11821 # early release versions look like later ones 11822 # 11823 # $_ is transformed to the correct value. 11824 11825 my $file = shift; 11826 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11827 11828 if ($v_version lt v3.2.0) { 11829 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted 11830 $_ = ""; 11831 return; 11832 } 11833 11834 # Old versions used a different syntax to mark the range. 11835 $_ =~ s/;\s+/../ if $v_version lt v3.1.0; 11836 } 11837 11838 my @fields = split /\s*;\s*/, $_, -1; 11839 if (@fields != 2) { 11840 $file->carp_bad_line("Expecting exactly two fields"); 11841 $_ = ""; 11842 return; 11843 } 11844 11845 # Change hyphens and blanks in the block name field only 11846 $fields[1] =~ s/[ -]/_/g; 11847 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word 11848 11849 $_ = join("; ", @fields); 11850 return; 11851} 11852 11853{ # Closure 11854 my $current_property; 11855 11856 sub filter_old_style_proplist { 11857 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it 11858 # was in a completely different syntax. Ken Whistler of Unicode says 11859 # that it was something he used as an aid for his own purposes, but 11860 # was never an official part of the standard. Many of the properties 11861 # in it were incorporated into the later PropList.txt, but some were 11862 # not. This program uses this early file to generate property tables 11863 # that are otherwise not accessible in the early UCD's. It does this 11864 # for the ones that eventually became official, and don't appear to be 11865 # too different in their contents from the later official version, and 11866 # throws away the rest. It could be argued that the ones it generates 11867 # were probably not really official at that time, so should be 11868 # ignored. You can easily modify things to skip all of them by 11869 # changing this function to just set $_ to "", and return; and to skip 11870 # certain of them by by simply removing their declarations from 11871 # get_old_property_aliases(). 11872 # 11873 # Here is a list of all the ones that are thrown away: 11874 # Alphabetic The definitions for this are very 11875 # defective, so better to not mislead 11876 # people into thinking it works. 11877 # Instead the Perl extension of the 11878 # same name is constructed from first 11879 # principles. 11880 # Bidi=* duplicates UnicodeData.txt 11881 # Combining never made into official property; 11882 # is \P{ccc=0} 11883 # Composite never made into official property. 11884 # Currency Symbol duplicates UnicodeData.txt: gc=sc 11885 # Decimal Digit duplicates UnicodeData.txt: gc=nd 11886 # Delimiter never made into official property; 11887 # removed in 3.0.1 11888 # Format Control never made into official property; 11889 # similar to gc=cf 11890 # High Surrogate duplicates Blocks.txt 11891 # Ignorable Control never made into official property; 11892 # similar to di=y 11893 # ISO Control duplicates UnicodeData.txt: gc=cc 11894 # Left of Pair never made into official property; 11895 # Line Separator duplicates UnicodeData.txt: gc=zl 11896 # Low Surrogate duplicates Blocks.txt 11897 # Non-break was actually listed as a property 11898 # in 3.2, but without any code 11899 # points. Unicode denies that this 11900 # was ever an official property 11901 # Non-spacing duplicate UnicodeData.txt: gc=mn 11902 # Numeric duplicates UnicodeData.txt: gc=cc 11903 # Paired Punctuation never made into official property; 11904 # appears to be gc=ps + gc=pe 11905 # Paragraph Separator duplicates UnicodeData.txt: gc=cc 11906 # Private Use duplicates UnicodeData.txt: gc=co 11907 # Private Use High Surrogate duplicates Blocks.txt 11908 # Punctuation duplicates UnicodeData.txt: gc=p 11909 # Space different definition than eventual 11910 # one. 11911 # Titlecase duplicates UnicodeData.txt: gc=lt 11912 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn 11913 # Zero-width never made into official property; 11914 # subset of gc=cf 11915 # Most of the properties have the same names in this file as in later 11916 # versions, but a couple do not. 11917 # 11918 # This subroutine filters $_, converting it from the old style into 11919 # the new style. Here's a sample of the old-style 11920 # 11921 # ******************************************* 11922 # 11923 # Property dump for: 0x100000A0 (Join Control) 11924 # 11925 # 200C..200D (2 chars) 11926 # 11927 # In the example, the property is "Join Control". It is kept in this 11928 # closure between calls to the subroutine. The numbers beginning with 11929 # 0x were internal to Ken's program that generated this file. 11930 11931 # If this line contains the property name, extract it. 11932 if (/^Property dump for: [^(]*\((.*)\)/) { 11933 $_ = $1; 11934 11935 # Convert white space to underscores. 11936 s/ /_/g; 11937 11938 # Convert the few properties that don't have the same name as 11939 # their modern counterparts 11940 s/Identifier_Part/ID_Continue/ 11941 or s/Not_a_Character/NChar/; 11942 11943 # If the name matches an existing property, use it. 11944 if (defined property_ref($_)) { 11945 trace "new property=", $_ if main::DEBUG && $to_trace; 11946 $current_property = $_; 11947 } 11948 else { # Otherwise discard it 11949 trace "rejected property=", $_ if main::DEBUG && $to_trace; 11950 undef $current_property; 11951 } 11952 $_ = ""; # The property is saved for the next lines of the 11953 # file, but this defining line is of no further use, 11954 # so clear it so that the caller won't process it 11955 # further. 11956 } 11957 elsif (! defined $current_property || $_ !~ /^$code_point_re/) { 11958 11959 # Here, the input line isn't a header defining a property for the 11960 # following section, and either we aren't in such a section, or 11961 # the line doesn't look like one that defines the code points in 11962 # such a section. Ignore this line. 11963 $_ = ""; 11964 } 11965 else { 11966 11967 # Here, we have a line defining the code points for the current 11968 # stashed property. Anything starting with the first blank is 11969 # extraneous. Otherwise, it should look like a normal range to 11970 # the caller. Append the property name so that it looks just like 11971 # a modern PropList entry. 11972 11973 $_ =~ s/\s.*//; 11974 $_ .= "; $current_property"; 11975 } 11976 trace $_ if main::DEBUG && $to_trace; 11977 return; 11978 } 11979} # End closure for old style proplist 11980 11981sub filter_old_style_normalization_lines { 11982 # For early releases of Unicode, the lines were like: 11983 # 74..2A76 ; NFKD_NO 11984 # For later releases this became: 11985 # 74..2A76 ; NFKD_QC; N 11986 # Filter $_ to look like those in later releases. 11987 # Similarly for MAYBEs 11988 11989 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; 11990 11991 # Also, the property FC_NFKC was abbreviated to FNC 11992 s/FNC/FC_NFKC/; 11993 return; 11994} 11995 11996sub setup_script_extensions { 11997 # The Script_Extensions property starts out with a clone of the Script 11998 # property. 11999 12000 my $scx = property_ref("Script_Extensions"); 12001 $scx = Property->new("scx", Full_Name => "Script_Extensions") 12002 if ! defined $scx; 12003 $scx->_set_format($STRING_WHITE_SPACE_LIST); 12004 $scx->initialize($script); 12005 $scx->set_default_map($script->default_map); 12006 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 12007 $scx->add_comment(join_lines( <<END 12008The values for code points that appear in one script are just the same as for 12009the 'Script' property. Likewise the values for those that appear in many 12010scripts are either 'Common' or 'Inherited', same as with 'Script'. But the 12011values of code points that appear in a few scripts are a space separated list 12012of those scripts. 12013END 12014 )); 12015 12016 # Initialize scx's tables and the aliases for them to be the same as sc's 12017 foreach my $table ($script->tables) { 12018 my $scx_table = $scx->add_match_table($table->name, 12019 Full_Name => $table->full_name); 12020 foreach my $alias ($table->aliases) { 12021 $scx_table->add_alias($alias->name); 12022 } 12023 } 12024} 12025 12026sub filter_script_extensions_line { 12027 # The Scripts file comes with the full name for the scripts; the 12028 # ScriptExtensions, with the short name. The final mapping file is a 12029 # combination of these, and without adjustment, would have inconsistent 12030 # entries. This filters the latter file to convert to full names. 12031 # Entries look like this: 12032 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 12033 12034 my @fields = split /\s*;\s*/; 12035 12036 # This script was erroneously omitted in this Unicode version. 12037 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; 12038 12039 my @full_names; 12040 foreach my $short_name (split " ", $fields[1]) { 12041 push @full_names, $script->table($short_name)->full_name; 12042 } 12043 $fields[1] = join " ", @full_names; 12044 $_ = join "; ", @fields; 12045 12046 return; 12047} 12048 12049sub generate_hst { 12050 12051 # Populates the Hangul Syllable Type property from first principles 12052 12053 my $file= shift; 12054 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12055 12056 # These few ranges are hard-coded in. 12057 $file->insert_lines(split /\n/, <<'END' 120581100..1159 ; L 12059115F ; L 120601160..11A2 ; V 1206111A8..11F9 ; T 12062END 12063); 12064 12065 # The Hangul syllables in version 1 are completely different than what came 12066 # after, so just ignore them there. 12067 if ($v_version lt v2.0.0) { 12068 my $property = property_ref($file->property); 12069 push @tables_that_may_be_empty, $property->table('LV')->complete_name; 12070 push @tables_that_may_be_empty, $property->table('LVT')->complete_name; 12071 return; 12072 } 12073 12074 # The algorithmically derived syllables are almost all LVT ones, so 12075 # initialize the whole range with that. 12076 $file->insert_lines(sprintf "%04X..%04X; LVT\n", 12077 $SBase, $SBase + $SCount -1); 12078 12079 # Those ones that aren't LVT are LV, and they occur at intervals of 12080 # $TCount code points, starting with the first code point, at $SBase. 12081 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) { 12082 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i); 12083 } 12084 12085 return; 12086} 12087 12088sub generate_GCB { 12089 12090 # Populates the Grapheme Cluster Break property from first principles 12091 12092 my $file= shift; 12093 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12094 12095 # All these definitions are from 12096 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation 12097 # from http://www.unicode.org/reports/tr29/tr29-4.html 12098 12099 foreach my $range ($gc->ranges) { 12100 12101 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc 12102 # and gc=Cf 12103 if ($range->value =~ / ^ M [en] $ /x) { 12104 $file->insert_lines(sprintf "%04X..%04X; Extend", 12105 $range->start, $range->end); 12106 } 12107 elsif ($range->value =~ / ^ C [cf] $ /x) { 12108 $file->insert_lines(sprintf "%04X..%04X; Control", 12109 $range->start, $range->end); 12110 } 12111 } 12112 $file->insert_lines("2028; Control"); # Line Separator 12113 $file->insert_lines("2029; Control"); # Paragraph Separator 12114 12115 $file->insert_lines("000D; CR"); 12116 $file->insert_lines("000A; LF"); 12117 12118 # Also from http://www.unicode.org/reports/tr29/tr29-3.html. 12119 foreach my $code_point ( qw{ 12120 40000 12121 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6 12122 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F 12123 } 12124 ) { 12125 my $category = $gc->value_of(hex $code_point); 12126 next if ! defined $category || $category eq 'Cn'; # But not if 12127 # unassigned in this 12128 # release 12129 $file->insert_lines("$code_point; Extend"); 12130 } 12131 12132 my $hst = property_ref('Hangul_Syllable_Type'); 12133 if ($hst->count > 0) { 12134 foreach my $range ($hst->ranges) { 12135 $file->insert_lines(sprintf "%04X..%04X; %s", 12136 $range->start, $range->end, $range->value); 12137 } 12138 } 12139 else { 12140 generate_hst($file); 12141 } 12142 12143 return; 12144} 12145 12146sub setup_early_name_alias { 12147 my $file= shift; 12148 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12149 12150 # This has the effect of pretending that the Name_Alias property was 12151 # available in all Unicode releases. Strictly speaking, this property 12152 # should not be availabe in early releases, but doing this allows 12153 # charnames.pm to work on older releases without change. Prior to v5.16 12154 # it had these names hard-coded inside it. Unicode 6.1 came along and 12155 # created these names, and so they were removed from charnames. 12156 12157 my $aliases = property_ref('Name_Alias'); 12158 if (! defined $aliases) { 12159 $aliases = Property->new('Name_Alias', Default_Map => ""); 12160 } 12161 12162 $file->insert_lines(get_old_name_aliases()); 12163 12164 return; 12165} 12166 12167sub get_old_name_aliases () { 12168 12169 # The Unicode_1_Name field, contains most of these names. One would 12170 # expect, given the field's name, that its values would be fixed across 12171 # versions, giving the true Unicode version 1 name for the character. 12172 # Sadly, this is not the case. Actually Version 1.1.5 had no names for 12173 # any of the controls; Version 2.0 introduced names for the C0 controls, 12174 # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2 12175 # changed some names: it 12176 # changed to parenthesized versions like "NEXT LINE" to 12177 # "NEXT LINE (NEL)"; 12178 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD 12179 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; 12180 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR 12181 # This list contains all the names that were defined so that 12182 # charnames::vianame(), etc. understand them all EVEN if this version of 12183 # Unicode didn't specify them (this could be construed as a bug). 12184 # mktables elsewhere gives preference to the Unicode_1_Name field over 12185 # these names, so that viacode() will return the correct value for that 12186 # version of Unicode, except when that version doesn't define a name, 12187 # viacode() will return one anyway (this also could be construed as a 12188 # bug). But these potential "bugs" allow for the smooth working of code 12189 # on earlier Unicode releases. 12190 12191 my @return = split /\n/, <<'END'; 121920000;NULL;control 121930000;NUL;abbreviation 121940001;START OF HEADING;control 121950001;SOH;abbreviation 121960002;START OF TEXT;control 121970002;STX;abbreviation 121980003;END OF TEXT;control 121990003;ETX;abbreviation 122000004;END OF TRANSMISSION;control 122010004;EOT;abbreviation 122020005;ENQUIRY;control 122030005;ENQ;abbreviation 122040006;ACKNOWLEDGE;control 122050006;ACK;abbreviation 122060007;BELL;control 122070007;BEL;abbreviation 122080008;BACKSPACE;control 122090008;BS;abbreviation 122100009;CHARACTER TABULATION;control 122110009;HORIZONTAL TABULATION;control 122120009;HT;abbreviation 122130009;TAB;abbreviation 12214000A;LINE FEED;control 12215000A;LINE FEED (LF);control 12216000A;NEW LINE;control 12217000A;END OF LINE;control 12218000A;LF;abbreviation 12219000A;NL;abbreviation 12220000A;EOL;abbreviation 12221000B;LINE TABULATION;control 12222000B;VERTICAL TABULATION;control 12223000B;VT;abbreviation 12224000C;FORM FEED;control 12225000C;FORM FEED (FF);control 12226000C;FF;abbreviation 12227000D;CARRIAGE RETURN;control 12228000D;CARRIAGE RETURN (CR);control 12229000D;CR;abbreviation 12230000E;SHIFT OUT;control 12231000E;LOCKING-SHIFT ONE;control 12232000E;SO;abbreviation 12233000F;SHIFT IN;control 12234000F;LOCKING-SHIFT ZERO;control 12235000F;SI;abbreviation 122360010;DATA LINK ESCAPE;control 122370010;DLE;abbreviation 122380011;DEVICE CONTROL ONE;control 122390011;DC1;abbreviation 122400012;DEVICE CONTROL TWO;control 122410012;DC2;abbreviation 122420013;DEVICE CONTROL THREE;control 122430013;DC3;abbreviation 122440014;DEVICE CONTROL FOUR;control 122450014;DC4;abbreviation 122460015;NEGATIVE ACKNOWLEDGE;control 122470015;NAK;abbreviation 122480016;SYNCHRONOUS IDLE;control 122490016;SYN;abbreviation 122500017;END OF TRANSMISSION BLOCK;control 122510017;ETB;abbreviation 122520018;CANCEL;control 122530018;CAN;abbreviation 122540019;END OF MEDIUM;control 122550019;EOM;abbreviation 12256001A;SUBSTITUTE;control 12257001A;SUB;abbreviation 12258001B;ESCAPE;control 12259001B;ESC;abbreviation 12260001C;INFORMATION SEPARATOR FOUR;control 12261001C;FILE SEPARATOR;control 12262001C;FS;abbreviation 12263001D;INFORMATION SEPARATOR THREE;control 12264001D;GROUP SEPARATOR;control 12265001D;GS;abbreviation 12266001E;INFORMATION SEPARATOR TWO;control 12267001E;RECORD SEPARATOR;control 12268001E;RS;abbreviation 12269001F;INFORMATION SEPARATOR ONE;control 12270001F;UNIT SEPARATOR;control 12271001F;US;abbreviation 122720020;SP;abbreviation 12273007F;DELETE;control 12274007F;DEL;abbreviation 122750080;PADDING CHARACTER;figment 122760080;PAD;abbreviation 122770081;HIGH OCTET PRESET;figment 122780081;HOP;abbreviation 122790082;BREAK PERMITTED HERE;control 122800082;BPH;abbreviation 122810083;NO BREAK HERE;control 122820083;NBH;abbreviation 122830084;INDEX;control 122840084;IND;abbreviation 122850085;NEXT LINE;control 122860085;NEXT LINE (NEL);control 122870085;NEL;abbreviation 122880086;START OF SELECTED AREA;control 122890086;SSA;abbreviation 122900087;END OF SELECTED AREA;control 122910087;ESA;abbreviation 122920088;CHARACTER TABULATION SET;control 122930088;HORIZONTAL TABULATION SET;control 122940088;HTS;abbreviation 122950089;CHARACTER TABULATION WITH JUSTIFICATION;control 122960089;HORIZONTAL TABULATION WITH JUSTIFICATION;control 122970089;HTJ;abbreviation 12298008A;LINE TABULATION SET;control 12299008A;VERTICAL TABULATION SET;control 12300008A;VTS;abbreviation 12301008B;PARTIAL LINE FORWARD;control 12302008B;PARTIAL LINE DOWN;control 12303008B;PLD;abbreviation 12304008C;PARTIAL LINE BACKWARD;control 12305008C;PARTIAL LINE UP;control 12306008C;PLU;abbreviation 12307008D;REVERSE LINE FEED;control 12308008D;REVERSE INDEX;control 12309008D;RI;abbreviation 12310008E;SINGLE SHIFT TWO;control 12311008E;SINGLE-SHIFT-2;control 12312008E;SS2;abbreviation 12313008F;SINGLE SHIFT THREE;control 12314008F;SINGLE-SHIFT-3;control 12315008F;SS3;abbreviation 123160090;DEVICE CONTROL STRING;control 123170090;DCS;abbreviation 123180091;PRIVATE USE ONE;control 123190091;PRIVATE USE-1;control 123200091;PU1;abbreviation 123210092;PRIVATE USE TWO;control 123220092;PRIVATE USE-2;control 123230092;PU2;abbreviation 123240093;SET TRANSMIT STATE;control 123250093;STS;abbreviation 123260094;CANCEL CHARACTER;control 123270094;CCH;abbreviation 123280095;MESSAGE WAITING;control 123290095;MW;abbreviation 123300096;START OF GUARDED AREA;control 123310096;START OF PROTECTED AREA;control 123320096;SPA;abbreviation 123330097;END OF GUARDED AREA;control 123340097;END OF PROTECTED AREA;control 123350097;EPA;abbreviation 123360098;START OF STRING;control 123370098;SOS;abbreviation 123380099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment 123390099;SGC;abbreviation 12340009A;SINGLE CHARACTER INTRODUCER;control 12341009A;SCI;abbreviation 12342009B;CONTROL SEQUENCE INTRODUCER;control 12343009B;CSI;abbreviation 12344009C;STRING TERMINATOR;control 12345009C;ST;abbreviation 12346009D;OPERATING SYSTEM COMMAND;control 12347009D;OSC;abbreviation 12348009E;PRIVACY MESSAGE;control 12349009E;PM;abbreviation 12350009F;APPLICATION PROGRAM COMMAND;control 12351009F;APC;abbreviation 1235200A0;NBSP;abbreviation 1235300AD;SHY;abbreviation 12354200B;ZWSP;abbreviation 12355200C;ZWNJ;abbreviation 12356200D;ZWJ;abbreviation 12357200E;LRM;abbreviation 12358200F;RLM;abbreviation 12359202A;LRE;abbreviation 12360202B;RLE;abbreviation 12361202C;PDF;abbreviation 12362202D;LRO;abbreviation 12363202E;RLO;abbreviation 12364FEFF;BYTE ORDER MARK;alternate 12365FEFF;BOM;abbreviation 12366FEFF;ZWNBSP;abbreviation 12367END 12368 12369 if ($v_version ge v3.0.0) { 12370 push @return, split /\n/, <<'END'; 12371180B; FVS1; abbreviation 12372180C; FVS2; abbreviation 12373180D; FVS3; abbreviation 12374180E; MVS; abbreviation 12375202F; NNBSP; abbreviation 12376END 12377 } 12378 12379 if ($v_version ge v3.2.0) { 12380 push @return, split /\n/, <<'END'; 12381034F; CGJ; abbreviation 12382205F; MMSP; abbreviation 123832060; WJ; abbreviation 12384END 12385 # Add in VS1..VS16 12386 my $cp = 0xFE00 - 1; 12387 for my $i (1..16) { 12388 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); 12389 } 12390 } 12391 if ($v_version ge v4.0.0) { # Add in VS17..VS256 12392 my $cp = 0xE0100 - 17; 12393 for my $i (17..256) { 12394 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); 12395 } 12396 } 12397 12398 # ALERT did not come along until 6.0, at which point it became preferred 12399 # over BELL, and was never in the Unicode_1_Name field. For the same 12400 # reasons, that the other names are made known to all releases by this 12401 # function, we make ALERT known too. By inserting it 12402 # last in early releases, BELL is preferred over it; and vice-vers in 6.0 12403 my $alert = '0007; ALERT; control'; 12404 if ($v_version lt v6.0.0) { 12405 push @return, $alert; 12406 } 12407 else { 12408 unshift @return, $alert; 12409 } 12410 12411 return @return; 12412} 12413 12414sub filter_later_version_name_alias_line { 12415 12416 # This file has an extra entry per line for the alias type. This is 12417 # handled by creating a compound entry: "$alias: $type"; First, split 12418 # the line into components. 12419 my ($range, $alias, $type, @remainder) 12420 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 12421 12422 # This file contains multiple entries for some components, so tell the 12423 # downstream code to allow this in our internal tables; the 12424 # $MULTIPLE_AFTER preserves the input ordering. 12425 $_ = join ";", $range, $CMD_DELIM 12426 . $REPLACE_CMD 12427 . '=' 12428 . $MULTIPLE_AFTER 12429 . $CMD_DELIM 12430 . "$alias: $type", 12431 @remainder; 12432 return; 12433} 12434 12435sub filter_early_version_name_alias_line { 12436 12437 # Early versions did not have the trailing alias type field; implicitly it 12438 # was 'correction'. But our synthetic lines we add in this program do 12439 # have it, so test for the type field. 12440 $_ .= "; correction" if $_ !~ /;.*;/; 12441 12442 filter_later_version_name_alias_line; 12443 return; 12444} 12445 12446sub finish_Unicode() { 12447 # This routine should be called after all the Unicode files have been read 12448 # in. It: 12449 # 1) Creates properties that are missing from the version of Unicode being 12450 # compiled, and which, for whatever reason, are needed for the Perl 12451 # core to function properly. These are minimally populated as 12452 # necessary. 12453 # 2) Adds the mappings for code points missing from the files which have 12454 # defaults specified for them. 12455 # 3) At this this point all mappings are known, so it computes the type of 12456 # each property whose type hasn't been determined yet. 12457 # 4) Calculates all the regular expression match tables based on the 12458 # mappings. 12459 # 5) Calculates and adds the tables which are defined by Unicode, but 12460 # which aren't derived by them, and certain derived tables that Perl 12461 # uses. 12462 12463 # Folding information was introduced later into Unicode data. To get 12464 # Perl's case ignore (/i) to work at all in releases that don't have 12465 # folding, use the best available alternative, which is lower casing. 12466 my $fold = property_ref('Case_Folding'); 12467 if ($fold->is_empty) { 12468 $fold->initialize(property_ref('Lowercase_Mapping')); 12469 $fold->add_note(join_lines(<<END 12470WARNING: This table uses lower case as a substitute for missing fold 12471information 12472END 12473 )); 12474 } 12475 12476 # Multiple-character mapping was introduced later into Unicode data, so it 12477 # is by default the simple version. If to output the simple versions and 12478 # not present, just use the regular (which in these Unicode versions is 12479 # the simple as well). 12480 foreach my $map (qw { Uppercase_Mapping 12481 Lowercase_Mapping 12482 Titlecase_Mapping 12483 Case_Folding 12484 } ) 12485 { 12486 my $simple = property_ref("Simple_$map"); 12487 next if ! $simple->is_empty; 12488 if ($simple->to_output_map) { 12489 $simple->initialize(property_ref($map)); 12490 } 12491 else { 12492 property_ref($map)->set_proxy_for($simple->name); 12493 } 12494 } 12495 12496 # For each property, fill in any missing mappings, and calculate the re 12497 # match tables. If a property has more than one missing mapping, the 12498 # default is a reference to a data structure, and requires data from other 12499 # properties to resolve. The sort is used to cause these to be processed 12500 # last, after all the other properties have been calculated. 12501 # (Fortunately, the missing properties so far don't depend on each other.) 12502 foreach my $property 12503 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } 12504 property_ref('*')) 12505 { 12506 # $perl has been defined, but isn't one of the Unicode properties that 12507 # need to be finished up. 12508 next if $property == $perl; 12509 12510 # Nor do we need to do anything with properties that aren't going to 12511 # be output. 12512 next if $property->fate == $SUPPRESSED; 12513 12514 # Handle the properties that have more than one possible default 12515 if (ref $property->default_map) { 12516 my $default_map = $property->default_map; 12517 12518 # These properties have stored in the default_map: 12519 # One or more of: 12520 # 1) A default map which applies to all code points in a 12521 # certain class 12522 # 2) an expression which will evaluate to the list of code 12523 # points in that class 12524 # And 12525 # 3) the default map which applies to every other missing code 12526 # point. 12527 # 12528 # Go through each list. 12529 while (my ($default, $eval) = $default_map->get_next_defaults) { 12530 12531 # Get the class list, and intersect it with all the so-far 12532 # unspecified code points yielding all the code points 12533 # in the class that haven't been specified. 12534 my $list = eval $eval; 12535 if ($@) { 12536 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); 12537 last; 12538 } 12539 12540 # Narrow down the list to just those code points we don't have 12541 # maps for yet. 12542 $list = $list & $property->inverse_list; 12543 12544 # Add mappings to the property for each code point in the list 12545 foreach my $range ($list->ranges) { 12546 $property->add_map($range->start, $range->end, $default, 12547 Replace => $CROAK); 12548 } 12549 } 12550 12551 # All remaining code points have the other mapping. Set that up 12552 # so the normal single-default mapping code will work on them 12553 $property->set_default_map($default_map->other_default); 12554 12555 # And fall through to do that 12556 } 12557 12558 # We should have enough data now to compute the type of the property. 12559 $property->compute_type; 12560 my $property_type = $property->type; 12561 12562 next if ! $property->to_create_match_tables; 12563 12564 # Here want to create match tables for this property 12565 12566 # The Unicode db always (so far, and they claim into the future) have 12567 # the default for missing entries in binary properties be 'N' (unless 12568 # there is a '@missing' line that specifies otherwise) 12569 if ($property_type == $BINARY && ! defined $property->default_map) { 12570 $property->set_default_map('N'); 12571 } 12572 12573 # Add any remaining code points to the mapping, using the default for 12574 # missing code points. 12575 my $default_table; 12576 if (defined (my $default_map = $property->default_map)) { 12577 12578 # Make sure there is a match table for the default 12579 if (! defined ($default_table = $property->table($default_map))) { 12580 $default_table = $property->add_match_table($default_map); 12581 } 12582 12583 # And, if the property is binary, the default table will just 12584 # be the complement of the other table. 12585 if ($property_type == $BINARY) { 12586 my $non_default_table; 12587 12588 # Find the non-default table. 12589 for my $table ($property->tables) { 12590 next if $table == $default_table; 12591 $non_default_table = $table; 12592 } 12593 $default_table->set_complement($non_default_table); 12594 } 12595 else { 12596 12597 # This fills in any missing values with the default. It's not 12598 # necessary to do this with binary properties, as the default 12599 # is defined completely in terms of the Y table. 12600 $property->add_map(0, $MAX_UNICODE_CODEPOINT, 12601 $default_map, Replace => $NO); 12602 } 12603 } 12604 12605 # Have all we need to populate the match tables. 12606 my $property_name = $property->name; 12607 my $maps_should_be_defined = $property->pre_declared_maps; 12608 foreach my $range ($property->ranges) { 12609 my $map = $range->value; 12610 my $table = $property->table($map); 12611 if (! defined $table) { 12612 12613 # Integral and rational property values are not necessarily 12614 # defined in PropValueAliases, but whether all the other ones 12615 # should be depends on the property. 12616 if ($maps_should_be_defined 12617 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) 12618 { 12619 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.") 12620 } 12621 $table = $property->add_match_table($map); 12622 } 12623 12624 next if $table->complement != 0; # Don't need to populate these 12625 $table->add_range($range->start, $range->end); 12626 } 12627 12628 # A forced binary property has additional true/false tables which 12629 # should have been set up when it was forced into binary. The false 12630 # table matches exactly the same set as the property's default table. 12631 # The true table matches the complement of that. The false table is 12632 # not the same as an additional set of aliases on top of the default 12633 # table, so use 'set_equivalent_to'. If it were implemented as 12634 # additional aliases, various things would have to be adjusted, but 12635 # especially, if the user wants to get a list of names for the table 12636 # using Unicode::UCD::prop_value_aliases(), s/he should get a 12637 # different set depending on whether they want the default table or 12638 # the false table. 12639 if ($property_type == $FORCED_BINARY) { 12640 $property->table('N')->set_equivalent_to($default_table, 12641 Related => 1); 12642 $property->table('Y')->set_complement($default_table); 12643 } 12644 12645 # For Perl 5.6 compatibility, all properties matchable in regexes can 12646 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl. 12647 # But warn if this creates a conflict with a (new) Unicode property 12648 # name, although it appears that Unicode has made a decision never to 12649 # begin a property name with 'Is_', so this shouldn't happen. 12650 foreach my $alias ($property->aliases) { 12651 my $Is_name = 'Is_' . $alias->name; 12652 if (defined (my $pre_existing = property_ref($Is_name))) { 12653 Carp::my_carp(<<END 12654There is already an alias named $Is_name (from " . $pre_existing . "), so 12655creating one for $property won't work. This is bad news. If it is not too 12656late, get Unicode to back off. Otherwise go back to the old scheme (findable 12657from the git blame log for this area of the code that suppressed individual 12658aliases that conflict with the new Unicode names. Proceeding anyway. 12659END 12660 ); 12661 } 12662 } # End of loop through aliases for this property 12663 } # End of loop through all Unicode properties. 12664 12665 # Fill in the mappings that Unicode doesn't completely furnish. First the 12666 # single letter major general categories. If Unicode were to start 12667 # delivering the values, this would be redundant, but better that than to 12668 # try to figure out if should skip and not get it right. Ths could happen 12669 # if a new major category were to be introduced, and the hard-coded test 12670 # wouldn't know about it. 12671 # This routine depends on the standard names for the general categories 12672 # being what it thinks they are, like 'Cn'. The major categories are the 12673 # union of all the general category tables which have the same first 12674 # letters. eg. L = Lu + Lt + Ll + Lo + Lm 12675 foreach my $minor_table ($gc->tables) { 12676 my $minor_name = $minor_table->name; 12677 next if length $minor_name == 1; 12678 if (length $minor_name != 2) { 12679 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); 12680 next; 12681 } 12682 12683 my $major_name = uc(substr($minor_name, 0, 1)); 12684 my $major_table = $gc->table($major_name); 12685 $major_table += $minor_table; 12686 } 12687 12688 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt 12689 # defines it as LC) 12690 my $LC = $gc->table('LC'); 12691 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... 12692 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. 12693 12694 12695 if ($LC->is_empty) { # Assume if not empty that Unicode has started to 12696 # deliver the correct values in it 12697 $LC->initialize($gc->table('Ll') + $gc->table('Lu')); 12698 12699 # Lt not in release 1. 12700 if (defined $gc->table('Lt')) { 12701 $LC += $gc->table('Lt'); 12702 $gc->table('Lt')->set_caseless_equivalent($LC); 12703 } 12704 } 12705 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); 12706 12707 $gc->table('Ll')->set_caseless_equivalent($LC); 12708 $gc->table('Lu')->set_caseless_equivalent($LC); 12709 12710 my $Cs = $gc->table('Cs'); 12711 12712 # Create digit and case fold tables with the original file names for 12713 # backwards compatibility with applications that read them directly. 12714 my $Digit = Property->new("Legacy_Perl_Decimal_Digit", 12715 Default_Map => "", 12716 Perl_Extension => 1, 12717 File => 'Digit', # Trad. location 12718 Directory => $map_directory, 12719 UCD => 0, 12720 Type => $STRING, 12721 To_Output_Map => $EXTERNAL_MAP, 12722 Range_Size_1 => 1, 12723 Initialize => property_ref('Perl_Decimal_Digit'), 12724 ); 12725 $Digit->add_comment(join_lines(<<END 12726This file gives the mapping of all code points which represent a single 12727decimal digit [0-9] to their respective digits. For example, the code point 12728U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those 12729that have Numeric_Type=Decimal; not special things, like subscripts nor Roman 12730numerals. 12731END 12732 )); 12733 12734 Property->new('Legacy_Case_Folding', 12735 File => "Fold", 12736 Directory => $map_directory, 12737 Default_Map => $CODE_POINT, 12738 UCD => 0, 12739 Range_Size_1 => 1, 12740 Type => $STRING, 12741 To_Output_Map => $EXTERNAL_MAP, 12742 Format => $HEX_FORMAT, 12743 Initialize => property_ref('cf'), 12744 ); 12745 12746 # The Script_Extensions property started out as a clone of the Script 12747 # property. But processing its data file caused some elements to be 12748 # replaced with different data. (These elements were for the Common and 12749 # Inherited properties.) This data is a qw() list of all the scripts that 12750 # the code points in the given range are in. An example line is: 12751 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA 12752 # 12753 # The code above has created a new match table named "Arab Syrc Thaa" 12754 # which contains 060C. (The cloned table started out with this code point 12755 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and 12756 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa" 12757 # match table. This is repeated for all these tables and ranges. The map 12758 # data is retained in the map table for reference, but the spurious match 12759 # tables are deleted. 12760 12761 my $scx = property_ref("Script_Extensions"); 12762 if (defined $scx) { 12763 foreach my $table ($scx->tables) { 12764 next unless $table->name =~ /\s/; # All the new and only the new 12765 # tables have a space in their 12766 # names 12767 my @scripts = split /\s+/, $table->name; 12768 foreach my $script (@scripts) { 12769 my $script_table = $scx->table($script); 12770 $script_table += $table; 12771 } 12772 $scx->delete_match_table($table); 12773 } 12774 } 12775 12776 return; 12777} 12778 12779sub pre_3_dot_1_Nl () { 12780 12781 # Return a range list for gc=nl for Unicode versions prior to 3.1, which 12782 # is when Unicode's became fully usable. These code points were 12783 # determined by inspection and experimentation. gc=nl is important for 12784 # certain Perl-extension properties that should be available in all 12785 # releases. 12786 12787 my $Nl = Range_List->new(); 12788 if (defined (my $official = $gc->table('Nl'))) { 12789 $Nl += $official; 12790 } 12791 else { 12792 $Nl->add_range(0x2160, 0x2182); 12793 $Nl->add_range(0x3007, 0x3007); 12794 $Nl->add_range(0x3021, 0x3029); 12795 } 12796 $Nl->add_range(0xFE20, 0xFE23); 12797 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when 12798 # these were added 12799 return $Nl; 12800} 12801 12802sub compile_perl() { 12803 # Create perl-defined tables. Almost all are part of the pseudo-property 12804 # named 'perl' internally to this program. Many of these are recommended 12805 # in UTS#18 "Unicode Regular Expressions", and their derivations are based 12806 # on those found there. 12807 # Almost all of these are equivalent to some Unicode property. 12808 # A number of these properties have equivalents restricted to the ASCII 12809 # range, with their names prefaced by 'Posix', to signify that these match 12810 # what the Posix standard says they should match. A couple are 12811 # effectively this, but the name doesn't have 'Posix' in it because there 12812 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended 12813 # to the full Unicode range, by our guesses as to what is appropriate. 12814 12815 # 'Any' is all code points. As an error check, instead of just setting it 12816 # to be that, construct it to be the union of all the major categories 12817 $Any = $perl->add_match_table('Any', 12818 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]", 12819 Matches_All => 1); 12820 12821 foreach my $major_table ($gc->tables) { 12822 12823 # Major categories are the ones with single letter names. 12824 next if length($major_table->name) != 1; 12825 12826 $Any += $major_table; 12827 } 12828 12829 if ($Any->max != $MAX_UNICODE_CODEPOINT) { 12830 Carp::my_carp_bug("Generated highest code point (" 12831 . sprintf("%X", $Any->max) 12832 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.") 12833 } 12834 if ($Any->range_count != 1 || $Any->min != 0) { 12835 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.") 12836 } 12837 12838 $Any->add_alias('All'); 12839 12840 # Assigned is the opposite of gc=unassigned 12841 my $Assigned = $perl->add_match_table('Assigned', 12842 Description => "All assigned code points", 12843 Initialize => ~ $gc->table('Unassigned'), 12844 ); 12845 12846 # Our internal-only property should be treated as more than just a 12847 # synonym; grandfather it in to the pod. 12848 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1, 12849 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED) 12850 ->set_equivalent_to(property_ref('ccc')->table('Above'), 12851 Related => 1); 12852 12853 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]'); 12854 if (defined $block) { # This is equivalent to the block if have it. 12855 my $Unicode_ASCII = $block->table('Basic_Latin'); 12856 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { 12857 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); 12858 } 12859 } 12860 12861 # Very early releases didn't have blocks, so initialize ASCII ourselves if 12862 # necessary 12863 if ($ASCII->is_empty) { 12864 $ASCII->add_range(0, 127); 12865 } 12866 12867 # Get the best available case definitions. Early Unicode versions didn't 12868 # have Uppercase and Lowercase defined, so use the general category 12869 # instead for them, modified by hard-coding in the code points each is 12870 # missing. 12871 my $Lower = $perl->add_match_table('Lower'); 12872 my $Unicode_Lower = property_ref('Lowercase'); 12873 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { 12874 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); 12875 12876 } 12877 else { 12878 $Lower += $gc->table('Lowercase_Letter'); 12879 12880 # There are quite a few code points in Lower, that aren't in gc=lc, 12881 # and not all are in all releases. 12882 foreach my $code_point ( 0x00AA, 12883 0x00BA, 12884 0x02B0 .. 0x02B8, 12885 0x02C0 .. 0x02C1, 12886 0x02E0 .. 0x02E4, 12887 0x0345, 12888 0x037A, 12889 0x1D2C .. 0x1D6A, 12890 0x1D78, 12891 0x1D9B .. 0x1DBF, 12892 0x2071, 12893 0x207F, 12894 0x2090 .. 0x209C, 12895 0x2170 .. 0x217F, 12896 0x24D0 .. 0x24E9, 12897 0x2C7C .. 0x2C7D, 12898 0xA770, 12899 0xA7F8 .. 0xA7F9, 12900 ) { 12901 # Don't include the code point unless it is assigned in this 12902 # release 12903 my $category = $gc->value_of(hex $code_point); 12904 next if ! defined $category || $category eq 'Cn'; 12905 12906 $Lower += $code_point; 12907 } 12908 } 12909 $Lower->add_alias('XPosixLower'); 12910 my $Posix_Lower = $perl->add_match_table("PosixLower", 12911 Description => "[a-z]", 12912 Initialize => $Lower & $ASCII, 12913 ); 12914 12915 my $Upper = $perl->add_match_table('Upper'); 12916 my $Unicode_Upper = property_ref('Uppercase'); 12917 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { 12918 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); 12919 } 12920 else { 12921 12922 # Unlike Lower, there are only two ranges in Upper that aren't in 12923 # gc=Lu, and all code points were assigned in all releases. 12924 $Upper += $gc->table('Uppercase_Letter'); 12925 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals 12926 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters 12927 } 12928 $Upper->add_alias('XPosixUpper'); 12929 my $Posix_Upper = $perl->add_match_table("PosixUpper", 12930 Description => "[A-Z]", 12931 Initialize => $Upper & $ASCII, 12932 ); 12933 12934 # Earliest releases didn't have title case. Initialize it to empty if not 12935 # otherwise present 12936 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase', 12937 Description => '(= \p{Gc=Lt})'); 12938 my $lt = $gc->table('Lt'); 12939 12940 # Earlier versions of mktables had this related to $lt since they have 12941 # identical code points, but their caseless equivalents are not the same, 12942 # one being 'Cased' and the other being 'LC', and so now must be kept as 12943 # separate entities. 12944 if (defined $lt) { 12945 $Title += $lt; 12946 } 12947 else { 12948 push @tables_that_may_be_empty, $Title->complete_name; 12949 } 12950 12951 my $Unicode_Cased = property_ref('Cased'); 12952 if (defined $Unicode_Cased) { 12953 my $yes = $Unicode_Cased->table('Y'); 12954 my $no = $Unicode_Cased->table('N'); 12955 $Title->set_caseless_equivalent($yes); 12956 if (defined $Unicode_Upper) { 12957 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes); 12958 $Unicode_Upper->table('N')->set_caseless_equivalent($no); 12959 } 12960 $Upper->set_caseless_equivalent($yes); 12961 if (defined $Unicode_Lower) { 12962 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes); 12963 $Unicode_Lower->table('N')->set_caseless_equivalent($no); 12964 } 12965 $Lower->set_caseless_equivalent($yes); 12966 } 12967 else { 12968 # If this Unicode version doesn't have Cased, set up the Perl 12969 # extension from first principles. From Unicode 5.1: Definition D120: 12970 # A character C is defined to be cased if and only if C has the 12971 # Lowercase or Uppercase property or has a General_Category value of 12972 # Titlecase_Letter. 12973 my $cased = $perl->add_match_table('Cased', 12974 Initialize => $Lower + $Upper + $Title, 12975 Description => 'Uppercase or Lowercase or Titlecase', 12976 ); 12977 # $notcased is purely for the caseless equivalents below 12978 my $notcased = $perl->add_match_table('_Not_Cased', 12979 Initialize => ~ $cased, 12980 Fate => $INTERNAL_ONLY, 12981 Description => 'All not-cased code points'); 12982 $Title->set_caseless_equivalent($cased); 12983 if (defined $Unicode_Upper) { 12984 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased); 12985 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased); 12986 } 12987 $Upper->set_caseless_equivalent($cased); 12988 if (defined $Unicode_Lower) { 12989 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased); 12990 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased); 12991 } 12992 $Lower->set_caseless_equivalent($cased); 12993 } 12994 12995 # Similarly, set up our own Case_Ignorable property if this Unicode 12996 # version doesn't have it. From Unicode 5.1: Definition D121: A character 12997 # C is defined to be case-ignorable if C has the value MidLetter or the 12998 # value MidNumLet for the Word_Break property or its General_Category is 12999 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), 13000 # Modifier_Letter (Lm), or Modifier_Symbol (Sk). 13001 13002 # Perl has long had an internal-only alias for this property; grandfather 13003 # it in to the pod, but discourage its use. 13004 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable', 13005 Re_Pod_Entry => 1, 13006 Fate => $INTERNAL_ONLY, 13007 Status => $DISCOURAGED); 13008 my $case_ignorable = property_ref('Case_Ignorable'); 13009 if (defined $case_ignorable && ! $case_ignorable->is_empty) { 13010 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'), 13011 Related => 1); 13012 } 13013 else { 13014 13015 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm')); 13016 13017 # The following three properties are not in early releases 13018 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me'); 13019 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf'); 13020 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk'); 13021 13022 # For versions 4.1 - 5.0, there is no MidNumLet property, and 13023 # correspondingly the case-ignorable definition lacks that one. For 13024 # 4.0, it appears that it was meant to be the same definition, but was 13025 # inadvertently omitted from the standard's text, so add it if the 13026 # property actually is there 13027 my $wb = property_ref('Word_Break'); 13028 if (defined $wb) { 13029 my $midlet = $wb->table('MidLetter'); 13030 $perl_case_ignorable += $midlet if defined $midlet; 13031 my $midnumlet = $wb->table('MidNumLet'); 13032 $perl_case_ignorable += $midnumlet if defined $midnumlet; 13033 } 13034 else { 13035 13036 # In earlier versions of the standard, instead of the above two 13037 # properties , just the following characters were used: 13038 $perl_case_ignorable += 0x0027 # APOSTROPHE 13039 + 0x00AD # SOFT HYPHEN (SHY) 13040 + 0x2019; # RIGHT SINGLE QUOTATION MARK 13041 } 13042 } 13043 13044 # The remaining perl defined tables are mostly based on Unicode TR 18, 13045 # "Annex C: Compatibility Properties". All of these have two versions, 13046 # one whose name generally begins with Posix that is posix-compliant, and 13047 # one that matches Unicode characters beyond the Posix, ASCII range 13048 13049 my $Alpha = $perl->add_match_table('Alpha'); 13050 13051 # Alphabetic was not present in early releases 13052 my $Alphabetic = property_ref('Alphabetic'); 13053 if (defined $Alphabetic && ! $Alphabetic->is_empty) { 13054 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); 13055 } 13056 else { 13057 13058 # The Alphabetic property doesn't exist for early releases, so 13059 # generate it. The actual definition, in 5.2 terms is: 13060 # 13061 # gc=L + gc=Nl + Other_Alphabetic 13062 # 13063 # Other_Alphabetic is also not defined in these early releases, but it 13064 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add 13065 # those last two as well, then subtract the relatively few of them that 13066 # shouldn't have been added. (The gc=So range is the circled capital 13067 # Latin characters. Early releases mistakenly didn't also include the 13068 # lower-case versions of these characters, and so we don't either, to 13069 # maintain consistency with those releases that first had this 13070 # property. 13071 $Alpha->initialize($gc->table('Letter') 13072 + pre_3_dot_1_Nl() 13073 + $gc->table('Mn') 13074 + $gc->table('Mc') 13075 ); 13076 $Alpha->add_range(0x24D0, 0x24E9); # gc=So 13077 foreach my $range ( [ 0x0300, 0x0344 ], 13078 [ 0x0346, 0x034E ], 13079 [ 0x0360, 0x0362 ], 13080 [ 0x0483, 0x0486 ], 13081 [ 0x0591, 0x05AF ], 13082 [ 0x06DF, 0x06E0 ], 13083 [ 0x06EA, 0x06EC ], 13084 [ 0x0740, 0x074A ], 13085 0x093C, 13086 0x094D, 13087 [ 0x0951, 0x0954 ], 13088 0x09BC, 13089 0x09CD, 13090 0x0A3C, 13091 0x0A4D, 13092 0x0ABC, 13093 0x0ACD, 13094 0x0B3C, 13095 0x0B4D, 13096 0x0BCD, 13097 0x0C4D, 13098 0x0CCD, 13099 0x0D4D, 13100 0x0DCA, 13101 [ 0x0E47, 0x0E4C ], 13102 0x0E4E, 13103 [ 0x0EC8, 0x0ECC ], 13104 [ 0x0F18, 0x0F19 ], 13105 0x0F35, 13106 0x0F37, 13107 0x0F39, 13108 [ 0x0F3E, 0x0F3F ], 13109 [ 0x0F82, 0x0F84 ], 13110 [ 0x0F86, 0x0F87 ], 13111 0x0FC6, 13112 0x1037, 13113 0x1039, 13114 [ 0x17C9, 0x17D3 ], 13115 [ 0x20D0, 0x20DC ], 13116 0x20E1, 13117 [ 0x302A, 0x302F ], 13118 [ 0x3099, 0x309A ], 13119 [ 0xFE20, 0xFE23 ], 13120 [ 0x1D165, 0x1D169 ], 13121 [ 0x1D16D, 0x1D172 ], 13122 [ 0x1D17B, 0x1D182 ], 13123 [ 0x1D185, 0x1D18B ], 13124 [ 0x1D1AA, 0x1D1AD ], 13125 ) { 13126 if (ref $range) { 13127 $Alpha->delete_range($range->[0], $range->[1]); 13128 } 13129 else { 13130 $Alpha->delete_range($range, $range); 13131 } 13132 } 13133 $Alpha->add_description('Alphabetic'); 13134 $Alpha->add_alias('Alphabetic'); 13135 } 13136 $Alpha->add_alias('XPosixAlpha'); 13137 my $Posix_Alpha = $perl->add_match_table("PosixAlpha", 13138 Description => "[A-Za-z]", 13139 Initialize => $Alpha & $ASCII, 13140 ); 13141 $Posix_Upper->set_caseless_equivalent($Posix_Alpha); 13142 $Posix_Lower->set_caseless_equivalent($Posix_Alpha); 13143 13144 my $Alnum = $perl->add_match_table('Alnum', 13145 Description => 'Alphabetic and (decimal) Numeric', 13146 Initialize => $Alpha + $gc->table('Decimal_Number'), 13147 ); 13148 $Alnum->add_alias('XPosixAlnum'); 13149 $perl->add_match_table("PosixAlnum", 13150 Description => "[A-Za-z0-9]", 13151 Initialize => $Alnum & $ASCII, 13152 ); 13153 13154 my $Word = $perl->add_match_table('Word', 13155 Description => '\w, including beyond ASCII;' 13156 . ' = \p{Alnum} + \pM + \p{Pc}', 13157 Initialize => $Alnum + $gc->table('Mark'), 13158 ); 13159 $Word->add_alias('XPosixWord'); 13160 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 13161 if (defined $Pc) { 13162 $Word += $Pc; 13163 } 13164 else { 13165 $Word += ord('_'); # Make sure this is a $Word 13166 } 13167 my $JC = property_ref('Join_Control'); # Wasn't in release 1 13168 if (defined $JC) { 13169 $Word += $JC->table('Y'); 13170 } 13171 else { 13172 $Word += 0x200C + 0x200D; 13173 } 13174 13175 # This is a Perl extension, so the name doesn't begin with Posix. 13176 my $PerlWord = $perl->add_match_table('PerlWord', 13177 Description => '\w, restricted to ASCII = [A-Za-z0-9_]', 13178 Initialize => $Word & $ASCII, 13179 ); 13180 $PerlWord->add_alias('PosixWord'); 13181 13182 my $Blank = $perl->add_match_table('Blank', 13183 Description => '\h, Horizontal white space', 13184 13185 # 200B is Zero Width Space which is for line 13186 # break control, and was listed as 13187 # Space_Separator in early releases 13188 Initialize => $gc->table('Space_Separator') 13189 + 0x0009 # TAB 13190 - 0x200B, # ZWSP 13191 ); 13192 $Blank->add_alias('HorizSpace'); # Another name for it. 13193 $Blank->add_alias('XPosixBlank'); 13194 $perl->add_match_table("PosixBlank", 13195 Description => "\\t and ' '", 13196 Initialize => $Blank & $ASCII, 13197 ); 13198 13199 my $VertSpace = $perl->add_match_table('VertSpace', 13200 Description => '\v', 13201 Initialize => $gc->table('Line_Separator') 13202 + $gc->table('Paragraph_Separator') 13203 + 0x000A # LINE FEED 13204 + 0x000B # VERTICAL TAB 13205 + 0x000C # FORM FEED 13206 + 0x000D # CARRIAGE RETURN 13207 + 0x0085, # NEL 13208 ); 13209 # No Posix equivalent for vertical space 13210 13211 my $Space = $perl->add_match_table('Space', 13212 Description => '\s including beyond ASCII and vertical tab', 13213 Initialize => $Blank + $VertSpace, 13214 ); 13215 $Space->add_alias('XPosixSpace'); 13216 my $posix_space = $perl->add_match_table("PosixSpace", 13217 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", 13218 Initialize => $Space & $ASCII, 13219 ); 13220 13221 # Perl's traditional space doesn't include Vertical Tab prior to v5.18 13222 my $XPerlSpace = $perl->add_match_table('XPerlSpace', 13223 Description => '\s, including beyond ASCII', 13224 #Initialize => $Space - 0x000B, 13225 Initialize => $Space, 13226 ); 13227 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym 13228 my $PerlSpace = $perl->add_match_table('PerlSpace', 13229 Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab', 13230 Initialize => $XPerlSpace & $ASCII, 13231 ); 13232 13233 13234 my $Cntrl = $perl->add_match_table('Cntrl', 13235 Description => 'Control characters'); 13236 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); 13237 $Cntrl->add_alias('XPosixCntrl'); 13238 $perl->add_match_table("PosixCntrl", 13239 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL", 13240 Initialize => $Cntrl & $ASCII, 13241 ); 13242 13243 # $controls is a temporary used to construct Graph. 13244 my $controls = Range_List->new(Initialize => $gc->table('Unassigned') 13245 + $gc->table('Control')); 13246 # Cs not in release 1 13247 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate'); 13248 13249 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) 13250 my $Graph = $perl->add_match_table('Graph', 13251 Description => 'Characters that are graphical', 13252 Initialize => ~ ($Space + $controls), 13253 ); 13254 $Graph->add_alias('XPosixGraph'); 13255 $perl->add_match_table("PosixGraph", 13256 Description => 13257 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]', 13258 Initialize => $Graph & $ASCII, 13259 ); 13260 13261 $print = $perl->add_match_table('Print', 13262 Description => 'Characters that are graphical plus space characters (but no controls)', 13263 Initialize => $Blank + $Graph - $gc->table('Control'), 13264 ); 13265 $print->add_alias('XPosixPrint'); 13266 $perl->add_match_table("PosixPrint", 13267 Description => 13268 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', 13269 Initialize => $print & $ASCII, 13270 ); 13271 13272 my $Punct = $perl->add_match_table('Punct'); 13273 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); 13274 13275 # \p{punct} doesn't include the symbols, which posix does 13276 my $XPosixPunct = $perl->add_match_table('XPosixPunct', 13277 Description => '\p{Punct} + ASCII-range \p{Symbol}', 13278 Initialize => $gc->table('Punctuation') 13279 + ($ASCII & $gc->table('Symbol')), 13280 Perl_Extension => 1 13281 ); 13282 $perl->add_match_table('PosixPunct', Perl_Extension => 1, 13283 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', 13284 Initialize => $ASCII & $XPosixPunct, 13285 ); 13286 13287 my $Digit = $perl->add_match_table('Digit', 13288 Description => '[0-9] + all other decimal digits'); 13289 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); 13290 $Digit->add_alias('XPosixDigit'); 13291 my $PosixDigit = $perl->add_match_table("PosixDigit", 13292 Description => '[0-9]', 13293 Initialize => $Digit & $ASCII, 13294 ); 13295 13296 # Hex_Digit was not present in first release 13297 my $Xdigit = $perl->add_match_table('XDigit'); 13298 $Xdigit->add_alias('XPosixXDigit'); 13299 my $Hex = property_ref('Hex_Digit'); 13300 if (defined $Hex && ! $Hex->is_empty) { 13301 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); 13302 } 13303 else { 13304 # (Have to use hex instead of e.g. '0', because could be running on an 13305 # non-ASCII machine, and we want the Unicode (ASCII) values) 13306 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66, 13307 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); 13308 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO'); 13309 } 13310 13311 # AHex was not present in early releases 13312 my $PosixXDigit = $perl->add_match_table('PosixXDigit'); 13313 my $AHex = property_ref('ASCII_Hex_Digit'); 13314 if (defined $AHex && ! $AHex->is_empty) { 13315 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); 13316 } 13317 else { 13318 $PosixXDigit->initialize($Xdigit & $ASCII); 13319 $PosixXDigit->add_alias('AHex'); 13320 $PosixXDigit->add_alias('Ascii_Hex_Digit'); 13321 } 13322 $PosixXDigit->add_description('[0-9A-Fa-f]'); 13323 13324 my $dt = property_ref('Decomposition_Type'); 13325 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', 13326 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), 13327 Perl_Extension => 1, 13328 Note => 'Union of all non-canonical decompositions', 13329 ); 13330 13331 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier 13332 # than SD appeared, construct it ourselves, based on the first release SD 13333 # was in. A pod entry is grandfathered in for it 13334 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1, 13335 Perl_Extension => 1, 13336 Fate => $INTERNAL_ONLY, 13337 Status => $DISCOURAGED); 13338 my $soft_dotted = property_ref('Soft_Dotted'); 13339 if (defined $soft_dotted && ! $soft_dotted->is_empty) { 13340 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1); 13341 } 13342 else { 13343 13344 # This list came from 3.2 Soft_Dotted; all of these code points are in 13345 # all releases 13346 $CanonDCIJ->initialize([ 0x0069, 13347 0x006A, 13348 0x012F, 13349 0x0268, 13350 0x0456, 13351 0x0458, 13352 0x1E2D, 13353 0x1ECB, 13354 ]); 13355 $CanonDCIJ = $CanonDCIJ & $Assigned; 13356 } 13357 13358 # For backward compatibility, Perl has its own definition for IDStart. 13359 # It is regular XID_Start plus the underscore, but all characters must be 13360 # Word characters as well 13361 my $XID_Start = property_ref('XID_Start'); 13362 my $perl_xids = $perl->add_match_table('_Perl_IDStart', 13363 Perl_Extension => 1, 13364 Fate => $INTERNAL_ONLY, 13365 Initialize => ord('_') 13366 ); 13367 if (defined $XID_Start 13368 || defined ($XID_Start = property_ref('ID_Start'))) 13369 { 13370 $perl_xids += $XID_Start->table('Y'); 13371 } 13372 else { 13373 # For Unicode versions that don't have the property, construct our own 13374 # from first principles. The actual definition is: 13375 # Letters 13376 # + letter numbers (Nl) 13377 # - Pattern_Syntax 13378 # - Pattern_White_Space 13379 # + stability extensions 13380 # - NKFC modifications 13381 # 13382 # What we do in the code below is to include the identical code points 13383 # that are in the first release that had Unicode's version of this 13384 # property, essentially extrapolating backwards. There were no 13385 # stability extensions until v4.1, so none are included; likewise in 13386 # no Unicode version so far do subtracting PatSyn and PatWS make any 13387 # difference, so those also are ignored. 13388 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl(); 13389 13390 # We do subtract the NFKC modifications that are in the first version 13391 # that had this property. We don't bother to test if they are in the 13392 # version in question, because if they aren't, the operation is a 13393 # no-op. The NKFC modifications are discussed in 13394 # http://www.unicode.org/reports/tr31/#NFKC_Modifications 13395 foreach my $range ( 0x037A, 13396 0x0E33, 13397 0x0EB3, 13398 [ 0xFC5E, 0xFC63 ], 13399 [ 0xFDFA, 0xFE70 ], 13400 [ 0xFE72, 0xFE76 ], 13401 0xFE78, 13402 0xFE7A, 13403 0xFE7C, 13404 0xFE7E, 13405 [ 0xFF9E, 0xFF9F ], 13406 ) { 13407 if (ref $range) { 13408 $perl_xids->delete_range($range->[0], $range->[1]); 13409 } 13410 else { 13411 $perl_xids->delete_range($range, $range); 13412 } 13413 } 13414 } 13415 13416 $perl_xids &= $Word; 13417 13418 my $perl_xidc = $perl->add_match_table('_Perl_IDCont', 13419 Perl_Extension => 1, 13420 Fate => $INTERNAL_ONLY); 13421 my $XIDC = property_ref('XID_Continue'); 13422 if (defined $XIDC 13423 || defined ($XIDC = property_ref('ID_Continue'))) 13424 { 13425 $perl_xidc += $XIDC->table('Y'); 13426 } 13427 else { 13428 # Similarly, we construct our own XIDC if necessary for early Unicode 13429 # versions. The definition is: 13430 # everything in XIDS 13431 # + Gc=Mn 13432 # + Gc=Mc 13433 # + Gc=Nd 13434 # + Gc=Pc 13435 # - Pattern_Syntax 13436 # - Pattern_White_Space 13437 # + stability extensions 13438 # - NFKC modifications 13439 # 13440 # The same thing applies to this as with XIDS for the PatSyn, PatWS, 13441 # and stability extensions. There is a somewhat different set of NFKC 13442 # mods to remove (and add in this case). The ones below make this 13443 # have identical code points as in the first release that defined it. 13444 $perl_xidc += $perl_xids 13445 + $gc->table('L') 13446 + $gc->table('Mn') 13447 + $gc->table('Mc') 13448 + $gc->table('Nd') 13449 + 0x00B7 13450 ; 13451 if (defined (my $pc = $gc->table('Pc'))) { 13452 $perl_xidc += $pc; 13453 } 13454 else { # 1.1.5 didn't have Pc, but these should have been in it 13455 $perl_xidc += 0xFF3F; 13456 $perl_xidc->add_range(0x203F, 0x2040); 13457 $perl_xidc->add_range(0xFE33, 0xFE34); 13458 $perl_xidc->add_range(0xFE4D, 0xFE4F); 13459 } 13460 13461 # Subtract the NFKC mods 13462 foreach my $range ( 0x037A, 13463 [ 0xFC5E, 0xFC63 ], 13464 [ 0xFDFA, 0xFE1F ], 13465 0xFE70, 13466 [ 0xFE72, 0xFE76 ], 13467 0xFE78, 13468 0xFE7A, 13469 0xFE7C, 13470 0xFE7E, 13471 ) { 13472 if (ref $range) { 13473 $perl_xidc->delete_range($range->[0], $range->[1]); 13474 } 13475 else { 13476 $perl_xidc->delete_range($range, $range); 13477 } 13478 } 13479 } 13480 13481 $perl_xidc &= $Word; 13482 13483 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin', 13484 Perl_Extension => 1, 13485 Fate => $INTERNAL_ONLY, 13486 Initialize => $gc->table('Letter') & $Alpha & $perl_xids, 13487 ); 13488 13489 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue', 13490 Perl_Extension => 1, 13491 Fate => $INTERNAL_ONLY, 13492 Initialize => $perl_xidc 13493 + 0x0020 # SPACE 13494 + 0x0028 # ( 13495 + 0x0029 # ) 13496 + 0x002D # - 13497 + 0x00A0 # NBSP 13498 ); 13499 13500 # These two tables are for matching \X, which is based on the 'extended' 13501 # grapheme cluster, which came in 5.1; create empty ones if not already 13502 # present. The straight 'grapheme cluster' (non-extended) is used prior 13503 # to 5.1, and differs from the extended (see 13504 # http://www.unicode.org/reports/tr29/) only by these two tables, so we 13505 # get the older definition automatically when they are empty. 13506 my $gcb = property_ref('Grapheme_Cluster_Break'); 13507 my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend', 13508 Perl_Extension => 1, 13509 Fate => $INTERNAL_ONLY); 13510 if (defined (my $gcb_prepend = $gcb->table('Prepend'))) { 13511 $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1); 13512 } 13513 else { 13514 push @tables_that_may_be_empty, $perl_prepend->complete_name; 13515 } 13516 13517 # All the tables with _X_ in their names are used in defining \X handling, 13518 # and are based on the Unicode GCB property. Basically, \X matches: 13519 # CR LF 13520 # | Prepend* Begin Extend* 13521 # | . 13522 # Begin is: ( Special_Begin | ! Control ) 13523 # Begin is also: ( Regular_Begin | Special_Begin ) 13524 # where Regular_Begin is defined as ( ! Control - Special_Begin ) 13525 # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) 13526 # Extend is: ( Grapheme_Extend | Spacing_Mark ) 13527 # Control is: [ GCB_Control | CR | LF ] 13528 # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) 13529 13530 foreach my $gcb_name (qw{ L V T LV LVT }) { 13531 13532 # The perl internal extension's name is the gcb table name prepended 13533 # with an '_X_' 13534 my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name, 13535 Perl_Extension => 1, 13536 Fate => $INTERNAL_ONLY, 13537 Initialize => $gcb->table($gcb_name), 13538 ); 13539 # Version 1 had mostly different Hangul syllables that were removed 13540 # from later versions, so some of the tables may not apply. 13541 if ($v_version lt v2.0) { 13542 push @tables_that_may_be_empty, $perl_table->complete_name; 13543 } 13544 } 13545 13546 # More GCB. Populate a combined hangul syllables table 13547 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V', 13548 Perl_Extension => 1, 13549 Fate => $INTERNAL_ONLY); 13550 $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V'); 13551 $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V'); 13552 13553 my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1, 13554 Fate => $INTERNAL_ONLY); 13555 if ($v_version ge v6.2) { 13556 $ri += $gcb->table('RI'); 13557 } 13558 else { 13559 push @tables_that_may_be_empty, $ri->full_name; 13560 } 13561 13562 my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start', 13563 Perl_Extension => 1, 13564 Fate => $INTERNAL_ONLY, 13565 Initialize => $lv_lvt_v 13566 + $gcb->table('L') 13567 + $gcb->table('T') 13568 + $ri 13569 ); 13570 $specials_begin->add_comment(join_lines( <<END 13571For use in \\X; matches first (perhaps only) character of potential 13572multi-character sequences that can begin an extended grapheme cluster. They 13573need special handling because of their complicated nature. 13574END 13575 )); 13576 my $regular_begin = $perl->add_match_table('_X_Regular_Begin', 13577 Perl_Extension => 1, 13578 Fate => $INTERNAL_ONLY, 13579 Initialize => ~ $gcb->table('Control') 13580 - $specials_begin 13581 - $gcb->table('CR') 13582 - $gcb->table('LF') 13583 ); 13584 $regular_begin->add_comment(join_lines( <<END 13585For use in \\X; matches first character of anything that can begin an extended 13586grapheme cluster, except those that require special handling. 13587END 13588 )); 13589 13590 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1, 13591 Fate => $INTERNAL_ONLY, 13592 Initialize => $gcb->table('Extend') 13593 ); 13594 if (defined (my $sm = $gcb->table('SpacingMark'))) { 13595 $extend += $sm; 13596 } 13597 $extend->add_comment('For use in \X; matches: Extend | SpacingMark'); 13598 13599 # End of GCB \X processing 13600 13601 my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias'); 13602 13603 if (@named_sequences) { 13604 push @composition, 'Named_Sequence'; 13605 foreach my $sequence (@named_sequences) { 13606 $perl_charname->add_anomalous_entry($sequence); 13607 } 13608 } 13609 13610 my $alias_sentence = ""; 13611 my %abbreviations; 13612 my $alias = property_ref('Name_Alias'); 13613 $perl_charname->set_proxy_for('Name_Alias'); 13614 13615 # Add each entry in Name_Alias to Perl_Charnames. Where these go with 13616 # respect to any existing entry depends on the entry type. Corrections go 13617 # before said entry, as they should be returned in preference over the 13618 # existing entry. (A correction to a correction should be later in the 13619 # Name_Alias table, so it will correctly precede the erroneous correction 13620 # in Perl_Charnames.) 13621 # 13622 # Abbreviations go after everything else, so they are saved temporarily in 13623 # a hash for later. 13624 # 13625 # Everything else is added added afterwards, which preserves the input 13626 # ordering 13627 13628 foreach my $range ($alias->ranges) { 13629 next if $range->value eq ""; 13630 my $code_point = $range->start; 13631 if ($code_point != $range->end) { 13632 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 13633 } 13634 my ($value, $type) = split ': ', $range->value; 13635 my $replace_type; 13636 if ($type eq 'correction') { 13637 $replace_type = $MULTIPLE_BEFORE; 13638 } 13639 elsif ($type eq 'abbreviation') { 13640 13641 # Save for later 13642 $abbreviations{$value} = $code_point; 13643 next; 13644 } 13645 else { 13646 $replace_type = $MULTIPLE_AFTER; 13647 } 13648 13649 # Actually add; before or after current entry(ies) as determined 13650 # above. 13651 13652 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); 13653 } 13654 $alias_sentence = <<END; 13655The Name_Alias property adds duplicate code point entries that are 13656alternatives to the original name. If an addition is a corrected 13657name, it will be physically first in the table. The original (less correct, 13658but still valid) name will be next; then any alternatives, in no particular 13659order; and finally any abbreviations, again in no particular order. 13660END 13661 13662 # Now add the Unicode_1 names for the controls. The Unicode_1 names had 13663 # precedence before 6.1, so should be first in the file; the other names 13664 # have precedence starting in 6.1, 13665 my $before_or_after = ($v_version lt v6.1.0) 13666 ? $MULTIPLE_BEFORE 13667 : $MULTIPLE_AFTER; 13668 13669 foreach my $range (property_ref('Unicode_1_Name')->ranges) { 13670 my $code_point = $range->start; 13671 my $unicode_1_value = $range->value; 13672 next if $unicode_1_value eq ""; # Skip if name doesn't exist. 13673 13674 if ($code_point != $range->end) { 13675 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 13676 } 13677 13678 # To handle EBCDIC, we don't hard code in the code points of the 13679 # controls; instead realizing that all of them are below 256. 13680 last if $code_point > 255; 13681 13682 # We only add in the controls. 13683 next if $gc->value_of($code_point) ne 'Cc'; 13684 13685 # We reject this Unicode1 name for later Perls, as it is used for 13686 # another code point 13687 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0; 13688 13689 # This won't add an exact duplicate. 13690 $perl_charname->add_duplicate($code_point, $unicode_1_value, 13691 Replace => $before_or_after); 13692 } 13693 13694 # But in this version only, the ALERT has precedence over BELL, the 13695 # Unicode_1_Name that would otherwise have precedence. 13696 if ($v_version eq v6.0.0) { 13697 $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE); 13698 } 13699 13700 # Now that have everything added, add in abbreviations after 13701 # everything else. Sort so results don't change between runs of this 13702 # program 13703 foreach my $value (sort keys %abbreviations) { 13704 $perl_charname->add_duplicate($abbreviations{$value}, $value, 13705 Replace => $MULTIPLE_AFTER); 13706 } 13707 13708 my $comment; 13709 if (@composition <= 2) { # Always at least 2 13710 $comment = join " and ", @composition; 13711 } 13712 else { 13713 $comment = join ", ", @composition[0 .. scalar @composition - 2]; 13714 $comment .= ", and $composition[-1]"; 13715 } 13716 13717 $perl_charname->add_comment(join_lines( <<END 13718This file is for charnames.pm. It is the union of the $comment properties. 13719Unicode_1_Name entries are used only for nameless code points in the Name 13720property. 13721$alias_sentence 13722This file doesn't include the algorithmically determinable names. For those, 13723use 'unicore/Name.pm' 13724END 13725 )); 13726 property_ref('Name')->add_comment(join_lines( <<END 13727This file doesn't include the algorithmically determinable names. For those, 13728use 'unicore/Name.pm' 13729END 13730 )); 13731 13732 # Construct the Present_In property from the Age property. 13733 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) { 13734 my $default_map = $age->default_map; 13735 my $in = Property->new('In', 13736 Default_Map => $default_map, 13737 Full_Name => "Present_In", 13738 Perl_Extension => 1, 13739 Type => $ENUM, 13740 Initialize => $age, 13741 ); 13742 $in->add_comment(join_lines(<<END 13743THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the 13744same as for $age, and not for what $in really means. This is because anything 13745defined in a given release should have multiple values: that release and all 13746higher ones. But only one value per code point can be represented in a table 13747like this. 13748END 13749 )); 13750 13751 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the 13752 # lowest numbered (earliest) come first, with the non-numeric one 13753 # last. 13754 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/) 13755 ? 1 13756 : ($b->name !~ /^[\d.]*$/) 13757 ? -1 13758 : $a->name <=> $b->name 13759 } $age->tables; 13760 13761 # The Present_In property is the cumulative age properties. The first 13762 # one hence is identical to the first age one. 13763 my $previous_in = $in->add_match_table($first_age->name); 13764 $previous_in->set_equivalent_to($first_age, Related => 1); 13765 13766 my $description_start = "Code point's usage introduced in version "; 13767 $first_age->add_description($description_start . $first_age->name); 13768 13769 # To construct the accumulated values, for each of the age tables 13770 # starting with the 2nd earliest, merge the earliest with it, to get 13771 # all those code points existing in the 2nd earliest. Repeat merging 13772 # the new 2nd earliest with the 3rd earliest to get all those existing 13773 # in the 3rd earliest, and so on. 13774 foreach my $current_age (@rest_ages) { 13775 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric 13776 13777 my $current_in = $in->add_match_table( 13778 $current_age->name, 13779 Initialize => $current_age + $previous_in, 13780 Description => $description_start 13781 . $current_age->name 13782 . ' or earlier', 13783 ); 13784 $previous_in = $current_in; 13785 13786 # Add clarifying material for the corresponding age file. This is 13787 # in part because of the confusing and contradictory information 13788 # given in the Standard's documentation itself, as of 5.2. 13789 $current_age->add_description( 13790 "Code point's usage was introduced in version " 13791 . $current_age->name); 13792 $current_age->add_note("See also $in"); 13793 13794 } 13795 13796 # And finally the code points whose usages have yet to be decided are 13797 # the same in both properties. Note that permanently unassigned code 13798 # points actually have their usage assigned (as being permanently 13799 # unassigned), so that these tables are not the same as gc=cn. 13800 my $unassigned = $in->add_match_table($default_map); 13801 my $age_default = $age->table($default_map); 13802 $age_default->add_description(<<END 13803Code point's usage has not been assigned in any Unicode release thus far. 13804END 13805 ); 13806 $unassigned->set_equivalent_to($age_default, Related => 1); 13807 } 13808 13809 # See L<perlfunc/quotemeta> 13810 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', 13811 Perl_Extension => 1, 13812 Fate => $INTERNAL_ONLY, 13813 13814 # Initialize to what's common in 13815 # all Unicode releases. 13816 Initialize => 13817 $Space 13818 + $gc->table('Control') 13819 ); 13820 13821 # In early releases without the proper Unicode properties, just set to \W. 13822 if (! defined (my $patsyn = property_ref('Pattern_Syntax')) 13823 || ! defined (my $patws = property_ref('Pattern_White_Space')) 13824 || ! defined (my $di = property_ref('Default_Ignorable_Code_Point'))) 13825 { 13826 $quotemeta += ~ $Word; 13827 } 13828 else { 13829 $quotemeta += $patsyn->table('Y') 13830 + $patws->table('Y') 13831 + $di->table('Y') 13832 + ((~ $Word) & $ASCII); 13833 } 13834 13835 # Finished creating all the perl properties. All non-internal non-string 13836 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with 13837 # an underscore.) These do not get a separate entry in the pod file 13838 foreach my $table ($perl->tables) { 13839 foreach my $alias ($table->aliases) { 13840 next if $alias->name =~ /^_/; 13841 $table->add_alias('Is_' . $alias->name, 13842 Re_Pod_Entry => 0, 13843 UCD => 0, 13844 Status => $alias->status, 13845 OK_as_Filename => 0); 13846 } 13847 } 13848 13849 # Here done with all the basic stuff. Ready to populate the information 13850 # about each character if annotating them. 13851 if ($annotate) { 13852 13853 # See comments at its declaration 13854 $annotate_ranges = Range_Map->new; 13855 13856 # This separates out the non-characters from the other unassigneds, so 13857 # can give different annotations for each. 13858 $unassigned_sans_noncharacters = Range_List->new( 13859 Initialize => $gc->table('Unassigned')); 13860 if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) { 13861 $unassigned_sans_noncharacters &= $nonchars->table('N'); 13862 } 13863 13864 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) { 13865 $i = populate_char_info($i); # Note sets $i so may cause skips 13866 } 13867 } 13868 13869 return; 13870} 13871 13872sub add_perl_synonyms() { 13873 # A number of Unicode tables have Perl synonyms that are expressed in 13874 # the single-form, \p{name}. These are: 13875 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and 13876 # \p{Is_Name} as synonyms 13877 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms 13878 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms 13879 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no 13880 # conflict, \p{Value} and \p{Is_Value} as well 13881 # 13882 # This routine generates these synonyms, warning of any unexpected 13883 # conflicts. 13884 13885 # Construct the list of tables to get synonyms for. Start with all the 13886 # binary and the General_Category ones. 13887 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } 13888 property_ref('*'); 13889 push @tables, $gc->tables; 13890 13891 # If the version of Unicode includes the Script property, add its tables 13892 push @tables, $script->tables if defined $script; 13893 13894 # The Block tables are kept separate because they are treated differently. 13895 # And the earliest versions of Unicode didn't include them, so add only if 13896 # there are some. 13897 my @blocks; 13898 push @blocks, $block->tables if defined $block; 13899 13900 # Here, have the lists of tables constructed. Process blocks last so that 13901 # if there are name collisions with them, blocks have lowest priority. 13902 # Should there ever be other collisions, manual intervention would be 13903 # required. See the comments at the beginning of the program for a 13904 # possible way to handle those semi-automatically. 13905 foreach my $table (@tables, @blocks) { 13906 13907 # For non-binary properties, the synonym is just the name of the 13908 # table, like Greek, but for binary properties the synonym is the name 13909 # of the property, and means the code points in its 'Y' table. 13910 my $nominal = $table; 13911 my $nominal_property = $nominal->property; 13912 my $actual; 13913 if (! $nominal->isa('Property')) { 13914 $actual = $table; 13915 } 13916 else { 13917 13918 # Here is a binary property. Use the 'Y' table. Verify that is 13919 # there 13920 my $yes = $nominal->table('Y'); 13921 unless (defined $yes) { # Must be defined, but is permissible to 13922 # be empty. 13923 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); 13924 next; 13925 } 13926 $actual = $yes; 13927 } 13928 13929 foreach my $alias ($nominal->aliases) { 13930 13931 # Attempt to create a table in the perl directory for the 13932 # candidate table, using whatever aliases in it that don't 13933 # conflict. Also add non-conflicting aliases for all these 13934 # prefixed by 'Is_' (and/or 'In_' for Block property tables) 13935 PREFIX: 13936 foreach my $prefix ("", 'Is_', 'In_') { 13937 13938 # Only Block properties can have added 'In_' aliases. 13939 next if $prefix eq 'In_' and $nominal_property != $block; 13940 13941 my $proposed_name = $prefix . $alias->name; 13942 13943 # No Is_Is, In_In, nor combinations thereof 13944 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; 13945 next if $proposed_name =~ /^ I [ns] _I [ns] _/x; 13946 13947 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; 13948 13949 # Get a reference to any existing table in the perl 13950 # directory with the desired name. 13951 my $pre_existing = $perl->table($proposed_name); 13952 13953 if (! defined $pre_existing) { 13954 13955 # No name collision, so ok to add the perl synonym. 13956 13957 my $make_re_pod_entry; 13958 my $ok_as_filename; 13959 my $status = $alias->status; 13960 if ($nominal_property == $block) { 13961 13962 # For block properties, the 'In' form is preferred for 13963 # external use; the pod file contains wild cards for 13964 # this and the 'Is' form so no entries for those; and 13965 # we don't want people using the name without the 13966 # 'In', so discourage that. 13967 if ($prefix eq "") { 13968 $make_re_pod_entry = 1; 13969 $status = $status || $DISCOURAGED; 13970 $ok_as_filename = 0; 13971 } 13972 elsif ($prefix eq 'In_') { 13973 $make_re_pod_entry = 0; 13974 $status = $status || $NORMAL; 13975 $ok_as_filename = 1; 13976 } 13977 else { 13978 $make_re_pod_entry = 0; 13979 $status = $status || $DISCOURAGED; 13980 $ok_as_filename = 0; 13981 } 13982 } 13983 elsif ($prefix ne "") { 13984 13985 # The 'Is' prefix is handled in the pod by a wild 13986 # card, and we won't use it for an external name 13987 $make_re_pod_entry = 0; 13988 $status = $status || $NORMAL; 13989 $ok_as_filename = 0; 13990 } 13991 else { 13992 13993 # Here, is an empty prefix, non block. This gets its 13994 # own pod entry and can be used for an external name. 13995 $make_re_pod_entry = 1; 13996 $status = $status || $NORMAL; 13997 $ok_as_filename = 1; 13998 } 13999 14000 # Here, there isn't a perl pre-existing table with the 14001 # name. Look through the list of equivalents of this 14002 # table to see if one is a perl table. 14003 foreach my $equivalent ($actual->leader->equivalents) { 14004 next if $equivalent->property != $perl; 14005 14006 # Here, have found a table for $perl. Add this alias 14007 # to it, and are done with this prefix. 14008 $equivalent->add_alias($proposed_name, 14009 Re_Pod_Entry => $make_re_pod_entry, 14010 14011 # Currently don't output these in the 14012 # ucd pod, as are strongly discouraged 14013 # from being used 14014 UCD => 0, 14015 14016 Status => $status, 14017 OK_as_Filename => $ok_as_filename); 14018 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; 14019 next PREFIX; 14020 } 14021 14022 # Here, $perl doesn't already have a table that is a 14023 # synonym for this property, add one. 14024 my $added_table = $perl->add_match_table($proposed_name, 14025 Re_Pod_Entry => $make_re_pod_entry, 14026 14027 # See UCD comment just above 14028 UCD => 0, 14029 14030 Status => $status, 14031 OK_as_Filename => $ok_as_filename); 14032 # And it will be related to the actual table, since it is 14033 # based on it. 14034 $added_table->set_equivalent_to($actual, Related => 1); 14035 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; 14036 next; 14037 } # End of no pre-existing. 14038 14039 # Here, there is a pre-existing table that has the proposed 14040 # name. We could be in trouble, but not if this is just a 14041 # synonym for another table that we have already made a child 14042 # of the pre-existing one. 14043 if ($pre_existing->is_set_equivalent_to($actual)) { 14044 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; 14045 $pre_existing->add_alias($proposed_name); 14046 next; 14047 } 14048 14049 # Here, there is a name collision, but it still could be ok if 14050 # the tables match the identical set of code points, in which 14051 # case, we can combine the names. Compare each table's code 14052 # point list to see if they are identical. 14053 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; 14054 if ($pre_existing->matches_identically_to($actual)) { 14055 14056 # Here, they do match identically. Not a real conflict. 14057 # Make the perl version a child of the Unicode one, except 14058 # in the non-obvious case of where the perl name is 14059 # already a synonym of another Unicode property. (This is 14060 # excluded by the test for it being its own parent.) The 14061 # reason for this exclusion is that then the two Unicode 14062 # properties become related; and we don't really know if 14063 # they are or not. We generate documentation based on 14064 # relatedness, and this would be misleading. Code 14065 # later executed in the process will cause the tables to 14066 # be represented by a single file anyway, without making 14067 # it look in the pod like they are necessarily related. 14068 if ($pre_existing->parent == $pre_existing 14069 && ($pre_existing->property == $perl 14070 || $actual->property == $perl)) 14071 { 14072 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; 14073 $pre_existing->set_equivalent_to($actual, Related => 1); 14074 } 14075 elsif (main::DEBUG && $to_trace) { 14076 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; 14077 trace $pre_existing->parent; 14078 } 14079 next PREFIX; 14080 } 14081 14082 # Here they didn't match identically, there is a real conflict 14083 # between our new name and a pre-existing property. 14084 $actual->add_conflicting($proposed_name, 'p', $pre_existing); 14085 $pre_existing->add_conflicting($nominal->full_name, 14086 'p', 14087 $actual); 14088 14089 # Don't output a warning for aliases for the block 14090 # properties (unless they start with 'In_') as it is 14091 # expected that there will be conflicts and the block 14092 # form loses. 14093 if ($verbosity >= $NORMAL_VERBOSITY 14094 && ($actual->property != $block || $prefix eq 'In_')) 14095 { 14096 print simple_fold(join_lines(<<END 14097There is already an alias named $proposed_name (from $pre_existing), 14098so not creating this alias for $actual 14099END 14100 ), "", 4); 14101 } 14102 14103 # Keep track for documentation purposes. 14104 $has_In_conflicts++ if $prefix eq 'In_'; 14105 $has_Is_conflicts++ if $prefix eq 'Is_'; 14106 } 14107 } 14108 } 14109 14110 # There are some properties which have No and Yes (and N and Y) as 14111 # property values, but aren't binary, and could possibly be confused with 14112 # binary ones. So create caveats for them. There are tables that are 14113 # named 'No', and tables that are named 'N', but confusion is not likely 14114 # unless they are the same table. For example, N meaning Number or 14115 # Neutral is not likely to cause confusion, so don't add caveats to things 14116 # like them. 14117 foreach my $property (grep { $_->type != $BINARY 14118 && $_->type != $FORCED_BINARY } 14119 property_ref('*')) 14120 { 14121 my $yes = $property->table('Yes'); 14122 if (defined $yes) { 14123 my $y = $property->table('Y'); 14124 if (defined $y && $yes == $y) { 14125 foreach my $alias ($property->aliases) { 14126 $yes->add_conflicting($alias->name); 14127 } 14128 } 14129 } 14130 my $no = $property->table('No'); 14131 if (defined $no) { 14132 my $n = $property->table('N'); 14133 if (defined $n && $no == $n) { 14134 foreach my $alias ($property->aliases) { 14135 $no->add_conflicting($alias->name, 'P'); 14136 } 14137 } 14138 } 14139 } 14140 14141 return; 14142} 14143 14144sub register_file_for_name($$$) { 14145 # Given info about a table and a datafile that it should be associated 14146 # with, register that association 14147 14148 my $table = shift; 14149 my $directory_ref = shift; # Array of the directory path for the file 14150 my $file = shift; # The file name in the final directory. 14151 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 14152 14153 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace; 14154 14155 if ($table->isa('Property')) { 14156 $table->set_file_path(@$directory_ref, $file); 14157 push @map_properties, $table; 14158 14159 # No swash means don't do the rest of this. 14160 return if $table->fate != $ORDINARY; 14161 14162 # Get the path to the file 14163 my @path = $table->file_path; 14164 14165 # Use just the file name if no subdirectory. 14166 shift @path if $path[0] eq File::Spec->curdir(); 14167 14168 my $file = join '/', @path; 14169 14170 # Create a hash entry for utf8_heavy to get the file that stores this 14171 # property's map table 14172 foreach my $alias ($table->aliases) { 14173 my $name = $alias->name; 14174 $loose_property_to_file_of{standardize($name)} = $file; 14175 } 14176 14177 # And a way for utf8_heavy to find the proper key in the SwashInfo 14178 # hash for this property. 14179 $file_to_swash_name{$file} = "To" . $table->swash_name; 14180 return; 14181 } 14182 14183 # Do all of the work for all equivalent tables when called with the leader 14184 # table, so skip if isn't the leader. 14185 return if $table->leader != $table; 14186 14187 # If this is a complement of another file, use that other file instead, 14188 # with a ! prepended to it. 14189 my $complement; 14190 if (($complement = $table->complement) != 0) { 14191 my @directories = $complement->file_path; 14192 14193 # This assumes that the 0th element is something like 'lib', 14194 # the 1th element the property name (in its own directory), like 14195 # 'AHex', and the 2th element the file like 'Y' which will have a .pl 14196 # appended to it later. 14197 $directories[1] =~ s/^/!/; 14198 $file = pop @directories; 14199 $directory_ref =\@directories; 14200 } 14201 14202 # Join all the file path components together, using slashes. 14203 my $full_filename = join('/', @$directory_ref, $file); 14204 14205 # All go in the same subdirectory of unicore 14206 if ($directory_ref->[0] ne $matches_directory) { 14207 Carp::my_carp("Unexpected directory in " 14208 . join('/', @{$directory_ref}, $file)); 14209 } 14210 14211 # For this table and all its equivalents ... 14212 foreach my $table ($table, $table->equivalents) { 14213 14214 # Associate it with its file internally. Don't include the 14215 # $matches_directory first component 14216 $table->set_file_path(@$directory_ref, $file); 14217 14218 # No swash means don't do the rest of this. 14219 next if $table->isa('Map_Table') && $table->fate != $ORDINARY; 14220 14221 my $sub_filename = join('/', $directory_ref->[1, -1], $file); 14222 14223 my $property = $table->property; 14224 my $property_name = ($property == $perl) 14225 ? "" # 'perl' is never explicitly stated 14226 : standardize($property->name) . '='; 14227 14228 my $is_default = 0; # Is this table the default one for the property? 14229 14230 # To calculate $is_default, we find if this table is the same as the 14231 # default one for the property. But this is complicated by the 14232 # possibility that there is a master table for this one, and the 14233 # information is stored there instead of here. 14234 my $parent = $table->parent; 14235 my $leader_prop = $parent->property; 14236 my $default_map = $leader_prop->default_map; 14237 if (defined $default_map) { 14238 my $default_table = $leader_prop->table($default_map); 14239 $is_default = 1 if defined $default_table && $parent == $default_table; 14240 } 14241 14242 # Calculate the loose name for this table. Mostly it's just its name, 14243 # standardized. But in the case of Perl tables that are single-form 14244 # equivalents to Unicode properties, it is the latter's name. 14245 my $loose_table_name = 14246 ($property != $perl || $leader_prop == $perl) 14247 ? standardize($table->name) 14248 : standardize($parent->name); 14249 14250 my $deprecated = ($table->status eq $DEPRECATED) 14251 ? $table->status_info 14252 : ""; 14253 my $caseless_equivalent = $table->caseless_equivalent; 14254 14255 # And for each of the table's aliases... This inner loop eventually 14256 # goes through all aliases in the UCD that we generate regex match 14257 # files for 14258 foreach my $alias ($table->aliases) { 14259 my $standard = utf8_heavy_name($table, $alias); 14260 14261 # Generate an entry in either the loose or strict hashes, which 14262 # will translate the property and alias names combination into the 14263 # file where the table for them is stored. 14264 if ($alias->loose_match) { 14265 if (exists $loose_to_file_of{$standard}) { 14266 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); 14267 } 14268 else { 14269 $loose_to_file_of{$standard} = $sub_filename; 14270 } 14271 } 14272 else { 14273 if (exists $stricter_to_file_of{$standard}) { 14274 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); 14275 } 14276 else { 14277 $stricter_to_file_of{$standard} = $sub_filename; 14278 14279 # Tightly coupled with how utf8_heavy.pl works, for a 14280 # floating point number that is a whole number, get rid of 14281 # the trailing decimal point and 0's, so that utf8_heavy 14282 # will work. Also note that this assumes that such a 14283 # number is matched strictly; so if that were to change, 14284 # this would be wrong. 14285 if ((my $integer_name = $alias->name) 14286 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) 14287 { 14288 $stricter_to_file_of{$property_name . $integer_name} 14289 = $sub_filename; 14290 } 14291 } 14292 } 14293 14294 # For Unicode::UCD, create a mapping of the prop=value to the 14295 # canonical =value for that property. 14296 if ($standard =~ /=/) { 14297 14298 # This could happen if a strict name mapped into an existing 14299 # loose name. In that event, the strict names would have to 14300 # be moved to a new hash. 14301 if (exists($loose_to_standard_value{$standard})) { 14302 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); 14303 } 14304 $loose_to_standard_value{$standard} = $loose_table_name; 14305 } 14306 14307 # Keep a list of the deprecated properties and their filenames 14308 if ($deprecated && $complement == 0) { 14309 $utf8::why_deprecated{$sub_filename} = $deprecated; 14310 } 14311 14312 # And a substitute table, if any, for case-insensitive matching 14313 if ($caseless_equivalent != 0) { 14314 $caseless_equivalent_to{$standard} = $caseless_equivalent; 14315 } 14316 14317 # Add to defaults list if the table this alias belongs to is the 14318 # default one 14319 $loose_defaults{$standard} = 1 if $is_default; 14320 } 14321 } 14322 14323 return; 14324} 14325 14326{ # Closure 14327 my %base_names; # Names already used for avoiding DOS 8.3 filesystem 14328 # conflicts 14329 my %full_dir_name_of; # Full length names of directories used. 14330 14331 sub construct_filename($$$) { 14332 # Return a file name for a table, based on the table name, but perhaps 14333 # changed to get rid of non-portable characters in it, and to make 14334 # sure that it is unique on a file system that allows the names before 14335 # any period to be at most 8 characters (DOS). While we're at it 14336 # check and complain if there are any directory conflicts. 14337 14338 my $name = shift; # The name to start with 14339 my $mutable = shift; # Boolean: can it be changed? If no, but 14340 # yet it must be to work properly, a warning 14341 # is given 14342 my $directories_ref = shift; # A reference to an array containing the 14343 # path to the file, with each element one path 14344 # component. This is used because the same 14345 # name can be used in different directories. 14346 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 14347 14348 my $warn = ! defined wantarray; # If true, then if the name is 14349 # changed, a warning is issued as well. 14350 14351 if (! defined $name) { 14352 Carp::my_carp("Undefined name in directory " 14353 . File::Spec->join(@$directories_ref) 14354 . ". '_' used"); 14355 return '_'; 14356 } 14357 14358 # Make sure that no directory names conflict with each other. Look at 14359 # each directory in the input file's path. If it is already in use, 14360 # assume it is correct, and is merely being re-used, but if we 14361 # truncate it to 8 characters, and find that there are two directories 14362 # that are the same for the first 8 characters, but differ after that, 14363 # then that is a problem. 14364 foreach my $directory (@$directories_ref) { 14365 my $short_dir = substr($directory, 0, 8); 14366 if (defined $full_dir_name_of{$short_dir}) { 14367 next if $full_dir_name_of{$short_dir} eq $directory; 14368 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); 14369 } 14370 else { 14371 $full_dir_name_of{$short_dir} = $directory; 14372 } 14373 } 14374 14375 my $path = join '/', @$directories_ref; 14376 $path .= '/' if $path; 14377 14378 # Remove interior underscores. 14379 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; 14380 14381 # Change any non-word character into an underscore, and truncate to 8. 14382 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" 14383 substr($filename, 8) = "" if length($filename) > 8; 14384 14385 # Make sure the basename doesn't conflict with something we 14386 # might have already written. If we have, say, 14387 # InGreekExtended1 14388 # InGreekExtended2 14389 # they become 14390 # InGreekE 14391 # InGreek2 14392 my $warned = 0; 14393 while (my $num = $base_names{$path}{lc $filename}++) { 14394 $num++; # so basenames with numbers start with '2', which 14395 # just looks more natural. 14396 14397 # Want to append $num, but if it'll make the basename longer 14398 # than 8 characters, pre-truncate $filename so that the result 14399 # is acceptable. 14400 my $delta = length($filename) + length($num) - 8; 14401 if ($delta > 0) { 14402 substr($filename, -$delta) = $num; 14403 } 14404 else { 14405 $filename .= $num; 14406 } 14407 if ($warn && ! $warned) { 14408 $warned = 1; 14409 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); 14410 } 14411 } 14412 14413 return $filename if $mutable; 14414 14415 # If not changeable, must return the input name, but warn if needed to 14416 # change it beyond shortening it. 14417 if ($name ne $filename 14418 && substr($name, 0, length($filename)) ne $filename) { 14419 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); 14420 } 14421 return $name; 14422 } 14423} 14424 14425# The pod file contains a very large table. Many of the lines in that table 14426# would exceed a typical output window's size, and so need to be wrapped with 14427# a hanging indent to make them look good. The pod language is really 14428# insufficient here. There is no general construct to do that in pod, so it 14429# is done here by beginning each such line with a space to cause the result to 14430# be output without formatting, and doing all the formatting here. This leads 14431# to the result that if the eventual display window is too narrow it won't 14432# look good, and if the window is too wide, no advantage is taken of that 14433# extra width. A further complication is that the output may be indented by 14434# the formatter so that there is less space than expected. What I (khw) have 14435# done is to assume that that indent is a particular number of spaces based on 14436# what it is in my Linux system; people can always resize their windows if 14437# necessary, but this is obviously less than desirable, but the best that can 14438# be expected. 14439my $automatic_pod_indent = 8; 14440 14441# Try to format so that uses fewest lines, but few long left column entries 14442# slide into the right column. An experiment on 5.1 data yielded the 14443# following percentages that didn't cut into the other side along with the 14444# associated first-column widths 14445# 69% = 24 14446# 80% not too bad except for a few blocks 14447# 90% = 33; # , cuts 353/3053 lines from 37 = 12% 14448# 95% = 37; 14449my $indent_info_column = 27; # 75% of lines didn't have overlap 14450 14451my $FILLER = 3; # Length of initial boiler-plate columns in a pod line 14452 # The 3 is because of: 14453 # 1 for the leading space to tell the pod formatter to 14454 # output as-is 14455 # 1 for the flag 14456 # 1 for the space between the flag and the main data 14457 14458sub format_pod_line ($$$;$$) { 14459 # Take a pod line and return it, formatted properly 14460 14461 my $first_column_width = shift; 14462 my $entry = shift; # Contents of left column 14463 my $info = shift; # Contents of right column 14464 14465 my $status = shift || ""; # Any flag 14466 14467 my $loose_match = shift; # Boolean. 14468 $loose_match = 1 unless defined $loose_match; 14469 14470 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 14471 14472 my $flags = ""; 14473 $flags .= $STRICTER if ! $loose_match; 14474 14475 $flags .= $status if $status; 14476 14477 # There is a blank in the left column to cause the pod formatter to 14478 # output the line as-is. 14479 return sprintf " %-*s%-*s %s\n", 14480 # The first * in the format is replaced by this, the -1 is 14481 # to account for the leading blank. There isn't a 14482 # hard-coded blank after this to separate the flags from 14483 # the rest of the line, so that in the unlikely event that 14484 # multiple flags are shown on the same line, they both 14485 # will get displayed at the expense of that separation, 14486 # but since they are left justified, a blank will be 14487 # inserted in the normal case. 14488 $FILLER - 1, 14489 $flags, 14490 14491 # The other * in the format is replaced by this number to 14492 # cause the first main column to right fill with blanks. 14493 # The -1 is for the guaranteed blank following it. 14494 $first_column_width - $FILLER - 1, 14495 $entry, 14496 $info; 14497} 14498 14499my @zero_match_tables; # List of tables that have no matches in this release 14500 14501sub make_re_pod_entries($) { 14502 # This generates the entries for the pod file for a given table. 14503 # Also done at this time are any children tables. The output looks like: 14504 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) 14505 14506 my $input_table = shift; # Table the entry is for 14507 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 14508 14509 # Generate parent and all its children at the same time. 14510 return if $input_table->parent != $input_table; 14511 14512 my $property = $input_table->property; 14513 my $type = $property->type; 14514 my $full_name = $property->full_name; 14515 14516 my $count = $input_table->count; 14517 my $string_count = clarify_number($count); 14518 my $status = $input_table->status; 14519 my $status_info = $input_table->status_info; 14520 my $caseless_equivalent = $input_table->caseless_equivalent; 14521 14522 # Don't mention a placeholder equivalent as it isn't to be listed in the 14523 # pod 14524 $caseless_equivalent = 0 if $caseless_equivalent != 0 14525 && $caseless_equivalent->fate > $ORDINARY; 14526 14527 my $entry_for_first_table; # The entry for the first table output. 14528 # Almost certainly, it is the parent. 14529 14530 # For each related table (including itself), we will generate a pod entry 14531 # for each name each table goes by 14532 foreach my $table ($input_table, $input_table->children) { 14533 14534 # utf8_heavy.pl cannot deal with null string property values, so skip 14535 # any tables that have no non-null names. 14536 next if ! grep { $_->name ne "" } $table->aliases; 14537 14538 # First, gather all the info that applies to this table as a whole. 14539 14540 push @zero_match_tables, $table if $count == 0 14541 # Don't mention special tables 14542 # as being zero length 14543 && $table->fate == $ORDINARY; 14544 14545 my $table_property = $table->property; 14546 14547 # The short name has all the underscores removed, while the full name 14548 # retains them. Later, we decide whether to output a short synonym 14549 # for the full one, we need to compare apples to apples, so we use the 14550 # short name's length including underscores. 14551 my $table_property_short_name_length; 14552 my $table_property_short_name 14553 = $table_property->short_name(\$table_property_short_name_length); 14554 my $table_property_full_name = $table_property->full_name; 14555 14556 # Get how much savings there is in the short name over the full one 14557 # (delta will always be <= 0) 14558 my $table_property_short_delta = $table_property_short_name_length 14559 - length($table_property_full_name); 14560 my @table_description = $table->description; 14561 my @table_note = $table->note; 14562 14563 # Generate an entry for each alias in this table. 14564 my $entry_for_first_alias; # saves the first one encountered. 14565 foreach my $alias ($table->aliases) { 14566 14567 # Skip if not to go in pod. 14568 next unless $alias->make_re_pod_entry; 14569 14570 # Start gathering all the components for the entry 14571 my $name = $alias->name; 14572 14573 # Skip if name is empty, as can't be accessed by regexes. 14574 next if $name eq ""; 14575 14576 my $entry; # Holds the left column, may include extras 14577 my $entry_ref; # To refer to the left column's contents from 14578 # another entry; has no extras 14579 14580 # First the left column of the pod entry. Tables for the $perl 14581 # property always use the single form. 14582 if ($table_property == $perl) { 14583 $entry = "\\p{$name}"; 14584 $entry_ref = "\\p{$name}"; 14585 } 14586 else { # Compound form. 14587 14588 # Only generate one entry for all the aliases that mean true 14589 # or false in binary properties. Append a '*' to indicate 14590 # some are missing. (The heading comment notes this.) 14591 my $rhs; 14592 if ($type == $BINARY) { 14593 next if $name ne 'N' && $name ne 'Y'; 14594 $rhs = "$name*"; 14595 } 14596 elsif ($type != $FORCED_BINARY) { 14597 $rhs = $name; 14598 } 14599 else { 14600 14601 # Forced binary properties require special handling. It 14602 # has two sets of tables, one set is true/false; and the 14603 # other set is everything else. Entries are generated for 14604 # each set. Use the Bidi_Mirrored property (which appears 14605 # in all Unicode versions) to get a list of the aliases 14606 # for the true/false tables. Of these, only output the N 14607 # and Y ones, the same as, a regular binary property. And 14608 # output all the rest, same as a non-binary property. 14609 my $bm = property_ref("Bidi_Mirrored"); 14610 if ($name eq 'N' || $name eq 'Y') { 14611 $rhs = "$name*"; 14612 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, 14613 $bm->table("N")->aliases) 14614 { 14615 next; 14616 } 14617 else { 14618 $rhs = $name; 14619 } 14620 } 14621 14622 # Colon-space is used to give a little more space to be easier 14623 # to read; 14624 $entry = "\\p{" 14625 . $table_property_full_name 14626 . ": $rhs}"; 14627 14628 # But for the reference to this entry, which will go in the 14629 # right column, where space is at a premium, use equals 14630 # without a space 14631 $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; 14632 } 14633 14634 # Then the right (info) column. This is stored as components of 14635 # an array for the moment, then joined into a string later. For 14636 # non-internal only properties, begin the info with the entry for 14637 # the first table we encountered (if any), as things are ordered 14638 # so that that one is the most descriptive. This leads to the 14639 # info column of an entry being a more descriptive version of the 14640 # name column 14641 my @info; 14642 if ($name =~ /^_/) { 14643 push @info, 14644 '(For internal use by Perl, not necessarily stable)'; 14645 } 14646 elsif ($entry_for_first_alias) { 14647 push @info, $entry_for_first_alias; 14648 } 14649 14650 # If this entry is equivalent to another, add that to the info, 14651 # using the first such table we encountered 14652 if ($entry_for_first_table) { 14653 if (@info) { 14654 push @info, "(= $entry_for_first_table)"; 14655 } 14656 else { 14657 push @info, $entry_for_first_table; 14658 } 14659 } 14660 14661 # If the name is a large integer, add an equivalent with an 14662 # exponent for better readability 14663 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { 14664 push @info, sprintf "(= %.1e)", $name 14665 } 14666 14667 my $parenthesized = ""; 14668 if (! $entry_for_first_alias) { 14669 14670 # This is the first alias for the current table. The alias 14671 # array is ordered so that this is the fullest, most 14672 # descriptive alias, so it gets the fullest info. The other 14673 # aliases are mostly merely pointers to this one, using the 14674 # information already added above. 14675 14676 # Display any status message, but only on the parent table 14677 if ($status && ! $entry_for_first_table) { 14678 push @info, $status_info; 14679 } 14680 14681 # Put out any descriptive info 14682 if (@table_description || @table_note) { 14683 push @info, join "; ", @table_description, @table_note; 14684 } 14685 14686 # Look to see if there is a shorter name we can point people 14687 # at 14688 my $standard_name = standardize($name); 14689 my $short_name; 14690 my $proposed_short = $table->short_name; 14691 if (defined $proposed_short) { 14692 my $standard_short = standardize($proposed_short); 14693 14694 # If the short name is shorter than the standard one, or 14695 # even it it's not, but the combination of it and its 14696 # short property name (as in \p{prop=short} ($perl doesn't 14697 # have this form)) saves at least two characters, then, 14698 # cause it to be listed as a shorter synonym. 14699 if (length $standard_short < length $standard_name 14700 || ($table_property != $perl 14701 && (length($standard_short) 14702 - length($standard_name) 14703 + $table_property_short_delta) # (<= 0) 14704 < -2)) 14705 { 14706 $short_name = $proposed_short; 14707 if ($table_property != $perl) { 14708 $short_name = $table_property_short_name 14709 . "=$short_name"; 14710 } 14711 $short_name = "\\p{$short_name}"; 14712 } 14713 } 14714 14715 # And if this is a compound form name, see if there is a 14716 # single form equivalent 14717 my $single_form; 14718 if ($table_property != $perl) { 14719 14720 # Special case the binary N tables, so that will print 14721 # \P{single}, but use the Y table values to populate 14722 # 'single', as we haven't likewise populated the N table. 14723 # For forced binary tables, we can't just look at the N 14724 # table, but must see if this table is equivalent to the N 14725 # one, as there are two equivalent beasts in these 14726 # properties. 14727 my $test_table; 14728 my $p; 14729 if ( ($type == $BINARY 14730 && $input_table == $property->table('No')) 14731 || ($type == $FORCED_BINARY 14732 && $property->table('No')-> 14733 is_set_equivalent_to($input_table))) 14734 { 14735 $test_table = $property->table('Yes'); 14736 $p = 'P'; 14737 } 14738 else { 14739 $test_table = $input_table; 14740 $p = 'p'; 14741 } 14742 14743 # Look for a single form amongst all the children. 14744 foreach my $table ($test_table->children) { 14745 next if $table->property != $perl; 14746 my $proposed_name = $table->short_name; 14747 next if ! defined $proposed_name; 14748 14749 # Don't mention internal-only properties as a possible 14750 # single form synonym 14751 next if substr($proposed_name, 0, 1) eq '_'; 14752 14753 $proposed_name = "\\$p\{$proposed_name}"; 14754 if (! defined $single_form 14755 || length($proposed_name) < length $single_form) 14756 { 14757 $single_form = $proposed_name; 14758 14759 # The goal here is to find a single form; not the 14760 # shortest possible one. We've already found a 14761 # short name. So, stop at the first single form 14762 # found, which is likely to be closer to the 14763 # original. 14764 last; 14765 } 14766 } 14767 } 14768 14769 # Ouput both short and single in the same parenthesized 14770 # expression, but with only one of 'Single', 'Short' if there 14771 # are both items. 14772 if ($short_name || $single_form || $table->conflicting) { 14773 $parenthesized .= "Short: $short_name" if $short_name; 14774 if ($short_name && $single_form) { 14775 $parenthesized .= ', '; 14776 } 14777 elsif ($single_form) { 14778 $parenthesized .= 'Single: '; 14779 } 14780 $parenthesized .= $single_form if $single_form; 14781 } 14782 } 14783 14784 if ($caseless_equivalent != 0) { 14785 $parenthesized .= '; ' if $parenthesized ne ""; 14786 $parenthesized .= "/i= " . $caseless_equivalent->complete_name; 14787 } 14788 14789 14790 # Warn if this property isn't the same as one that a 14791 # semi-casual user might expect. The other components of this 14792 # parenthesized structure are calculated only for the first entry 14793 # for this table, but the conflicting is deemed important enough 14794 # to go on every entry. 14795 my $conflicting = join " NOR ", $table->conflicting; 14796 if ($conflicting) { 14797 $parenthesized .= '; ' if $parenthesized ne ""; 14798 $parenthesized .= "NOT $conflicting"; 14799 } 14800 14801 push @info, "($parenthesized)" if $parenthesized; 14802 14803 if ($name =~ /_$/ && $alias->loose_match) { 14804 push @info, "Note the trailing '_' matters in spite of loose matching rules."; 14805 } 14806 14807 if ($table_property != $perl && $table->perl_extension) { 14808 push @info, '(Perl extension)'; 14809 } 14810 push @info, "($string_count)"; 14811 14812 # Now, we have both the entry and info so add them to the 14813 # list of all the properties. 14814 push @match_properties, 14815 format_pod_line($indent_info_column, 14816 $entry, 14817 join( " ", @info), 14818 $alias->status, 14819 $alias->loose_match); 14820 14821 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; 14822 } # End of looping through the aliases for this table. 14823 14824 if (! $entry_for_first_table) { 14825 $entry_for_first_table = $entry_for_first_alias; 14826 } 14827 } # End of looping through all the related tables 14828 return; 14829} 14830 14831sub make_ucd_table_pod_entries { 14832 my $table = shift; 14833 14834 # Generate the entries for the UCD section of the pod for $table. This 14835 # also calculates if names are ambiguous, so has to be called even if the 14836 # pod is not being output 14837 14838 my $short_name = $table->name; 14839 my $standard_short_name = standardize($short_name); 14840 my $full_name = $table->full_name; 14841 my $standard_full_name = standardize($full_name); 14842 14843 my $full_info = ""; # Text of info column for full-name entries 14844 my $other_info = ""; # Text of info column for short-name entries 14845 my $short_info = ""; # Text of info column for other entries 14846 my $meaning = ""; # Synonym of this table 14847 14848 my $property = ($table->isa('Property')) 14849 ? $table 14850 : $table->parent->property; 14851 14852 my $perl_extension = $table->perl_extension; 14853 14854 # Get the more official name for for perl extensions that aren't 14855 # stand-alone properties 14856 if ($perl_extension && $property != $table) { 14857 if ($property == $perl ||$property->type == $BINARY) { 14858 $meaning = $table->complete_name; 14859 } 14860 else { 14861 $meaning = $property->full_name . "=$full_name"; 14862 } 14863 } 14864 14865 # There are three types of info column. One for the short name, one for 14866 # the full name, and one for everything else. They mostly are the same, 14867 # so initialize in the same loop. 14868 foreach my $info_ref (\$full_info, \$short_info, \$other_info) { 14869 if ($perl_extension && $property != $table) { 14870 14871 # Add the synonymous name for the non-full name entries; and to 14872 # the full-name entry if it adds extra information 14873 if ($info_ref == \$other_info 14874 || ($info_ref == \$short_info 14875 && $standard_short_name ne $standard_full_name) 14876 || standardize($meaning) ne $standard_full_name 14877 ) { 14878 $$info_ref .= "$meaning."; 14879 } 14880 } 14881 elsif ($info_ref != \$full_info) { 14882 14883 # Otherwise, the non-full name columns include the full name 14884 $$info_ref .= $full_name; 14885 } 14886 14887 # And the full-name entry includes the short name, if different 14888 if ($info_ref == \$full_info 14889 && $standard_short_name ne $standard_full_name) 14890 { 14891 $full_info =~ s/\.\Z//; 14892 $full_info .= " " if $full_info; 14893 $full_info .= "(Short: $short_name)"; 14894 } 14895 14896 if ($table->perl_extension) { 14897 $$info_ref =~ s/\.\Z//; 14898 $$info_ref .= ". " if $$info_ref; 14899 $$info_ref .= "(Perl extension)"; 14900 } 14901 } 14902 14903 # Add any extra annotations to the full name entry 14904 foreach my $more_info ($table->description, 14905 $table->note, 14906 $table->status_info) 14907 { 14908 next unless $more_info; 14909 $full_info =~ s/\.\Z//; 14910 $full_info .= ". " if $full_info; 14911 $full_info .= $more_info; 14912 } 14913 14914 # These keep track if have created full and short name pod entries for the 14915 # property 14916 my $done_full = 0; 14917 my $done_short = 0; 14918 14919 # Every possible name is kept track of, even those that aren't going to be 14920 # output. This way we can be sure to find the ambiguities. 14921 foreach my $alias ($table->aliases) { 14922 my $name = $alias->name; 14923 my $standard = standardize($name); 14924 my $info; 14925 my $output_this = $alias->ucd; 14926 14927 # If the full and short names are the same, we want to output the full 14928 # one's entry, so it has priority. 14929 if ($standard eq $standard_full_name) { 14930 next if $done_full; 14931 $done_full = 1; 14932 $info = $full_info; 14933 } 14934 elsif ($standard eq $standard_short_name) { 14935 next if $done_short; 14936 $done_short = 1; 14937 next if $standard_short_name eq $standard_full_name; 14938 $info = $short_info; 14939 } 14940 else { 14941 $info = $other_info; 14942 } 14943 14944 # Here, we have set up the two columns for this entry. But if an 14945 # entry already exists for this name, we have to decide which one 14946 # we're going to later output. 14947 if (exists $ucd_pod{$standard}) { 14948 14949 # If the two entries refer to the same property, it's not going to 14950 # be ambiguous. (Likely it's because the names when standardized 14951 # are the same.) But that means if they are different properties, 14952 # there is ambiguity. 14953 if ($ucd_pod{$standard}->{'property'} != $property) { 14954 14955 # Here, we have an ambiguity. This code assumes that one is 14956 # scheduled to be output and one not and that one is a perl 14957 # extension (which is not to be output) and the other isn't. 14958 # If those assumptions are wrong, things have to be rethought. 14959 if ($ucd_pod{$standard}{'output_this'} == $output_this 14960 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension 14961 || $output_this == $perl_extension) 14962 { 14963 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); 14964 } 14965 14966 # We modifiy the info column of the one being output to 14967 # indicate the ambiguity. Set $which to point to that one's 14968 # info. 14969 my $which; 14970 if ($ucd_pod{$standard}{'output_this'}) { 14971 $which = \$ucd_pod{$standard}->{'info'}; 14972 } 14973 else { 14974 $which = \$info; 14975 $meaning = $ucd_pod{$standard}{'meaning'}; 14976 } 14977 14978 chomp $$which; 14979 $$which =~ s/\.\Z//; 14980 $$which .= "; NOT '$standard' meaning '$meaning'"; 14981 14982 $ambiguous_names{$standard} = 1; 14983 } 14984 14985 # Use the non-perl-extension variant 14986 next unless $ucd_pod{$standard}{'perl_extension'}; 14987 } 14988 14989 # Store enough information about this entry that we can later look for 14990 # ambiguities, and output it properly. 14991 $ucd_pod{$standard} = { 'name' => $name, 14992 'info' => $info, 14993 'meaning' => $meaning, 14994 'output_this' => $output_this, 14995 'perl_extension' => $perl_extension, 14996 'property' => $property, 14997 'status' => $alias->status, 14998 }; 14999 } # End of looping through all this table's aliases 15000 15001 return; 15002} 15003 15004sub pod_alphanumeric_sort { 15005 # Sort pod entries alphanumerically. 15006 15007 # The first few character columns are filler, plus the '\p{'; and get rid 15008 # of all the trailing stuff, starting with the trailing '}', so as to sort 15009 # on just 'Name=Value' 15010 (my $a = lc $a) =~ s/^ .*? { //x; 15011 $a =~ s/}.*//; 15012 (my $b = lc $b) =~ s/^ .*? { //x; 15013 $b =~ s/}.*//; 15014 15015 # Determine if the two operands are both internal only or both not. 15016 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 15017 # should be the underscore that begins internal only 15018 my $a_is_internal = (substr($a, 0, 1) eq '_'); 15019 my $b_is_internal = (substr($b, 0, 1) eq '_'); 15020 15021 # Sort so the internals come last in the table instead of first (which the 15022 # leading underscore would otherwise indicate). 15023 if ($a_is_internal != $b_is_internal) { 15024 return 1 if $a_is_internal; 15025 return -1 15026 } 15027 15028 # Determine if the two operands are numeric property values or not. 15029 # A numeric property will look like xyz: 3. But the number 15030 # can begin with an optional minus sign, and may have a 15031 # fraction or rational component, like xyz: 3/2. If either 15032 # isn't numeric, use alphabetic sort. 15033 my ($a_initial, $a_number) = 15034 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); 15035 return $a cmp $b unless defined $a_number; 15036 my ($b_initial, $b_number) = 15037 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); 15038 return $a cmp $b unless defined $b_number; 15039 15040 # Here they are both numeric, but use alphabetic sort if the 15041 # initial parts don't match 15042 return $a cmp $b if $a_initial ne $b_initial; 15043 15044 # Convert rationals to floating for the comparison. 15045 $a_number = eval $a_number if $a_number =~ qr{/}; 15046 $b_number = eval $b_number if $b_number =~ qr{/}; 15047 15048 return $a_number <=> $b_number; 15049} 15050 15051sub make_pod () { 15052 # Create the .pod file. This generates the various subsections and then 15053 # combines them in one big HERE document. 15054 15055 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; 15056 15057 return unless defined $pod_directory; 15058 print "Making pod file\n" if $verbosity >= $PROGRESS; 15059 15060 my $exception_message = 15061 '(Any exceptions are individually noted beginning with the word NOT.)'; 15062 my @block_warning; 15063 if (-e 'Blocks.txt') { 15064 15065 # Add the line: '\p{In_*} \p{Block: *}', with the warning message 15066 # if the global $has_In_conflicts indicates we have them. 15067 push @match_properties, format_pod_line($indent_info_column, 15068 '\p{In_*}', 15069 '\p{Block: *}' 15070 . (($has_In_conflicts) 15071 ? " $exception_message" 15072 : "")); 15073 @block_warning = << "END"; 15074 15075Matches in the Block property have shortcuts that begin with "In_". For 15076example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For 15077backward compatibility, if there is no conflict with another shortcut, these 15078may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there 15079are numerous such conflicting shortcuts. Use of these forms for Block is 15080discouraged, and are flagged as such, not only because of the potential 15081confusion as to what is meant, but also because a later release of Unicode may 15082preempt the shortcut, and your program would no longer be correct. Use the 15083"In_" form instead to avoid this, or even more clearly, use the compound form, 15084e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information 15085about this. 15086END 15087 } 15088 my $text = $Is_flags_text; 15089 $text = "$exception_message $text" if $has_Is_conflicts; 15090 15091 # And the 'Is_ line'; 15092 push @match_properties, format_pod_line($indent_info_column, 15093 '\p{Is_*}', 15094 "\\p{*} $text"); 15095 15096 # Sort the properties array for output. It is sorted alphabetically 15097 # except numerically for numeric properties, and only output unique lines. 15098 @match_properties = sort pod_alphanumeric_sort uniques @match_properties; 15099 15100 my $formatted_properties = simple_fold(\@match_properties, 15101 "", 15102 # indent succeeding lines by two extra 15103 # which looks better 15104 $indent_info_column + 2, 15105 15106 # shorten the line length by how much 15107 # the formatter indents, so the folded 15108 # line will fit in the space 15109 # presumably available 15110 $automatic_pod_indent); 15111 # Add column headings, indented to be a little more centered, but not 15112 # exactly 15113 $formatted_properties = format_pod_line($indent_info_column, 15114 ' NAME', 15115 ' INFO') 15116 . "\n" 15117 . $formatted_properties; 15118 15119 # Generate pod documentation lines for the tables that match nothing 15120 my $zero_matches = ""; 15121 if (@zero_match_tables) { 15122 @zero_match_tables = uniques(@zero_match_tables); 15123 $zero_matches = join "\n\n", 15124 map { $_ = '=item \p{' . $_->complete_name . "}" } 15125 sort { $a->complete_name cmp $b->complete_name } 15126 @zero_match_tables; 15127 15128 $zero_matches = <<END; 15129 15130=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters 15131 15132Unicode has some property-value pairs that currently don't match anything. 15133This happens generally either because they are obsolete, or they exist for 15134symmetry with other forms, but no language has yet been encoded that uses 15135them. In this version of Unicode, the following match zero code points: 15136 15137=over 4 15138 15139$zero_matches 15140 15141=back 15142 15143END 15144 } 15145 15146 # Generate list of properties that we don't accept, grouped by the reasons 15147 # why. This is so only put out the 'why' once, and then list all the 15148 # properties that have that reason under it. 15149 15150 my %why_list; # The keys are the reasons; the values are lists of 15151 # properties that have the key as their reason 15152 15153 # For each property, add it to the list that are suppressed for its reason 15154 # The sort will cause the alphabetically first properties to be added to 15155 # each list first, so each list will be sorted. 15156 foreach my $property (sort keys %why_suppressed) { 15157 push @{$why_list{$why_suppressed{$property}}}, $property; 15158 } 15159 15160 # For each reason (sorted by the first property that has that reason)... 15161 my @bad_re_properties; 15162 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } 15163 keys %why_list) 15164 { 15165 # Add to the output, all the properties that have that reason. 15166 my $has_item = 0; # Flag if actually output anything. 15167 foreach my $name (@{$why_list{$why}}) { 15168 15169 # Split compound names into $property and $table components 15170 my $property = $name; 15171 my $table; 15172 if ($property =~ / (.*) = (.*) /x) { 15173 $property = $1; 15174 $table = $2; 15175 } 15176 15177 # This release of Unicode may not have a property that is 15178 # suppressed, so don't reference a non-existent one. 15179 $property = property_ref($property); 15180 next if ! defined $property; 15181 15182 # And since this list is only for match tables, don't list the 15183 # ones that don't have match tables. 15184 next if ! $property->to_create_match_tables; 15185 15186 # Find any abbreviation, and turn it into a compound name if this 15187 # is a property=value pair. 15188 my $short_name = $property->name; 15189 $short_name .= '=' . $property->table($table)->name if $table; 15190 15191 # Start with an empty line. 15192 push @bad_re_properties, "\n\n" unless $has_item; 15193 15194 # And add the property as an item for the reason. 15195 push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; 15196 $has_item = 1; 15197 } 15198 15199 # And add the reason under the list of properties, if such a list 15200 # actually got generated. Note that the header got added 15201 # unconditionally before. But pod ignores extra blank lines, so no 15202 # harm. 15203 push @bad_re_properties, "\n$why\n" if $has_item; 15204 15205 } # End of looping through each reason. 15206 15207 if (! @bad_re_properties) { 15208 push @bad_re_properties, 15209 "*** This installation accepts ALL non-Unihan properties ***"; 15210 } 15211 else { 15212 # Add =over only if non-empty to avoid an empty =over/=back section, 15213 # which is considered bad form. 15214 unshift @bad_re_properties, "\n=over 4\n"; 15215 push @bad_re_properties, "\n=back\n"; 15216 } 15217 15218 # Similiarly, generate a list of files that we don't use, grouped by the 15219 # reasons why. First, create a hash whose keys are the reasons, and whose 15220 # values are anonymous arrays of all the files that share that reason. 15221 my %grouped_by_reason; 15222 foreach my $file (keys %ignored_files) { 15223 push @{$grouped_by_reason{$ignored_files{$file}}}, $file; 15224 } 15225 foreach my $file (keys %skipped_files) { 15226 push @{$grouped_by_reason{$skipped_files{$file}}}, $file; 15227 } 15228 15229 # Then, sort each group. 15230 foreach my $group (keys %grouped_by_reason) { 15231 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } 15232 @{$grouped_by_reason{$group}} ; 15233 } 15234 15235 # Finally, create the output text. For each reason (sorted by the 15236 # alphabetically first file that has that reason)... 15237 my @unused_files; 15238 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] 15239 cmp lc $grouped_by_reason{$b}->[0] 15240 } 15241 keys %grouped_by_reason) 15242 { 15243 # Add all the files that have that reason to the output. Start 15244 # with an empty line. 15245 push @unused_files, "\n\n"; 15246 push @unused_files, map { "\n=item F<$_> \n" } 15247 @{$grouped_by_reason{$reason}}; 15248 # And add the reason under the list of files 15249 push @unused_files, "\n$reason\n"; 15250 } 15251 15252 # Similarly, create the output text for the UCD section of the pod 15253 my @ucd_pod; 15254 foreach my $key (keys %ucd_pod) { 15255 next unless $ucd_pod{$key}->{'output_this'}; 15256 push @ucd_pod, format_pod_line($indent_info_column, 15257 $ucd_pod{$key}->{'name'}, 15258 $ucd_pod{$key}->{'info'}, 15259 $ucd_pod{$key}->{'status'}, 15260 ); 15261 } 15262 15263 # Sort alphabetically, and fold for output 15264 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; 15265 my $ucd_pod = simple_fold(\@ucd_pod, 15266 ' ', 15267 $indent_info_column, 15268 $automatic_pod_indent); 15269 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') 15270 . "\n" 15271 . $ucd_pod; 15272 local $" = ""; 15273 15274 # Everything is ready to assemble. 15275 my @OUT = << "END"; 15276=begin comment 15277 15278$HEADER 15279 15280To change this file, edit $0 instead. 15281 15282=end comment 15283 15284=head1 NAME 15285 15286$pod_file - Index of Unicode Version $string_version character properties in Perl 15287 15288=head1 DESCRIPTION 15289 15290This document provides information about the portion of the Unicode database 15291that deals with character properties, that is the portion that is defined on 15292single code points. (L</Other information in the Unicode data base> 15293below briefly mentions other data that Unicode provides.) 15294 15295Perl can provide access to all non-provisional Unicode character properties, 15296though not all are enabled by default. The omitted ones are the Unihan 15297properties (accessible via the CPAN module L<Unicode::Unihan>) and certain 15298deprecated or Unicode-internal properties. (An installation may choose to 15299recompile Perl's tables to change this. See L<Unicode character 15300properties that are NOT accepted by Perl>.) 15301 15302For most purposes, access to Unicode properties from the Perl core is through 15303regular expression matches, as described in the next section. 15304For some special purposes, and to access the properties that are not suitable 15305for regular expression matching, all the Unicode character properties that 15306Perl handles are accessible via the standard L<Unicode::UCD> module, as 15307described in the section L</Properties accessible through Unicode::UCD>. 15308 15309Perl also provides some additional extensions and short-cut synonyms 15310for Unicode properties. 15311 15312This document merely lists all available properties and does not attempt to 15313explain what each property really means. There is a brief description of each 15314Perl extension; see L<perlunicode/Other Properties> for more information on 15315these. There is some detail about Blocks, Scripts, General_Category, 15316and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the 15317official Unicode properties, refer to the Unicode standard. A good starting 15318place is L<$unicode_reference_url>. 15319 15320Note that you can define your own properties; see 15321L<perlunicode/"User-Defined Character Properties">. 15322 15323=head1 Properties accessible through C<\\p{}> and C<\\P{}> 15324 15325The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to 15326most of the Unicode character properties. The table below shows all these 15327constructs, both single and compound forms. 15328 15329B<Compound forms> consist of two components, separated by an equals sign or a 15330colon. The first component is the property name, and the second component is 15331the particular value of the property to match against, for example, 15332C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters 15333whose Script property is Greek. 15334 15335B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for 15336their equivalent compound forms. The table shows these equivalences. (In our 15337example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.) 15338There are also a few Perl-defined single forms that are not shortcuts for a 15339compound form. One such is C<\\p{Word}>. These are also listed in the table. 15340 15341In parsing these constructs, Perl always ignores Upper/lower case differences 15342everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as 15343C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before 15344the left brace completely changes the meaning of the construct, from "match" 15345(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is 15346for improved legibility. 15347 15348Also, white space, hyphens, and underscores are normally ignored 15349everywhere between the {braces}, and hence can be freely added or removed 15350even if the C</x> modifier hasn't been specified on the regular expression. 15351But $a_bold_stricter at the beginning of an entry in the table below 15352means that tighter (stricter) rules are used for that entry: 15353 15354=over 4 15355 15356=item Single form (C<\\p{name}>) tighter rules: 15357 15358White space, hyphens, and underscores ARE significant 15359except for: 15360 15361=over 4 15362 15363=item * white space adjacent to a non-word character 15364 15365=item * underscores separating digits in numbers 15366 15367=back 15368 15369That means, for example, that you can freely add or remove white space 15370adjacent to (but within) the braces without affecting the meaning. 15371 15372=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules: 15373 15374The tighter rules given above for the single form apply to everything to the 15375right of the colon or equals; the looser rules still apply to everything to 15376the left. 15377 15378That means, for example, that you can freely add or remove white space 15379adjacent to (but within) the braces and the colon or equal sign. 15380 15381=back 15382 15383Some properties are considered obsolete by Unicode, but still available. 15384There are several varieties of obsolescence: 15385 15386=over 4 15387 15388=item Stabilized 15389 15390A property may be stabilized. Such a determination does not indicate 15391that the property should or should not be used; instead it is a declaration 15392that the property will not be maintained nor extended for newly encoded 15393characters. Such properties are marked with $a_bold_stabilized in the 15394table. 15395 15396=item Deprecated 15397 15398A property may be deprecated, perhaps because its original intent 15399has been replaced by another property, or because its specification was 15400somehow defective. This means that its use is strongly 15401discouraged, so much so that a warning will be issued if used, unless the 15402regular expression is in the scope of a C<S<no warnings 'deprecated'>> 15403statement. $A_bold_deprecated flags each such entry in the table, and 15404the entry there for the longest, most descriptive version of the property will 15405give the reason it is deprecated, and perhaps advice. Perl may issue such a 15406warning, even for properties that aren't officially deprecated by Unicode, 15407when there used to be characters or code points that were matched by them, but 15408no longer. This is to warn you that your program may not work like it did on 15409earlier Unicode releases. 15410 15411A deprecated property may be made unavailable in a future Perl version, so it 15412is best to move away from them. 15413 15414A deprecated property may also be stabilized, but this fact is not shown. 15415 15416=item Obsolete 15417 15418Properties marked with $a_bold_obsolete in the table are considered (plain) 15419obsolete. Generally this designation is given to properties that Unicode once 15420used for internal purposes (but not any longer). 15421 15422=back 15423 15424Some Perl extensions are present for backwards compatibility and are 15425discouraged from being used, but are not obsolete. $A_bold_discouraged 15426flags each such entry in the table. Future Unicode versions may force 15427some of these extensions to be removed without warning, replaced by another 15428property with the same name that means something different. Use the 15429equivalent shown instead. 15430 15431@block_warning 15432 15433The table below has two columns. The left column contains the C<\\p{}> 15434constructs to look up, possibly preceded by the flags mentioned above; and 15435the right column contains information about them, like a description, or 15436synonyms. It shows both the single and compound forms for each property that 15437has them. If the left column is a short name for a property, the right column 15438will give its longer, more descriptive name; and if the left column is the 15439longest name, the right column will show any equivalent shortest name, in both 15440single and compound forms if applicable. 15441 15442The right column will also caution you if a property means something different 15443than what might normally be expected. 15444 15445All single forms are Perl extensions; a few compound forms are as well, and 15446are noted as such. 15447 15448Numbers in (parentheses) indicate the total number of code points matched by 15449the property. For emphasis, those properties that match no code points at all 15450are listed as well in a separate section following the table. 15451 15452Most properties match the same code points regardless of whether C<"/i"> 15453case-insensitive matching is specified or not. But a few properties are 15454affected. These are shown with the notation 15455 15456 (/i= other_property) 15457 15458in the second column. Under case-insensitive matching they match the 15459same code pode points as the property "other_property". 15460 15461There is no description given for most non-Perl defined properties (See 15462L<$unicode_reference_url> for that). 15463 15464For compactness, 'B<*>' is used as a wildcard instead of showing all possible 15465combinations. For example, entries like: 15466 15467 \\p{Gc: *} \\p{General_Category: *} 15468 15469mean that 'Gc' is a synonym for 'General_Category', and anything that is valid 15470for the latter is also valid for the former. Similarly, 15471 15472 \\p{Is_*} \\p{*} 15473 15474means that if and only if, for example, C<\\p{Foo}> exists, then 15475C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing. 15476And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and 15477C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an 15478underscore. 15479 15480Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. 15481And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 15482'N*' to indicate this, and doesn't have separate entries for the other 15483possibilities. Note that not all properties which have values 'Yes' and 'No' 15484are binary, and they have all their values spelled out without using this wild 15485card, and a C<NOT> clause in their description that highlights their not being 15486binary. These also require the compound form to match them, whereas true 15487binary properties have both single and compound forms available. 15488 15489Note that all non-essential underscores are removed in the display of the 15490short names below. 15491 15492B<Legend summary:> 15493 15494=over 4 15495 15496=item Z<>B<*> is a wild-card 15497 15498=item B<(\\d+)> in the info column gives the number of code points matched by 15499this property. 15500 15501=item B<$DEPRECATED> means this is deprecated. 15502 15503=item B<$OBSOLETE> means this is obsolete. 15504 15505=item B<$STABILIZED> means this is stabilized. 15506 15507=item B<$STRICTER> means tighter (stricter) name matching applies. 15508 15509=item B<$DISCOURAGED> means use of this form is discouraged, and may not be 15510stable. 15511 15512=back 15513 15514$formatted_properties 15515 15516$zero_matches 15517 15518=head1 Properties accessible through Unicode::UCD 15519 15520All the Unicode character properties mentioned above (except for those marked 15521as for internal use by Perl) are also accessible by 15522L<Unicode::UCD/prop_invlist()>. 15523 15524Due to their nature, not all Unicode character properties are suitable for 15525regular expression matches, nor C<prop_invlist()>. The remaining 15526non-provisional, non-internal ones are accessible via 15527L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation 15528hasn't included; see L<below for which those are|/Unicode character properties 15529that are NOT accepted by Perl>). 15530 15531For compatibility with other parts of Perl, all the single forms given in the 15532table in the L<section above|/Properties accessible through \\p{} and \\P{}> 15533are recognized. BUT, there are some ambiguities between some Perl extensions 15534and the Unicode properties, all of which are silently resolved in favor of the 15535official Unicode property. To avoid surprises, you should only use 15536C<prop_invmap()> for forms listed in the table below, which omits the 15537non-recommended ones. The affected forms are the Perl single form equivalents 15538of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of 15539C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property, 15540whose short name is C<sc>. The table indicates the current ambiguities in the 15541INFO column, beginning with the word C<"NOT">. 15542 15543The standard Unicode properties listed below are documented in 15544L<$unicode_reference_url>; Perl_Decimal_Digit is documented in 15545L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in 15546L<perlunicode/Other Properties>; 15547 15548The first column in the table is a name for the property; the second column is 15549an alternative name, if any, plus possibly some annotations. The alternative 15550name is the property's full name, unless that would simply repeat the first 15551column, in which case the second column indicates the property's short name 15552(if different). The annotations are given only in the entry for the full 15553name. If a property is obsolete, etc, the entry will be flagged with the same 15554characters used in the table in the L<section above|/Properties accessible 15555through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>. 15556 15557$ucd_pod 15558 15559=head1 Properties accessible through other means 15560 15561Certain properties are accessible also via core function calls. These are: 15562 15563 Lowercase_Mapping lc() and lcfirst() 15564 Titlecase_Mapping ucfirst() 15565 Uppercase_Mapping uc() 15566 15567Also, Case_Folding is accessible through the C</i> modifier in regular 15568expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>> 15569operator. 15570 15571And, the Name and Name_Aliases properties are accessible through the C<\\N{}> 15572interpolation in double-quoted strings and regular expressions; and functions 15573C<charnames::viacode()>, C<charnames::vianame()>, and 15574C<charnames::string_vianame()> (which require a C<use charnames ();> to be 15575specified. 15576 15577Finally, most properties related to decomposition are accessible via 15578L<Unicode::Normalize>. 15579 15580=head1 Unicode character properties that are NOT accepted by Perl 15581 15582Perl will generate an error for a few character properties in Unicode when 15583used in a regular expression. The non-Unihan ones are listed below, with the 15584reasons they are not accepted, perhaps with work-arounds. The short names for 15585the properties are listed enclosed in (parentheses). 15586As described after the list, an installation can change the defaults and choose 15587to accept any of these. The list is machine generated based on the 15588choices made for the installation that generated this document. 15589 15590@bad_re_properties 15591 15592An installation can choose to allow any of these to be matched by downloading 15593the Unicode database from L<http://www.unicode.org/Public/> to 15594C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the 15595controlling lists contained in the program 15596C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing. 15597(C<\%Config> is available from the Config module). 15598 15599=head1 Other information in the Unicode data base 15600 15601The Unicode data base is delivered in two different formats. The XML version 15602is valid for more modern Unicode releases. The other version is a collection 15603of files. The two are intended to give equivalent information. Perl uses the 15604older form; this allows you to recompile Perl to use early Unicode releases. 15605 15606The only non-character property that Perl currently supports is Named 15607Sequences, in which a sequence of code points 15608is given a name and generally treated as a single entity. (Perl supports 15609these via the C<\\N{...}> double-quotish construct, 15610L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>. 15611 15612Below is a list of the files in the Unicode data base that Perl doesn't 15613currently use, along with very brief descriptions of their purposes. 15614Some of the names of the files have been shortened from those that Unicode 15615uses, in order to allow them to be distinguishable from similarly named files 15616on file systems for which only the first 8 characters of a name are 15617significant. 15618 15619=over 4 15620 15621@unused_files 15622 15623=back 15624 15625=head1 SEE ALSO 15626 15627L<$unicode_reference_url> 15628 15629L<perlrecharclass> 15630 15631L<perlunicode> 15632 15633END 15634 15635 # And write it. The 0 means no utf8. 15636 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT); 15637 return; 15638} 15639 15640sub make_Heavy () { 15641 # Create and write Heavy.pl, which passes info about the tables to 15642 # utf8_heavy.pl 15643 15644 # Stringify structures for output 15645 my $loose_property_name_of 15646 = simple_dumper(\%loose_property_name_of, ' ' x 4); 15647 chomp $loose_property_name_of; 15648 15649 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); 15650 chomp $stricter_to_file_of; 15651 15652 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); 15653 chomp $loose_to_file_of; 15654 15655 my $nv_floating_to_rational 15656 = simple_dumper(\%nv_floating_to_rational, ' ' x 4); 15657 chomp $nv_floating_to_rational; 15658 15659 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4); 15660 chomp $why_deprecated; 15661 15662 # We set the key to the file when we associated files with tables, but we 15663 # couldn't do the same for the value then, as we might not have the file 15664 # for the alternate table figured out at that time. 15665 foreach my $cased (keys %caseless_equivalent_to) { 15666 my @path = $caseless_equivalent_to{$cased}->file_path; 15667 my $path = join '/', @path[1, -1]; 15668 $caseless_equivalent_to{$cased} = $path; 15669 } 15670 my $caseless_equivalent_to 15671 = simple_dumper(\%caseless_equivalent_to, ' ' x 4); 15672 chomp $caseless_equivalent_to; 15673 15674 my $loose_property_to_file_of 15675 = simple_dumper(\%loose_property_to_file_of, ' ' x 4); 15676 chomp $loose_property_to_file_of; 15677 15678 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); 15679 chomp $file_to_swash_name; 15680 15681 my @heavy = <<END; 15682$HEADER 15683$INTERNAL_ONLY_HEADER 15684 15685# This file is for the use of utf8_heavy.pl and Unicode::UCD 15686 15687# Maps Unicode (not Perl single-form extensions) property names in loose 15688# standard form to their corresponding standard names 15689\%utf8::loose_property_name_of = ( 15690$loose_property_name_of 15691); 15692 15693# Maps property, table to file for those using stricter matching 15694\%utf8::stricter_to_file_of = ( 15695$stricter_to_file_of 15696); 15697 15698# Maps property, table to file for those using loose matching 15699\%utf8::loose_to_file_of = ( 15700$loose_to_file_of 15701); 15702 15703# Maps floating point to fractional form 15704\%utf8::nv_floating_to_rational = ( 15705$nv_floating_to_rational 15706); 15707 15708# If a floating point number doesn't have enough digits in it to get this 15709# close to a fraction, it isn't considered to be that fraction even if all the 15710# digits it does have match. 15711\$utf8::max_floating_slop = $MAX_FLOATING_SLOP; 15712 15713# Deprecated tables to generate a warning for. The key is the file containing 15714# the table, so as to avoid duplication, as many property names can map to the 15715# file, but we only need one entry for all of them. 15716\%utf8::why_deprecated = ( 15717$why_deprecated 15718); 15719 15720# A few properties have different behavior under /i matching. This maps 15721# those to substitute files to use under /i. 15722\%utf8::caseless_equivalent = ( 15723$caseless_equivalent_to 15724); 15725 15726# Property names to mapping files 15727\%utf8::loose_property_to_file_of = ( 15728$loose_property_to_file_of 15729); 15730 15731# Files to the swash names within them. 15732\%utf8::file_to_swash_name = ( 15733$file_to_swash_name 15734); 15735 157361; 15737END 15738 15739 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8. 15740 return; 15741} 15742 15743sub make_Name_pm () { 15744 # Create and write Name.pm, which contains subroutines and data to use in 15745 # conjunction with Name.pl 15746 15747 # Maybe there's nothing to do. 15748 return unless $has_hangul_syllables || @code_points_ending_in_code_point; 15749 15750 my @name = <<END; 15751$HEADER 15752$INTERNAL_ONLY_HEADER 15753END 15754 15755 # Convert these structures to output format. 15756 my $code_points_ending_in_code_point = 15757 main::simple_dumper(\@code_points_ending_in_code_point, 15758 ' ' x 8); 15759 my $names = main::simple_dumper(\%names_ending_in_code_point, 15760 ' ' x 8); 15761 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point, 15762 ' ' x 8); 15763 15764 # Do the same with the Hangul names, 15765 my $jamo; 15766 my $jamo_l; 15767 my $jamo_v; 15768 my $jamo_t; 15769 my $jamo_re; 15770 if ($has_hangul_syllables) { 15771 15772 # Construct a regular expression of all the possible 15773 # combinations of the Hangul syllables. 15774 my @L_re; # Leading consonants 15775 for my $i ($LBase .. $LBase + $LCount - 1) { 15776 push @L_re, $Jamo{$i} 15777 } 15778 my @V_re; # Middle vowels 15779 for my $i ($VBase .. $VBase + $VCount - 1) { 15780 push @V_re, $Jamo{$i} 15781 } 15782 my @T_re; # Trailing consonants 15783 for my $i ($TBase + 1 .. $TBase + $TCount - 1) { 15784 push @T_re, $Jamo{$i} 15785 } 15786 15787 # The whole re is made up of the L V T combination. 15788 $jamo_re = '(' 15789 . join ('|', sort @L_re) 15790 . ')(' 15791 . join ('|', sort @V_re) 15792 . ')(' 15793 . join ('|', sort @T_re) 15794 . ')?'; 15795 15796 # These hashes needed by the algorithm were generated 15797 # during reading of the Jamo.txt file 15798 $jamo = main::simple_dumper(\%Jamo, ' ' x 8); 15799 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8); 15800 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8); 15801 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8); 15802 } 15803 15804 push @name, <<END; 15805 15806package charnames; 15807 15808# This module contains machine-generated tables and code for the 15809# algorithmically-determinable Unicode character names. The following 15810# routines can be used to translate between name and code point and vice versa 15811 15812{ # Closure 15813 15814 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 15815 # two must be 10; if there are 5, the first must not be a 0. Written this 15816 # way to decrease backtracking. The first regex allows the code point to 15817 # be at the end of a word, but to work properly, the word shouldn't end 15818 # with a valid hex character. The second one won't match a code point at 15819 # the end of a word, and doesn't have the run-on issue 15820 my \$run_on_code_point_re = qr/$run_on_code_point_re/; 15821 my \$code_point_re = qr/$code_point_re/; 15822 15823 # In the following hash, the keys are the bases of names which include 15824 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 15825 # of each key is another hash which is used to get the low and high ends 15826 # for each range of code points that apply to the name. 15827 my %names_ending_in_code_point = ( 15828$names 15829 ); 15830 15831 # The following hash is a copy of the previous one, except is for loose 15832 # matching, so each name has blanks and dashes squeezed out 15833 my %loose_names_ending_in_code_point = ( 15834$loose_names 15835 ); 15836 15837 # And the following array gives the inverse mapping from code points to 15838 # names. Lowest code points are first 15839 my \@code_points_ending_in_code_point = ( 15840$code_points_ending_in_code_point 15841 ); 15842END 15843 # Earlier releases didn't have Jamos. No sense outputting 15844 # them unless will be used. 15845 if ($has_hangul_syllables) { 15846 push @name, <<END; 15847 15848 # Convert from code point to Jamo short name for use in composing Hangul 15849 # syllable names 15850 my %Jamo = ( 15851$jamo 15852 ); 15853 15854 # Leading consonant (can be null) 15855 my %Jamo_L = ( 15856$jamo_l 15857 ); 15858 15859 # Vowel 15860 my %Jamo_V = ( 15861$jamo_v 15862 ); 15863 15864 # Optional trailing consonant 15865 my %Jamo_T = ( 15866$jamo_t 15867 ); 15868 15869 # Computed re that splits up a Hangul name into LVT or LV syllables 15870 my \$syllable_re = qr/$jamo_re/; 15871 15872 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 15873 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 15874 15875 # These constants names and values were taken from the Unicode standard, 15876 # version 5.1, section 3.12. They are used in conjunction with Hangul 15877 # syllables 15878 my \$SBase = $SBase_string; 15879 my \$LBase = $LBase_string; 15880 my \$VBase = $VBase_string; 15881 my \$TBase = $TBase_string; 15882 my \$SCount = $SCount; 15883 my \$LCount = $LCount; 15884 my \$VCount = $VCount; 15885 my \$TCount = $TCount; 15886 my \$NCount = \$VCount * \$TCount; 15887END 15888 } # End of has Jamos 15889 15890 push @name, << 'END'; 15891 15892 sub name_to_code_point_special { 15893 my ($name, $loose) = @_; 15894 15895 # Returns undef if not one of the specially handled names; otherwise 15896 # returns the code point equivalent to the input name 15897 # $loose is non-zero if to use loose matching, 'name' in that case 15898 # must be input as upper case with all blanks and dashes squeezed out. 15899END 15900 if ($has_hangul_syllables) { 15901 push @name, << 'END'; 15902 15903 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 15904 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 15905 { 15906 return if $name !~ qr/^$syllable_re$/; 15907 my $L = $Jamo_L{$1}; 15908 my $V = $Jamo_V{$2}; 15909 my $T = (defined $3) ? $Jamo_T{$3} : 0; 15910 return ($L * $VCount + $V) * $TCount + $T + $SBase; 15911 } 15912END 15913 } 15914 push @name, << 'END'; 15915 15916 # Name must end in 'code_point' for this to handle. 15917 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 15918 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 15919 15920 my $base = $1; 15921 my $code_point = CORE::hex $2; 15922 my $names_ref; 15923 15924 if ($loose) { 15925 $names_ref = \%loose_names_ending_in_code_point; 15926 } 15927 else { 15928 return if $base !~ s/-$//; 15929 $names_ref = \%names_ending_in_code_point; 15930 } 15931 15932 # Name must be one of the ones which has the code point in it. 15933 return if ! $names_ref->{$base}; 15934 15935 # Look through the list of ranges that apply to this name to see if 15936 # the code point is in one of them. 15937 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 15938 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 15939 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 15940 15941 # Here, the code point is in the range. 15942 return $code_point; 15943 } 15944 15945 # Here, looked like the name had a code point number in it, but 15946 # did not match one of the valid ones. 15947 return; 15948 } 15949 15950 sub code_point_to_name_special { 15951 my $code_point = shift; 15952 15953 # Returns the name of a code point if algorithmically determinable; 15954 # undef if not 15955END 15956 if ($has_hangul_syllables) { 15957 push @name, << 'END'; 15958 15959 # If in the Hangul range, calculate the name based on Unicode's 15960 # algorithm 15961 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 15962 use integer; 15963 my $SIndex = $code_point - $SBase; 15964 my $L = $LBase + $SIndex / $NCount; 15965 my $V = $VBase + ($SIndex % $NCount) / $TCount; 15966 my $T = $TBase + $SIndex % $TCount; 15967 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 15968 $name .= $Jamo{$T} if $T != $TBase; 15969 return $name; 15970 } 15971END 15972 } 15973 push @name, << 'END'; 15974 15975 # Look through list of these code points for one in range. 15976 foreach my $hash (@code_points_ending_in_code_point) { 15977 return if $code_point < $hash->{'low'}; 15978 if ($code_point <= $hash->{'high'}) { 15979 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 15980 } 15981 } 15982 return; # None found 15983 } 15984} # End closure 15985 159861; 15987END 15988 15989 main::write("Name.pm", 0, \@name); # The 0 means no utf8. 15990 return; 15991} 15992 15993sub make_UCD () { 15994 # Create and write UCD.pl, which passes info about the tables to 15995 # Unicode::UCD 15996 15997 # Create a mapping from each alias of Perl single-form extensions to all 15998 # its equivalent aliases, for quick look-up. 15999 my %perlprop_to_aliases; 16000 foreach my $table ($perl->tables) { 16001 16002 # First create the list of the aliases of each extension 16003 my @aliases_list; # List of legal aliases for this extension 16004 16005 my $table_name = $table->name; 16006 my $standard_table_name = standardize($table_name); 16007 my $table_full_name = $table->full_name; 16008 my $standard_table_full_name = standardize($table_full_name); 16009 16010 # Make sure that the list has both the short and full names 16011 push @aliases_list, $table_name, $table_full_name; 16012 16013 my $found_ucd = 0; # ? Did we actually get an alias that should be 16014 # output for this table 16015 16016 # Go through all the aliases (including the two just added), and add 16017 # any new unique ones to the list 16018 foreach my $alias ($table->aliases) { 16019 16020 # Skip non-legal names 16021 next unless $alias->ok_as_filename; 16022 next unless $alias->ucd; 16023 16024 $found_ucd = 1; # have at least one legal name 16025 16026 my $name = $alias->name; 16027 my $standard = standardize($name); 16028 16029 # Don't repeat a name that is equivalent to one already on the 16030 # list 16031 next if $standard eq $standard_table_name; 16032 next if $standard eq $standard_table_full_name; 16033 16034 push @aliases_list, $name; 16035 } 16036 16037 # If there were no legal names, don't output anything. 16038 next unless $found_ucd; 16039 16040 # To conserve memory in the program reading these in, omit full names 16041 # that are identical to the short name, when those are the only two 16042 # aliases for the property. 16043 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) { 16044 pop @aliases_list; 16045 } 16046 16047 # Here, @aliases_list is the list of all the aliases that this 16048 # extension legally has. Now can create a map to it from each legal 16049 # standardized alias 16050 foreach my $alias ($table->aliases) { 16051 next unless $alias->ucd; 16052 next unless $alias->ok_as_filename; 16053 push @{$perlprop_to_aliases{standardize($alias->name)}}, 16054 @aliases_list; 16055 } 16056 } 16057 16058 # Make a list of all combinations of properties/values that are suppressed. 16059 my @suppressed; 16060 if (! $debug_skip) { # This tends to fail in this debug mode 16061 foreach my $property_name (keys %why_suppressed) { 16062 16063 # Just the value 16064 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; 16065 16066 # The hash may contain properties not in this release of Unicode 16067 next unless defined (my $property = property_ref($property_name)); 16068 16069 # Find all combinations 16070 foreach my $prop_alias ($property->aliases) { 16071 my $prop_alias_name = standardize($prop_alias->name); 16072 16073 # If no =value, there's just one combination possibe for this 16074 if (! $value_name) { 16075 16076 # The property may be suppressed, but there may be a proxy 16077 # for it, so it shouldn't be listed as suppressed 16078 next if $prop_alias->ucd; 16079 push @suppressed, $prop_alias_name; 16080 } 16081 else { # Otherwise 16082 foreach my $value_alias 16083 ($property->table($value_name)->aliases) 16084 { 16085 next if $value_alias->ucd; 16086 16087 push @suppressed, "$prop_alias_name=" 16088 . standardize($value_alias->name); 16089 } 16090 } 16091 } 16092 } 16093 } 16094 @suppressed = sort @suppressed; # So doesn't change between runs of this 16095 # program 16096 16097 # Convert the structure below (designed for Name.pm) to a form that UCD 16098 # wants, so it doesn't have to modify it at all; i.e. so that it includes 16099 # an element for the Hangul syllables in the appropriate place, and 16100 # otherwise changes the name to include the "-<code point>" suffix. 16101 my @algorithm_names; 16102 my $done_hangul = 0; 16103 16104 # Copy it linearly. 16105 for my $i (0 .. @code_points_ending_in_code_point - 1) { 16106 16107 # Insert the hanguls in the correct place. 16108 if (! $done_hangul 16109 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase) 16110 { 16111 $done_hangul = 1; 16112 push @algorithm_names, { low => $SBase, 16113 high => $SBase + $SCount - 1, 16114 name => '<hangul syllable>', 16115 }; 16116 } 16117 16118 # Copy the current entry, modified. 16119 push @algorithm_names, { 16120 low => $code_points_ending_in_code_point[$i]->{'low'}, 16121 high => $code_points_ending_in_code_point[$i]->{'high'}, 16122 name => 16123 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>", 16124 }; 16125 } 16126 16127 # Serialize these structures for output. 16128 my $loose_to_standard_value 16129 = simple_dumper(\%loose_to_standard_value, ' ' x 4); 16130 chomp $loose_to_standard_value; 16131 16132 my $string_property_loose_to_name 16133 = simple_dumper(\%string_property_loose_to_name, ' ' x 4); 16134 chomp $string_property_loose_to_name; 16135 16136 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4); 16137 chomp $perlprop_to_aliases; 16138 16139 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4); 16140 chomp $prop_aliases; 16141 16142 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4); 16143 chomp $prop_value_aliases; 16144 16145 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : ""; 16146 chomp $suppressed; 16147 16148 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4); 16149 chomp $algorithm_names; 16150 16151 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); 16152 chomp $ambiguous_names; 16153 16154 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); 16155 chomp $loose_defaults; 16156 16157 my @ucd = <<END; 16158$HEADER 16159$INTERNAL_ONLY_HEADER 16160 16161# This file is for the use of Unicode::UCD 16162 16163# Highest legal Unicode code point 16164\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING; 16165 16166# Hangul syllables 16167\$Unicode::UCD::HANGUL_BEGIN = $SBase_string; 16168\$Unicode::UCD::HANGUL_COUNT = $SCount; 16169 16170# Keys are all the possible "prop=value" combinations, in loose form; values 16171# are the standard loose name for the 'value' part of the key 16172\%Unicode::UCD::loose_to_standard_value = ( 16173$loose_to_standard_value 16174); 16175 16176# String property loose names to standard loose name 16177\%Unicode::UCD::string_property_loose_to_name = ( 16178$string_property_loose_to_name 16179); 16180 16181# Keys are Perl extensions in loose form; values are each one's list of 16182# aliases 16183\%Unicode::UCD::loose_perlprop_to_name = ( 16184$perlprop_to_aliases 16185); 16186 16187# Keys are standard property name; values are each one's aliases 16188\%Unicode::UCD::prop_aliases = ( 16189$prop_aliases 16190); 16191 16192# Keys of top level are standard property name; values are keys to another 16193# hash, Each one is one of the property's values, in standard form. The 16194# values are that prop-val's aliases. If only one specified, the short and 16195# long alias are identical. 16196\%Unicode::UCD::prop_value_aliases = ( 16197$prop_value_aliases 16198); 16199 16200# Ordered (by code point ordinal) list of the ranges of code points whose 16201# names are algorithmically determined. Each range entry is an anonymous hash 16202# of the start and end points and a template for the names within it. 16203\@Unicode::UCD::algorithmic_named_code_points = ( 16204$algorithm_names 16205); 16206 16207# The properties that as-is have two meanings, and which must be disambiguated 16208\%Unicode::UCD::ambiguous_names = ( 16209$ambiguous_names 16210); 16211 16212# Keys are the prop-val combinations which are the default values for the 16213# given property, expressed in standard loose form 16214\%Unicode::UCD::loose_defaults = ( 16215$loose_defaults 16216); 16217 16218# All combinations of names that are suppressed. 16219# This is actually for UCD.t, so it knows which properties shouldn't have 16220# entries. If it got any bigger, would probably want to put it in its own 16221# file to use memory only when it was needed, in testing. 16222\@Unicode::UCD::suppressed_properties = ( 16223$suppressed 16224); 16225 162261; 16227END 16228 16229 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8. 16230 return; 16231} 16232 16233sub write_all_tables() { 16234 # Write out all the tables generated by this program to files, as well as 16235 # the supporting data structures, pod file, and .t file. 16236 16237 my @writables; # List of tables that actually get written 16238 my %match_tables_to_write; # Used to collapse identical match tables 16239 # into one file. Each key is a hash function 16240 # result to partition tables into buckets. 16241 # Each value is an array of the tables that 16242 # fit in the bucket. 16243 16244 # For each property ... 16245 # (sort so that if there is an immutable file name, it has precedence, so 16246 # some other property can't come in and take over its file name. (We 16247 # don't care if both defined, as they had better be different anyway.) 16248 # The property named 'Perl' needs to be first (it doesn't have any 16249 # immutable file name) because empty properties are defined in terms of 16250 # it's table named 'Any'.) We also sort by the property's name. This is 16251 # just for repeatability of the outputs between runs of this program, but 16252 # does not affect correctness. 16253 PROPERTY: 16254 foreach my $property ($perl, 16255 sort { return -1 if defined $a->file; 16256 return 1 if defined $b->file; 16257 return $a->name cmp $b->name; 16258 } grep { $_ != $perl } property_ref('*')) 16259 { 16260 my $type = $property->type; 16261 16262 # And for each table for that property, starting with the mapping 16263 # table for it ... 16264 TABLE: 16265 foreach my $table($property, 16266 16267 # and all the match tables for it (if any), sorted so 16268 # the ones with the shortest associated file name come 16269 # first. The length sorting prevents problems of a 16270 # longer file taking a name that might have to be used 16271 # by a shorter one. The alphabetic sorting prevents 16272 # differences between releases 16273 sort { my $ext_a = $a->external_name; 16274 return 1 if ! defined $ext_a; 16275 my $ext_b = $b->external_name; 16276 return -1 if ! defined $ext_b; 16277 16278 # But return the non-complement table before 16279 # the complement one, as the latter is defined 16280 # in terms of the former, and needs to have 16281 # the information for the former available. 16282 return 1 if $a->complement != 0; 16283 return -1 if $b->complement != 0; 16284 16285 # Similarly, return a subservient table after 16286 # a leader 16287 return 1 if $a->leader != $a; 16288 return -1 if $b->leader != $b; 16289 16290 my $cmp = length $ext_a <=> length $ext_b; 16291 16292 # Return result if lengths not equal 16293 return $cmp if $cmp; 16294 16295 # Alphabetic if lengths equal 16296 return $ext_a cmp $ext_b 16297 } $property->tables 16298 ) 16299 { 16300 16301 # Here we have a table associated with a property. It could be 16302 # the map table (done first for each property), or one of the 16303 # other tables. Determine which type. 16304 my $is_property = $table->isa('Property'); 16305 16306 my $name = $table->name; 16307 my $complete_name = $table->complete_name; 16308 16309 # See if should suppress the table if is empty, but warn if it 16310 # contains something. 16311 my $suppress_if_empty_warn_if_not 16312 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0; 16313 16314 # Calculate if this table should have any code points associated 16315 # with it or not. 16316 my $expected_empty = 16317 16318 # $perl should be empty, as well as properties that we just 16319 # don't do anything with 16320 ($is_property 16321 && ($table == $perl 16322 || grep { $complete_name eq $_ } 16323 @unimplemented_properties 16324 ) 16325 ) 16326 16327 # Match tables in properties we skipped populating should be 16328 # empty 16329 || (! $is_property && ! $property->to_create_match_tables) 16330 16331 # Tables and properties that are expected to have no code 16332 # points should be empty 16333 || $suppress_if_empty_warn_if_not 16334 ; 16335 16336 # Set a boolean if this table is the complement of an empty binary 16337 # table 16338 my $is_complement_of_empty_binary = 16339 $type == $BINARY && 16340 (($table == $property->table('Y') 16341 && $property->table('N')->is_empty) 16342 || ($table == $property->table('N') 16343 && $property->table('Y')->is_empty)); 16344 16345 if ($table->is_empty) { 16346 16347 if ($suppress_if_empty_warn_if_not) { 16348 $table->set_fate($SUPPRESSED, 16349 $suppress_if_empty_warn_if_not); 16350 } 16351 16352 # Suppress (by skipping them) expected empty tables. 16353 next TABLE if $expected_empty; 16354 16355 # And setup to later output a warning for those that aren't 16356 # known to be allowed to be empty. Don't do the warning if 16357 # this table is a child of another one to avoid duplicating 16358 # the warning that should come from the parent one. 16359 if (($table == $property || $table->parent == $table) 16360 && $table->fate != $SUPPRESSED 16361 && $table->fate != $MAP_PROXIED 16362 && ! grep { $complete_name =~ /^$_$/ } 16363 @tables_that_may_be_empty) 16364 { 16365 push @unhandled_properties, "$table"; 16366 } 16367 16368 # An empty table is just the complement of everything. 16369 $table->set_complement($Any) if $table != $property; 16370 } 16371 elsif ($expected_empty) { 16372 my $because = ""; 16373 if ($suppress_if_empty_warn_if_not) { 16374 $because = " because $suppress_if_empty_warn_if_not"; 16375 } 16376 16377 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); 16378 } 16379 16380 # Some tables should match everything 16381 my $expected_full = 16382 ($table->fate == $SUPPRESSED) 16383 ? 0 16384 : ($is_property) 16385 ? # All these types of map tables will be full because 16386 # they will have been populated with defaults 16387 ($type == $ENUM || $type == $FORCED_BINARY) 16388 16389 : # A match table should match everything if its method 16390 # shows it should 16391 ($table->matches_all 16392 16393 # The complement of an empty binary table will match 16394 # everything 16395 || $is_complement_of_empty_binary 16396 ) 16397 ; 16398 16399 my $count = $table->count; 16400 if ($expected_full) { 16401 if ($count != $MAX_UNICODE_CODEPOINTS) { 16402 Carp::my_carp("$table matches only " 16403 . clarify_number($count) 16404 . " Unicode code points but should match " 16405 . clarify_number($MAX_UNICODE_CODEPOINTS) 16406 . " (off by " 16407 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count)) 16408 . "). Proceeding anyway."); 16409 } 16410 16411 # Here is expected to be full. If it is because it is the 16412 # complement of an (empty) binary table that is to be 16413 # suppressed, then suppress this one as well. 16414 if ($is_complement_of_empty_binary) { 16415 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; 16416 my $opposing = $property->table($opposing_name); 16417 my $opposing_status = $opposing->status; 16418 if ($opposing_status) { 16419 $table->set_status($opposing_status, 16420 $opposing->status_info); 16421 } 16422 } 16423 } 16424 elsif ($count == $MAX_UNICODE_CODEPOINTS 16425 && ($table == $property || $table->leader == $table) 16426 && $table->property->status ne $PLACEHOLDER) 16427 { 16428 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); 16429 } 16430 16431 if ($table->fate >= $SUPPRESSED) { 16432 if (! $is_property) { 16433 my @children = $table->children; 16434 foreach my $child (@children) { 16435 if ($child->fate < $SUPPRESSED) { 16436 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); 16437 } 16438 } 16439 } 16440 next TABLE; 16441 16442 } 16443 16444 if (! $is_property) { 16445 16446 make_ucd_table_pod_entries($table) if $table->property == $perl; 16447 16448 # Several things need to be done just once for each related 16449 # group of match tables. Do them on the parent. 16450 if ($table->parent == $table) { 16451 16452 # Add an entry in the pod file for the table; it also does 16453 # the children. 16454 make_re_pod_entries($table) if defined $pod_directory; 16455 16456 # See if the the table matches identical code points with 16457 # something that has already been output. In that case, 16458 # no need to have two files with the same code points in 16459 # them. We use the table's hash() method to store these 16460 # in buckets, so that it is quite likely that if two 16461 # tables are in the same bucket they will be identical, so 16462 # don't have to compare tables frequently. The tables 16463 # have to have the same status to share a file, so add 16464 # this to the bucket hash. (The reason for this latter is 16465 # that Heavy.pl associates a status with a file.) 16466 # We don't check tables that are inverses of others, as it 16467 # would lead to some coding complications, and checking 16468 # all the regular ones should find everything. 16469 if ($table->complement == 0) { 16470 my $hash = $table->hash . ';' . $table->status; 16471 16472 # Look at each table that is in the same bucket as 16473 # this one would be. 16474 foreach my $comparison 16475 (@{$match_tables_to_write{$hash}}) 16476 { 16477 if ($table->matches_identically_to($comparison)) { 16478 $table->set_equivalent_to($comparison, 16479 Related => 0); 16480 next TABLE; 16481 } 16482 } 16483 16484 # Here, not equivalent, add this table to the bucket. 16485 push @{$match_tables_to_write{$hash}}, $table; 16486 } 16487 } 16488 } 16489 else { 16490 16491 # Here is the property itself. 16492 # Don't write out or make references to the $perl property 16493 next if $table == $perl; 16494 16495 make_ucd_table_pod_entries($table); 16496 16497 # There is a mapping stored of the various synonyms to the 16498 # standardized name of the property for utf8_heavy.pl. 16499 # Also, the pod file contains entries of the form: 16500 # \p{alias: *} \p{full: *} 16501 # rather than show every possible combination of things. 16502 16503 my @property_aliases = $property->aliases; 16504 16505 my $full_property_name = $property->full_name; 16506 my $property_name = $property->name; 16507 my $standard_property_name = standardize($property_name); 16508 my $standard_property_full_name 16509 = standardize($full_property_name); 16510 16511 # We also create for Unicode::UCD a list of aliases for 16512 # the property. The list starts with the property name; 16513 # then its full name. 16514 my @property_list; 16515 my @standard_list; 16516 if ( $property->fate <= $MAP_PROXIED) { 16517 @property_list = ($property_name, $full_property_name); 16518 @standard_list = ($standard_property_name, 16519 $standard_property_full_name); 16520 } 16521 16522 # For each synonym ... 16523 for my $i (0 .. @property_aliases - 1) { 16524 my $alias = $property_aliases[$i]; 16525 my $alias_name = $alias->name; 16526 my $alias_standard = standardize($alias_name); 16527 16528 16529 # Add other aliases to the list of property aliases 16530 if ($property->fate <= $MAP_PROXIED 16531 && ! grep { $alias_standard eq $_ } @standard_list) 16532 { 16533 push @property_list, $alias_name; 16534 push @standard_list, $alias_standard; 16535 } 16536 16537 # For utf8_heavy, set the mapping of the alias to the 16538 # property 16539 if ($type == $STRING) { 16540 if ($property->fate <= $MAP_PROXIED) { 16541 $string_property_loose_to_name{$alias_standard} 16542 = $standard_property_name; 16543 } 16544 } 16545 else { 16546 if (exists ($loose_property_name_of{$alias_standard})) 16547 { 16548 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained"); 16549 } 16550 else { 16551 $loose_property_name_of{$alias_standard} 16552 = $standard_property_name; 16553 } 16554 16555 # Now for the re pod entry for this alias. Skip if not 16556 # outputting a pod; skip the first one, which is the 16557 # full name so won't have an entry like: '\p{full: *} 16558 # \p{full: *}', and skip if don't want an entry for 16559 # this one. 16560 next if $i == 0 16561 || ! defined $pod_directory 16562 || ! $alias->make_re_pod_entry; 16563 16564 my $rhs = "\\p{$full_property_name: *}"; 16565 if ($property != $perl && $table->perl_extension) { 16566 $rhs .= ' (Perl extension)'; 16567 } 16568 push @match_properties, 16569 format_pod_line($indent_info_column, 16570 '\p{' . $alias->name . ': *}', 16571 $rhs, 16572 $alias->status); 16573 } 16574 } 16575 16576 # The list of all possible names is attached to each alias, so 16577 # lookup is easy 16578 if (@property_list) { 16579 push @{$prop_aliases{$standard_list[0]}}, @property_list; 16580 } 16581 16582 if ($property->fate <= $MAP_PROXIED) { 16583 16584 # Similarly, we create for Unicode::UCD a list of 16585 # property-value aliases. 16586 16587 my $property_full_name = $property->full_name; 16588 16589 # Look at each table in the property... 16590 foreach my $table ($property->tables) { 16591 my @values_list; 16592 my $table_full_name = $table->full_name; 16593 my $standard_table_full_name 16594 = standardize($table_full_name); 16595 my $table_name = $table->name; 16596 my $standard_table_name = standardize($table_name); 16597 16598 # The list starts with the table name and its full 16599 # name. 16600 push @values_list, $table_name, $table_full_name; 16601 16602 # We add to the table each unique alias that isn't 16603 # discouraged from use. 16604 foreach my $alias ($table->aliases) { 16605 next if $alias->status 16606 && $alias->status eq $DISCOURAGED; 16607 my $name = $alias->name; 16608 my $standard = standardize($name); 16609 next if $standard eq $standard_table_name; 16610 next if $standard eq $standard_table_full_name; 16611 push @values_list, $name; 16612 } 16613 16614 # Here @values_list is a list of all the aliases for 16615 # the table. That is, all the property-values given 16616 # by this table. By agreement with Unicode::UCD, 16617 # if the name and full name are identical, and there 16618 # are no other names, drop the duplcate entry to save 16619 # memory. 16620 if (@values_list == 2 16621 && $values_list[0] eq $values_list[1]) 16622 { 16623 pop @values_list 16624 } 16625 16626 # To save memory, unlike the similar list for property 16627 # aliases above, only the standard forms hve the list. 16628 # This forces an extra step of converting from input 16629 # name to standard name, but the savings are 16630 # considerable. (There is only marginal savings if we 16631 # did this with the property aliases.) 16632 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list; 16633 } 16634 } 16635 16636 # Don't write out a mapping file if not desired. 16637 next if ! $property->to_output_map; 16638 } 16639 16640 # Here, we know we want to write out the table, but don't do it 16641 # yet because there may be other tables that come along and will 16642 # want to share the file, and the file's comments will change to 16643 # mention them. So save for later. 16644 push @writables, $table; 16645 16646 } # End of looping through the property and all its tables. 16647 } # End of looping through all properties. 16648 16649 # Now have all the tables that will have files written for them. Do it. 16650 foreach my $table (@writables) { 16651 my @directory; 16652 my $filename; 16653 my $property = $table->property; 16654 my $is_property = ($table == $property); 16655 if (! $is_property) { 16656 16657 # Match tables for the property go in lib/$subdirectory, which is 16658 # the property's name. Don't use the standard file name for this, 16659 # as may get an unfamiliar alias 16660 @directory = ($matches_directory, $property->external_name); 16661 } 16662 else { 16663 16664 @directory = $table->directory; 16665 $filename = $table->file; 16666 } 16667 16668 # Use specified filename if available, or default to property's 16669 # shortest name. We need an 8.3 safe filename (which means "an 8 16670 # safe" filename, since after the dot is only 'pl', which is < 3) 16671 # The 2nd parameter is if the filename shouldn't be changed, and 16672 # it shouldn't iff there is a hard-coded name for this table. 16673 $filename = construct_filename( 16674 $filename || $table->external_name, 16675 ! $filename, # mutable if no filename 16676 \@directory); 16677 16678 register_file_for_name($table, \@directory, $filename); 16679 16680 # Only need to write one file when shared by more than one 16681 # property 16682 next if ! $is_property 16683 && ($table->leader != $table || $table->complement != 0); 16684 16685 # Construct a nice comment to add to the file 16686 $table->set_final_comment; 16687 16688 $table->write; 16689 } 16690 16691 16692 # Write out the pod file 16693 make_pod; 16694 16695 # And Heavy.pl, Name.pm, UCD.pl 16696 make_Heavy; 16697 make_Name_pm; 16698 make_UCD; 16699 16700 make_property_test_script() if $make_test_script; 16701 make_normalization_test_script() if $make_norm_test_script; 16702 return; 16703} 16704 16705my @white_space_separators = ( # This used only for making the test script. 16706 "", 16707 ' ', 16708 "\t", 16709 ' ' 16710 ); 16711 16712sub generate_separator($) { 16713 # This used only for making the test script. It generates the colon or 16714 # equal separator between the property and property value, with random 16715 # white space surrounding the separator 16716 16717 my $lhs = shift; 16718 16719 return "" if $lhs eq ""; # No separator if there's only one (the r) side 16720 16721 # Choose space before and after randomly 16722 my $spaces_before =$white_space_separators[rand(@white_space_separators)]; 16723 my $spaces_after = $white_space_separators[rand(@white_space_separators)]; 16724 16725 # And return the whole complex, half the time using a colon, half the 16726 # equals 16727 return $spaces_before 16728 . (rand() < 0.5) ? '=' : ':' 16729 . $spaces_after; 16730} 16731 16732sub generate_tests($$$$$) { 16733 # This used only for making the test script. It generates test cases that 16734 # are expected to compile successfully in perl. Note that the lhs and 16735 # rhs are assumed to already be as randomized as the caller wants. 16736 16737 my $lhs = shift; # The property: what's to the left of the colon 16738 # or equals separator 16739 my $rhs = shift; # The property value; what's to the right 16740 my $valid_code = shift; # A code point that's known to be in the 16741 # table given by lhs=rhs; undef if table is 16742 # empty 16743 my $invalid_code = shift; # A code point known to not be in the table; 16744 # undef if the table is all code points 16745 my $warning = shift; 16746 16747 # Get the colon or equal 16748 my $separator = generate_separator($lhs); 16749 16750 # The whole 'property=value' 16751 my $name = "$lhs$separator$rhs"; 16752 16753 my @output; 16754 # Create a complete set of tests, with complements. 16755 if (defined $valid_code) { 16756 push @output, <<"EOC" 16757Expect(1, $valid_code, '\\p{$name}', $warning); 16758Expect(0, $valid_code, '\\p{^$name}', $warning); 16759Expect(0, $valid_code, '\\P{$name}', $warning); 16760Expect(1, $valid_code, '\\P{^$name}', $warning); 16761EOC 16762 } 16763 if (defined $invalid_code) { 16764 push @output, <<"EOC" 16765Expect(0, $invalid_code, '\\p{$name}', $warning); 16766Expect(1, $invalid_code, '\\p{^$name}', $warning); 16767Expect(1, $invalid_code, '\\P{$name}', $warning); 16768Expect(0, $invalid_code, '\\P{^$name}', $warning); 16769EOC 16770 } 16771 return @output; 16772} 16773 16774sub generate_error($$$) { 16775 # This used only for making the test script. It generates test cases that 16776 # are expected to not only not match, but to be syntax or similar errors 16777 16778 my $lhs = shift; # The property: what's to the left of the 16779 # colon or equals separator 16780 my $rhs = shift; # The property value; what's to the right 16781 my $already_in_error = shift; # Boolean; if true it's known that the 16782 # unmodified lhs and rhs will cause an error. 16783 # This routine should not force another one 16784 # Get the colon or equal 16785 my $separator = generate_separator($lhs); 16786 16787 # Since this is an error only, don't bother to randomly decide whether to 16788 # put the error on the left or right side; and assume that the rhs is 16789 # loosely matched, again for convenience rather than rigor. 16790 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; 16791 16792 my $property = $lhs . $separator . $rhs; 16793 16794 return <<"EOC"; 16795Error('\\p{$property}'); 16796Error('\\P{$property}'); 16797EOC 16798} 16799 16800# These are used only for making the test script 16801# XXX Maybe should also have a bad strict seps, which includes underscore. 16802 16803my @good_loose_seps = ( 16804 " ", 16805 "-", 16806 "\t", 16807 "", 16808 "_", 16809 ); 16810my @bad_loose_seps = ( 16811 "/a/", 16812 ':=', 16813 ); 16814 16815sub randomize_stricter_name { 16816 # This used only for making the test script. Take the input name and 16817 # return a randomized, but valid version of it under the stricter matching 16818 # rules. 16819 16820 my $name = shift; 16821 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 16822 16823 # If the name looks like a number (integer, floating, or rational), do 16824 # some extra work 16825 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { 16826 my $sign = $1; 16827 my $number = $2; 16828 my $separator = $3; 16829 16830 # If there isn't a sign, part of the time add a plus 16831 # Note: Not testing having any denominator having a minus sign 16832 if (! $sign) { 16833 $sign = '+' if rand() <= .3; 16834 } 16835 16836 # And add 0 or more leading zeros. 16837 $name = $sign . ('0' x int rand(10)) . $number; 16838 16839 if (defined $separator) { 16840 my $extra_zeros = '0' x int rand(10); 16841 16842 if ($separator eq '.') { 16843 16844 # Similarly, add 0 or more trailing zeros after a decimal 16845 # point 16846 $name .= $extra_zeros; 16847 } 16848 else { 16849 16850 # Or, leading zeros before the denominator 16851 $name =~ s,/,/$extra_zeros,; 16852 } 16853 } 16854 } 16855 16856 # For legibility of the test, only change the case of whole sections at a 16857 # time. To do this, first split into sections. The split returns the 16858 # delimiters 16859 my @sections; 16860 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { 16861 trace $section if main::DEBUG && $to_trace; 16862 16863 if (length $section > 1 && $section !~ /\D/) { 16864 16865 # If the section is a sequence of digits, about half the time 16866 # randomly add underscores between some of them. 16867 if (rand() > .5) { 16868 16869 # Figure out how many underscores to add. max is 1 less than 16870 # the number of digits. (But add 1 at the end to make sure 16871 # result isn't 0, and compensate earlier by subtracting 2 16872 # instead of 1) 16873 my $num_underscores = int rand(length($section) - 2) + 1; 16874 16875 # And add them evenly throughout, for convenience, not rigor 16876 use integer; 16877 my $spacing = (length($section) - 1)/ $num_underscores; 16878 my $temp = $section; 16879 $section = ""; 16880 for my $i (1 .. $num_underscores) { 16881 $section .= substr($temp, 0, $spacing, "") . '_'; 16882 } 16883 $section .= $temp; 16884 } 16885 push @sections, $section; 16886 } 16887 else { 16888 16889 # Here not a sequence of digits. Change the case of the section 16890 # randomly 16891 my $switch = int rand(4); 16892 if ($switch == 0) { 16893 push @sections, uc $section; 16894 } 16895 elsif ($switch == 1) { 16896 push @sections, lc $section; 16897 } 16898 elsif ($switch == 2) { 16899 push @sections, ucfirst $section; 16900 } 16901 else { 16902 push @sections, $section; 16903 } 16904 } 16905 } 16906 trace "returning", join "", @sections if main::DEBUG && $to_trace; 16907 return join "", @sections; 16908} 16909 16910sub randomize_loose_name($;$) { 16911 # This used only for making the test script 16912 16913 my $name = shift; 16914 my $want_error = shift; # if true, make an error 16915 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 16916 16917 $name = randomize_stricter_name($name); 16918 16919 my @parts; 16920 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 16921 16922 # Preserve trailing ones for the sake of not stripping the underscore from 16923 # 'L_' 16924 for my $part (split /[-\s_]+ (?= . )/, $name) { 16925 if (@parts) { 16926 if ($want_error and rand() < 0.3) { 16927 push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; 16928 $want_error = 0; 16929 } 16930 else { 16931 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 16932 } 16933 } 16934 push @parts, $part; 16935 } 16936 my $new = join("", @parts); 16937 trace "$name => $new" if main::DEBUG && $to_trace; 16938 16939 if ($want_error) { 16940 if (rand() >= 0.5) { 16941 $new .= $bad_loose_seps[rand(@bad_loose_seps)]; 16942 } 16943 else { 16944 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; 16945 } 16946 } 16947 return $new; 16948} 16949 16950# Used to make sure don't generate duplicate test cases. 16951my %test_generated; 16952 16953sub make_property_test_script() { 16954 # This used only for making the test script 16955 # this written directly -- it's huge. 16956 16957 print "Making test script\n" if $verbosity >= $PROGRESS; 16958 16959 # This uses randomness to test different possibilities without testing all 16960 # possibilities. To ensure repeatability, set the seed to 0. But if 16961 # tests are added, it will perturb all later ones in the .t file 16962 srand 0; 16963 16964 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name 16965 16966 # Keep going down an order of magnitude 16967 # until find that adding this quantity to 16968 # 1 remains 1; but put an upper limit on 16969 # this so in case this algorithm doesn't 16970 # work properly on some platform, that we 16971 # won't loop forever. 16972 my $digits = 0; 16973 my $min_floating_slop = 1; 16974 while (1+ $min_floating_slop != 1 16975 && $digits++ < 50) 16976 { 16977 my $next = $min_floating_slop / 10; 16978 last if $next == 0; # If underflows, 16979 # use previous one 16980 $min_floating_slop = $next; 16981 } 16982 16983 # It doesn't matter whether the elements of this array contain single lines 16984 # or multiple lines. main::write doesn't count the lines. 16985 my @output; 16986 16987 # Sort these so get results in same order on different runs of this 16988 # program 16989 foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) { 16990 foreach my $table (sort { $a->name cmp $b->name } $property->tables) { 16991 16992 # Find code points that match, and don't match this table. 16993 my $valid = $table->get_valid_code_point; 16994 my $invalid = $table->get_invalid_code_point; 16995 my $warning = ($table->status eq $DEPRECATED) 16996 ? "'deprecated'" 16997 : '""'; 16998 16999 # Test each possible combination of the property's aliases with 17000 # the table's. If this gets to be too many, could do what is done 17001 # in the set_final_comment() for Tables 17002 my @table_aliases = $table->aliases; 17003 my @property_aliases = $table->property->aliases; 17004 17005 # Every property can be optionally be prefixed by 'Is_', so test 17006 # that those work, by creating such a new alias for each 17007 # pre-existing one. 17008 push @property_aliases, map { Alias->new("Is_" . $_->name, 17009 $_->loose_match, 17010 $_->make_re_pod_entry, 17011 $_->ok_as_filename, 17012 $_->status, 17013 $_->ucd, 17014 ) 17015 } @property_aliases; 17016 my $max = max(scalar @table_aliases, scalar @property_aliases); 17017 for my $j (0 .. $max - 1) { 17018 17019 # The current alias for property is the next one on the list, 17020 # or if beyond the end, start over. Similarly for table 17021 my $property_name 17022 = $property_aliases[$j % @property_aliases]->name; 17023 17024 $property_name = "" if $table->property == $perl; 17025 my $table_alias = $table_aliases[$j % @table_aliases]; 17026 my $table_name = $table_alias->name; 17027 my $loose_match = $table_alias->loose_match; 17028 17029 # If the table doesn't have a file, any test for it is 17030 # already guaranteed to be in error 17031 my $already_error = ! $table->file_path; 17032 17033 # Generate error cases for this alias. 17034 push @output, generate_error($property_name, 17035 $table_name, 17036 $already_error); 17037 17038 # If the table is guaranteed to always generate an error, 17039 # quit now without generating success cases. 17040 next if $already_error; 17041 17042 # Now for the success cases. 17043 my $random; 17044 if ($loose_match) { 17045 17046 # For loose matching, create an extra test case for the 17047 # standard name. 17048 my $standard = standardize($table_name); 17049 17050 # $test_name should be a unique combination for each test 17051 # case; used just to avoid duplicate tests 17052 my $test_name = "$property_name=$standard"; 17053 17054 # Don't output duplicate test cases. 17055 if (! exists $test_generated{$test_name}) { 17056 $test_generated{$test_name} = 1; 17057 push @output, generate_tests($property_name, 17058 $standard, 17059 $valid, 17060 $invalid, 17061 $warning, 17062 ); 17063 } 17064 $random = randomize_loose_name($table_name) 17065 } 17066 else { # Stricter match 17067 $random = randomize_stricter_name($table_name); 17068 } 17069 17070 # Now for the main test case for this alias. 17071 my $test_name = "$property_name=$random"; 17072 if (! exists $test_generated{$test_name}) { 17073 $test_generated{$test_name} = 1; 17074 push @output, generate_tests($property_name, 17075 $random, 17076 $valid, 17077 $invalid, 17078 $warning, 17079 ); 17080 17081 # If the name is a rational number, add tests for the 17082 # floating point equivalent. 17083 if ($table_name =~ qr{/}) { 17084 17085 # Calculate the float, and find just the fraction. 17086 my $float = eval $table_name; 17087 my ($whole, $fraction) 17088 = $float =~ / (.*) \. (.*) /x; 17089 17090 # Starting with one digit after the decimal point, 17091 # create a test for each possible precision (number of 17092 # digits past the decimal point) until well beyond the 17093 # native number found on this machine. (If we started 17094 # with 0 digits, it would be an integer, which could 17095 # well match an unrelated table) 17096 PLACE: 17097 for my $i (1 .. $min_floating_slop + 3) { 17098 my $table_name = sprintf("%.*f", $i, $float); 17099 if ($i < $MIN_FRACTION_LENGTH) { 17100 17101 # If the test case has fewer digits than the 17102 # minimum acceptable precision, it shouldn't 17103 # succeed, so we expect an error for it. 17104 # E.g., 2/3 = .7 at one decimal point, and we 17105 # shouldn't say it matches .7. We should make 17106 # it be .667 at least before agreeing that the 17107 # intent was to match 2/3. But at the 17108 # less-than- acceptable level of precision, it 17109 # might actually match an unrelated number. 17110 # So don't generate a test case if this 17111 # conflating is possible. In our example, we 17112 # don't want 2/3 matching 7/10, if there is 17113 # a 7/10 code point. 17114 for my $existing 17115 (keys %nv_floating_to_rational) 17116 { 17117 next PLACE 17118 if abs($table_name - $existing) 17119 < $MAX_FLOATING_SLOP; 17120 } 17121 push @output, generate_error($property_name, 17122 $table_name, 17123 1 # 1 => already an error 17124 ); 17125 } 17126 else { 17127 17128 # Here the number of digits exceeds the 17129 # minimum we think is needed. So generate a 17130 # success test case for it. 17131 push @output, generate_tests($property_name, 17132 $table_name, 17133 $valid, 17134 $invalid, 17135 $warning, 17136 ); 17137 } 17138 } 17139 } 17140 } 17141 } 17142 } 17143 } 17144 17145 &write($t_path, 17146 0, # Not utf8; 17147 [<DATA>, 17148 @output, 17149 (map {"Test_X('$_');\n"} @backslash_X_tests), 17150 "Finished();\n"]); 17151 return; 17152} 17153 17154sub make_normalization_test_script() { 17155 print "Making normalization test script\n" if $verbosity >= $PROGRESS; 17156 17157 my $n_path = 'TestNorm.pl'; 17158 17159 unshift @normalization_tests, <<'END'; 17160use utf8; 17161use Test::More; 17162 17163sub ord_string { # Convert packed ords to printable string 17164 use charnames (); 17165 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } 17166 unpack "U*", shift) . "'"; 17167 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; 17168} 17169 17170sub Test_N { 17171 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; 17172 my $display_source = ord_string($source); 17173 my $display_nfc = ord_string($nfc); 17174 my $display_nfd = ord_string($nfd); 17175 my $display_nfkc = ord_string($nfkc); 17176 my $display_nfkd = ord_string($nfkd); 17177 17178 use Unicode::Normalize; 17179 # NFC 17180 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd) 17181 # nfkc == toNFC(nfkc) == toNFC(nfkd) 17182 # 17183 # NFD 17184 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd) 17185 # nfkd == toNFD(nfkc) == toNFD(nfkd) 17186 # 17187 # NFKC 17188 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) == 17189 # toNFKC(nfkc) == toNFKC(nfkd) 17190 # 17191 # NFKD 17192 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) == 17193 # toNFKD(nfkc) == toNFKD(nfkd) 17194 17195 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc"); 17196 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc"); 17197 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc"); 17198 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc"); 17199 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc"); 17200 17201 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd"); 17202 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd"); 17203 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd"); 17204 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd"); 17205 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd"); 17206 17207 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc"); 17208 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc"); 17209 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc"); 17210 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc"); 17211 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc"); 17212 17213 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd"); 17214 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd"); 17215 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd"); 17216 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd"); 17217 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd"); 17218} 17219END 17220 17221 &write($n_path, 17222 1, # Is utf8; 17223 [ 17224 @normalization_tests, 17225 'done_testing();' 17226 ]); 17227 return; 17228} 17229 17230# This is a list of the input files and how to handle them. The files are 17231# processed in their order in this list. Some reordering is possible if 17232# desired, but the v0 files should be first, and the extracted before the 17233# others except DAge.txt (as data in an extracted file can be over-ridden by 17234# the non-extracted. Some other files depend on data derived from an earlier 17235# file, like UnicodeData requires data from Jamo, and the case changing and 17236# folding requires data from Unicode. Mostly, it is safest to order by first 17237# version releases in (except the Jamo). DAge.txt is read before the 17238# extracted ones because of the rarely used feature $compare_versions. In the 17239# unlikely event that there were ever an extracted file that contained the Age 17240# property information, it would have to go in front of DAge. 17241# 17242# The version strings allow the program to know whether to expect a file or 17243# not, but if a file exists in the directory, it will be processed, even if it 17244# is in a version earlier than expected, so you can copy files from a later 17245# release into an earlier release's directory. 17246my @input_file_objects = ( 17247 Input_file->new('PropertyAliases.txt', v0, 17248 Handler => \&process_PropertyAliases, 17249 ), 17250 Input_file->new(undef, v0, # No file associated with this 17251 Progress_Message => 'Finishing property setup', 17252 Handler => \&finish_property_setup, 17253 ), 17254 Input_file->new('PropValueAliases.txt', v0, 17255 Handler => \&process_PropValueAliases, 17256 Has_Missings_Defaults => $NOT_IGNORED, 17257 ), 17258 Input_file->new('DAge.txt', v3.2.0, 17259 Has_Missings_Defaults => $NOT_IGNORED, 17260 Property => 'Age' 17261 ), 17262 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, 17263 Property => 'General_Category', 17264 ), 17265 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, 17266 Property => 'Canonical_Combining_Class', 17267 Has_Missings_Defaults => $NOT_IGNORED, 17268 ), 17269 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, 17270 Property => 'Numeric_Type', 17271 Has_Missings_Defaults => $NOT_IGNORED, 17272 ), 17273 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, 17274 Property => 'East_Asian_Width', 17275 Has_Missings_Defaults => $NOT_IGNORED, 17276 ), 17277 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, 17278 Property => 'Line_Break', 17279 Has_Missings_Defaults => $NOT_IGNORED, 17280 ), 17281 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, 17282 Property => 'Bidi_Class', 17283 Has_Missings_Defaults => $NOT_IGNORED, 17284 ), 17285 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, 17286 Property => 'Decomposition_Type', 17287 Has_Missings_Defaults => $NOT_IGNORED, 17288 ), 17289 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), 17290 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, 17291 Property => 'Numeric_Value', 17292 Each_Line_Handler => \&filter_numeric_value_line, 17293 Has_Missings_Defaults => $NOT_IGNORED, 17294 ), 17295 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, 17296 Property => 'Joining_Group', 17297 Has_Missings_Defaults => $NOT_IGNORED, 17298 ), 17299 17300 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, 17301 Property => 'Joining_Type', 17302 Has_Missings_Defaults => $NOT_IGNORED, 17303 ), 17304 Input_file->new('Jamo.txt', v2.0.0, 17305 Property => 'Jamo_Short_Name', 17306 Each_Line_Handler => \&filter_jamo_line, 17307 ), 17308 Input_file->new('UnicodeData.txt', v1.1.5, 17309 Pre_Handler => \&setup_UnicodeData, 17310 17311 # We clean up this file for some early versions. 17312 Each_Line_Handler => [ (($v_version lt v2.0.0 ) 17313 ? \&filter_v1_ucd 17314 : ($v_version eq v2.1.5) 17315 ? \&filter_v2_1_5_ucd 17316 17317 # And for 5.14 Perls with 6.0, 17318 # have to also make changes 17319 : ($v_version ge v6.0.0 17320 && $^V lt v5.17.0) 17321 ? \&filter_v6_ucd 17322 : undef), 17323 17324 # Early versions did not have the 17325 # proper Unicode_1 names for the 17326 # controls 17327 (($v_version lt v3.0.0) 17328 ? \&filter_early_U1_names 17329 : undef), 17330 17331 # Early versions did not correctly 17332 # use the later method for giving 17333 # decimal digit values 17334 (($v_version le v3.2.0) 17335 ? \&filter_bad_Nd_ucd 17336 : undef), 17337 17338 # And the main filter 17339 \&filter_UnicodeData_line, 17340 ], 17341 EOF_Handler => \&EOF_UnicodeData, 17342 ), 17343 Input_file->new('ArabicShaping.txt', v2.0.0, 17344 Each_Line_Handler => 17345 [ ($v_version lt 4.1.0) 17346 ? \&filter_old_style_arabic_shaping 17347 : undef, 17348 \&filter_arabic_shaping_line, 17349 ], 17350 Has_Missings_Defaults => $NOT_IGNORED, 17351 ), 17352 Input_file->new('Blocks.txt', v2.0.0, 17353 Property => 'Block', 17354 Has_Missings_Defaults => $NOT_IGNORED, 17355 Each_Line_Handler => \&filter_blocks_lines 17356 ), 17357 Input_file->new('PropList.txt', v2.0.0, 17358 Each_Line_Handler => (($v_version lt v3.1.0) 17359 ? \&filter_old_style_proplist 17360 : undef), 17361 ), 17362 Input_file->new('Unihan.txt', v2.0.0, 17363 Pre_Handler => \&setup_unihan, 17364 Optional => 1, 17365 Each_Line_Handler => \&filter_unihan_line, 17366 ), 17367 Input_file->new('SpecialCasing.txt', v2.1.8, 17368 Each_Line_Handler => ($v_version eq 2.1.8) 17369 ? \&filter_2_1_8_special_casing_line 17370 : \&filter_special_casing_line, 17371 Pre_Handler => \&setup_special_casing, 17372 Has_Missings_Defaults => $IGNORED, 17373 ), 17374 Input_file->new( 17375 'LineBreak.txt', v3.0.0, 17376 Has_Missings_Defaults => $NOT_IGNORED, 17377 Property => 'Line_Break', 17378 # Early versions had problematic syntax 17379 Each_Line_Handler => (($v_version lt v3.1.0) 17380 ? \&filter_early_ea_lb 17381 : undef), 17382 ), 17383 Input_file->new('EastAsianWidth.txt', v3.0.0, 17384 Property => 'East_Asian_Width', 17385 Has_Missings_Defaults => $NOT_IGNORED, 17386 # Early versions had problematic syntax 17387 Each_Line_Handler => (($v_version lt v3.1.0) 17388 ? \&filter_early_ea_lb 17389 : undef), 17390 ), 17391 Input_file->new('CompositionExclusions.txt', v3.0.0, 17392 Property => 'Composition_Exclusion', 17393 ), 17394 Input_file->new('BidiMirroring.txt', v3.0.1, 17395 Property => 'Bidi_Mirroring_Glyph', 17396 Has_Missings_Defaults => ($v_version lt v6.2.0) 17397 ? $NO_DEFAULTS 17398 # Is <none> which doesn't mean 17399 # anything to us, we will use the 17400 # null string 17401 : $IGNORED, 17402 17403 ), 17404 Input_file->new("NormTest.txt", v3.0.0, 17405 Handler => \&process_NormalizationsTest, 17406 Skip => ($make_norm_test_script) ? 0 : 'Validation Tests', 17407 ), 17408 Input_file->new('CaseFolding.txt', v3.0.1, 17409 Pre_Handler => \&setup_case_folding, 17410 Each_Line_Handler => 17411 [ ($v_version lt v3.1.0) 17412 ? \&filter_old_style_case_folding 17413 : undef, 17414 \&filter_case_folding_line 17415 ], 17416 Has_Missings_Defaults => $IGNORED, 17417 ), 17418 Input_file->new('DCoreProperties.txt', v3.1.0, 17419 # 5.2 changed this file 17420 Has_Missings_Defaults => (($v_version ge v5.2.0) 17421 ? $NOT_IGNORED 17422 : $NO_DEFAULTS), 17423 ), 17424 Input_file->new('Scripts.txt', v3.1.0, 17425 Property => 'Script', 17426 Has_Missings_Defaults => $NOT_IGNORED, 17427 ), 17428 Input_file->new('DNormalizationProps.txt', v3.1.0, 17429 Has_Missings_Defaults => $NOT_IGNORED, 17430 Each_Line_Handler => (($v_version lt v4.0.1) 17431 ? \&filter_old_style_normalization_lines 17432 : undef), 17433 ), 17434 Input_file->new('HangulSyllableType.txt', v0, 17435 Has_Missings_Defaults => $NOT_IGNORED, 17436 Property => 'Hangul_Syllable_Type', 17437 Pre_Handler => ($v_version lt v4.0.0) 17438 ? \&generate_hst 17439 : undef, 17440 ), 17441 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, 17442 Property => 'Word_Break', 17443 Has_Missings_Defaults => $NOT_IGNORED, 17444 ), 17445 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0, 17446 Property => 'Grapheme_Cluster_Break', 17447 Has_Missings_Defaults => $NOT_IGNORED, 17448 Pre_Handler => ($v_version lt v4.1.0) 17449 ? \&generate_GCB 17450 : undef, 17451 ), 17452 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, 17453 Handler => \&process_GCB_test, 17454 ), 17455 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0, 17456 Skip => 'Validation Tests', 17457 ), 17458 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, 17459 Skip => 'Validation Tests', 17460 ), 17461 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, 17462 Skip => 'Validation Tests', 17463 ), 17464 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, 17465 Property => 'Sentence_Break', 17466 Has_Missings_Defaults => $NOT_IGNORED, 17467 ), 17468 Input_file->new('NamedSequences.txt', v4.1.0, 17469 Handler => \&process_NamedSequences 17470 ), 17471 Input_file->new('NameAliases.txt', v0, 17472 Property => 'Name_Alias', 17473 Pre_Handler => ($v_version le v6.0.0) 17474 ? \&setup_early_name_alias 17475 : undef, 17476 Each_Line_Handler => ($v_version le v6.0.0) 17477 ? \&filter_early_version_name_alias_line 17478 : \&filter_later_version_name_alias_line, 17479 ), 17480 Input_file->new("BidiTest.txt", v5.2.0, 17481 Skip => 'Validation Tests', 17482 ), 17483 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, 17484 Optional => 1, 17485 Each_Line_Handler => \&filter_unihan_line, 17486 ), 17487 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, 17488 Optional => 1, 17489 Each_Line_Handler => \&filter_unihan_line, 17490 ), 17491 Input_file->new('UnihanIRGSources.txt', v5.2.0, 17492 Optional => 1, 17493 Pre_Handler => \&setup_unihan, 17494 Each_Line_Handler => \&filter_unihan_line, 17495 ), 17496 Input_file->new('UnihanNumericValues.txt', v5.2.0, 17497 Optional => 1, 17498 Each_Line_Handler => \&filter_unihan_line, 17499 ), 17500 Input_file->new('UnihanOtherMappings.txt', v5.2.0, 17501 Optional => 1, 17502 Each_Line_Handler => \&filter_unihan_line, 17503 ), 17504 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, 17505 Optional => 1, 17506 Each_Line_Handler => \&filter_unihan_line, 17507 ), 17508 Input_file->new('UnihanReadings.txt', v5.2.0, 17509 Optional => 1, 17510 Each_Line_Handler => \&filter_unihan_line, 17511 ), 17512 Input_file->new('UnihanVariants.txt', v5.2.0, 17513 Optional => 1, 17514 Each_Line_Handler => \&filter_unihan_line, 17515 ), 17516 Input_file->new('ScriptExtensions.txt', v6.0.0, 17517 Property => 'Script_Extensions', 17518 Pre_Handler => \&setup_script_extensions, 17519 Each_Line_Handler => \&filter_script_extensions_line, 17520 Has_Missings_Defaults => (($v_version le v6.0.0) 17521 ? $NO_DEFAULTS 17522 : $IGNORED), 17523 ), 17524 # The two Indic files are actually available starting in v6.0.0, but their 17525 # property values are missing from PropValueAliases.txt in that release, 17526 # so that further work would have to be done to get them to work properly 17527 # for that release. 17528 Input_file->new('IndicMatraCategory.txt', v6.1.0, 17529 Property => 'Indic_Matra_Category', 17530 Has_Missings_Defaults => $NOT_IGNORED, 17531 Skip => "Provisional; for the analysis and processing of Indic scripts", 17532 ), 17533 Input_file->new('IndicSyllabicCategory.txt', v6.1.0, 17534 Property => 'Indic_Syllabic_Category', 17535 Has_Missings_Defaults => $NOT_IGNORED, 17536 Skip => "Provisional; for the analysis and processing of Indic scripts", 17537 ), 17538); 17539 17540# End of all the preliminaries. 17541# Do it... 17542 17543if ($compare_versions) { 17544 Carp::my_carp(<<END 17545Warning. \$compare_versions is set. Output is not suitable for production 17546END 17547 ); 17548} 17549 17550# Put into %potential_files a list of all the files in the directory structure 17551# that could be inputs to this program, excluding those that we should ignore. 17552# Use absolute file names because it makes it easier across machine types. 17553my @ignored_files_full_names = map { File::Spec->rel2abs( 17554 internal_file_to_platform($_)) 17555 } keys %ignored_files; 17556File::Find::find({ 17557 wanted=>sub { 17558 return unless /\.txt$/i; # Some platforms change the name's case 17559 my $full = lc(File::Spec->rel2abs($_)); 17560 $potential_files{$full} = 1 17561 if ! grep { $full eq lc($_) } @ignored_files_full_names; 17562 return; 17563 } 17564}, File::Spec->curdir()); 17565 17566my @mktables_list_output_files; 17567my $old_start_time = 0; 17568 17569if (! -e $file_list) { 17570 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; 17571 $write_unchanged_files = 1; 17572} elsif ($write_unchanged_files) { 17573 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; 17574} 17575else { 17576 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; 17577 my $file_handle; 17578 if (! open $file_handle, "<", $file_list) { 17579 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); 17580 $glob_list = 1; 17581 } 17582 else { 17583 my @input; 17584 17585 # Read and parse mktables.lst, placing the results from the first part 17586 # into @input, and the second part into @mktables_list_output_files 17587 for my $list ( \@input, \@mktables_list_output_files ) { 17588 while (<$file_handle>) { 17589 s/^ \s+ | \s+ $//xg; 17590 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) { 17591 $old_start_time = $1; 17592 } 17593 next if /^ \s* (?: \# .* )? $/x; 17594 last if /^ =+ $/x; 17595 my ( $file ) = split /\t/; 17596 push @$list, $file; 17597 } 17598 @$list = uniques(@$list); 17599 next; 17600 } 17601 17602 # Look through all the input files 17603 foreach my $input (@input) { 17604 next if $input eq 'version'; # Already have checked this. 17605 17606 # Ignore if doesn't exist. The checking about whether we care or 17607 # not is done via the Input_file object. 17608 next if ! file_exists($input); 17609 17610 # The paths are stored with relative names, and with '/' as the 17611 # delimiter; convert to absolute on this machine 17612 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); 17613 $potential_files{lc $full} = 1 17614 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names; 17615 } 17616 } 17617 17618 close $file_handle; 17619} 17620 17621if ($glob_list) { 17622 17623 # Here wants to process all .txt files in the directory structure. 17624 # Convert them to full path names. They are stored in the platform's 17625 # relative style 17626 my @known_files; 17627 foreach my $object (@input_file_objects) { 17628 my $file = $object->file; 17629 next unless defined $file; 17630 push @known_files, File::Spec->rel2abs($file); 17631 } 17632 17633 my @unknown_input_files; 17634 foreach my $file (keys %potential_files) { # The keys are stored in lc 17635 next if grep { $file eq lc($_) } @known_files; 17636 17637 # Here, the file is unknown to us. Get relative path name 17638 $file = File::Spec->abs2rel($file); 17639 push @unknown_input_files, $file; 17640 17641 # What will happen is we create a data structure for it, and add it to 17642 # the list of input files to process. First get the subdirectories 17643 # into an array 17644 my (undef, $directories, undef) = File::Spec->splitpath($file); 17645 $directories =~ s;/$;;; # Can have extraneous trailing '/' 17646 my @directories = File::Spec->splitdir($directories); 17647 17648 # If the file isn't extracted (meaning none of the directories is the 17649 # extracted one), just add it to the end of the list of inputs. 17650 if (! grep { $EXTRACTED_DIR eq $_ } @directories) { 17651 push @input_file_objects, Input_file->new($file, v0); 17652 } 17653 else { 17654 17655 # Here, the file is extracted. It needs to go ahead of most other 17656 # processing. Search for the first input file that isn't a 17657 # special required property (that is, find one whose first_release 17658 # is non-0), and isn't extracted. Also, the Age property file is 17659 # processed before the extracted ones, just in case 17660 # $compare_versions is set. 17661 for (my $i = 0; $i < @input_file_objects; $i++) { 17662 if ($input_file_objects[$i]->first_released ne v0 17663 && lc($input_file_objects[$i]->file) ne 'dage.txt' 17664 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) 17665 { 17666 splice @input_file_objects, $i, 0, 17667 Input_file->new($file, v0); 17668 last; 17669 } 17670 } 17671 17672 } 17673 } 17674 if (@unknown_input_files) { 17675 print STDERR simple_fold(join_lines(<<END 17676 17677The following files are unknown as to how to handle. Assuming they are 17678typical property files. You'll know by later error messages if it worked or 17679not: 17680END 17681 ) . " " . join(", ", @unknown_input_files) . "\n\n"); 17682 } 17683} # End of looking through directory structure for more .txt files. 17684 17685# Create the list of input files from the objects we have defined, plus 17686# version 17687my @input_files = qw(version Makefile); 17688foreach my $object (@input_file_objects) { 17689 my $file = $object->file; 17690 next if ! defined $file; # Not all objects have files 17691 next if $object->optional && ! -e $file; 17692 push @input_files, $file; 17693} 17694 17695if ( $verbosity >= $VERBOSE ) { 17696 print "Expecting ".scalar( @input_files )." input files. ", 17697 "Checking ".scalar( @mktables_list_output_files )." output files.\n"; 17698} 17699 17700# We set $most_recent to be the most recently changed input file, including 17701# this program itself (done much earlier in this file) 17702foreach my $in (@input_files) { 17703 next unless -e $in; # Keep going even if missing a file 17704 my $mod_time = (stat $in)[9]; 17705 $most_recent = $mod_time if $mod_time > $most_recent; 17706 17707 # See that the input files have distinct names, to warn someone if they 17708 # are adding a new one 17709 if ($make_list) { 17710 my ($volume, $directories, $file ) = File::Spec->splitpath($in); 17711 $directories =~ s;/$;;; # Can have extraneous trailing '/' 17712 my @directories = File::Spec->splitdir($directories); 17713 my $base = $file =~ s/\.txt$//; 17714 construct_filename($file, 'mutable', \@directories); 17715 } 17716} 17717 17718# We use 'Makefile' just to see if it has changed since the last time we 17719# rebuilt. Now discard it. 17720@input_files = grep { $_ ne 'Makefile' } @input_files; 17721 17722my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild 17723 || ! scalar @mktables_list_output_files # or if no outputs known 17724 || $old_start_time < $most_recent; # or out-of-date 17725 17726# Now we check to see if any output files are older than youngest, if 17727# they are, we need to continue on, otherwise we can presumably bail. 17728if (! $rebuild) { 17729 foreach my $out (@mktables_list_output_files) { 17730 if ( ! file_exists($out)) { 17731 print "'$out' is missing.\n" if $verbosity >= $VERBOSE; 17732 $rebuild = 1; 17733 last; 17734 } 17735 #local $to_trace = 1 if main::DEBUG; 17736 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; 17737 if ( (stat $out)[9] <= $most_recent ) { 17738 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; 17739 print "'$out' is too old.\n" if $verbosity >= $VERBOSE; 17740 $rebuild = 1; 17741 last; 17742 } 17743 } 17744} 17745if (! $rebuild) { 17746 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; 17747 exit(0); 17748} 17749print "Must rebuild tables.\n" if $verbosity >= $VERBOSE; 17750 17751# Ready to do the major processing. First create the perl pseudo-property. 17752$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); 17753 17754# Process each input file 17755foreach my $file (@input_file_objects) { 17756 $file->run; 17757} 17758 17759# Finish the table generation. 17760 17761print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; 17762finish_Unicode(); 17763 17764print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; 17765compile_perl(); 17766 17767print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; 17768add_perl_synonyms(); 17769 17770print "Writing tables\n" if $verbosity >= $PROGRESS; 17771write_all_tables(); 17772 17773# Write mktables.lst 17774if ( $file_list and $make_list ) { 17775 17776 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; 17777 foreach my $file (@input_files, @files_actually_output) { 17778 my (undef, $directories, $file) = File::Spec->splitpath($file); 17779 my @directories = File::Spec->splitdir($directories); 17780 $file = join '/', @directories, $file; 17781 } 17782 17783 my $ofh; 17784 if (! open $ofh,">",$file_list) { 17785 Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); 17786 return 17787 } 17788 else { 17789 my $localtime = localtime $start_time; 17790 print $ofh <<"END"; 17791# 17792# $file_list -- File list for $0. 17793# 17794# Autogenerated starting on $start_time ($localtime) 17795# 17796# - First section is input files 17797# ($0 itself is not listed but is automatically considered an input) 17798# - Section separator is /^=+\$/ 17799# - Second section is a list of output files. 17800# - Lines matching /^\\s*#/ are treated as comments 17801# which along with blank lines are ignored. 17802# 17803 17804# Input files: 17805 17806END 17807 print $ofh "$_\n" for sort(@input_files); 17808 print $ofh "\n=================================\n# Output files:\n\n"; 17809 print $ofh "$_\n" for sort @files_actually_output; 17810 print $ofh "\n# ",scalar(@input_files)," input files\n", 17811 "# ",scalar(@files_actually_output)+1," output files\n\n", 17812 "# End list\n"; 17813 close $ofh 17814 or Carp::my_carp("Failed to close $ofh: $!"); 17815 17816 print "Filelist has ",scalar(@input_files)," input files and ", 17817 scalar(@files_actually_output)+1," output files\n" 17818 if $verbosity >= $VERBOSE; 17819 } 17820} 17821 17822# Output these warnings unless -q explicitly specified. 17823if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { 17824 if (@unhandled_properties) { 17825 print "\nProperties and tables that unexpectedly have no code points\n"; 17826 foreach my $property (sort @unhandled_properties) { 17827 print $property, "\n"; 17828 } 17829 } 17830 17831 if (%potential_files) { 17832 print "\nInput files that are not considered:\n"; 17833 foreach my $file (sort keys %potential_files) { 17834 print File::Spec->abs2rel($file), "\n"; 17835 } 17836 } 17837 print "\nAll done\n" if $verbosity >= $VERBOSE; 17838} 17839exit(0); 17840 17841# TRAILING CODE IS USED BY make_property_test_script() 17842__DATA__ 17843 17844use strict; 17845use warnings; 17846 17847# If run outside the normal test suite on an ASCII platform, you can 17848# just create a latin1_to_native() function that just returns its 17849# inputs, because that's the only function used from test.pl 17850require "test.pl"; 17851 17852# Test qr/\X/ and the \p{} regular expression constructs. This file is 17853# constructed by mktables from the tables it generates, so if mktables is 17854# buggy, this won't necessarily catch those bugs. Tests are generated for all 17855# feasible properties; a few aren't currently feasible; see 17856# is_code_point_usable() in mktables for details. 17857 17858# Standard test packages are not used because this manipulates SIG_WARN. It 17859# exits 0 if every non-skipped test succeeded; -1 if any failed. 17860 17861my $Tests = 0; 17862my $Fails = 0; 17863 17864sub Expect($$$$) { 17865 my $expected = shift; 17866 my $ord = shift; 17867 my $regex = shift; 17868 my $warning_type = shift; # Type of warning message, like 'deprecated' 17869 # or empty if none 17870 my $line = (caller)[2]; 17871 $ord = ord(latin1_to_native(chr($ord))); 17872 17873 # Convert the code point to hex form 17874 my $string = sprintf "\"\\x{%04X}\"", $ord; 17875 17876 my @tests = ""; 17877 17878 # The first time through, use all warnings. If the input should generate 17879 # a warning, add another time through with them turned off 17880 push @tests, "no warnings '$warning_type';" if $warning_type; 17881 17882 foreach my $no_warnings (@tests) { 17883 17884 # Store any warning messages instead of outputting them 17885 local $SIG{__WARN__} = $SIG{__WARN__}; 17886 my $warning_message; 17887 $SIG{__WARN__} = sub { $warning_message = $_[0] }; 17888 17889 $Tests++; 17890 17891 # A string eval is needed because of the 'no warnings'. 17892 # Assumes no parens in the regular expression 17893 my $result = eval "$no_warnings 17894 my \$RegObj = qr($regex); 17895 $string =~ \$RegObj ? 1 : 0"; 17896 if (not defined $result) { 17897 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; 17898 $Fails++; 17899 } 17900 elsif ($result ^ $expected) { 17901 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; 17902 $Fails++; 17903 } 17904 elsif ($warning_message) { 17905 if (! $warning_type || ($warning_type && $no_warnings)) { 17906 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; 17907 $Fails++; 17908 } 17909 else { 17910 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; 17911 } 17912 } 17913 elsif ($warning_type && ! $no_warnings) { 17914 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; 17915 $Fails++; 17916 } 17917 else { 17918 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; 17919 } 17920 } 17921 return; 17922} 17923 17924sub Error($) { 17925 my $regex = shift; 17926 $Tests++; 17927 if (eval { 'x' =~ qr/$regex/; 1 }) { 17928 $Fails++; 17929 my $line = (caller)[2]; 17930 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; 17931 } 17932 else { 17933 my $line = (caller)[2]; 17934 print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; 17935 } 17936 return; 17937} 17938 17939# GCBTest.txt character that separates grapheme clusters 17940my $breakable_utf8 = my $breakable = chr(0xF7); 17941utf8::upgrade($breakable_utf8); 17942 17943# GCBTest.txt character that indicates that the adjoining code points are part 17944# of the same grapheme cluster 17945my $nobreak_utf8 = my $nobreak = chr(0xD7); 17946utf8::upgrade($nobreak_utf8); 17947 17948sub Test_X($) { 17949 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt 17950 # Each such line is a sequence of code points given by their hex numbers, 17951 # separated by the two characters defined just before this subroutine that 17952 # indicate that either there can or cannot be a break between the adjacent 17953 # code points. If there isn't a break, that means the sequence forms an 17954 # extended grapheme cluster, which means that \X should match the whole 17955 # thing. If there is a break, \X should stop there. This is all 17956 # converted by this routine into a match: 17957 # $string =~ /(\X)/, 17958 # Each \X should match the next cluster; and that is what is checked. 17959 17960 my $template = shift; 17961 17962 my $line = (caller)[2]; 17963 17964 # The line contains characters above the ASCII range, but in Latin1. It 17965 # may or may not be in utf8, and if it is, it may or may not know it. So, 17966 # convert these characters to 8 bits. If knows is in utf8, simply 17967 # downgrade. 17968 if (utf8::is_utf8($template)) { 17969 utf8::downgrade($template); 17970 } else { 17971 17972 # Otherwise, if it is in utf8, but doesn't know it, the next lines 17973 # convert the two problematic characters to their 8-bit equivalents. 17974 # If it isn't in utf8, they don't harm anything. 17975 use bytes; 17976 $template =~ s/$nobreak_utf8/$nobreak/g; 17977 $template =~ s/$breakable_utf8/$breakable/g; 17978 } 17979 17980 # Get rid of the leading and trailing breakables 17981 $template =~ s/^ \s* $breakable \s* //x; 17982 $template =~ s/ \s* $breakable \s* $ //x; 17983 17984 # And no-breaks become just a space. 17985 $template =~ s/ \s* $nobreak \s* / /xg; 17986 17987 # Split the input into segments that are breakable between them. 17988 my @segments = split /\s*$breakable\s*/, $template; 17989 17990 my $string = ""; 17991 my $display_string = ""; 17992 my @should_match; 17993 my @should_display; 17994 17995 # Convert the code point sequence in each segment into a Perl string of 17996 # characters 17997 foreach my $segment (@segments) { 17998 my @code_points = split /\s+/, $segment; 17999 my $this_string = ""; 18000 my $this_display = ""; 18001 foreach my $code_point (@code_points) { 18002 $this_string .= latin1_to_native(chr(hex $code_point)); 18003 $this_display .= "\\x{$code_point}"; 18004 } 18005 18006 # The next cluster should match the string in this segment. 18007 push @should_match, $this_string; 18008 push @should_display, $this_display; 18009 $string .= $this_string; 18010 $display_string .= $this_display; 18011 } 18012 18013 # If a string can be represented in both non-ut8 and utf8, test both cases 18014 UPGRADE: 18015 for my $to_upgrade (0 .. 1) { 18016 18017 if ($to_upgrade) { 18018 18019 # If already in utf8, would just be a repeat 18020 next UPGRADE if utf8::is_utf8($string); 18021 18022 utf8::upgrade($string); 18023 } 18024 18025 # Finally, do the \X match. 18026 my @matches = $string =~ /(\X)/g; 18027 18028 # Look through each matched cluster to verify that it matches what we 18029 # expect. 18030 my $min = (@matches < @should_match) ? @matches : @should_match; 18031 for my $i (0 .. $min - 1) { 18032 $Tests++; 18033 if ($matches[$i] eq $should_match[$i]) { 18034 print "ok $Tests - "; 18035 if ($i == 0) { 18036 print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; 18037 } else { 18038 print "And \\X #", $i + 1, 18039 } 18040 print " correctly matched $should_display[$i]; line $line\n"; 18041 } else { 18042 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ } 18043 unpack("U*", $matches[$i])); 18044 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #", 18045 $i + 1, 18046 " should have matched $should_display[$i]", 18047 " but instead matched $matches[$i]", 18048 ". Abandoning rest of line $line\n"; 18049 next UPGRADE; 18050 } 18051 } 18052 18053 # And the number of matches should equal the number of expected matches. 18054 $Tests++; 18055 if (@matches == @should_match) { 18056 print "ok $Tests - Nothing was left over; line $line\n"; 18057 } else { 18058 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n"; 18059 } 18060 } 18061 18062 return; 18063} 18064 18065sub Finished() { 18066 print "1..$Tests\n"; 18067 exit($Fails ? -1 : 0); 18068} 18069 18070Error('\p{Script=InGreek}'); # Bug #69018 18071Test_X("1100 $nobreak 1161"); # Bug #70940 18072Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 18073Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 18074Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726 18075