1#!/usr/bin/perl -w 2 3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! 4# Any files created or read by this program should be listed in 'mktables.lst' 5# Use -makelist to regenerate it. 6 7# Needs 'no overloading' to run faster on miniperl. Code commented out at the 8# subroutine objaddr can be used instead to work as far back (untested) as 9# 5.8: needs pack "U". But almost all occurrences of objaddr have been 10# removed in favor of using 'no overloading'. You also would have to go 11# through and replace occurrences like: 12# my $addr = do { no overloading; pack 'J', $self; } 13# with 14# my $addr = main::objaddr $self; 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b 16# that instituted the change to main::objaddr, and subsequent commits that 17# changed 0+$self to pack 'J', $self.) 18 19my $start_time; 20BEGIN { # Get the time the script started running; do it at compilation to 21 # get it as close as possible 22 $start_time= time; 23} 24 25require 5.010_001; 26use strict; 27use warnings; 28use Carp; 29use Config; 30use File::Find; 31use File::Path; 32use File::Spec; 33use Text::Tabs; 34use re "/aa"; 35 36sub DEBUG () { 0 } # Set to 0 for production; 1 for development 37my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; 38 39########################################################################## 40# 41# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), 42# from the Unicode database files (lib/unicore/.../*.txt), It also generates 43# a pod file and a .t file 44# 45# The structure of this file is: 46# First these introductory comments; then 47# code needed for everywhere, such as debugging stuff; then 48# code to handle input parameters; then 49# data structures likely to be of external interest (some of which depend on 50# the input parameters, so follows them; then 51# more data structures and subroutine and package (class) definitions; then 52# the small actual loop to process the input files and finish up; then 53# a __DATA__ section, for the .t tests 54# 55# This program works on all releases of Unicode through at least 6.0. The 56# outputs have been scrutinized most intently for release 5.1. The others 57# have been checked for somewhat more than just sanity. It can handle all 58# existing Unicode character properties in those releases. 59# 60# This program is mostly about Unicode character (or code point) properties. 61# A property describes some attribute or quality of a code point, like if it 62# is lowercase or not, its name, what version of Unicode it was first defined 63# in, or what its uppercase equivalent is. Unicode deals with these disparate 64# possibilities by making all properties into mappings from each code point 65# into some corresponding value. In the case of it being lowercase or not, 66# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each 67# property maps each Unicode code point to a single value, called a "property 68# value". (Hence each Unicode property is a true mathematical function with 69# exactly one value per code point.) 70# 71# When using a property in a regular expression, what is desired isn't the 72# mapping of the code point to its property's value, but the reverse (or the 73# mathematical "inverse relation"): starting with the property value, "Does a 74# code point map to it?" These are written in a "compound" form: 75# \p{property=value}, e.g., \p{category=punctuation}. This program generates 76# files containing the lists of code points that map to each such regular 77# expression property value, one file per list 78# 79# There is also a single form shortcut that Perl adds for many of the commonly 80# used properties. This happens for all binary properties, plus script, 81# general_category, and block properties. 82# 83# Thus the outputs of this program are files. There are map files, mostly in 84# the 'To' directory; and there are list files for use in regular expression 85# matching, all in subdirectories of the 'lib' directory, with each 86# subdirectory being named for the property that the lists in it are for. 87# Bookkeeping, test, and documentation files are also generated. 88 89my $matches_directory = 'lib'; # Where match (\p{}) files go. 90my $map_directory = 'To'; # Where map files go. 91 92# DATA STRUCTURES 93# 94# The major data structures of this program are Property, of course, but also 95# Table. There are two kinds of tables, very similar to each other. 96# "Match_Table" is the data structure giving the list of code points that have 97# a particular property value, mentioned above. There is also a "Map_Table" 98# data structure which gives the property's mapping from code point to value. 99# There are two structures because the match tables need to be combined in 100# various ways, such as constructing unions, intersections, complements, etc., 101# and the map ones don't. And there would be problems, perhaps subtle, if 102# a map table were inadvertently operated on in some of those ways. 103# The use of separate classes with operations defined on one but not the other 104# prevents accidentally confusing the two. 105# 106# At the heart of each table's data structure is a "Range_List", which is just 107# an ordered list of "Ranges", plus ancillary information, and methods to 108# operate on them. A Range is a compact way to store property information. 109# Each range has a starting code point, an ending code point, and a value that 110# is meant to apply to all the code points between the two end points, 111# inclusive. For a map table, this value is the property value for those 112# code points. Two such ranges could be written like this: 113# 0x41 .. 0x5A, 'Upper', 114# 0x61 .. 0x7A, 'Lower' 115# 116# Each range also has a type used as a convenience to classify the values. 117# Most ranges in this program will be Type 0, or normal, but there are some 118# ranges that have a non-zero type. These are used only in map tables, and 119# are for mappings that don't fit into the normal scheme of things. Mappings 120# that require a hash entry to communicate with utf8.c are one example; 121# another example is mappings for charnames.pm to use which indicate a name 122# that is algorithmically determinable from its code point (and vice-versa). 123# These are used to significantly compact these tables, instead of listing 124# each one of the tens of thousands individually. 125# 126# In a match table, the value of a range is irrelevant (and hence the type as 127# well, which will always be 0), and arbitrarily set to the null string. 128# Using the example above, there would be two match tables for those two 129# entries, one named Upper would contain the 0x41..0x5A range, and the other 130# named Lower would contain 0x61..0x7A. 131# 132# Actually, there are two types of range lists, "Range_Map" is the one 133# associated with map tables, and "Range_List" with match tables. 134# Again, this is so that methods can be defined on one and not the other so as 135# to prevent operating on them in incorrect ways. 136# 137# Eventually, most tables are written out to files to be read by utf8_heavy.pl 138# in the perl core. All tables could in theory be written, but some are 139# suppressed because there is no current practical use for them. It is easy 140# to change which get written by changing various lists that are near the top 141# of the actual code in this file. The table data structures contain enough 142# ancillary information to allow them to be treated as separate entities for 143# writing, such as the path to each one's file. There is a heading in each 144# map table that gives the format of its entries, and what the map is for all 145# the code points missing from it. (This allows tables to be more compact.) 146# 147# The Property data structure contains one or more tables. All properties 148# contain a map table (except the $perl property which is a 149# pseudo-property containing only match tables), and any properties that 150# are usable in regular expression matches also contain various matching 151# tables, one for each value the property can have. A binary property can 152# have two values, True and False (or Y and N, which are preferred by Unicode 153# terminology). Thus each of these properties will have a map table that 154# takes every code point and maps it to Y or N (but having ranges cuts the 155# number of entries in that table way down), and two match tables, one 156# which has a list of all the code points that map to Y, and one for all the 157# code points that map to N. (For each of these, a third table is also 158# generated for the pseudo Perl property. It contains the identical code 159# points as the Y table, but can be written, not in the compound form, but in 160# a "single" form like \p{IsUppercase}.) Many properties are binary, but some 161# properties have several possible values, some have many, and properties like 162# Name have a different value for every named code point. Those will not, 163# unless the controlling lists are changed, have their match tables written 164# out. But all the ones which can be used in regular expression \p{} and \P{} 165# constructs will. Prior to 5.14, generally a property would have either its 166# map table or its match tables written but not both. Again, what gets 167# written is controlled by lists which can easily be changed. Starting in 168# 5.14, advantage was taken of this, and all the map tables needed to 169# reconstruct the Unicode db are now written out, while suppressing the 170# Unicode .txt files that contain the data. Our tables are much more compact 171# than the .txt files, so a significant space savings was achieved. 172 173# Properties have a 'Type', like binary, or string, or enum depending on how 174# many match tables there are and the content of the maps. This 'Type' is 175# different than a range 'Type', so don't get confused by the two concepts 176# having the same name. 177# 178# For information about the Unicode properties, see Unicode's UAX44 document: 179 180my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; 181 182# As stated earlier, this program will work on any release of Unicode so far. 183# Most obvious problems in earlier data have NOT been corrected except when 184# necessary to make Perl or this program work reasonably. For example, no 185# folding information was given in early releases, so this program substitutes 186# lower case instead, just so that a regular expression with the /i option 187# will do something that actually gives the right results in many cases. 188# There are also a couple other corrections for version 1.1.5, commented at 189# the point they are made. As an example of corrections that weren't made 190# (but could be) is this statement from DerivedAge.txt: "The supplementary 191# private use code points and the non-character code points were assigned in 192# version 2.0, but not specifically listed in the UCD until versions 3.0 and 193# 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information 194# on Unicode version glitches is further down in these introductory comments. 195# 196# This program works on all non-provisional properties as of 6.0, though the 197# files for some are suppressed from apparent lack of demand for them. You 198# can change which are output by changing lists in this program. 199# 200# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's 201# loose matchings rules (from Unicode TR18): 202# 203# The recommended names for UCD properties and property values are in 204# PropertyAliases.txt [Prop] and PropertyValueAliases.txt 205# [PropValue]. There are both abbreviated names and longer, more 206# descriptive names. It is strongly recommended that both names be 207# recognized, and that loose matching of property names be used, 208# whereby the case distinctions, whitespace, hyphens, and underbar 209# are ignored. 210# The program still allows Fuzzy to override its determination of if loose 211# matching should be used, but it isn't currently used, as it is no longer 212# needed; the calculations it makes are good enough. 213# 214# SUMMARY OF HOW IT WORKS: 215# 216# Process arguments 217# 218# A list is constructed containing each input file that is to be processed 219# 220# Each file on the list is processed in a loop, using the associated handler 221# code for each: 222# The PropertyAliases.txt and PropValueAliases.txt files are processed 223# first. These files name the properties and property values. 224# Objects are created of all the property and property value names 225# that the rest of the input should expect, including all synonyms. 226# The other input files give mappings from properties to property 227# values. That is, they list code points and say what the mapping 228# is under the given property. Some files give the mappings for 229# just one property; and some for many. This program goes through 230# each file and populates the properties from them. Some properties 231# are listed in more than one file, and Unicode has set up a 232# precedence as to which has priority if there is a conflict. Thus 233# the order of processing matters, and this program handles the 234# conflict possibility by processing the overriding input files 235# last, so that if necessary they replace earlier values. 236# After this is all done, the program creates the property mappings not 237# furnished by Unicode, but derivable from what it does give. 238# The tables of code points that match each property value in each 239# property that is accessible by regular expressions are created. 240# The Perl-defined properties are created and populated. Many of these 241# require data determined from the earlier steps 242# Any Perl-defined synonyms are created, and name clashes between Perl 243# and Unicode are reconciled and warned about. 244# All the properties are written to files 245# Any other files are written, and final warnings issued. 246# 247# For clarity, a number of operators have been overloaded to work on tables: 248# ~ means invert (take all characters not in the set). The more 249# conventional '!' is not used because of the possibility of confusing 250# it with the actual boolean operation. 251# + means union 252# - means subtraction 253# & means intersection 254# The precedence of these is the order listed. Parentheses should be 255# copiously used. These are not a general scheme. The operations aren't 256# defined for a number of things, deliberately, to avoid getting into trouble. 257# Operations are done on references and affect the underlying structures, so 258# that the copy constructors for them have been overloaded to not return a new 259# clone, but the input object itself. 260# 261# The bool operator is deliberately not overloaded to avoid confusion with 262# "should it mean if the object merely exists, or also is non-empty?". 263# 264# WHY CERTAIN DESIGN DECISIONS WERE MADE 265# 266# This program needs to be able to run under miniperl. Therefore, it uses a 267# minimum of other modules, and hence implements some things itself that could 268# be gotten from CPAN 269# 270# This program uses inputs published by the Unicode Consortium. These can 271# change incompatibly between releases without the Perl maintainers realizing 272# it. Therefore this program is now designed to try to flag these. It looks 273# at the directories where the inputs are, and flags any unrecognized files. 274# It keeps track of all the properties in the files it handles, and flags any 275# that it doesn't know how to handle. It also flags any input lines that 276# don't match the expected syntax, among other checks. 277# 278# It is also designed so if a new input file matches one of the known 279# templates, one hopefully just needs to add it to a list to have it 280# processed. 281# 282# As mentioned earlier, some properties are given in more than one file. In 283# particular, the files in the extracted directory are supposedly just 284# reformattings of the others. But they contain information not easily 285# derivable from the other files, including results for Unihan, which this 286# program doesn't ordinarily look at, and for unassigned code points. They 287# also have historically had errors or been incomplete. In an attempt to 288# create the best possible data, this program thus processes them first to 289# glean information missing from the other files; then processes those other 290# files to override any errors in the extracted ones. Much of the design was 291# driven by this need to store things and then possibly override them. 292# 293# It tries to keep fatal errors to a minimum, to generate something usable for 294# testing purposes. It always looks for files that could be inputs, and will 295# warn about any that it doesn't know how to handle (the -q option suppresses 296# the warning). 297# 298# Why is there more than one type of range? 299# This simplified things. There are some very specialized code points that 300# have to be handled specially for output, such as Hangul syllable names. 301# By creating a range type (done late in the development process), it 302# allowed this to be stored with the range, and overridden by other input. 303# Originally these were stored in another data structure, and it became a 304# mess trying to decide if a second file that was for the same property was 305# overriding the earlier one or not. 306# 307# Why are there two kinds of tables, match and map? 308# (And there is a base class shared by the two as well.) As stated above, 309# they actually are for different things. Development proceeded much more 310# smoothly when I (khw) realized the distinction. Map tables are used to 311# give the property value for every code point (actually every code point 312# that doesn't map to a default value). Match tables are used for regular 313# expression matches, and are essentially the inverse mapping. Separating 314# the two allows more specialized methods, and error checks so that one 315# can't just take the intersection of two map tables, for example, as that 316# is nonsensical. 317# 318# DEBUGGING 319# 320# This program is written so it will run under miniperl. Occasionally changes 321# will cause an error where the backtrace doesn't work well under miniperl. 322# To diagnose the problem, you can instead run it under regular perl, if you 323# have one compiled. 324# 325# There is a good trace facility. To enable it, first sub DEBUG must be set 326# to return true. Then a line like 327# 328# local $to_trace = 1 if main::DEBUG; 329# 330# can be added to enable tracing in its lexical scope or until you insert 331# another line: 332# 333# local $to_trace = 0 if main::DEBUG; 334# 335# then use a line like "trace $a, @b, %c, ...; 336# 337# Some of the more complex subroutines already have trace statements in them. 338# Permanent trace statements should be like: 339# 340# trace ... if main::DEBUG && $to_trace; 341# 342# If there is just one or a few files that you're debugging, you can easily 343# cause most everything else to be skipped. Change the line 344# 345# my $debug_skip = 0; 346# 347# to 1, and every file whose object is in @input_file_objects and doesn't have 348# a, 'non_skip => 1,' in its constructor will be skipped. 349# 350# To compare the output tables, it may be useful to specify the -annotate 351# flag. This causes the tables to expand so there is one entry for each 352# non-algorithmically named code point giving, currently its name, and its 353# graphic representation if printable (and you have a font that knows about 354# it). This makes it easier to see what the particular code points are in 355# each output table. The tables are usable, but because they don't have 356# ranges (for the most part), a Perl using them will run slower. Non-named 357# code points are annotated with a description of their status, and contiguous 358# ones with the same description will be output as a range rather than 359# individually. Algorithmically named characters are also output as ranges, 360# except when there are just a few contiguous ones. 361# 362# FUTURE ISSUES 363# 364# The program would break if Unicode were to change its names so that 365# interior white space, underscores, or dashes differences were significant 366# within property and property value names. 367# 368# It might be easier to use the xml versions of the UCD if this program ever 369# would need heavy revision, and the ability to handle old versions was not 370# required. 371# 372# There is the potential for name collisions, in that Perl has chosen names 373# that Unicode could decide it also likes. There have been such collisions in 374# the past, with mostly Perl deciding to adopt the Unicode definition of the 375# name. However in the 5.2 Unicode beta testing, there were a number of such 376# collisions, which were withdrawn before the final release, because of Perl's 377# and other's protests. These all involved new properties which began with 378# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, 379# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a 380# Unicode document, so they are unlikely to be used by Unicode for another 381# purpose. However, they might try something beginning with 'In', or use any 382# of the other Perl-defined properties. This program will warn you of name 383# collisions, and refuse to generate tables with them, but manual intervention 384# will be required in this event. One scheme that could be implemented, if 385# necessary, would be to have this program generate another file, or add a 386# field to mktables.lst that gives the date of first definition of a property. 387# Each new release of Unicode would use that file as a basis for the next 388# iteration. And the Perl synonym addition code could sort based on the age 389# of the property, so older properties get priority, and newer ones that clash 390# would be refused; hence existing code would not be impacted, and some other 391# synonym would have to be used for the new property. This is ugly, and 392# manual intervention would certainly be easier to do in the short run; lets 393# hope it never comes to this. 394# 395# A NOTE ON UNIHAN 396# 397# This program can generate tables from the Unihan database. But it doesn't 398# by default, letting the CPAN module Unicode::Unihan handle them. Prior to 399# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the 400# database was split into 8 different files, all beginning with the letters 401# 'Unihan'. This program will read those file(s) if present, but it needs to 402# know which of the many properties in the file(s) should have tables created 403# for them. It will create tables for any properties listed in 404# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the 405# @cjk_properties array and the @cjk_property_values array. Thus, if a 406# property you want is not in those files of the release you are building 407# against, you must add it to those two arrays. Starting in 4.0, the 408# Unicode_Radical_Stroke was listed in those files, so if the Unihan database 409# is present in the directory, a table will be generated for that property. 410# In 5.2, several more properties were added. For your convenience, the two 411# arrays are initialized with all the 6.0 listed properties that are also in 412# earlier releases. But these are commented out. You can just uncomment the 413# ones you want, or use them as a template for adding entries for other 414# properties. 415# 416# You may need to adjust the entries to suit your purposes. setup_unihan(), 417# and filter_unihan_line() are the functions where this is done. This program 418# already does some adjusting to make the lines look more like the rest of the 419# Unicode DB; You can see what that is in filter_unihan_line() 420# 421# There is a bug in the 3.2 data file in which some values for the 422# kPrimaryNumeric property have commas and an unexpected comment. A filter 423# could be added for these; or for a particular installation, the Unihan.txt 424# file could be edited to fix them. 425# 426# HOW TO ADD A FILE TO BE PROCESSED 427# 428# A new file from Unicode needs to have an object constructed for it in 429# @input_file_objects, probably at the end or at the end of the extracted 430# ones. The program should warn you if its name will clash with others on 431# restrictive file systems, like DOS. If so, figure out a better name, and 432# add lines to the README.perl file giving that. If the file is a character 433# property, it should be in the format that Unicode has by default 434# standardized for such files for the more recently introduced ones. 435# If so, the Input_file constructor for @input_file_objects can just be the 436# file name and release it first appeared in. If not, then it should be 437# possible to construct an each_line_handler() to massage the line into the 438# standardized form. 439# 440# For non-character properties, more code will be needed. You can look at 441# the existing entries for clues. 442# 443# UNICODE VERSIONS NOTES 444# 445# The Unicode UCD has had a number of errors in it over the versions. And 446# these remain, by policy, in the standard for that version. Therefore it is 447# risky to correct them, because code may be expecting the error. So this 448# program doesn't generally make changes, unless the error breaks the Perl 449# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value 450# for U+1105, which causes real problems for the algorithms for Jamo 451# calculations, so it is changed here. 452# 453# But it isn't so clear cut as to what to do about concepts that are 454# introduced in a later release; should they extend back to earlier releases 455# where the concept just didn't exist? It was easier to do this than to not, 456# so that's what was done. For example, the default value for code points not 457# in the files for various properties was probably undefined until changed by 458# some version. No_Block for blocks is such an example. This program will 459# assign No_Block even in Unicode versions that didn't have it. This has the 460# benefit that code being written doesn't have to special case earlier 461# versions; and the detriment that it doesn't match the Standard precisely for 462# the affected versions. 463# 464# Here are some observations about some of the issues in early versions: 465# 466# The number of code points in \p{alpha} halved in 2.1.9. It turns out that 467# the reason is that the CJK block starting at 4E00 was removed from PropList, 468# and was not put back in until 3.1.0 469# 470# Unicode introduced the synonym Space for White_Space in 4.1. Perl has 471# always had a \p{Space}. In release 3.2 only, they are not synonymous. The 472# reason is that 3.2 introduced U+205F=medium math space, which was not 473# classed as white space, but Perl figured out that it should have been. 4.0 474# reclassified it correctly. 475# 476# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 477# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL 478# was left with no code points, as all the ones that mapped to 202 stayed 479# mapped to 202. Thus if your program used the numeric name for the class, 480# it would not have been affected, but if it used the mnemonic, it would have 481# been. 482# 483# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code 484# points which eventually came to have this script property value, instead 485# mapped to "Unknown". But in the next release all these code points were 486# moved to \p{sc=common} instead. 487# 488# The default for missing code points for BidiClass is complicated. Starting 489# in 3.1.1, the derived file DBidiClass.txt handles this, but this program 490# tries to do the best it can for earlier releases. It is done in 491# process_PropertyAliases() 492# 493############################################################################## 494 495my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing 496 # and errors 497my $MAX_LINE_WIDTH = 78; 498 499# Debugging aid to skip most files so as to not be distracted by them when 500# concentrating on the ones being debugged. Add 501# non_skip => 1, 502# to the constructor for those files you want processed when you set this. 503# Files with a first version number of 0 are special: they are always 504# processed regardless of the state of this flag. Generally, Jamo.txt and 505# UnicodeData.txt must not be skipped if you want this program to not die 506# before normal completion. 507my $debug_skip = 0; 508 509# Set to 1 to enable tracing. 510our $to_trace = 0; 511 512{ # Closure for trace: debugging aid 513 my $print_caller = 1; # ? Include calling subroutine name 514 my $main_with_colon = 'main::'; 515 my $main_colon_length = length($main_with_colon); 516 517 sub trace { 518 return unless $to_trace; # Do nothing if global flag not set 519 520 my @input = @_; 521 522 local $DB::trace = 0; 523 $DB::trace = 0; # Quiet 'used only once' message 524 525 my $line_number; 526 527 # Loop looking up the stack to get the first non-trace caller 528 my $caller_line; 529 my $caller_name; 530 my $i = 0; 531 do { 532 $line_number = $caller_line; 533 (my $pkg, my $file, $caller_line, my $caller) = caller $i++; 534 $caller = $main_with_colon unless defined $caller; 535 536 $caller_name = $caller; 537 538 # get rid of pkg 539 $caller_name =~ s/.*:://; 540 if (substr($caller_name, 0, $main_colon_length) 541 eq $main_with_colon) 542 { 543 $caller_name = substr($caller_name, $main_colon_length); 544 } 545 546 } until ($caller_name ne 'trace'); 547 548 # If the stack was empty, we were called from the top level 549 $caller_name = 'main' if ($caller_name eq "" 550 || $caller_name eq 'trace'); 551 552 my $output = ""; 553 foreach my $string (@input) { 554 #print STDERR __LINE__, ": ", join ", ", @input, "\n"; 555 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { 556 $output .= simple_dumper($string); 557 } 558 else { 559 $string = "$string" if ref $string; 560 $string = $UNDEF unless defined $string; 561 chomp $string; 562 $string = '""' if $string eq ""; 563 $output .= " " if $output ne "" 564 && $string ne "" 565 && substr($output, -1, 1) ne " " 566 && substr($string, 0, 1) ne " "; 567 $output .= $string; 568 } 569 } 570 571 print STDERR sprintf "%4d: ", $line_number if defined $line_number; 572 print STDERR "$caller_name: " if $print_caller; 573 print STDERR $output, "\n"; 574 return; 575 } 576} 577 578# This is for a rarely used development feature that allows you to compare two 579# versions of the Unicode standard without having to deal with changes caused 580# by the code points introduced in the later version. Change the 0 to a 581# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only 582# code points introduced in that release and earlier will be used; later ones 583# are thrown away. You use the version number of the earliest one you want to 584# compare; then run this program on directory structures containing each 585# release, and compare the outputs. These outputs will therefore include only 586# the code points common to both releases, and you can see the changes caused 587# just by the underlying release semantic changes. For versions earlier than 588# 3.2, you must copy a version of DAge.txt into the directory. 589my $string_compare_versions = DEBUG && 0; # e.g., "2.1"; 590my $compare_versions = DEBUG 591 && $string_compare_versions 592 && pack "C*", split /\./, $string_compare_versions; 593 594sub uniques { 595 # Returns non-duplicated input values. From "Perl Best Practices: 596 # Encapsulated Cleverness". p. 455 in first edition. 597 598 my %seen; 599 # Arguably this breaks encapsulation, if the goal is to permit multiple 600 # distinct objects to stringify to the same value, and be interchangeable. 601 # However, for this program, no two objects stringify identically, and all 602 # lists passed to this function are either objects or strings. So this 603 # doesn't affect correctness, but it does give a couple of percent speedup. 604 no overloading; 605 return grep { ! $seen{$_}++ } @_; 606} 607 608$0 = File::Spec->canonpath($0); 609 610my $make_test_script = 0; # ? Should we output a test script 611my $write_unchanged_files = 0; # ? Should we update the output files even if 612 # we don't think they have changed 613my $use_directory = ""; # ? Should we chdir somewhere. 614my $pod_directory; # input directory to store the pod file. 615my $pod_file = 'perluniprops'; 616my $t_path; # Path to the .t test file 617my $file_list = 'mktables.lst'; # File to store input and output file names. 618 # This is used to speed up the build, by not 619 # executing the main body of the program if 620 # nothing on the list has changed since the 621 # previous build 622my $make_list = 1; # ? Should we write $file_list. Set to always 623 # make a list so that when the pumpking is 624 # preparing a release, s/he won't have to do 625 # special things 626my $glob_list = 0; # ? Should we try to include unknown .txt files 627 # in the input. 628my $output_range_counts = $debugging_build; # ? Should we include the number 629 # of code points in ranges in 630 # the output 631my $annotate = 0; # ? Should character names be in the output 632 633# Verbosity levels; 0 is quiet 634my $NORMAL_VERBOSITY = 1; 635my $PROGRESS = 2; 636my $VERBOSE = 3; 637 638my $verbosity = $NORMAL_VERBOSITY; 639 640# Process arguments 641while (@ARGV) { 642 my $arg = shift @ARGV; 643 if ($arg eq '-v') { 644 $verbosity = $VERBOSE; 645 } 646 elsif ($arg eq '-p') { 647 $verbosity = $PROGRESS; 648 $| = 1; # Flush buffers as we go. 649 } 650 elsif ($arg eq '-q') { 651 $verbosity = 0; 652 } 653 elsif ($arg eq '-w') { 654 $write_unchanged_files = 1; # update the files even if havent changed 655 } 656 elsif ($arg eq '-check') { 657 my $this = shift @ARGV; 658 my $ok = shift @ARGV; 659 if ($this ne $ok) { 660 print "Skipping as check params are not the same.\n"; 661 exit(0); 662 } 663 } 664 elsif ($arg eq '-P' && defined ($pod_directory = shift)) { 665 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; 666 } 667 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) 668 { 669 $make_test_script = 1; 670 } 671 elsif ($arg eq '-makelist') { 672 $make_list = 1; 673 } 674 elsif ($arg eq '-C' && defined ($use_directory = shift)) { 675 -d $use_directory or croak "Unknown directory '$use_directory'"; 676 } 677 elsif ($arg eq '-L') { 678 679 # Existence not tested until have chdir'd 680 $file_list = shift; 681 } 682 elsif ($arg eq '-globlist') { 683 $glob_list = 1; 684 } 685 elsif ($arg eq '-c') { 686 $output_range_counts = ! $output_range_counts 687 } 688 elsif ($arg eq '-annotate') { 689 $annotate = 1; 690 $debugging_build = 1; 691 $output_range_counts = 1; 692 } 693 else { 694 my $with_c = 'with'; 695 $with_c .= 'out' if $output_range_counts; # Complements the state 696 croak <<END; 697usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] 698 [ -T test_file_path ] [-globlist] [-makelist] [-maketest] 699 [-check A B ] 700 -c : Output comments $with_c number of code points in ranges 701 -q : Quiet Mode: Only output serious warnings. 702 -p : Set verbosity level to normal plus show progress. 703 -v : Set Verbosity level high: Show progress and non-serious 704 warnings 705 -w : Write files regardless 706 -C dir : Change to this directory before proceeding. All relative paths 707 except those specified by the -P and -T options will be done 708 with respect to this directory. 709 -P dir : Output $pod_file file to directory 'dir'. 710 -T path : Create a test script as 'path'; overrides -maketest 711 -L filelist : Use alternate 'filelist' instead of standard one 712 -globlist : Take as input all non-Test *.txt files in current and sub 713 directories 714 -maketest : Make test script 'TestProp.pl' in current (or -C directory), 715 overrides -T 716 -makelist : Rewrite the file list $file_list based on current setup 717 -annotate : Output an annotation for each character in the table files; 718 useful for debugging mktables, looking at diffs; but is slow, 719 memory intensive; resulting tables are usable but are slow and 720 very large (and currently fail the Unicode::UCD.t tests). 721 -check A B : Executes $0 only if A and B are the same 722END 723 } 724} 725 726# Stores the most-recently changed file. If none have changed, can skip the 727# build 728my $most_recent = (stat $0)[9]; # Do this before the chdir! 729 730# Change directories now, because need to read 'version' early. 731if ($use_directory) { 732 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { 733 $pod_directory = File::Spec->rel2abs($pod_directory); 734 } 735 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { 736 $t_path = File::Spec->rel2abs($t_path); 737 } 738 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; 739 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { 740 $pod_directory = File::Spec->abs2rel($pod_directory); 741 } 742 if ($t_path && File::Spec->file_name_is_absolute($t_path)) { 743 $t_path = File::Spec->abs2rel($t_path); 744 } 745} 746 747# Get Unicode version into regular and v-string. This is done now because 748# various tables below get populated based on it. These tables are populated 749# here to be near the top of the file, and so easily seeable by those needing 750# to modify things. 751open my $VERSION, "<", "version" 752 or croak "$0: can't open required file 'version': $!\n"; 753my $string_version = <$VERSION>; 754close $VERSION; 755chomp $string_version; 756my $v_version = pack "C*", split /\./, $string_version; # v string 757 758# The following are the complete names of properties with property values that 759# are known to not match any code points in some versions of Unicode, but that 760# may change in the future so they should be matchable, hence an empty file is 761# generated for them. 762my @tables_that_may_be_empty = ( 763 'Joining_Type=Left_Joining', 764 ); 765push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; 766push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; 767push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' 768 if $v_version ge v4.1.0; 769push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' 770 if $v_version ge v6.0.0; 771push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' 772 if $v_version ge v6.1.0; 773 774# The lists below are hashes, so the key is the item in the list, and the 775# value is the reason why it is in the list. This makes generation of 776# documentation easier. 777 778my %why_suppressed; # No file generated for these. 779 780# Files aren't generated for empty extraneous properties. This is arguable. 781# Extraneous properties generally come about because a property is no longer 782# used in a newer version of Unicode. If we generated a file without code 783# points, programs that used to work on that property will still execute 784# without errors. It just won't ever match (or will always match, with \P{}). 785# This means that the logic is now likely wrong. I (khw) think its better to 786# find this out by getting an error message. Just move them to the table 787# above to change this behavior 788my %why_suppress_if_empty_warn_if_not = ( 789 790 # It is the only property that has ever officially been removed from the 791 # Standard. The database never contained any code points for it. 792 'Special_Case_Condition' => 'Obsolete', 793 794 # Apparently never official, but there were code points in some versions of 795 # old-style PropList.txt 796 'Non_Break' => 'Obsolete', 797); 798 799# These would normally go in the warn table just above, but they were changed 800# a long time before this program was written, so warnings about them are 801# moot. 802if ($v_version gt v3.2.0) { 803 push @tables_that_may_be_empty, 804 'Canonical_Combining_Class=Attached_Below_Left' 805} 806 807# These are listed in the Property aliases file in 6.0, but Unihan is ignored 808# unless explicitly added. 809if ($v_version ge v5.2.0) { 810 my $unihan = 'Unihan; remove from list if using Unihan'; 811 foreach my $table (qw ( 812 kAccountingNumeric 813 kOtherNumeric 814 kPrimaryNumeric 815 kCompatibilityVariant 816 kIICore 817 kIRG_GSource 818 kIRG_HSource 819 kIRG_JSource 820 kIRG_KPSource 821 kIRG_MSource 822 kIRG_KSource 823 kIRG_TSource 824 kIRG_USource 825 kIRG_VSource 826 kRSUnicode 827 )) 828 { 829 $why_suppress_if_empty_warn_if_not{$table} = $unihan; 830 } 831} 832 833# Enum values for to_output_map() method in the Map_Table package. 834my $EXTERNAL_MAP = 1; 835my $INTERNAL_MAP = 2; 836my $OUTPUT_ADJUSTED = 3; 837 838# To override computed values for writing the map tables for these properties. 839# The default for enum map tables is to write them out, so that the Unicode 840# .txt files can be removed, but all the data to compute any property value 841# for any code point is available in a more compact form. 842my %global_to_output_map = ( 843 # Needed by UCD.pm, but don't want to publicize that it exists, so won't 844 # get stuck supporting it if things change. Since it is a STRING 845 # property, it normally would be listed in the pod, but INTERNAL_MAP 846 # suppresses that. 847 Unicode_1_Name => $INTERNAL_MAP, 848 849 Present_In => 0, # Suppress, as easily computed from Age 850 Block => 0, # Suppress, as Blocks.txt is retained. 851 852 # Suppress, as mapping can be found instead from the 853 # Perl_Decomposition_Mapping file 854 Decomposition_Type => 0, 855); 856 857# Properties that this program ignores. 858my @unimplemented_properties; 859 860# With this release, it is automatically handled if the Unihan db is 861# downloaded 862push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0; 863 864# There are several types of obsolete properties defined by Unicode. These 865# must be hand-edited for every new Unicode release. 866my %why_deprecated; # Generates a deprecated warning message if used. 867my %why_stabilized; # Documentation only 868my %why_obsolete; # Documentation only 869 870{ # Closure 871 my $simple = 'Perl uses the more complete version of this property'; 872 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan'; 873 874 my $other_properties = 'other properties'; 875 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; 876 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; 877 878 %why_deprecated = ( 879 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 880 'Jamo_Short_Name' => $contributory, 881 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 882 'Other_Alphabetic' => $contributory, 883 'Other_Default_Ignorable_Code_Point' => $contributory, 884 'Other_Grapheme_Extend' => $contributory, 885 'Other_ID_Continue' => $contributory, 886 'Other_ID_Start' => $contributory, 887 'Other_Lowercase' => $contributory, 888 'Other_Math' => $contributory, 889 'Other_Uppercase' => $contributory, 890 'Expands_On_NFC' => $why_no_expand, 891 'Expands_On_NFD' => $why_no_expand, 892 'Expands_On_NFKC' => $why_no_expand, 893 'Expands_On_NFKD' => $why_no_expand, 894 ); 895 896 %why_suppressed = ( 897 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which 898 # contains the same information, but without the algorithmically 899 # determinable Hangul syllables'. This file is not published, so it's 900 # existence is not noted in the comment. 901 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()', 902 903 'Indic_Matra_Category' => "Provisional", 904 'Indic_Syllabic_Category' => "Provisional", 905 906 # Don't suppress ISO_Comment, as otherwise special handling is needed 907 # to differentiate between it and gc=c, which can be written as 'isc', 908 # which is the same characters as ISO_Comment's short name. 909 910 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()", 911 912 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()", 913 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 914 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 915 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 916 917 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful', 918 ); 919 920 foreach my $property ( 921 922 # The following are suppressed because they were made contributory 923 # or deprecated by Unicode before Perl ever thought about 924 # supporting them. 925 'Jamo_Short_Name', 926 'Grapheme_Link', 927 'Expands_On_NFC', 928 'Expands_On_NFD', 929 'Expands_On_NFKC', 930 'Expands_On_NFKD', 931 932 # The following are suppressed because they have been marked 933 # as deprecated for a sufficient amount of time 934 'Other_Alphabetic', 935 'Other_Default_Ignorable_Code_Point', 936 'Other_Grapheme_Extend', 937 'Other_ID_Continue', 938 'Other_ID_Start', 939 'Other_Lowercase', 940 'Other_Math', 941 'Other_Uppercase', 942 ) { 943 $why_suppressed{$property} = $why_deprecated{$property}; 944 } 945 946 # Customize the message for all the 'Other_' properties 947 foreach my $property (keys %why_deprecated) { 948 next if (my $main_property = $property) !~ s/^Other_//; 949 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; 950 } 951} 952 953if ($v_version ge 4.0.0) { 954 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; 955 if ($v_version ge 6.0.0) { 956 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; 957 } 958} 959if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { 960 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; 961 if ($v_version ge 6.0.0) { 962 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; 963 } 964} 965 966# Probably obsolete forever 967if ($v_version ge v4.1.0) { 968 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; 969} 970if ($v_version ge v6.0.0) { 971 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"'; 972 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; 973} 974 975# This program can create files for enumerated-like properties, such as 976# 'Numeric_Type'. This file would be the same format as for a string 977# property, with a mapping from code point to its value, so you could look up, 978# for example, the script a code point is in. But no one so far wants this 979# mapping, or they have found another way to get it since this is a new 980# feature. So no file is generated except if it is in this list. 981my @output_mapped_properties = split "\n", <<END; 982END 983 984# If you are using the Unihan database in a Unicode version before 5.2, you 985# need to add the properties that you want to extract from it to this table. 986# For your convenience, the properties in the 6.0 PropertyAliases.txt file are 987# listed, commented out 988my @cjk_properties = split "\n", <<'END'; 989#cjkAccountingNumeric; kAccountingNumeric 990#cjkOtherNumeric; kOtherNumeric 991#cjkPrimaryNumeric; kPrimaryNumeric 992#cjkCompatibilityVariant; kCompatibilityVariant 993#cjkIICore ; kIICore 994#cjkIRG_GSource; kIRG_GSource 995#cjkIRG_HSource; kIRG_HSource 996#cjkIRG_JSource; kIRG_JSource 997#cjkIRG_KPSource; kIRG_KPSource 998#cjkIRG_KSource; kIRG_KSource 999#cjkIRG_TSource; kIRG_TSource 1000#cjkIRG_USource; kIRG_USource 1001#cjkIRG_VSource; kIRG_VSource 1002#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS 1003END 1004 1005# Similarly for the property values. For your convenience, the lines in the 1006# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both 1007# '#' marks (for Unicode versions before 5.2) 1008my @cjk_property_values = split "\n", <<'END'; 1009## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 1010## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> 1011## @missing: 0000..10FFFF; cjkIICore; <none> 1012## @missing: 0000..10FFFF; cjkIRG_GSource; <none> 1013## @missing: 0000..10FFFF; cjkIRG_HSource; <none> 1014## @missing: 0000..10FFFF; cjkIRG_JSource; <none> 1015## @missing: 0000..10FFFF; cjkIRG_KPSource; <none> 1016## @missing: 0000..10FFFF; cjkIRG_KSource; <none> 1017## @missing: 0000..10FFFF; cjkIRG_TSource; <none> 1018## @missing: 0000..10FFFF; cjkIRG_USource; <none> 1019## @missing: 0000..10FFFF; cjkIRG_VSource; <none> 1020## @missing: 0000..10FFFF; cjkOtherNumeric; NaN 1021## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN 1022## @missing: 0000..10FFFF; cjkRSUnicode; <none> 1023END 1024 1025# The input files don't list every code point. Those not listed are to be 1026# defaulted to some value. Below are hard-coded what those values are for 1027# non-binary properties as of 5.1. Starting in 5.0, there are 1028# machine-parsable comment lines in the files the give the defaults; so this 1029# list shouldn't have to be extended. The claim is that all missing entries 1030# for binary properties will default to 'N'. Unicode tried to change that in 1031# 5.2, but the beta period produced enough protest that they backed off. 1032# 1033# The defaults for the fields that appear in UnicodeData.txt in this hash must 1034# be in the form that it expects. The others may be synonyms. 1035my $CODE_POINT = '<code point>'; 1036my %default_mapping = ( 1037 Age => "Unassigned", 1038 # Bidi_Class => Complicated; set in code 1039 Bidi_Mirroring_Glyph => "", 1040 Block => 'No_Block', 1041 Canonical_Combining_Class => 0, 1042 Case_Folding => $CODE_POINT, 1043 Decomposition_Mapping => $CODE_POINT, 1044 Decomposition_Type => 'None', 1045 East_Asian_Width => "Neutral", 1046 FC_NFKC_Closure => $CODE_POINT, 1047 General_Category => 'Cn', 1048 Grapheme_Cluster_Break => 'Other', 1049 Hangul_Syllable_Type => 'NA', 1050 ISO_Comment => "", 1051 Jamo_Short_Name => "", 1052 Joining_Group => "No_Joining_Group", 1053 # Joining_Type => Complicated; set in code 1054 kIICore => 'N', # Is converted to binary 1055 #Line_Break => Complicated; set in code 1056 Lowercase_Mapping => $CODE_POINT, 1057 Name => "", 1058 Name_Alias => "", 1059 NFC_QC => 'Yes', 1060 NFD_QC => 'Yes', 1061 NFKC_QC => 'Yes', 1062 NFKD_QC => 'Yes', 1063 Numeric_Type => 'None', 1064 Numeric_Value => 'NaN', 1065 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', 1066 Sentence_Break => 'Other', 1067 Simple_Case_Folding => $CODE_POINT, 1068 Simple_Lowercase_Mapping => $CODE_POINT, 1069 Simple_Titlecase_Mapping => $CODE_POINT, 1070 Simple_Uppercase_Mapping => $CODE_POINT, 1071 Titlecase_Mapping => $CODE_POINT, 1072 Unicode_1_Name => "", 1073 Unicode_Radical_Stroke => "", 1074 Uppercase_Mapping => $CODE_POINT, 1075 Word_Break => 'Other', 1076); 1077 1078# Below are files that Unicode furnishes, but this program ignores, and why 1079my %ignored_files = ( 1080 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points', 1081 'Index.txt' => 'Alphabetical index of Unicode characters', 1082 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl', 1083 'NamesList.txt' => 'Annotated list of characters', 1084 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base', 1085 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)', 1086 'ReadMe.txt' => 'Documentation', 1087 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>', 1088 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', 1089 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', 1090 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', 1091 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', 1092 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests', 1093); 1094 1095my %skipped_files; # List of files that we skip 1096 1097### End of externally interesting definitions, except for @input_file_objects 1098 1099my $HEADER=<<"EOF"; 1100# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1101# This file is machine-generated by $0 from the Unicode 1102# database, Version $string_version. Any changes made here will be lost! 1103EOF 1104 1105my $INTERNAL_ONLY_HEADER = <<"EOF"; 1106 1107# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 1108# This file is for internal use by core Perl only. The format and even the 1109# name or existence of this file are subject to change without notice. Don't 1110# use it directly. 1111EOF 1112 1113my $DEVELOPMENT_ONLY=<<"EOF"; 1114# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! 1115# This file contains information artificially constrained to code points 1116# present in Unicode release $string_compare_versions. 1117# IT CANNOT BE RELIED ON. It is for use during development only and should 1118# not be used for production. 1119 1120EOF 1121 1122my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF"; 1123my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; 1124my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; 1125 1126# Matches legal code point. 4-6 hex numbers, If there are 6, the first 1127# two must be 10; if there are 5, the first must not be a 0. Written this way 1128# to decrease backtracking. The first regex allows the code point to be at 1129# the end of a word, but to work properly, the word shouldn't end with a valid 1130# hex character. The second one won't match a code point at the end of a 1131# word, and doesn't have the run-on issue 1132my $run_on_code_point_re = 1133 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; 1134my $code_point_re = qr/\b$run_on_code_point_re/; 1135 1136# This matches the beginning of the line in the Unicode db files that give the 1137# defaults for code points not listed (i.e., missing) in the file. The code 1138# depends on this ending with a semi-colon, so it can assume it is a valid 1139# field when the line is split() by semi-colons 1140my $missing_defaults_prefix = 1141 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/; 1142 1143# Property types. Unicode has more types, but these are sufficient for our 1144# purposes. 1145my $UNKNOWN = -1; # initialized to illegal value 1146my $NON_STRING = 1; # Either binary or enum 1147my $BINARY = 2; 1148my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal 1149 # tables, additional true and false tables are 1150 # generated so that false is anything matching the 1151 # default value, and true is everything else. 1152my $ENUM = 4; # Include catalog 1153my $STRING = 5; # Anything else: string or misc 1154 1155# Some input files have lines that give default values for code points not 1156# contained in the file. Sometimes these should be ignored. 1157my $NO_DEFAULTS = 0; # Must evaluate to false 1158my $NOT_IGNORED = 1; 1159my $IGNORED = 2; 1160 1161# Range types. Each range has a type. Most ranges are type 0, for normal, 1162# and will appear in the main body of the tables in the output files, but 1163# there are other types of ranges as well, listed below, that are specially 1164# handled. There are pseudo-types as well that will never be stored as a 1165# type, but will affect the calculation of the type. 1166 1167# 0 is for normal, non-specials 1168my $MULTI_CP = 1; # Sequence of more than code point 1169my $HANGUL_SYLLABLE = 2; 1170my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. 1171my $NULL = 4; # The map is to the null string; utf8.c can't 1172 # handle these, nor is there an accepted syntax 1173 # for them in \p{} constructs 1174my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would 1175 # otherwise be $MULTI_CP type are instead type 0 1176 1177# process_generic_property_file() can accept certain overrides in its input. 1178# Each of these must begin AND end with $CMD_DELIM. 1179my $CMD_DELIM = "\a"; 1180my $REPLACE_CMD = 'replace'; # Override the Replace 1181my $MAP_TYPE_CMD = 'map_type'; # Override the Type 1182 1183my $NO = 0; 1184my $YES = 1; 1185 1186# Values for the Replace argument to add_range. 1187# $NO # Don't replace; add only the code points not 1188 # already present. 1189my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in 1190 # the comments at the subroutine definition. 1191my $UNCONDITIONALLY = 2; # Replace without conditions. 1192my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if 1193 # already there 1194my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if 1195 # already there 1196my $CROAK = 6; # Die with an error if is already there 1197 1198# Flags to give property statuses. The phrases are to remind maintainers that 1199# if the flag is changed, the indefinite article referring to it in the 1200# documentation may need to be as well. 1201my $NORMAL = ""; 1202my $DEPRECATED = 'D'; 1203my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; 1204my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; 1205my $DISCOURAGED = 'X'; 1206my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; 1207my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; 1208my $STRICTER = 'T'; 1209my $a_bold_stricter = "a 'B<$STRICTER>'"; 1210my $A_bold_stricter = "A 'B<$STRICTER>'"; 1211my $STABILIZED = 'S'; 1212my $a_bold_stabilized = "an 'B<$STABILIZED>'"; 1213my $A_bold_stabilized = "An 'B<$STABILIZED>'"; 1214my $OBSOLETE = 'O'; 1215my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; 1216my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; 1217 1218my %status_past_participles = ( 1219 $DISCOURAGED => 'discouraged', 1220 $STABILIZED => 'stabilized', 1221 $OBSOLETE => 'obsolete', 1222 $DEPRECATED => 'deprecated', 1223); 1224 1225# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be 1226# externally documented. 1227my $ORDINARY = 0; # The normal fate. 1228my $MAP_PROXIED = 1; # The map table for the property isn't written out, 1229 # but there is a file written that can be used to 1230 # reconstruct this table 1231my $SUPPRESSED = 3; # The file for this table is not written out. 1232my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is 1233 # for Perl's internal use only 1234my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a 1235 # Unicode version that doesn't have it, but we need it 1236 # to be defined, if empty, to have things work. 1237 # Implies no pod entry generated 1238 1239# The format of the values of the tables: 1240my $EMPTY_FORMAT = ""; 1241my $BINARY_FORMAT = 'b'; 1242my $DECIMAL_FORMAT = 'd'; 1243my $FLOAT_FORMAT = 'f'; 1244my $INTEGER_FORMAT = 'i'; 1245my $HEX_FORMAT = 'x'; 1246my $RATIONAL_FORMAT = 'r'; 1247my $STRING_FORMAT = 's'; 1248my $ADJUST_FORMAT = 'a'; 1249my $DECOMP_STRING_FORMAT = 'c'; 1250my $STRING_WHITE_SPACE_LIST = 'sw'; 1251 1252my %map_table_formats = ( 1253 $BINARY_FORMAT => 'binary', 1254 $DECIMAL_FORMAT => 'single decimal digit', 1255 $FLOAT_FORMAT => 'floating point number', 1256 $INTEGER_FORMAT => 'integer', 1257 $HEX_FORMAT => 'non-negative hex whole number; a code point', 1258 $RATIONAL_FORMAT => 'rational: an integer or a fraction', 1259 $STRING_FORMAT => 'string', 1260 $ADJUST_FORMAT => 'some entries need adjustment', 1261 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', 1262 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' 1263); 1264 1265# Unicode didn't put such derived files in a separate directory at first. 1266my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; 1267my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; 1268my $AUXILIARY = 'auxiliary'; 1269 1270# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl 1271# and into UCD.pl for the use of UCD.pm 1272my %loose_to_file_of; # loosely maps table names to their respective 1273 # files 1274my %stricter_to_file_of; # same; but for stricter mapping. 1275my %loose_property_to_file_of; # Maps a loose property name to its map file 1276my %file_to_swash_name; # Maps the file name to its corresponding key name 1277 # in the hash %utf8::SwashInfo 1278my %nv_floating_to_rational; # maps numeric values floating point numbers to 1279 # their rational equivalent 1280my %loose_property_name_of; # Loosely maps (non_string) property names to 1281 # standard form 1282my %string_property_loose_to_name; # Same, for string properties. 1283my %loose_defaults; # keys are of form "prop=value", where 'prop' is 1284 # the property name in standard loose form, and 1285 # 'value' is the default value for that property, 1286 # also in standard loose form. 1287my %loose_to_standard_value; # loosely maps table names to the canonical 1288 # alias for them 1289my %ambiguous_names; # keys are alias names (in standard form) that 1290 # have more than one possible meaning. 1291my %prop_aliases; # Keys are standard property name; values are each 1292 # one's aliases 1293my %prop_value_aliases; # Keys of top level are standard property name; 1294 # values are keys to another hash, Each one is 1295 # one of the property's values, in standard form. 1296 # The values are that prop-val's aliases. 1297my %ucd_pod; # Holds entries that will go into the UCD section of the pod 1298 1299# Most properties are immune to caseless matching, otherwise you would get 1300# nonsensical results, as properties are a function of a code point, not 1301# everything that is caselessly equivalent to that code point. For example, 1302# Changes_When_Case_Folded('s') should be false, whereas caselessly it would 1303# be true because 's' and 'S' are equivalent caselessly. However, 1304# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we 1305# extend that concept to those very few properties that are like this. Each 1306# such property will match the full range caselessly. They are hard-coded in 1307# the program; it's not worth trying to make it general as it's extremely 1308# unlikely that they will ever change. 1309my %caseless_equivalent_to; 1310 1311# These constants names and values were taken from the Unicode standard, 1312# version 5.1, section 3.12. They are used in conjunction with Hangul 1313# syllables. The '_string' versions are so generated tables can retain the 1314# hex format, which is the more familiar value 1315my $SBase_string = "0xAC00"; 1316my $SBase = CORE::hex $SBase_string; 1317my $LBase_string = "0x1100"; 1318my $LBase = CORE::hex $LBase_string; 1319my $VBase_string = "0x1161"; 1320my $VBase = CORE::hex $VBase_string; 1321my $TBase_string = "0x11A7"; 1322my $TBase = CORE::hex $TBase_string; 1323my $SCount = 11172; 1324my $LCount = 19; 1325my $VCount = 21; 1326my $TCount = 28; 1327my $NCount = $VCount * $TCount; 1328 1329# For Hangul syllables; These store the numbers from Jamo.txt in conjunction 1330# with the above published constants. 1331my %Jamo; 1332my %Jamo_L; # Leading consonants 1333my %Jamo_V; # Vowels 1334my %Jamo_T; # Trailing consonants 1335 1336# For code points whose name contains its ordinal as a '-ABCD' suffix. 1337# The key is the base name of the code point, and the value is an 1338# array giving all the ranges that use this base name. Each range 1339# is actually a hash giving the 'low' and 'high' values of it. 1340my %names_ending_in_code_point; 1341my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes 1342 # removed from the names 1343# Inverse mapping. The list of ranges that have these kinds of 1344# names. Each element contains the low, high, and base names in an 1345# anonymous hash. 1346my @code_points_ending_in_code_point; 1347 1348# Boolean: does this Unicode version have the hangul syllables, and are we 1349# writing out a table for them? 1350my $has_hangul_syllables = 0; 1351 1352# Does this Unicode version have code points whose names end in their 1353# respective code points, and are we writing out a table for them? 0 for no; 1354# otherwise points to first property that a table is needed for them, so that 1355# if multiple tables are needed, we don't create duplicates 1356my $needing_code_points_ending_in_code_point = 0; 1357 1358my @backslash_X_tests; # List of tests read in for testing \X 1359my @unhandled_properties; # Will contain a list of properties found in 1360 # the input that we didn't process. 1361my @match_properties; # Properties that have match tables, to be 1362 # listed in the pod 1363my @map_properties; # Properties that get map files written 1364my @named_sequences; # NamedSequences.txt contents. 1365my %potential_files; # Generated list of all .txt files in the directory 1366 # structure so we can warn if something is being 1367 # ignored. 1368my @files_actually_output; # List of files we generated. 1369my @more_Names; # Some code point names are compound; this is used 1370 # to store the extra components of them. 1371my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at 1372 # the minimum before we consider it equivalent to a 1373 # candidate rational 1374my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms 1375 1376# These store references to certain commonly used property objects 1377my $gc; 1378my $perl; 1379my $block; 1380my $perl_charname; 1381my $print; 1382my $Any; 1383my $script; 1384 1385# Are there conflicting names because of beginning with 'In_', or 'Is_' 1386my $has_In_conflicts = 0; 1387my $has_Is_conflicts = 0; 1388 1389sub internal_file_to_platform ($) { 1390 # Convert our file paths which have '/' separators to those of the 1391 # platform. 1392 1393 my $file = shift; 1394 return undef unless defined $file; 1395 1396 return File::Spec->join(split '/', $file); 1397} 1398 1399sub file_exists ($) { # platform independent '-e'. This program internally 1400 # uses slash as a path separator. 1401 my $file = shift; 1402 return 0 if ! defined $file; 1403 return -e internal_file_to_platform($file); 1404} 1405 1406sub objaddr($) { 1407 # Returns the address of the blessed input object. 1408 # It doesn't check for blessedness because that would do a string eval 1409 # every call, and the program is structured so that this is never called 1410 # for a non-blessed object. 1411 1412 no overloading; # If overloaded, numifying below won't work. 1413 1414 # Numifying a ref gives its address. 1415 return pack 'J', $_[0]; 1416} 1417 1418# These are used only if $annotate is true. 1419# The entire range of Unicode characters is examined to populate these 1420# after all the input has been processed. But most can be skipped, as they 1421# have the same descriptive phrases, such as being unassigned 1422my @viacode; # Contains the 1 million character names 1423my @printable; # boolean: And are those characters printable? 1424my @annotate_char_type; # Contains a type of those characters, specifically 1425 # for the purposes of annotation. 1426my $annotate_ranges; # A map of ranges of code points that have the same 1427 # name for the purposes of annotation. They map to the 1428 # upper edge of the range, so that the end point can 1429 # be immediately found. This is used to skip ahead to 1430 # the end of a range, and avoid processing each 1431 # individual code point in it. 1432my $unassigned_sans_noncharacters; # A Range_List of the unassigned 1433 # characters, but excluding those which are 1434 # also noncharacter code points 1435 1436# The annotation types are an extension of the regular range types, though 1437# some of the latter are folded into one. Make the new types negative to 1438# avoid conflicting with the regular types 1439my $SURROGATE_TYPE = -1; 1440my $UNASSIGNED_TYPE = -2; 1441my $PRIVATE_USE_TYPE = -3; 1442my $NONCHARACTER_TYPE = -4; 1443my $CONTROL_TYPE = -5; 1444my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program 1445 1446sub populate_char_info ($) { 1447 # Used only with the $annotate option. Populates the arrays with the 1448 # input code point's info that are needed for outputting more detailed 1449 # comments. If calling context wants a return, it is the end point of 1450 # any contiguous range of characters that share essentially the same info 1451 1452 my $i = shift; 1453 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 1454 1455 $viacode[$i] = $perl_charname->value_of($i) || ""; 1456 1457 # A character is generally printable if Unicode says it is, 1458 # but below we make sure that most Unicode general category 'C' types 1459 # aren't. 1460 $printable[$i] = $print->contains($i); 1461 1462 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; 1463 1464 # Only these two regular types are treated specially for annotations 1465 # purposes 1466 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME 1467 && $annotate_char_type[$i] != $HANGUL_SYLLABLE; 1468 1469 # Give a generic name to all code points that don't have a real name. 1470 # We output ranges, if applicable, for these. Also calculate the end 1471 # point of the range. 1472 my $end; 1473 if (! $viacode[$i]) { 1474 if ($gc-> table('Surrogate')->contains($i)) { 1475 $viacode[$i] = 'Surrogate'; 1476 $annotate_char_type[$i] = $SURROGATE_TYPE; 1477 $printable[$i] = 0; 1478 $end = $gc->table('Surrogate')->containing_range($i)->end; 1479 } 1480 elsif ($gc-> table('Private_use')->contains($i)) { 1481 $viacode[$i] = 'Private Use'; 1482 $annotate_char_type[$i] = $PRIVATE_USE_TYPE; 1483 $printable[$i] = 0; 1484 $end = $gc->table('Private_Use')->containing_range($i)->end; 1485 } 1486 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')-> 1487 contains($i)) 1488 { 1489 $viacode[$i] = 'Noncharacter'; 1490 $annotate_char_type[$i] = $NONCHARACTER_TYPE; 1491 $printable[$i] = 0; 1492 $end = property_ref('Noncharacter_Code_Point')->table('Y')-> 1493 containing_range($i)->end; 1494 } 1495 elsif ($gc-> table('Control')->contains($i)) { 1496 $viacode[$i] = 'Control'; 1497 $annotate_char_type[$i] = $CONTROL_TYPE; 1498 $printable[$i] = 0; 1499 $end = 0x81 if $i == 0x80; # Hard-code this one known case 1500 } 1501 elsif ($gc-> table('Unassigned')->contains($i)) { 1502 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i); 1503 $annotate_char_type[$i] = $UNASSIGNED_TYPE; 1504 $printable[$i] = 0; 1505 1506 # Because we name the unassigned by the blocks they are in, it 1507 # can't go past the end of that block, and it also can't go past 1508 # the unassigned range it is in. The special table makes sure 1509 # that the non-characters, which are unassigned, are separated 1510 # out. 1511 $end = min($block->containing_range($i)->end, 1512 $unassigned_sans_noncharacters-> containing_range($i)-> 1513 end); 1514 } 1515 else { 1516 Carp::my_carp_bug("Can't figure out how to annotate " 1517 . sprintf("U+%04X", $i) 1518 . ". Proceeding anyway."); 1519 $viacode[$i] = 'UNKNOWN'; 1520 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1521 $printable[$i] = 0; 1522 } 1523 } 1524 1525 # Here, has a name, but if it's one in which the code point number is 1526 # appended to the name, do that. 1527 elsif ($annotate_char_type[$i] == $CP_IN_NAME) { 1528 $viacode[$i] .= sprintf("-%04X", $i); 1529 $end = $perl_charname->containing_range($i)->end; 1530 } 1531 1532 # And here, has a name, but if it's a hangul syllable one, replace it with 1533 # the correct name from the Unicode algorithm 1534 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { 1535 use integer; 1536 my $SIndex = $i - $SBase; 1537 my $L = $LBase + $SIndex / $NCount; 1538 my $V = $VBase + ($SIndex % $NCount) / $TCount; 1539 my $T = $TBase + $SIndex % $TCount; 1540 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; 1541 $viacode[$i] .= $Jamo{$T} if $T != $TBase; 1542 $end = $perl_charname->containing_range($i)->end; 1543 } 1544 1545 return if ! defined wantarray; 1546 return $i if ! defined $end; # If not a range, return the input 1547 1548 # Save this whole range so can find the end point quickly 1549 $annotate_ranges->add_map($i, $end, $end); 1550 1551 return $end; 1552} 1553 1554# Commented code below should work on Perl 5.8. 1555## This 'require' doesn't necessarily work in miniperl, and even if it does, 1556## the native perl version of it (which is what would operate under miniperl) 1557## is extremely slow, as it does a string eval every call. 1558#my $has_fast_scalar_util = $ !~ /miniperl/ 1559# && defined eval "require Scalar::Util"; 1560# 1561#sub objaddr($) { 1562# # Returns the address of the blessed input object. Uses the XS version if 1563# # available. It doesn't check for blessedness because that would do a 1564# # string eval every call, and the program is structured so that this is 1565# # never called for a non-blessed object. 1566# 1567# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; 1568# 1569# # Check at least that is a ref. 1570# my $pkg = ref($_[0]) or return undef; 1571# 1572# # Change to a fake package to defeat any overloaded stringify 1573# bless $_[0], 'main::Fake'; 1574# 1575# # Numifying a ref gives its address. 1576# my $addr = pack 'J', $_[0]; 1577# 1578# # Return to original class 1579# bless $_[0], $pkg; 1580# return $addr; 1581#} 1582 1583sub max ($$) { 1584 my $a = shift; 1585 my $b = shift; 1586 return $a if $a >= $b; 1587 return $b; 1588} 1589 1590sub min ($$) { 1591 my $a = shift; 1592 my $b = shift; 1593 return $a if $a <= $b; 1594 return $b; 1595} 1596 1597sub clarify_number ($) { 1598 # This returns the input number with underscores inserted every 3 digits 1599 # in large (5 digits or more) numbers. Input must be entirely digits, not 1600 # checked. 1601 1602 my $number = shift; 1603 my $pos = length($number) - 3; 1604 return $number if $pos <= 1; 1605 while ($pos > 0) { 1606 substr($number, $pos, 0) = '_'; 1607 $pos -= 3; 1608 } 1609 return $number; 1610} 1611 1612 1613package Carp; 1614 1615# These routines give a uniform treatment of messages in this program. They 1616# are placed in the Carp package to cause the stack trace to not include them, 1617# although an alternative would be to use another package and set @CARP_NOT 1618# for it. 1619 1620our $Verbose = 1 if main::DEBUG; # Useful info when debugging 1621 1622# This is a work-around suggested by Nicholas Clark to fix a problem with Carp 1623# and overload trying to load Scalar:Util under miniperl. See 1624# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html 1625undef $overload::VERSION; 1626 1627sub my_carp { 1628 my $message = shift || ""; 1629 my $nofold = shift || 0; 1630 1631 if ($message) { 1632 $message = main::join_lines($message); 1633 $message =~ s/^$0: *//; # Remove initial program name 1634 $message =~ s/[.;,]+$//; # Remove certain ending punctuation 1635 $message = "\n$0: $message;"; 1636 1637 # Fold the message with program name, semi-colon end punctuation 1638 # (which looks good with the message that carp appends to it), and a 1639 # hanging indent for continuation lines. 1640 $message = main::simple_fold($message, "", 4) unless $nofold; 1641 $message =~ s/\n$//; # Remove the trailing nl so what carp 1642 # appends is to the same line 1643 } 1644 1645 return $message if defined wantarray; # If a caller just wants the msg 1646 1647 carp $message; 1648 return; 1649} 1650 1651sub my_carp_bug { 1652 # This is called when it is clear that the problem is caused by a bug in 1653 # this program. 1654 1655 my $message = shift; 1656 $message =~ s/^$0: *//; 1657 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); 1658 carp $message; 1659 return; 1660} 1661 1662sub carp_too_few_args { 1663 if (@_ != 2) { 1664 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken."); 1665 return; 1666 } 1667 1668 my $args_ref = shift; 1669 my $count = shift; 1670 1671 my_carp_bug("Need at least $count arguments to " 1672 . (caller 1)[3] 1673 . ". Instead got: '" 1674 . join ', ', @$args_ref 1675 . "'. No action taken."); 1676 return; 1677} 1678 1679sub carp_extra_args { 1680 my $args_ref = shift; 1681 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_; 1682 1683 unless (ref $args_ref) { 1684 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); 1685 return; 1686 } 1687 my ($package, $file, $line) = caller; 1688 my $subroutine = (caller 1)[3]; 1689 1690 my $list; 1691 if (ref $args_ref eq 'HASH') { 1692 foreach my $key (keys %$args_ref) { 1693 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; 1694 } 1695 $list = join ', ', each %{$args_ref}; 1696 } 1697 elsif (ref $args_ref eq 'ARRAY') { 1698 foreach my $arg (@$args_ref) { 1699 $arg = $UNDEF unless defined $arg; 1700 } 1701 $list = join ', ', @$args_ref; 1702 } 1703 else { 1704 my_carp_bug("Can't cope with ref " 1705 . ref($args_ref) 1706 . " . argument to 'carp_extra_args'. Not checking arguments."); 1707 return; 1708 } 1709 1710 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); 1711 return; 1712} 1713 1714package main; 1715 1716{ # Closure 1717 1718 # This program uses the inside-out method for objects, as recommended in 1719 # "Perl Best Practices". This closure aids in generating those. There 1720 # are two routines. setup_package() is called once per package to set 1721 # things up, and then set_access() is called for each hash representing a 1722 # field in the object. These routines arrange for the object to be 1723 # properly destroyed when no longer used, and for standard accessor 1724 # functions to be generated. If you need more complex accessors, just 1725 # write your own and leave those accesses out of the call to set_access(). 1726 # More details below. 1727 1728 my %constructor_fields; # fields that are to be used in constructors; see 1729 # below 1730 1731 # The values of this hash will be the package names as keys to other 1732 # hashes containing the name of each field in the package as keys, and 1733 # references to their respective hashes as values. 1734 my %package_fields; 1735 1736 sub setup_package { 1737 # Sets up the package, creating standard DESTROY and dump methods 1738 # (unless already defined). The dump method is used in debugging by 1739 # simple_dumper(). 1740 # The optional parameters are: 1741 # a) a reference to a hash, that gets populated by later 1742 # set_access() calls with one of the accesses being 1743 # 'constructor'. The caller can then refer to this, but it is 1744 # not otherwise used by these two routines. 1745 # b) a reference to a callback routine to call during destruction 1746 # of the object, before any fields are actually destroyed 1747 1748 my %args = @_; 1749 my $constructor_ref = delete $args{'Constructor_Fields'}; 1750 my $destroy_callback = delete $args{'Destroy_Callback'}; 1751 Carp::carp_extra_args(\@_) if main::DEBUG && %args; 1752 1753 my %fields; 1754 my $package = (caller)[0]; 1755 1756 $package_fields{$package} = \%fields; 1757 $constructor_fields{$package} = $constructor_ref; 1758 1759 unless ($package->can('DESTROY')) { 1760 my $destroy_name = "${package}::DESTROY"; 1761 no strict "refs"; 1762 1763 # Use typeglob to give the anonymous subroutine the name we want 1764 *$destroy_name = sub { 1765 my $self = shift; 1766 my $addr = do { no overloading; pack 'J', $self; }; 1767 1768 $self->$destroy_callback if $destroy_callback; 1769 foreach my $field (keys %{$package_fields{$package}}) { 1770 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; 1771 delete $package_fields{$package}{$field}{$addr}; 1772 } 1773 return; 1774 } 1775 } 1776 1777 unless ($package->can('dump')) { 1778 my $dump_name = "${package}::dump"; 1779 no strict "refs"; 1780 *$dump_name = sub { 1781 my $self = shift; 1782 return dump_inside_out($self, $package_fields{$package}, @_); 1783 } 1784 } 1785 return; 1786 } 1787 1788 sub set_access { 1789 # Arrange for the input field to be garbage collected when no longer 1790 # needed. Also, creates standard accessor functions for the field 1791 # based on the optional parameters-- none if none of these parameters: 1792 # 'addable' creates an 'add_NAME()' accessor function. 1793 # 'readable' or 'readable_array' creates a 'NAME()' accessor 1794 # function. 1795 # 'settable' creates a 'set_NAME()' accessor function. 1796 # 'constructor' doesn't create an accessor function, but adds the 1797 # field to the hash that was previously passed to 1798 # setup_package(); 1799 # Any of the accesses can be abbreviated down, so that 'a', 'ad', 1800 # 'add' etc. all mean 'addable'. 1801 # The read accessor function will work on both array and scalar 1802 # values. If another accessor in the parameter list is 'a', the read 1803 # access assumes an array. You can also force it to be array access 1804 # by specifying 'readable_array' instead of 'readable' 1805 # 1806 # A sort-of 'protected' access can be set-up by preceding the addable, 1807 # readable or settable with some initial portion of 'protected_' (but, 1808 # the underscore is required), like 'p_a', 'pro_set', etc. The 1809 # "protection" is only by convention. All that happens is that the 1810 # accessor functions' names begin with an underscore. So instead of 1811 # calling set_foo, the call is _set_foo. (Real protection could be 1812 # accomplished by having a new subroutine, end_package, called at the 1813 # end of each package, and then storing the __LINE__ ranges and 1814 # checking them on every accessor. But that is way overkill.) 1815 1816 # We create anonymous subroutines as the accessors and then use 1817 # typeglobs to assign them to the proper package and name 1818 1819 my $name = shift; # Name of the field 1820 my $field = shift; # Reference to the inside-out hash containing the 1821 # field 1822 1823 my $package = (caller)[0]; 1824 1825 if (! exists $package_fields{$package}) { 1826 croak "$0: Must call 'setup_package' before 'set_access'"; 1827 } 1828 1829 # Stash the field so DESTROY can get it. 1830 $package_fields{$package}{$name} = $field; 1831 1832 # Remaining arguments are the accessors. For each... 1833 foreach my $access (@_) { 1834 my $access = lc $access; 1835 1836 my $protected = ""; 1837 1838 # Match the input as far as it goes. 1839 if ($access =~ /^(p[^_]*)_/) { 1840 $protected = $1; 1841 if (substr('protected_', 0, length $protected) 1842 eq $protected) 1843 { 1844 1845 # Add 1 for the underscore not included in $protected 1846 $access = substr($access, length($protected) + 1); 1847 $protected = '_'; 1848 } 1849 else { 1850 $protected = ""; 1851 } 1852 } 1853 1854 if (substr('addable', 0, length $access) eq $access) { 1855 my $subname = "${package}::${protected}add_$name"; 1856 no strict "refs"; 1857 1858 # add_ accessor. Don't add if already there, which we 1859 # determine using 'eq' for scalars and '==' otherwise. 1860 *$subname = sub { 1861 use strict "refs"; 1862 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 1863 my $self = shift; 1864 my $value = shift; 1865 my $addr = do { no overloading; pack 'J', $self; }; 1866 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 1867 if (ref $value) { 1868 return if grep { $value == $_ } @{$field->{$addr}}; 1869 } 1870 else { 1871 return if grep { $value eq $_ } @{$field->{$addr}}; 1872 } 1873 push @{$field->{$addr}}, $value; 1874 return; 1875 } 1876 } 1877 elsif (substr('constructor', 0, length $access) eq $access) { 1878 if ($protected) { 1879 Carp::my_carp_bug("Can't set-up 'protected' constructors") 1880 } 1881 else { 1882 $constructor_fields{$package}{$name} = $field; 1883 } 1884 } 1885 elsif (substr('readable_array', 0, length $access) eq $access) { 1886 1887 # Here has read access. If one of the other parameters for 1888 # access is array, or this one specifies array (by being more 1889 # than just 'readable_'), then create a subroutine that 1890 # assumes the data is an array. Otherwise just a scalar 1891 my $subname = "${package}::${protected}$name"; 1892 if (grep { /^a/i } @_ 1893 or length($access) > length('readable_')) 1894 { 1895 no strict "refs"; 1896 *$subname = sub { 1897 use strict "refs"; 1898 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; 1899 my $addr = do { no overloading; pack 'J', $_[0]; }; 1900 if (ref $field->{$addr} ne 'ARRAY') { 1901 my $type = ref $field->{$addr}; 1902 $type = 'scalar' unless $type; 1903 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); 1904 return; 1905 } 1906 return scalar @{$field->{$addr}} unless wantarray; 1907 1908 # Make a copy; had problems with caller modifying the 1909 # original otherwise 1910 my @return = @{$field->{$addr}}; 1911 return @return; 1912 } 1913 } 1914 else { 1915 1916 # Here not an array value, a simpler function. 1917 no strict "refs"; 1918 *$subname = sub { 1919 use strict "refs"; 1920 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; 1921 no overloading; 1922 return $field->{pack 'J', $_[0]}; 1923 } 1924 } 1925 } 1926 elsif (substr('settable', 0, length $access) eq $access) { 1927 my $subname = "${package}::${protected}set_$name"; 1928 no strict "refs"; 1929 *$subname = sub { 1930 use strict "refs"; 1931 if (main::DEBUG) { 1932 return Carp::carp_too_few_args(\@_, 2) if @_ < 2; 1933 Carp::carp_extra_args(\@_) if @_ > 2; 1934 } 1935 # $self is $_[0]; $value is $_[1] 1936 no overloading; 1937 $field->{pack 'J', $_[0]} = $_[1]; 1938 return; 1939 } 1940 } 1941 else { 1942 Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); 1943 } 1944 } 1945 return; 1946 } 1947} 1948 1949package Input_file; 1950 1951# All input files use this object, which stores various attributes about them, 1952# and provides for convenient, uniform handling. The run method wraps the 1953# processing. It handles all the bookkeeping of opening, reading, and closing 1954# the file, returning only significant input lines. 1955# 1956# Each object gets a handler which processes the body of the file, and is 1957# called by run(). Most should use the generic, default handler, which has 1958# code scrubbed to handle things you might not expect. A handler should 1959# basically be a while(next_line()) {...} loop. 1960# 1961# You can also set up handlers to 1962# 1) call before the first line is read for pre processing 1963# 2) call to adjust each line of the input before the main handler gets them 1964# 3) call upon EOF before the main handler exits its loop 1965# 4) call at the end for post processing 1966# 1967# $_ is used to store the input line, and is to be filtered by the 1968# each_line_handler()s. So, if the format of the line is not in the desired 1969# format for the main handler, these are used to do that adjusting. They can 1970# be stacked (by enclosing them in an [ anonymous array ] in the constructor, 1971# so the $_ output of one is used as the input to the next. None of the other 1972# handlers are stackable, but could easily be changed to be so. 1973# 1974# Most of the handlers can call insert_lines() or insert_adjusted_lines() 1975# which insert the parameters as lines to be processed before the next input 1976# file line is read. This allows the EOF handler to flush buffers, for 1977# example. The difference between the two routines is that the lines inserted 1978# by insert_lines() are subjected to the each_line_handler()s. (So if you 1979# called it from such a handler, you would get infinite recursion.) Lines 1980# inserted by insert_adjusted_lines() go directly to the main handler without 1981# any adjustments. If the post-processing handler calls any of these, there 1982# will be no effect. Some error checking for these conditions could be added, 1983# but it hasn't been done. 1984# 1985# carp_bad_line() should be called to warn of bad input lines, which clears $_ 1986# to prevent further processing of the line. This routine will output the 1987# message as a warning once, and then keep a count of the lines that have the 1988# same message, and output that count at the end of the file's processing. 1989# This keeps the number of messages down to a manageable amount. 1990# 1991# get_missings() should be called to retrieve any @missing input lines. 1992# Messages will be raised if this isn't done if the options aren't to ignore 1993# missings. 1994 1995sub trace { return main::trace(@_); } 1996 1997{ # Closure 1998 # Keep track of fields that are to be put into the constructor. 1999 my %constructor_fields; 2000 2001 main::setup_package(Constructor_Fields => \%constructor_fields); 2002 2003 my %file; # Input file name, required 2004 main::set_access('file', \%file, qw{ c r }); 2005 2006 my %first_released; # Unicode version file was first released in, required 2007 main::set_access('first_released', \%first_released, qw{ c r }); 2008 2009 my %handler; # Subroutine to process the input file, defaults to 2010 # 'process_generic_property_file' 2011 main::set_access('handler', \%handler, qw{ c }); 2012 2013 my %property; 2014 # name of property this file is for. defaults to none, meaning not 2015 # applicable, or is otherwise determinable, for example, from each line. 2016 main::set_access('property', \%property, qw{ c }); 2017 2018 my %optional; 2019 # If this is true, the file is optional. If not present, no warning is 2020 # output. If it is present, the string given by this parameter is 2021 # evaluated, and if false the file is not processed. 2022 main::set_access('optional', \%optional, 'c', 'r'); 2023 2024 my %non_skip; 2025 # This is used for debugging, to skip processing of all but a few input 2026 # files. Add 'non_skip => 1' to the constructor for those files you want 2027 # processed when you set the $debug_skip global. 2028 main::set_access('non_skip', \%non_skip, 'c'); 2029 2030 my %skip; 2031 # This is used to skip processing of this input file semi-permanently, 2032 # when it evaluates to true. The value should be the reason the file is 2033 # being skipped. It is used for files that we aren't planning to process 2034 # anytime soon, but want to allow to be in the directory and not raise a 2035 # message that we are not handling. Mostly for test files. This is in 2036 # contrast to the non_skip element, which is supposed to be used very 2037 # temporarily for debugging. Sets 'optional' to 1. Also, files that we 2038 # pretty much will never look at can be placed in the global 2039 # %ignored_files instead. Ones used here will be added to %skipped files 2040 main::set_access('skip', \%skip, 'c'); 2041 2042 my %each_line_handler; 2043 # list of subroutines to look at and filter each non-comment line in the 2044 # file. defaults to none. The subroutines are called in order, each is 2045 # to adjust $_ for the next one, and the final one adjusts it for 2046 # 'handler' 2047 main::set_access('each_line_handler', \%each_line_handler, 'c'); 2048 2049 my %has_missings_defaults; 2050 # ? Are there lines in the file giving default values for code points 2051 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is 2052 # the norm, but IGNORED means it has such lines, but the handler doesn't 2053 # use them. Having these three states allows us to catch changes to the 2054 # UCD that this program should track 2055 main::set_access('has_missings_defaults', 2056 \%has_missings_defaults, qw{ c r }); 2057 2058 my %pre_handler; 2059 # Subroutine to call before doing anything else in the file. If undef, no 2060 # such handler is called. 2061 main::set_access('pre_handler', \%pre_handler, qw{ c }); 2062 2063 my %eof_handler; 2064 # Subroutine to call upon getting an EOF on the input file, but before 2065 # that is returned to the main handler. This is to allow buffers to be 2066 # flushed. The handler is expected to call insert_lines() or 2067 # insert_adjusted() with the buffered material 2068 main::set_access('eof_handler', \%eof_handler, qw{ c r }); 2069 2070 my %post_handler; 2071 # Subroutine to call after all the lines of the file are read in and 2072 # processed. If undef, no such handler is called. 2073 main::set_access('post_handler', \%post_handler, qw{ c }); 2074 2075 my %progress_message; 2076 # Message to print to display progress in lieu of the standard one 2077 main::set_access('progress_message', \%progress_message, qw{ c }); 2078 2079 my %handle; 2080 # cache open file handle, internal. Is undef if file hasn't been 2081 # processed at all, empty if has; 2082 main::set_access('handle', \%handle); 2083 2084 my %added_lines; 2085 # cache of lines added virtually to the file, internal 2086 main::set_access('added_lines', \%added_lines); 2087 2088 my %errors; 2089 # cache of errors found, internal 2090 main::set_access('errors', \%errors); 2091 2092 my %missings; 2093 # storage of '@missing' defaults lines 2094 main::set_access('missings', \%missings); 2095 2096 sub new { 2097 my $class = shift; 2098 2099 my $self = bless \do{ my $anonymous_scalar }, $class; 2100 my $addr = do { no overloading; pack 'J', $self; }; 2101 2102 # Set defaults 2103 $handler{$addr} = \&main::process_generic_property_file; 2104 $non_skip{$addr} = 0; 2105 $skip{$addr} = 0; 2106 $has_missings_defaults{$addr} = $NO_DEFAULTS; 2107 $handle{$addr} = undef; 2108 $added_lines{$addr} = [ ]; 2109 $each_line_handler{$addr} = [ ]; 2110 $errors{$addr} = { }; 2111 $missings{$addr} = [ ]; 2112 2113 # Two positional parameters. 2114 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2115 $file{$addr} = main::internal_file_to_platform(shift); 2116 $first_released{$addr} = shift; 2117 2118 # The rest of the arguments are key => value pairs 2119 # %constructor_fields has been set up earlier to list all possible 2120 # ones. Either set or push, depending on how the default has been set 2121 # up just above. 2122 my %args = @_; 2123 foreach my $key (keys %args) { 2124 my $argument = $args{$key}; 2125 2126 # Note that the fields are the lower case of the constructor keys 2127 my $hash = $constructor_fields{lc $key}; 2128 if (! defined $hash) { 2129 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); 2130 next; 2131 } 2132 if (ref $hash->{$addr} eq 'ARRAY') { 2133 if (ref $argument eq 'ARRAY') { 2134 foreach my $argument (@{$argument}) { 2135 next if ! defined $argument; 2136 push @{$hash->{$addr}}, $argument; 2137 } 2138 } 2139 else { 2140 push @{$hash->{$addr}}, $argument if defined $argument; 2141 } 2142 } 2143 else { 2144 $hash->{$addr} = $argument; 2145 } 2146 delete $args{$key}; 2147 }; 2148 2149 # If the file has a property for it, it means that the property is not 2150 # listed in the file's entries. So add a handler to the list of line 2151 # handlers to insert the property name into the lines, to provide a 2152 # uniform interface to the final processing subroutine. 2153 # the final code doesn't have to worry about that. 2154 if ($property{$addr}) { 2155 push @{$each_line_handler{$addr}}, \&_insert_property_into_line; 2156 } 2157 2158 if ($non_skip{$addr} && ! $debug_skip && $verbosity) { 2159 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; 2160 } 2161 2162 # If skipping, set to optional, and add to list of ignored files, 2163 # including its reason 2164 if ($skip{$addr}) { 2165 $optional{$addr} = 1; 2166 $skipped_files{$file{$addr}} = $skip{$addr} 2167 } 2168 2169 return $self; 2170 } 2171 2172 2173 use overload 2174 fallback => 0, 2175 qw("") => "_operator_stringify", 2176 "." => \&main::_operator_dot, 2177 ; 2178 2179 sub _operator_stringify { 2180 my $self = shift; 2181 2182 return __PACKAGE__ . " object for " . $self->file; 2183 } 2184 2185 # flag to make sure extracted files are processed early 2186 my $seen_non_extracted_non_age = 0; 2187 2188 sub run { 2189 # Process the input object $self. This opens and closes the file and 2190 # calls all the handlers for it. Currently, this can only be called 2191 # once per file, as it destroy's the EOF handler 2192 2193 my $self = shift; 2194 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2195 2196 my $addr = do { no overloading; pack 'J', $self; }; 2197 2198 my $file = $file{$addr}; 2199 2200 # Don't process if not expecting this file (because released later 2201 # than this Unicode version), and isn't there. This means if someone 2202 # copies it into an earlier version's directory, we will go ahead and 2203 # process it. 2204 return if $first_released{$addr} gt $v_version && ! -e $file; 2205 2206 # If in debugging mode and this file doesn't have the non-skip 2207 # flag set, and isn't one of the critical files, skip it. 2208 if ($debug_skip 2209 && $first_released{$addr} ne v0 2210 && ! $non_skip{$addr}) 2211 { 2212 print "Skipping $file in debugging\n" if $verbosity; 2213 return; 2214 } 2215 2216 # File could be optional 2217 if ($optional{$addr}) { 2218 return unless -e $file; 2219 my $result = eval $optional{$addr}; 2220 if (! defined $result) { 2221 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped."); 2222 return; 2223 } 2224 if (! $result) { 2225 if ($verbosity) { 2226 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n"; 2227 } 2228 return; 2229 } 2230 } 2231 2232 if (! defined $file || ! -e $file) { 2233 2234 # If the file doesn't exist, see if have internal data for it 2235 # (based on first_released being 0). 2236 if ($first_released{$addr} eq v0) { 2237 $handle{$addr} = 'pretend_is_open'; 2238 } 2239 else { 2240 if (! $optional{$addr} # File could be optional 2241 && $v_version ge $first_released{$addr}) 2242 { 2243 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr}; 2244 } 2245 return; 2246 } 2247 } 2248 else { 2249 2250 # Here, the file exists. Some platforms may change the case of 2251 # its name 2252 if ($seen_non_extracted_non_age) { 2253 if ($file =~ /$EXTRACTED/i) { 2254 Carp::my_carp_bug(main::join_lines(<<END 2255$file should be processed just after the 'Prop...Alias' files, and before 2256anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may 2257have subtle problems 2258END 2259 )); 2260 } 2261 } 2262 elsif ($EXTRACTED_DIR 2263 && $first_released{$addr} ne v0 2264 && $file !~ /$EXTRACTED/i 2265 && lc($file) ne 'dage.txt') 2266 { 2267 # We don't set this (by the 'if' above) if we have no 2268 # extracted directory, so if running on an early version, 2269 # this test won't work. Not worth worrying about. 2270 $seen_non_extracted_non_age = 1; 2271 } 2272 2273 # And mark the file as having being processed, and warn if it 2274 # isn't a file we are expecting. As we process the files, 2275 # they are deleted from the hash, so any that remain at the 2276 # end of the program are files that we didn't process. 2277 my $fkey = File::Spec->rel2abs($file); 2278 my $expecting = delete $potential_files{lc($fkey)}; 2279 2280 Carp::my_carp("Was not expecting '$file'.") if 2281 ! $expecting 2282 && ! defined $handle{$addr}; 2283 2284 # Having deleted from expected files, we can quit if not to do 2285 # anything. Don't print progress unless really want verbosity 2286 if ($skip{$addr}) { 2287 print "Skipping $file.\n" if $verbosity >= $VERBOSE; 2288 return; 2289 } 2290 2291 # Open the file, converting the slashes used in this program 2292 # into the proper form for the OS 2293 my $file_handle; 2294 if (not open $file_handle, "<", $file) { 2295 Carp::my_carp("Can't open $file. Skipping: $!"); 2296 return 0; 2297 } 2298 $handle{$addr} = $file_handle; # Cache the open file handle 2299 } 2300 2301 if ($verbosity >= $PROGRESS) { 2302 if ($progress_message{$addr}) { 2303 print "$progress_message{$addr}\n"; 2304 } 2305 else { 2306 # If using a virtual file, say so. 2307 print "Processing ", (-e $file) 2308 ? $file 2309 : "substitute $file", 2310 "\n"; 2311 } 2312 } 2313 2314 2315 # Call any special handler for before the file. 2316 &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; 2317 2318 # Then the main handler 2319 &{$handler{$addr}}($self); 2320 2321 # Then any special post-file handler. 2322 &{$post_handler{$addr}}($self) if $post_handler{$addr}; 2323 2324 # If any errors have been accumulated, output the counts (as the first 2325 # error message in each class was output when it was encountered). 2326 if ($errors{$addr}) { 2327 my $total = 0; 2328 my $types = 0; 2329 foreach my $error (keys %{$errors{$addr}}) { 2330 $total += $errors{$addr}->{$error}; 2331 delete $errors{$addr}->{$error}; 2332 $types++; 2333 } 2334 if ($total > 1) { 2335 my $message 2336 = "A total of $total lines had errors in $file. "; 2337 2338 $message .= ($types == 1) 2339 ? '(Only the first one was displayed.)' 2340 : '(Only the first of each type was displayed.)'; 2341 Carp::my_carp($message); 2342 } 2343 } 2344 2345 if (@{$missings{$addr}}) { 2346 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); 2347 } 2348 2349 # If a real file handle, close it. 2350 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if 2351 ref $handle{$addr}; 2352 $handle{$addr} = ""; # Uses empty to indicate that has already seen 2353 # the file, as opposed to undef 2354 return; 2355 } 2356 2357 sub next_line { 2358 # Sets $_ to be the next logical input line, if any. Returns non-zero 2359 # if such a line exists. 'logical' means that any lines that have 2360 # been added via insert_lines() will be returned in $_ before the file 2361 # is read again. 2362 2363 my $self = shift; 2364 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2365 2366 my $addr = do { no overloading; pack 'J', $self; }; 2367 2368 # Here the file is open (or if the handle is not a ref, is an open 2369 # 'virtual' file). Get the next line; any inserted lines get priority 2370 # over the file itself. 2371 my $adjusted; 2372 2373 LINE: 2374 while (1) { # Loop until find non-comment, non-empty line 2375 #local $to_trace = 1 if main::DEBUG; 2376 my $inserted_ref = shift @{$added_lines{$addr}}; 2377 if (defined $inserted_ref) { 2378 ($adjusted, $_) = @{$inserted_ref}; 2379 trace $adjusted, $_ if main::DEBUG && $to_trace; 2380 return 1 if $adjusted; 2381 } 2382 else { 2383 last if ! ref $handle{$addr}; # Don't read unless is real file 2384 last if ! defined ($_ = readline $handle{$addr}); 2385 } 2386 chomp; 2387 trace $_ if main::DEBUG && $to_trace; 2388 2389 # See if this line is the comment line that defines what property 2390 # value that code points that are not listed in the file should 2391 # have. The format or existence of these lines is not guaranteed 2392 # by Unicode since they are comments, but the documentation says 2393 # that this was added for machine-readability, so probably won't 2394 # change. This works starting in Unicode Version 5.0. They look 2395 # like: 2396 # 2397 # @missing: 0000..10FFFF; Not_Reordered 2398 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> 2399 # @missing: 0000..10FFFF; ; NaN 2400 # 2401 # Save the line for a later get_missings() call. 2402 if (/$missing_defaults_prefix/) { 2403 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { 2404 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); 2405 } 2406 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { 2407 my @defaults = split /\s* ; \s*/x, $_; 2408 2409 # The first field is the @missing, which ends in a 2410 # semi-colon, so can safely shift. 2411 shift @defaults; 2412 2413 # Some of these lines may have empty field placeholders 2414 # which get in the way. An example is: 2415 # @missing: 0000..10FFFF; ; NaN 2416 # Remove them. Process starting from the top so the 2417 # splice doesn't affect things still to be looked at. 2418 for (my $i = @defaults - 1; $i >= 0; $i--) { 2419 next if $defaults[$i] ne ""; 2420 splice @defaults, $i, 1; 2421 } 2422 2423 # What's left should be just the property (maybe) and the 2424 # default. Having only one element means it doesn't have 2425 # the property. 2426 my $default; 2427 my $property; 2428 if (@defaults >= 1) { 2429 if (@defaults == 1) { 2430 $default = $defaults[0]; 2431 } 2432 else { 2433 $property = $defaults[0]; 2434 $default = $defaults[1]; 2435 } 2436 } 2437 2438 if (@defaults < 1 2439 || @defaults > 2 2440 || ($default =~ /^</ 2441 && $default !~ /^<code *point>$/i 2442 && $default !~ /^<none>$/i 2443 && $default !~ /^<script>$/i)) 2444 { 2445 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); 2446 } 2447 else { 2448 2449 # If the property is missing from the line, it should 2450 # be the one for the whole file 2451 $property = $property{$addr} if ! defined $property; 2452 2453 # Change <none> to the null string, which is what it 2454 # really means. If the default is the code point 2455 # itself, set it to <code point>, which is what 2456 # Unicode uses (but sometimes they've forgotten the 2457 # space) 2458 if ($default =~ /^<none>$/i) { 2459 $default = ""; 2460 } 2461 elsif ($default =~ /^<code *point>$/i) { 2462 $default = $CODE_POINT; 2463 } 2464 elsif ($default =~ /^<script>$/i) { 2465 2466 # Special case this one. Currently is from 2467 # ScriptExtensions.txt, and means for all unlisted 2468 # code points, use their Script property values. 2469 # For the code points not listed in that file, the 2470 # default value is 'Unknown'. 2471 $default = "Unknown"; 2472 } 2473 2474 # Store them as a sub-arrays with both components. 2475 push @{$missings{$addr}}, [ $default, $property ]; 2476 } 2477 } 2478 2479 # There is nothing for the caller to process on this comment 2480 # line. 2481 next; 2482 } 2483 2484 # Remove comments and trailing space, and skip this line if the 2485 # result is empty 2486 s/#.*//; 2487 s/\s+$//; 2488 next if /^$/; 2489 2490 # Call any handlers for this line, and skip further processing of 2491 # the line if the handler sets the line to null. 2492 foreach my $sub_ref (@{$each_line_handler{$addr}}) { 2493 &{$sub_ref}($self); 2494 next LINE if /^$/; 2495 } 2496 2497 # Here the line is ok. return success. 2498 return 1; 2499 } # End of looping through lines. 2500 2501 # If there is an EOF handler, call it (only once) and if it generates 2502 # more lines to process go back in the loop to handle them. 2503 if ($eof_handler{$addr}) { 2504 &{$eof_handler{$addr}}($self); 2505 $eof_handler{$addr} = ""; # Currently only get one shot at it. 2506 goto LINE if $added_lines{$addr}; 2507 } 2508 2509 # Return failure -- no more lines. 2510 return 0; 2511 2512 } 2513 2514# Not currently used, not fully tested. 2515# sub peek { 2516# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank 2517# # record. Not callable from an each_line_handler(), nor does it call 2518# # an each_line_handler() on the line. 2519# 2520# my $self = shift; 2521# my $addr = do { no overloading; pack 'J', $self; }; 2522# 2523# foreach my $inserted_ref (@{$added_lines{$addr}}) { 2524# my ($adjusted, $line) = @{$inserted_ref}; 2525# next if $adjusted; 2526# 2527# # Remove comments and trailing space, and return a non-empty 2528# # resulting line 2529# $line =~ s/#.*//; 2530# $line =~ s/\s+$//; 2531# return $line if $line ne ""; 2532# } 2533# 2534# return if ! ref $handle{$addr}; # Don't read unless is real file 2535# while (1) { # Loop until find non-comment, non-empty line 2536# local $to_trace = 1 if main::DEBUG; 2537# trace $_ if main::DEBUG && $to_trace; 2538# return if ! defined (my $line = readline $handle{$addr}); 2539# chomp $line; 2540# push @{$added_lines{$addr}}, [ 0, $line ]; 2541# 2542# $line =~ s/#.*//; 2543# $line =~ s/\s+$//; 2544# return $line if $line ne ""; 2545# } 2546# 2547# return; 2548# } 2549 2550 2551 sub insert_lines { 2552 # Lines can be inserted so that it looks like they were in the input 2553 # file at the place it was when this routine is called. See also 2554 # insert_adjusted_lines(). Lines inserted via this routine go through 2555 # any each_line_handler() 2556 2557 my $self = shift; 2558 2559 # Each inserted line is an array, with the first element being 0 to 2560 # indicate that this line hasn't been adjusted, and needs to be 2561 # processed. 2562 no overloading; 2563 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_; 2564 return; 2565 } 2566 2567 sub insert_adjusted_lines { 2568 # Lines can be inserted so that it looks like they were in the input 2569 # file at the place it was when this routine is called. See also 2570 # insert_lines(). Lines inserted via this routine are already fully 2571 # adjusted, ready to be processed; each_line_handler()s handlers will 2572 # not be called. This means this is not a completely general 2573 # facility, as only the last each_line_handler on the stack should 2574 # call this. It could be made more general, by passing to each of the 2575 # line_handlers their position on the stack, which they would pass on 2576 # to this routine, and that would replace the boolean first element in 2577 # the anonymous array pushed here, so that the next_line routine could 2578 # use that to call only those handlers whose index is after it on the 2579 # stack. But this is overkill for what is needed now. 2580 2581 my $self = shift; 2582 trace $_[0] if main::DEBUG && $to_trace; 2583 2584 # Each inserted line is an array, with the first element being 1 to 2585 # indicate that this line has been adjusted 2586 no overloading; 2587 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_; 2588 return; 2589 } 2590 2591 sub get_missings { 2592 # Returns the stored up @missings lines' values, and clears the list. 2593 # The values are in an array, consisting of the default in the first 2594 # element, and the property in the 2nd. However, since these lines 2595 # can be stacked up, the return is an array of all these arrays. 2596 2597 my $self = shift; 2598 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2599 2600 my $addr = do { no overloading; pack 'J', $self; }; 2601 2602 # If not accepting a list return, just return the first one. 2603 return shift @{$missings{$addr}} unless wantarray; 2604 2605 my @return = @{$missings{$addr}}; 2606 undef @{$missings{$addr}}; 2607 return @return; 2608 } 2609 2610 sub _insert_property_into_line { 2611 # Add a property field to $_, if this file requires it. 2612 2613 my $self = shift; 2614 my $addr = do { no overloading; pack 'J', $self; }; 2615 my $property = $property{$addr}; 2616 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2617 2618 $_ =~ s/(;|$)/; $property$1/; 2619 return; 2620 } 2621 2622 sub carp_bad_line { 2623 # Output consistent error messages, using either a generic one, or the 2624 # one given by the optional parameter. To avoid gazillions of the 2625 # same message in case the syntax of a file is way off, this routine 2626 # only outputs the first instance of each message, incrementing a 2627 # count so the totals can be output at the end of the file. 2628 2629 my $self = shift; 2630 my $message = shift; 2631 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2632 2633 my $addr = do { no overloading; pack 'J', $self; }; 2634 2635 $message = 'Unexpected line' unless $message; 2636 2637 # No trailing punctuation so as to fit with our addenda. 2638 $message =~ s/[.:;,]$//; 2639 2640 # If haven't seen this exact message before, output it now. Otherwise 2641 # increment the count of how many times it has occurred 2642 unless ($errors{$addr}->{$message}) { 2643 Carp::my_carp("$message in '$_' in " 2644 . $file{$addr} 2645 . " at line $.. Skipping this line;"); 2646 $errors{$addr}->{$message} = 1; 2647 } 2648 else { 2649 $errors{$addr}->{$message}++; 2650 } 2651 2652 # Clear the line to prevent any further (meaningful) processing of it. 2653 $_ = ""; 2654 2655 return; 2656 } 2657} # End closure 2658 2659package Multi_Default; 2660 2661# Certain properties in early versions of Unicode had more than one possible 2662# default for code points missing from the files. In these cases, one 2663# default applies to everything left over after all the others are applied, 2664# and for each of the others, there is a description of which class of code 2665# points applies to it. This object helps implement this by storing the 2666# defaults, and for all but that final default, an eval string that generates 2667# the class that it applies to. 2668 2669 2670{ # Closure 2671 2672 main::setup_package(); 2673 2674 my %class_defaults; 2675 # The defaults structure for the classes 2676 main::set_access('class_defaults', \%class_defaults); 2677 2678 my %other_default; 2679 # The default that applies to everything left over. 2680 main::set_access('other_default', \%other_default, 'r'); 2681 2682 2683 sub new { 2684 # The constructor is called with default => eval pairs, terminated by 2685 # the left-over default. e.g. 2686 # Multi_Default->new( 2687 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C 2688 # - 0x200D', 2689 # 'R' => 'some other expression that evaluates to code points', 2690 # . 2691 # . 2692 # . 2693 # 'U')); 2694 2695 my $class = shift; 2696 2697 my $self = bless \do{my $anonymous_scalar}, $class; 2698 my $addr = do { no overloading; pack 'J', $self; }; 2699 2700 while (@_ > 1) { 2701 my $default = shift; 2702 my $eval = shift; 2703 $class_defaults{$addr}->{$default} = $eval; 2704 } 2705 2706 $other_default{$addr} = shift; 2707 2708 return $self; 2709 } 2710 2711 sub get_next_defaults { 2712 # Iterates and returns the next class of defaults. 2713 my $self = shift; 2714 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2715 2716 my $addr = do { no overloading; pack 'J', $self; }; 2717 2718 return each %{$class_defaults{$addr}}; 2719 } 2720} 2721 2722package Alias; 2723 2724# An alias is one of the names that a table goes by. This class defines them 2725# including some attributes. Everything is currently setup in the 2726# constructor. 2727 2728 2729{ # Closure 2730 2731 main::setup_package(); 2732 2733 my %name; 2734 main::set_access('name', \%name, 'r'); 2735 2736 my %loose_match; 2737 # Should this name match loosely or not. 2738 main::set_access('loose_match', \%loose_match, 'r'); 2739 2740 my %make_re_pod_entry; 2741 # Some aliases should not get their own entries in the re section of the 2742 # pod, because they are covered by a wild-card, and some we want to 2743 # discourage use of. Binary 2744 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); 2745 2746 my %ucd; 2747 # Is this documented to be accessible via Unicode::UCD 2748 main::set_access('ucd', \%ucd, 'r', 's'); 2749 2750 my %status; 2751 # Aliases have a status, like deprecated, or even suppressed (which means 2752 # they don't appear in documentation). Enum 2753 main::set_access('status', \%status, 'r'); 2754 2755 my %ok_as_filename; 2756 # Similarly, some aliases should not be considered as usable ones for 2757 # external use, such as file names, or we don't want documentation to 2758 # recommend them. Boolean 2759 main::set_access('ok_as_filename', \%ok_as_filename, 'r'); 2760 2761 sub new { 2762 my $class = shift; 2763 2764 my $self = bless \do { my $anonymous_scalar }, $class; 2765 my $addr = do { no overloading; pack 'J', $self; }; 2766 2767 $name{$addr} = shift; 2768 $loose_match{$addr} = shift; 2769 $make_re_pod_entry{$addr} = shift; 2770 $ok_as_filename{$addr} = shift; 2771 $status{$addr} = shift; 2772 $ucd{$addr} = shift; 2773 2774 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2775 2776 # Null names are never ok externally 2777 $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; 2778 2779 return $self; 2780 } 2781} 2782 2783package Range; 2784 2785# A range is the basic unit for storing code points, and is described in the 2786# comments at the beginning of the program. Each range has a starting code 2787# point; an ending code point (not less than the starting one); a value 2788# that applies to every code point in between the two end-points, inclusive; 2789# and an enum type that applies to the value. The type is for the user's 2790# convenience, and has no meaning here, except that a non-zero type is 2791# considered to not obey the normal Unicode rules for having standard forms. 2792# 2793# The same structure is used for both map and match tables, even though in the 2794# latter, the value (and hence type) is irrelevant and could be used as a 2795# comment. In map tables, the value is what all the code points in the range 2796# map to. Type 0 values have the standardized version of the value stored as 2797# well, so as to not have to recalculate it a lot. 2798 2799sub trace { return main::trace(@_); } 2800 2801{ # Closure 2802 2803 main::setup_package(); 2804 2805 my %start; 2806 main::set_access('start', \%start, 'r', 's'); 2807 2808 my %end; 2809 main::set_access('end', \%end, 'r', 's'); 2810 2811 my %value; 2812 main::set_access('value', \%value, 'r'); 2813 2814 my %type; 2815 main::set_access('type', \%type, 'r'); 2816 2817 my %standard_form; 2818 # The value in internal standard form. Defined only if the type is 0. 2819 main::set_access('standard_form', \%standard_form); 2820 2821 # Note that if these fields change, the dump() method should as well 2822 2823 sub new { 2824 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 2825 my $class = shift; 2826 2827 my $self = bless \do { my $anonymous_scalar }, $class; 2828 my $addr = do { no overloading; pack 'J', $self; }; 2829 2830 $start{$addr} = shift; 2831 $end{$addr} = shift; 2832 2833 my %args = @_; 2834 2835 my $value = delete $args{'Value'}; # Can be 0 2836 $value = "" unless defined $value; 2837 $value{$addr} = $value; 2838 2839 $type{$addr} = delete $args{'Type'} || 0; 2840 2841 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 2842 2843 if (! $type{$addr}) { 2844 $standard_form{$addr} = main::standardize($value); 2845 } 2846 2847 return $self; 2848 } 2849 2850 use overload 2851 fallback => 0, 2852 qw("") => "_operator_stringify", 2853 "." => \&main::_operator_dot, 2854 ; 2855 2856 sub _operator_stringify { 2857 my $self = shift; 2858 my $addr = do { no overloading; pack 'J', $self; }; 2859 2860 # Output it like '0041..0065 (value)' 2861 my $return = sprintf("%04X", $start{$addr}) 2862 . '..' 2863 . sprintf("%04X", $end{$addr}); 2864 my $value = $value{$addr}; 2865 my $type = $type{$addr}; 2866 $return .= ' ('; 2867 $return .= "$value"; 2868 $return .= ", Type=$type" if $type != 0; 2869 $return .= ')'; 2870 2871 return $return; 2872 } 2873 2874 sub standard_form { 2875 # The standard form is the value itself if the standard form is 2876 # undefined (that is if the value is special) 2877 2878 my $self = shift; 2879 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2880 2881 my $addr = do { no overloading; pack 'J', $self; }; 2882 2883 return $standard_form{$addr} if defined $standard_form{$addr}; 2884 return $value{$addr}; 2885 } 2886 2887 sub dump { 2888 # Human, not machine readable. For machine readable, comment out this 2889 # entire routine and let the standard one take effect. 2890 my $self = shift; 2891 my $indent = shift; 2892 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2893 2894 my $addr = do { no overloading; pack 'J', $self; }; 2895 2896 my $return = $indent 2897 . sprintf("%04X", $start{$addr}) 2898 . '..' 2899 . sprintf("%04X", $end{$addr}) 2900 . " '$value{$addr}';"; 2901 if (! defined $standard_form{$addr}) { 2902 $return .= "(type=$type{$addr})"; 2903 } 2904 elsif ($standard_form{$addr} ne $value{$addr}) { 2905 $return .= "(standard '$standard_form{$addr}')"; 2906 } 2907 return $return; 2908 } 2909} # End closure 2910 2911package _Range_List_Base; 2912 2913# Base class for range lists. A range list is simply an ordered list of 2914# ranges, so that the ranges with the lowest starting numbers are first in it. 2915# 2916# When a new range is added that is adjacent to an existing range that has the 2917# same value and type, it merges with it to form a larger range. 2918# 2919# Ranges generally do not overlap, except that there can be multiple entries 2920# of single code point ranges. This is because of NameAliases.txt. 2921# 2922# In this program, there is a standard value such that if two different 2923# values, have the same standard value, they are considered equivalent. This 2924# value was chosen so that it gives correct results on Unicode data 2925 2926# There are a number of methods to manipulate range lists, and some operators 2927# are overloaded to handle them. 2928 2929sub trace { return main::trace(@_); } 2930 2931{ # Closure 2932 2933 our $addr; 2934 2935 main::setup_package(); 2936 2937 my %ranges; 2938 # The list of ranges 2939 main::set_access('ranges', \%ranges, 'readable_array'); 2940 2941 my %max; 2942 # The highest code point in the list. This was originally a method, but 2943 # actual measurements said it was used a lot. 2944 main::set_access('max', \%max, 'r'); 2945 2946 my %each_range_iterator; 2947 # Iterator position for each_range() 2948 main::set_access('each_range_iterator', \%each_range_iterator); 2949 2950 my %owner_name_of; 2951 # Name of parent this is attached to, if any. Solely for better error 2952 # messages. 2953 main::set_access('owner_name_of', \%owner_name_of, 'p_r'); 2954 2955 my %_search_ranges_cache; 2956 # A cache of the previous result from _search_ranges(), for better 2957 # performance 2958 main::set_access('_search_ranges_cache', \%_search_ranges_cache); 2959 2960 sub new { 2961 my $class = shift; 2962 my %args = @_; 2963 2964 # Optional initialization data for the range list. 2965 my $initialize = delete $args{'Initialize'}; 2966 2967 my $self; 2968 2969 # Use _union() to initialize. _union() returns an object of this 2970 # class, which means that it will call this constructor recursively. 2971 # But it won't have this $initialize parameter so that it won't 2972 # infinitely loop on this. 2973 return _union($class, $initialize, %args) if defined $initialize; 2974 2975 $self = bless \do { my $anonymous_scalar }, $class; 2976 my $addr = do { no overloading; pack 'J', $self; }; 2977 2978 # Optional parent object, only for debug info. 2979 $owner_name_of{$addr} = delete $args{'Owner'}; 2980 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; 2981 2982 # Stringify, in case it is an object. 2983 $owner_name_of{$addr} = "$owner_name_of{$addr}"; 2984 2985 # This is used only for error messages, and so a colon is added 2986 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; 2987 2988 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 2989 2990 # Max is initialized to a negative value that isn't adjacent to 0, 2991 # for simpler tests 2992 $max{$addr} = -2; 2993 2994 $_search_ranges_cache{$addr} = 0; 2995 $ranges{$addr} = []; 2996 2997 return $self; 2998 } 2999 3000 use overload 3001 fallback => 0, 3002 qw("") => "_operator_stringify", 3003 "." => \&main::_operator_dot, 3004 ; 3005 3006 sub _operator_stringify { 3007 my $self = shift; 3008 my $addr = do { no overloading; pack 'J', $self; }; 3009 3010 return "Range_List attached to '$owner_name_of{$addr}'" 3011 if $owner_name_of{$addr}; 3012 return "anonymous Range_List " . \$self; 3013 } 3014 3015 sub _union { 3016 # Returns the union of the input code points. It can be called as 3017 # either a constructor or a method. If called as a method, the result 3018 # will be a new() instance of the calling object, containing the union 3019 # of that object with the other parameter's code points; if called as 3020 # a constructor, the first parameter gives the class that the new object 3021 # should be, and the second parameter gives the code points to go into 3022 # it. 3023 # In either case, there are two parameters looked at by this routine; 3024 # any additional parameters are passed to the new() constructor. 3025 # 3026 # The code points can come in the form of some object that contains 3027 # ranges, and has a conventionally named method to access them; or 3028 # they can be an array of individual code points (as integers); or 3029 # just a single code point. 3030 # 3031 # If they are ranges, this routine doesn't make any effort to preserve 3032 # the range values and types of one input over the other. Therefore 3033 # this base class should not allow _union to be called from other than 3034 # initialization code, so as to prevent two tables from being added 3035 # together where the range values matter. The general form of this 3036 # routine therefore belongs in a derived class, but it was moved here 3037 # to avoid duplication of code. The failure to overload this in this 3038 # class keeps it safe. 3039 # 3040 # It does make the effort during initialization to accept tables with 3041 # multiple values for the same code point, and to preserve the order 3042 # of these. If there is only one input range or range set, it doesn't 3043 # sort (as it should already be sorted to the desired order), and will 3044 # accept multiple values per code point. Otherwise it will merge 3045 # multiple values into a single one. 3046 3047 my $self; 3048 my @args; # Arguments to pass to the constructor 3049 3050 my $class = shift; 3051 3052 # If a method call, will start the union with the object itself, and 3053 # the class of the new object will be the same as self. 3054 if (ref $class) { 3055 $self = $class; 3056 $class = ref $self; 3057 push @args, $self; 3058 } 3059 3060 # Add the other required parameter. 3061 push @args, shift; 3062 # Rest of parameters are passed on to the constructor 3063 3064 # Accumulate all records from both lists. 3065 my @records; 3066 my $input_count = 0; 3067 for my $arg (@args) { 3068 #local $to_trace = 0 if main::DEBUG; 3069 trace "argument = $arg" if main::DEBUG && $to_trace; 3070 if (! defined $arg) { 3071 my $message = ""; 3072 if (defined $self) { 3073 no overloading; 3074 $message .= $owner_name_of{pack 'J', $self}; 3075 } 3076 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done."); 3077 return; 3078 } 3079 3080 $arg = [ $arg ] if ! ref $arg; 3081 my $type = ref $arg; 3082 if ($type eq 'ARRAY') { 3083 foreach my $element (@$arg) { 3084 push @records, Range->new($element, $element); 3085 $input_count++; 3086 } 3087 } 3088 elsif ($arg->isa('Range')) { 3089 push @records, $arg; 3090 $input_count++; 3091 } 3092 elsif ($arg->can('ranges')) { 3093 push @records, $arg->ranges; 3094 $input_count++; 3095 } 3096 else { 3097 my $message = ""; 3098 if (defined $self) { 3099 no overloading; 3100 $message .= $owner_name_of{pack 'J', $self}; 3101 } 3102 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); 3103 return; 3104 } 3105 } 3106 3107 # Sort with the range containing the lowest ordinal first, but if 3108 # two ranges start at the same code point, sort with the bigger range 3109 # of the two first, because it takes fewer cycles. 3110 if ($input_count > 1) { 3111 @records = sort { ($a->start <=> $b->start) 3112 or 3113 # if b is shorter than a, b->end will be 3114 # less than a->end, and we want to select 3115 # a, so want to return -1 3116 ($b->end <=> $a->end) 3117 } @records; 3118 } 3119 3120 my $new = $class->new(@_); 3121 3122 # Fold in records so long as they add new information. 3123 for my $set (@records) { 3124 my $start = $set->start; 3125 my $end = $set->end; 3126 my $value = $set->value; 3127 my $type = $set->type; 3128 if ($start > $new->max) { 3129 $new->_add_delete('+', $start, $end, $value, Type => $type); 3130 } 3131 elsif ($end > $new->max) { 3132 $new->_add_delete('+', $new->max +1, $end, $value, 3133 Type => $type); 3134 } 3135 elsif ($input_count == 1) { 3136 # Here, overlaps existing range, but is from a single input, 3137 # so preserve the multiple values from that input. 3138 $new->_add_delete('+', $start, $end, $value, Type => $type, 3139 Replace => $MULTIPLE_AFTER); 3140 } 3141 } 3142 3143 return $new; 3144 } 3145 3146 sub range_count { # Return the number of ranges in the range list 3147 my $self = shift; 3148 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3149 3150 no overloading; 3151 return scalar @{$ranges{pack 'J', $self}}; 3152 } 3153 3154 sub min { 3155 # Returns the minimum code point currently in the range list, or if 3156 # the range list is empty, 2 beyond the max possible. This is a 3157 # method because used so rarely, that not worth saving between calls, 3158 # and having to worry about changing it as ranges are added and 3159 # deleted. 3160 3161 my $self = shift; 3162 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3163 3164 my $addr = do { no overloading; pack 'J', $self; }; 3165 3166 # If the range list is empty, return a large value that isn't adjacent 3167 # to any that could be in the range list, for simpler tests 3168 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; 3169 return $ranges{$addr}->[0]->start; 3170 } 3171 3172 sub contains { 3173 # Boolean: Is argument in the range list? If so returns $i such that: 3174 # range[$i]->end < $codepoint <= range[$i+1]->end 3175 # which is one beyond what you want; this is so that the 0th range 3176 # doesn't return false 3177 my $self = shift; 3178 my $codepoint = shift; 3179 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3180 3181 my $i = $self->_search_ranges($codepoint); 3182 return 0 unless defined $i; 3183 3184 # The search returns $i, such that 3185 # range[$i-1]->end < $codepoint <= range[$i]->end 3186 # So is in the table if and only iff it is at least the start position 3187 # of range $i. 3188 no overloading; 3189 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; 3190 return $i + 1; 3191 } 3192 3193 sub containing_range { 3194 # Returns the range object that contains the code point, undef if none 3195 3196 my $self = shift; 3197 my $codepoint = shift; 3198 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3199 3200 my $i = $self->contains($codepoint); 3201 return unless $i; 3202 3203 # contains() returns 1 beyond where we should look 3204 no overloading; 3205 return $ranges{pack 'J', $self}->[$i-1]; 3206 } 3207 3208 sub value_of { 3209 # Returns the value associated with the code point, undef if none 3210 3211 my $self = shift; 3212 my $codepoint = shift; 3213 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3214 3215 my $range = $self->containing_range($codepoint); 3216 return unless defined $range; 3217 3218 return $range->value; 3219 } 3220 3221 sub type_of { 3222 # Returns the type of the range containing the code point, undef if 3223 # the code point is not in the table 3224 3225 my $self = shift; 3226 my $codepoint = shift; 3227 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3228 3229 my $range = $self->containing_range($codepoint); 3230 return unless defined $range; 3231 3232 return $range->type; 3233 } 3234 3235 sub _search_ranges { 3236 # Find the range in the list which contains a code point, or where it 3237 # should go if were to add it. That is, it returns $i, such that: 3238 # range[$i-1]->end < $codepoint <= range[$i]->end 3239 # Returns undef if no such $i is possible (e.g. at end of table), or 3240 # if there is an error. 3241 3242 my $self = shift; 3243 my $code_point = shift; 3244 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3245 3246 my $addr = do { no overloading; pack 'J', $self; }; 3247 3248 return if $code_point > $max{$addr}; 3249 my $r = $ranges{$addr}; # The current list of ranges 3250 my $range_list_size = scalar @$r; 3251 my $i; 3252 3253 use integer; # want integer division 3254 3255 # Use the cached result as the starting guess for this one, because, 3256 # an experiment on 5.1 showed that 90% of the time the cache was the 3257 # same as the result on the next call (and 7% it was one less). 3258 $i = $_search_ranges_cache{$addr}; 3259 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. 3260 # from an intervening deletion 3261 #local $to_trace = 1 if main::DEBUG; 3262 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); 3263 return $i if $code_point <= $r->[$i]->end 3264 && ($i == 0 || $r->[$i-1]->end < $code_point); 3265 3266 # Here the cache doesn't yield the correct $i. Try adding 1. 3267 if ($i < $range_list_size - 1 3268 && $r->[$i]->end < $code_point && 3269 $code_point <= $r->[$i+1]->end) 3270 { 3271 $i++; 3272 trace "next \$i is correct: $i" if main::DEBUG && $to_trace; 3273 $_search_ranges_cache{$addr} = $i; 3274 return $i; 3275 } 3276 3277 # Here, adding 1 also didn't work. We do a binary search to 3278 # find the correct position, starting with current $i 3279 my $lower = 0; 3280 my $upper = $range_list_size - 1; 3281 while (1) { 3282 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; 3283 3284 if ($code_point <= $r->[$i]->end) { 3285 3286 # Here we have met the upper constraint. We can quit if we 3287 # also meet the lower one. 3288 last if $i == 0 || $r->[$i-1]->end < $code_point; 3289 3290 $upper = $i; # Still too high. 3291 3292 } 3293 else { 3294 3295 # Here, $r[$i]->end < $code_point, so look higher up. 3296 $lower = $i; 3297 } 3298 3299 # Split search domain in half to try again. 3300 my $temp = ($upper + $lower) / 2; 3301 3302 # No point in continuing unless $i changes for next time 3303 # in the loop. 3304 if ($temp == $i) { 3305 3306 # We can't reach the highest element because of the averaging. 3307 # So if one below the upper edge, force it there and try one 3308 # more time. 3309 if ($i == $range_list_size - 2) { 3310 3311 trace "Forcing to upper edge" if main::DEBUG && $to_trace; 3312 $i = $range_list_size - 1; 3313 3314 # Change $lower as well so if fails next time through, 3315 # taking the average will yield the same $i, and we will 3316 # quit with the error message just below. 3317 $lower = $i; 3318 next; 3319 } 3320 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); 3321 return; 3322 } 3323 $i = $temp; 3324 } # End of while loop 3325 3326 if (main::DEBUG && $to_trace) { 3327 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; 3328 trace "i= [ $i ]", $r->[$i]; 3329 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; 3330 } 3331 3332 # Here we have found the offset. Cache it as a starting point for the 3333 # next call. 3334 $_search_ranges_cache{$addr} = $i; 3335 return $i; 3336 } 3337 3338 sub _add_delete { 3339 # Add, replace or delete ranges to or from a list. The $type 3340 # parameter gives which: 3341 # '+' => insert or replace a range, returning a list of any changed 3342 # ranges. 3343 # '-' => delete a range, returning a list of any deleted ranges. 3344 # 3345 # The next three parameters give respectively the start, end, and 3346 # value associated with the range. 'value' should be null unless the 3347 # operation is '+'; 3348 # 3349 # The range list is kept sorted so that the range with the lowest 3350 # starting position is first in the list, and generally, adjacent 3351 # ranges with the same values are merged into a single larger one (see 3352 # exceptions below). 3353 # 3354 # There are more parameters; all are key => value pairs: 3355 # Type gives the type of the value. It is only valid for '+'. 3356 # All ranges have types; if this parameter is omitted, 0 is 3357 # assumed. Ranges with type 0 are assumed to obey the 3358 # Unicode rules for casing, etc; ranges with other types are 3359 # not. Otherwise, the type is arbitrary, for the caller's 3360 # convenience, and looked at only by this routine to keep 3361 # adjacent ranges of different types from being merged into 3362 # a single larger range, and when Replace => 3363 # $IF_NOT_EQUIVALENT is specified (see just below). 3364 # Replace determines what to do if the range list already contains 3365 # ranges which coincide with all or portions of the input 3366 # range. It is only valid for '+': 3367 # => $NO means that the new value is not to replace 3368 # any existing ones, but any empty gaps of the 3369 # range list coinciding with the input range 3370 # will be filled in with the new value. 3371 # => $UNCONDITIONALLY means to replace the existing values with 3372 # this one unconditionally. However, if the 3373 # new and old values are identical, the 3374 # replacement is skipped to save cycles 3375 # => $IF_NOT_EQUIVALENT means to replace the existing values 3376 # (the default) with this one if they are not equivalent. 3377 # Ranges are equivalent if their types are the 3378 # same, and they are the same string; or if 3379 # both are type 0 ranges, if their Unicode 3380 # standard forms are identical. In this last 3381 # case, the routine chooses the more "modern" 3382 # one to use. This is because some of the 3383 # older files are formatted with values that 3384 # are, for example, ALL CAPs, whereas the 3385 # derived files have a more modern style, 3386 # which looks better. By looking for this 3387 # style when the pre-existing and replacement 3388 # standard forms are the same, we can move to 3389 # the modern style 3390 # => $MULTIPLE_BEFORE means that if this range duplicates an 3391 # existing one, but has a different value, 3392 # don't replace the existing one, but insert 3393 # this, one so that the same range can occur 3394 # multiple times. They are stored LIFO, so 3395 # that the final one inserted is the first one 3396 # returned in an ordered search of the table. 3397 # If this is an exact duplicate, including the 3398 # value, the original will be moved to be 3399 # first, before any other duplicate ranges 3400 # with different values. 3401 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored 3402 # FIFO, so that this one is inserted after all 3403 # others that currently exist. If this is an 3404 # exact duplicate, including value, of an 3405 # existing range, this one is discarded 3406 # (leaving the existing one in its original, 3407 # higher priority position 3408 # => anything else is the same as => $IF_NOT_EQUIVALENT 3409 # 3410 # "same value" means identical for non-type-0 ranges, and it means 3411 # having the same standard forms for type-0 ranges. 3412 3413 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; 3414 3415 my $self = shift; 3416 my $operation = shift; # '+' for add/replace; '-' for delete; 3417 my $start = shift; 3418 my $end = shift; 3419 my $value = shift; 3420 3421 my %args = @_; 3422 3423 $value = "" if not defined $value; # warning: $value can be "0" 3424 3425 my $replace = delete $args{'Replace'}; 3426 $replace = $IF_NOT_EQUIVALENT unless defined $replace; 3427 3428 my $type = delete $args{'Type'}; 3429 $type = 0 unless defined $type; 3430 3431 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3432 3433 my $addr = do { no overloading; pack 'J', $self; }; 3434 3435 if ($operation ne '+' && $operation ne '-') { 3436 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); 3437 return; 3438 } 3439 unless (defined $start && defined $end) { 3440 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); 3441 return; 3442 } 3443 unless ($end >= $start) { 3444 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); 3445 return; 3446 } 3447 #local $to_trace = 1 if main::DEBUG; 3448 3449 if ($operation eq '-') { 3450 if ($replace != $IF_NOT_EQUIVALENT) { 3451 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); 3452 $replace = $IF_NOT_EQUIVALENT; 3453 } 3454 if ($type) { 3455 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); 3456 $type = 0; 3457 } 3458 if ($value ne "") { 3459 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); 3460 $value = ""; 3461 } 3462 } 3463 3464 my $r = $ranges{$addr}; # The current list of ranges 3465 my $range_list_size = scalar @$r; # And its size 3466 my $max = $max{$addr}; # The current high code point in 3467 # the list of ranges 3468 3469 # Do a special case requiring fewer machine cycles when the new range 3470 # starts after the current highest point. The Unicode input data is 3471 # structured so this is common. 3472 if ($start > $max) { 3473 3474 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace; 3475 return if $operation eq '-'; # Deleting a non-existing range is a 3476 # no-op 3477 3478 # If the new range doesn't logically extend the current final one 3479 # in the range list, create a new range at the end of the range 3480 # list. (max cleverly is initialized to a negative number not 3481 # adjacent to 0 if the range list is empty, so even adding a range 3482 # to an empty range list starting at 0 will have this 'if' 3483 # succeed.) 3484 if ($start > $max + 1 # non-adjacent means can't extend. 3485 || @{$r}[-1]->value ne $value # values differ, can't extend. 3486 || @{$r}[-1]->type != $type # types differ, can't extend. 3487 ) { 3488 push @$r, Range->new($start, $end, 3489 Value => $value, 3490 Type => $type); 3491 } 3492 else { 3493 3494 # Here, the new range starts just after the current highest in 3495 # the range list, and they have the same type and value. 3496 # Extend the current range to incorporate the new one. 3497 @{$r}[-1]->set_end($end); 3498 } 3499 3500 # This becomes the new maximum. 3501 $max{$addr} = $end; 3502 3503 return; 3504 } 3505 #local $to_trace = 0 if main::DEBUG; 3506 3507 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; 3508 3509 # Here, the input range isn't after the whole rest of the range list. 3510 # Most likely 'splice' will be needed. The rest of the routine finds 3511 # the needed splice parameters, and if necessary, does the splice. 3512 # First, find the offset parameter needed by the splice function for 3513 # the input range. Note that the input range may span multiple 3514 # existing ones, but we'll worry about that later. For now, just find 3515 # the beginning. If the input range is to be inserted starting in a 3516 # position not currently in the range list, it must (obviously) come 3517 # just after the range below it, and just before the range above it. 3518 # Slightly less obviously, it will occupy the position currently 3519 # occupied by the range that is to come after it. More formally, we 3520 # are looking for the position, $i, in the array of ranges, such that: 3521 # 3522 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end 3523 # 3524 # (The ordered relationships within existing ranges are also shown in 3525 # the equation above). However, if the start of the input range is 3526 # within an existing range, the splice offset should point to that 3527 # existing range's position in the list; that is $i satisfies a 3528 # somewhat different equation, namely: 3529 # 3530 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end 3531 # 3532 # More briefly, $start can come before or after r[$i]->start, and at 3533 # this point, we don't know which it will be. However, these 3534 # two equations share these constraints: 3535 # 3536 # r[$i-1]->end < $start <= r[$i]->end 3537 # 3538 # And that is good enough to find $i. 3539 3540 my $i = $self->_search_ranges($start); 3541 if (! defined $i) { 3542 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); 3543 return; 3544 } 3545 3546 # The search function returns $i such that: 3547 # 3548 # r[$i-1]->end < $start <= r[$i]->end 3549 # 3550 # That means that $i points to the first range in the range list 3551 # that could possibly be affected by this operation. We still don't 3552 # know if the start of the input range is within r[$i], or if it 3553 # points to empty space between r[$i-1] and r[$i]. 3554 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; 3555 3556 # Special case the insertion of data that is not to replace any 3557 # existing data. 3558 if ($replace == $NO) { # If $NO, has to be operation '+' 3559 #local $to_trace = 1 if main::DEBUG; 3560 trace "Doesn't replace" if main::DEBUG && $to_trace; 3561 3562 # Here, the new range is to take effect only on those code points 3563 # that aren't already in an existing range. This can be done by 3564 # looking through the existing range list and finding the gaps in 3565 # the ranges that this new range affects, and then calling this 3566 # function recursively on each of those gaps, leaving untouched 3567 # anything already in the list. Gather up a list of the changed 3568 # gaps first so that changes to the internal state as new ranges 3569 # are added won't be a problem. 3570 my @gap_list; 3571 3572 # First, if the starting point of the input range is outside an 3573 # existing one, there is a gap from there to the beginning of the 3574 # existing range -- add a span to fill the part that this new 3575 # range occupies 3576 if ($start < $r->[$i]->start) { 3577 push @gap_list, Range->new($start, 3578 main::min($end, 3579 $r->[$i]->start - 1), 3580 Type => $type); 3581 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; 3582 } 3583 3584 # Then look through the range list for other gaps until we reach 3585 # the highest range affected by the input one. 3586 my $j; 3587 for ($j = $i+1; $j < $range_list_size; $j++) { 3588 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; 3589 last if $end < $r->[$j]->start; 3590 3591 # If there is a gap between when this range starts and the 3592 # previous one ends, add a span to fill it. Note that just 3593 # because there are two ranges doesn't mean there is a 3594 # non-zero gap between them. It could be that they have 3595 # different values or types 3596 if ($r->[$j-1]->end + 1 != $r->[$j]->start) { 3597 push @gap_list, 3598 Range->new($r->[$j-1]->end + 1, 3599 $r->[$j]->start - 1, 3600 Type => $type); 3601 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; 3602 } 3603 } 3604 3605 # Here, we have either found an existing range in the range list, 3606 # beyond the area affected by the input one, or we fell off the 3607 # end of the loop because the input range affects the whole rest 3608 # of the range list. In either case, $j is 1 higher than the 3609 # highest affected range. If $j == $i, it means that there are no 3610 # affected ranges, that the entire insertion is in the gap between 3611 # r[$i-1], and r[$i], which we already have taken care of before 3612 # the loop. 3613 # On the other hand, if there are affected ranges, it might be 3614 # that there is a gap that needs filling after the final such 3615 # range to the end of the input range 3616 if ($r->[$j-1]->end < $end) { 3617 push @gap_list, Range->new(main::max($start, 3618 $r->[$j-1]->end + 1), 3619 $end, 3620 Type => $type); 3621 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; 3622 } 3623 3624 # Call recursively to fill in all the gaps. 3625 foreach my $gap (@gap_list) { 3626 $self->_add_delete($operation, 3627 $gap->start, 3628 $gap->end, 3629 $value, 3630 Type => $type); 3631 } 3632 3633 return; 3634 } 3635 3636 # Here, we have taken care of the case where $replace is $NO. 3637 # Remember that here, r[$i-1]->end < $start <= r[$i]->end 3638 # If inserting a multiple record, this is where it goes, before the 3639 # first (if any) existing one if inserting LIFO. (If this is to go 3640 # afterwards, FIFO, we below move the pointer to there.) These imply 3641 # an insertion, and no change to any existing ranges. Note that $i 3642 # can be -1 if this new range doesn't actually duplicate any existing, 3643 # and comes at the beginning of the list. 3644 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) { 3645 3646 if ($start != $end) { 3647 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken."); 3648 return; 3649 } 3650 3651 # If the new code point is within a current range ... 3652 if ($end >= $r->[$i]->start) { 3653 3654 # Don't add an exact duplicate, as it isn't really a multiple 3655 my $existing_value = $r->[$i]->value; 3656 my $existing_type = $r->[$i]->type; 3657 return if $value eq $existing_value && $type eq $existing_type; 3658 3659 # If the multiple value is part of an existing range, we want 3660 # to split up that range, so that only the single code point 3661 # is affected. To do this, we first call ourselves 3662 # recursively to delete that code point from the table, having 3663 # preserved its current data above. Then we call ourselves 3664 # recursively again to add the new multiple, which we know by 3665 # the test just above is different than the current code 3666 # point's value, so it will become a range containing a single 3667 # code point: just itself. Finally, we add back in the 3668 # pre-existing code point, which will again be a single code 3669 # point range. Because 'i' likely will have changed as a 3670 # result of these operations, we can't just continue on, but 3671 # do this operation recursively as well. If we are inserting 3672 # LIFO, the pre-existing code point needs to go after the new 3673 # one, so use MULTIPLE_AFTER; and vice versa. 3674 if ($r->[$i]->start != $r->[$i]->end) { 3675 $self->_add_delete('-', $start, $end, ""); 3676 $self->_add_delete('+', $start, $end, $value, Type => $type); 3677 return $self->_add_delete('+', 3678 $start, $end, 3679 $existing_value, 3680 Type => $existing_type, 3681 Replace => ($replace == $MULTIPLE_BEFORE) 3682 ? $MULTIPLE_AFTER 3683 : $MULTIPLE_BEFORE); 3684 } 3685 } 3686 3687 # If to place this new record after, move to beyond all existing 3688 # ones; but don't add this one if identical to any of them, as it 3689 # isn't really a multiple. This leaves the original order, so 3690 # that the current request is ignored. The reasoning is that the 3691 # previous request that wanted this record to have high priority 3692 # should have precedence. 3693 if ($replace == $MULTIPLE_AFTER) { 3694 while ($i < @$r && $r->[$i]->start == $start) { 3695 return if $value eq $r->[$i]->value 3696 && $type eq $r->[$i]->type; 3697 $i++; 3698 } 3699 } 3700 else { 3701 # If instead we are to place this new record before any 3702 # existing ones, remove any identical ones that come after it. 3703 # This changes the existing order so that the new one is 3704 # first, as is being requested. 3705 for (my $j = $i + 1; 3706 $j < @$r && $r->[$j]->start == $start; 3707 $j++) 3708 { 3709 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { 3710 splice @$r, $j, 1; 3711 last; # There should only be one instance, so no 3712 # need to keep looking 3713 } 3714 } 3715 } 3716 3717 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; 3718 my @return = splice @$r, 3719 $i, 3720 0, 3721 Range->new($start, 3722 $end, 3723 Value => $value, 3724 Type => $type); 3725 if (main::DEBUG && $to_trace) { 3726 trace "After splice:"; 3727 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 3728 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 3729 trace "i =[", $i, "]", $r->[$i] if $i >= 0; 3730 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 3731 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 3732 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3; 3733 } 3734 return @return; 3735 } 3736 3737 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This 3738 # leaves delete, insert, and replace either unconditionally or if not 3739 # equivalent. $i still points to the first potential affected range. 3740 # Now find the highest range affected, which will determine the length 3741 # parameter to splice. (The input range can span multiple existing 3742 # ones.) If this isn't a deletion, while we are looking through the 3743 # range list, see also if this is a replacement rather than a clean 3744 # insertion; that is if it will change the values of at least one 3745 # existing range. Start off assuming it is an insert, until find it 3746 # isn't. 3747 my $clean_insert = $operation eq '+'; 3748 my $j; # This will point to the highest affected range 3749 3750 # For non-zero types, the standard form is the value itself; 3751 my $standard_form = ($type) ? $value : main::standardize($value); 3752 3753 for ($j = $i; $j < $range_list_size; $j++) { 3754 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; 3755 3756 # If find a range that it doesn't overlap into, we can stop 3757 # searching 3758 last if $end < $r->[$j]->start; 3759 3760 # Here, overlaps the range at $j. If the values don't match, 3761 # and so far we think this is a clean insertion, it becomes a 3762 # non-clean insertion, i.e., a 'change' or 'replace' instead. 3763 if ($clean_insert) { 3764 if ($r->[$j]->standard_form ne $standard_form) { 3765 $clean_insert = 0; 3766 if ($replace == $CROAK) { 3767 main::croak("The range to add " 3768 . sprintf("%04X", $start) 3769 . '-' 3770 . sprintf("%04X", $end) 3771 . " with value '$value' overlaps an existing range $r->[$j]"); 3772 } 3773 } 3774 else { 3775 3776 # Here, the two values are essentially the same. If the 3777 # two are actually identical, replacing wouldn't change 3778 # anything so skip it. 3779 my $pre_existing = $r->[$j]->value; 3780 if ($pre_existing ne $value) { 3781 3782 # Here the new and old standardized values are the 3783 # same, but the non-standardized values aren't. If 3784 # replacing unconditionally, then replace 3785 if( $replace == $UNCONDITIONALLY) { 3786 $clean_insert = 0; 3787 } 3788 else { 3789 3790 # Here, are replacing conditionally. Decide to 3791 # replace or not based on which appears to look 3792 # the "nicest". If one is mixed case and the 3793 # other isn't, choose the mixed case one. 3794 my $new_mixed = $value =~ /[A-Z]/ 3795 && $value =~ /[a-z]/; 3796 my $old_mixed = $pre_existing =~ /[A-Z]/ 3797 && $pre_existing =~ /[a-z]/; 3798 3799 if ($old_mixed != $new_mixed) { 3800 $clean_insert = 0 if $new_mixed; 3801 if (main::DEBUG && $to_trace) { 3802 if ($clean_insert) { 3803 trace "Retaining $pre_existing over $value"; 3804 } 3805 else { 3806 trace "Replacing $pre_existing with $value"; 3807 } 3808 } 3809 } 3810 else { 3811 3812 # Here casing wasn't different between the two. 3813 # If one has hyphens or underscores and the 3814 # other doesn't, choose the one with the 3815 # punctuation. 3816 my $new_punct = $value =~ /[-_]/; 3817 my $old_punct = $pre_existing =~ /[-_]/; 3818 3819 if ($old_punct != $new_punct) { 3820 $clean_insert = 0 if $new_punct; 3821 if (main::DEBUG && $to_trace) { 3822 if ($clean_insert) { 3823 trace "Retaining $pre_existing over $value"; 3824 } 3825 else { 3826 trace "Replacing $pre_existing with $value"; 3827 } 3828 } 3829 } # else existing one is just as "good"; 3830 # retain it to save cycles. 3831 } 3832 } 3833 } 3834 } 3835 } 3836 } # End of loop looking for highest affected range. 3837 3838 # Here, $j points to one beyond the highest range that this insertion 3839 # affects (hence to beyond the range list if that range is the final 3840 # one in the range list). 3841 3842 # The splice length is all the affected ranges. Get it before 3843 # subtracting, for efficiency, so we don't have to later add 1. 3844 my $length = $j - $i; 3845 3846 $j--; # $j now points to the highest affected range. 3847 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; 3848 3849 # Here, have taken care of $NO and $MULTIPLE_foo replaces. 3850 # $j points to the highest affected range. But it can be < $i or even 3851 # -1. These happen only if the insertion is entirely in the gap 3852 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop 3853 # above exited first time through with $end < $r->[$i]->start. (And 3854 # then we subtracted one from j) This implies also that $start < 3855 # $r->[$i]->start, but we know from above that $r->[$i-1]->end < 3856 # $start, so the entire input range is in the gap. 3857 if ($j < $i) { 3858 3859 # Here the entire input range is in the gap before $i. 3860 3861 if (main::DEBUG && $to_trace) { 3862 if ($i) { 3863 trace "Entire range is between $r->[$i-1] and $r->[$i]"; 3864 } 3865 else { 3866 trace "Entire range is before $r->[$i]"; 3867 } 3868 } 3869 return if $operation ne '+'; # Deletion of a non-existent range is 3870 # a no-op 3871 } 3872 else { 3873 3874 # Here part of the input range is not in the gap before $i. Thus, 3875 # there is at least one affected one, and $j points to the highest 3876 # such one. 3877 3878 # At this point, here is the situation: 3879 # This is not an insertion of a multiple, nor of tentative ($NO) 3880 # data. 3881 # $i points to the first element in the current range list that 3882 # may be affected by this operation. In fact, we know 3883 # that the range at $i is affected because we are in 3884 # the else branch of this 'if' 3885 # $j points to the highest affected range. 3886 # In other words, 3887 # r[$i-1]->end < $start <= r[$i]->end 3888 # And: 3889 # r[$i-1]->end < $start <= $end <= r[$j]->end 3890 # 3891 # Also: 3892 # $clean_insert is a boolean which is set true if and only if 3893 # this is a "clean insertion", i.e., not a change nor a 3894 # deletion (multiple was handled above). 3895 3896 # We now have enough information to decide if this call is a no-op 3897 # or not. It is a no-op if this is an insertion of already 3898 # existing data. 3899 3900 if (main::DEBUG && $to_trace && $clean_insert 3901 && $i == $j 3902 && $start >= $r->[$i]->start) 3903 { 3904 trace "no-op"; 3905 } 3906 return if $clean_insert 3907 && $i == $j # more than one affected range => not no-op 3908 3909 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end 3910 # Further, $start and/or $end is >= r[$i]->start 3911 # The test below hence guarantees that 3912 # r[$i]->start < $start <= $end <= r[$i]->end 3913 # This means the input range is contained entirely in 3914 # the one at $i, so is a no-op 3915 && $start >= $r->[$i]->start; 3916 } 3917 3918 # Here, we know that some action will have to be taken. We have 3919 # calculated the offset and length (though adjustments may be needed) 3920 # for the splice. Now start constructing the replacement list. 3921 my @replacement; 3922 my $splice_start = $i; 3923 3924 my $extends_below; 3925 my $extends_above; 3926 3927 # See if should extend any adjacent ranges. 3928 if ($operation eq '-') { # Don't extend deletions 3929 $extends_below = $extends_above = 0; 3930 } 3931 else { # Here, should extend any adjacent ranges. See if there are 3932 # any. 3933 $extends_below = ($i > 0 3934 # can't extend unless adjacent 3935 && $r->[$i-1]->end == $start -1 3936 # can't extend unless are same standard value 3937 && $r->[$i-1]->standard_form eq $standard_form 3938 # can't extend unless share type 3939 && $r->[$i-1]->type == $type); 3940 $extends_above = ($j+1 < $range_list_size 3941 && $r->[$j+1]->start == $end +1 3942 && $r->[$j+1]->standard_form eq $standard_form 3943 && $r->[$j+1]->type == $type); 3944 } 3945 if ($extends_below && $extends_above) { # Adds to both 3946 $splice_start--; # start replace at element below 3947 $length += 2; # will replace on both sides 3948 trace "Extends both below and above ranges" if main::DEBUG && $to_trace; 3949 3950 # The result will fill in any gap, replacing both sides, and 3951 # create one large range. 3952 @replacement = Range->new($r->[$i-1]->start, 3953 $r->[$j+1]->end, 3954 Value => $value, 3955 Type => $type); 3956 } 3957 else { 3958 3959 # Here we know that the result won't just be the conglomeration of 3960 # a new range with both its adjacent neighbors. But it could 3961 # extend one of them. 3962 3963 if ($extends_below) { 3964 3965 # Here the new element adds to the one below, but not to the 3966 # one above. If inserting, and only to that one range, can 3967 # just change its ending to include the new one. 3968 if ($length == 0 && $clean_insert) { 3969 $r->[$i-1]->set_end($end); 3970 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; 3971 return; 3972 } 3973 else { 3974 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; 3975 $splice_start--; # start replace at element below 3976 $length++; # will replace the element below 3977 $start = $r->[$i-1]->start; 3978 } 3979 } 3980 elsif ($extends_above) { 3981 3982 # Here the new element adds to the one above, but not below. 3983 # Mirror the code above 3984 if ($length == 0 && $clean_insert) { 3985 $r->[$j+1]->set_start($start); 3986 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; 3987 return; 3988 } 3989 else { 3990 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; 3991 $length++; # will replace the element above 3992 $end = $r->[$j+1]->end; 3993 } 3994 } 3995 3996 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; 3997 3998 # Finally, here we know there will have to be a splice. 3999 # If the change or delete affects only the highest portion of the 4000 # first affected range, the range will have to be split. The 4001 # splice will remove the whole range, but will replace it by a new 4002 # range containing just the unaffected part. So, in this case, 4003 # add to the replacement list just this unaffected portion. 4004 if (! $extends_below 4005 && $start > $r->[$i]->start && $start <= $r->[$i]->end) 4006 { 4007 push @replacement, 4008 Range->new($r->[$i]->start, 4009 $start - 1, 4010 Value => $r->[$i]->value, 4011 Type => $r->[$i]->type); 4012 } 4013 4014 # In the case of an insert or change, but not a delete, we have to 4015 # put in the new stuff; this comes next. 4016 if ($operation eq '+') { 4017 push @replacement, Range->new($start, 4018 $end, 4019 Value => $value, 4020 Type => $type); 4021 } 4022 4023 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; 4024 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; 4025 4026 # And finally, if we're changing or deleting only a portion of the 4027 # highest affected range, it must be split, as the lowest one was. 4028 if (! $extends_above 4029 && $j >= 0 # Remember that j can be -1 if before first 4030 # current element 4031 && $end >= $r->[$j]->start 4032 && $end < $r->[$j]->end) 4033 { 4034 push @replacement, 4035 Range->new($end + 1, 4036 $r->[$j]->end, 4037 Value => $r->[$j]->value, 4038 Type => $r->[$j]->type); 4039 } 4040 } 4041 4042 # And do the splice, as calculated above 4043 if (main::DEBUG && $to_trace) { 4044 trace "replacing $length element(s) at $i with "; 4045 foreach my $replacement (@replacement) { 4046 trace " $replacement"; 4047 } 4048 trace "Before splice:"; 4049 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4050 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4051 trace "i =[", $i, "]", $r->[$i]; 4052 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4053 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4054 } 4055 4056 my @return = splice @$r, $splice_start, $length, @replacement; 4057 4058 if (main::DEBUG && $to_trace) { 4059 trace "After splice:"; 4060 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4061 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4062 trace "i =[", $i, "]", $r->[$i]; 4063 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4064 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4065 trace "removed ", @return if @return; 4066 } 4067 4068 # An actual deletion could have changed the maximum in the list. 4069 # There was no deletion if the splice didn't return something, but 4070 # otherwise recalculate it. This is done too rarely to worry about 4071 # performance. 4072 if ($operation eq '-' && @return) { 4073 $max{$addr} = $r->[-1]->end; 4074 } 4075 return @return; 4076 } 4077 4078 sub reset_each_range { # reset the iterator for each_range(); 4079 my $self = shift; 4080 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4081 4082 no overloading; 4083 undef $each_range_iterator{pack 'J', $self}; 4084 return; 4085 } 4086 4087 sub each_range { 4088 # Iterate over each range in a range list. Results are undefined if 4089 # the range list is changed during the iteration. 4090 4091 my $self = shift; 4092 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4093 4094 my $addr = do { no overloading; pack 'J', $self; }; 4095 4096 return if $self->is_empty; 4097 4098 $each_range_iterator{$addr} = -1 4099 if ! defined $each_range_iterator{$addr}; 4100 $each_range_iterator{$addr}++; 4101 return $ranges{$addr}->[$each_range_iterator{$addr}] 4102 if $each_range_iterator{$addr} < @{$ranges{$addr}}; 4103 undef $each_range_iterator{$addr}; 4104 return; 4105 } 4106 4107 sub count { # Returns count of code points in range list 4108 my $self = shift; 4109 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4110 4111 my $addr = do { no overloading; pack 'J', $self; }; 4112 4113 my $count = 0; 4114 foreach my $range (@{$ranges{$addr}}) { 4115 $count += $range->end - $range->start + 1; 4116 } 4117 return $count; 4118 } 4119 4120 sub delete_range { # Delete a range 4121 my $self = shift; 4122 my $start = shift; 4123 my $end = shift; 4124 4125 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4126 4127 return $self->_add_delete('-', $start, $end, ""); 4128 } 4129 4130 sub is_empty { # Returns boolean as to if a range list is empty 4131 my $self = shift; 4132 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4133 4134 no overloading; 4135 return scalar @{$ranges{pack 'J', $self}} == 0; 4136 } 4137 4138 sub hash { 4139 # Quickly returns a scalar suitable for separating tables into 4140 # buckets, i.e. it is a hash function of the contents of a table, so 4141 # there are relatively few conflicts. 4142 4143 my $self = shift; 4144 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4145 4146 my $addr = do { no overloading; pack 'J', $self; }; 4147 4148 # These are quickly computable. Return looks like 'min..max;count' 4149 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; 4150 } 4151} # End closure for _Range_List_Base 4152 4153package Range_List; 4154use base '_Range_List_Base'; 4155 4156# A Range_List is a range list for match tables; i.e. the range values are 4157# not significant. Thus a number of operations can be safely added to it, 4158# such as inversion, intersection. Note that union is also an unsafe 4159# operation when range values are cared about, and that method is in the base 4160# class, not here. But things are set up so that that method is callable only 4161# during initialization. Only in this derived class, is there an operation 4162# that combines two tables. A Range_Map can thus be used to initialize a 4163# Range_List, and its mappings will be in the list, but are not significant to 4164# this class. 4165 4166sub trace { return main::trace(@_); } 4167 4168{ # Closure 4169 4170 use overload 4171 fallback => 0, 4172 '+' => sub { my $self = shift; 4173 my $other = shift; 4174 4175 return $self->_union($other) 4176 }, 4177 '&' => sub { my $self = shift; 4178 my $other = shift; 4179 4180 return $self->_intersect($other, 0); 4181 }, 4182 '~' => "_invert", 4183 '-' => "_subtract", 4184 ; 4185 4186 sub _invert { 4187 # Returns a new Range_List that gives all code points not in $self. 4188 4189 my $self = shift; 4190 4191 my $new = Range_List->new; 4192 4193 # Go through each range in the table, finding the gaps between them 4194 my $max = -1; # Set so no gap before range beginning at 0 4195 for my $range ($self->ranges) { 4196 my $start = $range->start; 4197 my $end = $range->end; 4198 4199 # If there is a gap before this range, the inverse will contain 4200 # that gap. 4201 if ($start > $max + 1) { 4202 $new->add_range($max + 1, $start - 1); 4203 } 4204 $max = $end; 4205 } 4206 4207 # And finally, add the gap from the end of the table to the max 4208 # possible code point 4209 if ($max < $MAX_UNICODE_CODEPOINT) { 4210 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT); 4211 } 4212 return $new; 4213 } 4214 4215 sub _subtract { 4216 # Returns a new Range_List with the argument deleted from it. The 4217 # argument can be a single code point, a range, or something that has 4218 # a range, with the _range_list() method on it returning them 4219 4220 my $self = shift; 4221 my $other = shift; 4222 my $reversed = shift; 4223 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4224 4225 if ($reversed) { 4226 Carp::my_carp_bug("Can't cope with a " 4227 . __PACKAGE__ 4228 . " being the second parameter in a '-'. Subtraction ignored."); 4229 return $self; 4230 } 4231 4232 my $new = Range_List->new(Initialize => $self); 4233 4234 if (! ref $other) { # Single code point 4235 $new->delete_range($other, $other); 4236 } 4237 elsif ($other->isa('Range')) { 4238 $new->delete_range($other->start, $other->end); 4239 } 4240 elsif ($other->can('_range_list')) { 4241 foreach my $range ($other->_range_list->ranges) { 4242 $new->delete_range($range->start, $range->end); 4243 } 4244 } 4245 else { 4246 Carp::my_carp_bug("Can't cope with a " 4247 . ref($other) 4248 . " argument to '-'. Subtraction ignored." 4249 ); 4250 return $self; 4251 } 4252 4253 return $new; 4254 } 4255 4256 sub _intersect { 4257 # Returns either a boolean giving whether the two inputs' range lists 4258 # intersect (overlap), or a new Range_List containing the intersection 4259 # of the two lists. The optional final parameter being true indicates 4260 # to do the check instead of the intersection. 4261 4262 my $a_object = shift; 4263 my $b_object = shift; 4264 my $check_if_overlapping = shift; 4265 $check_if_overlapping = 0 unless defined $check_if_overlapping; 4266 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4267 4268 if (! defined $b_object) { 4269 my $message = ""; 4270 $message .= $a_object->_owner_name_of if defined $a_object; 4271 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); 4272 return; 4273 } 4274 4275 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) 4276 # Thus the intersection could be much more simply be written: 4277 # return ~(~$a_object + ~$b_object); 4278 # But, this is slower, and when taking the inverse of a large 4279 # range_size_1 table, back when such tables were always stored that 4280 # way, it became prohibitively slow, hence the code was changed to the 4281 # below 4282 4283 if ($b_object->isa('Range')) { 4284 $b_object = Range_List->new(Initialize => $b_object, 4285 Owner => $a_object->_owner_name_of); 4286 } 4287 $b_object = $b_object->_range_list if $b_object->can('_range_list'); 4288 4289 my @a_ranges = $a_object->ranges; 4290 my @b_ranges = $b_object->ranges; 4291 4292 #local $to_trace = 1 if main::DEBUG; 4293 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; 4294 4295 # Start with the first range in each list 4296 my $a_i = 0; 4297 my $range_a = $a_ranges[$a_i]; 4298 my $b_i = 0; 4299 my $range_b = $b_ranges[$b_i]; 4300 4301 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) 4302 if ! $check_if_overlapping; 4303 4304 # If either list is empty, there is no intersection and no overlap 4305 if (! defined $range_a || ! defined $range_b) { 4306 return $check_if_overlapping ? 0 : $new; 4307 } 4308 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 4309 4310 # Otherwise, must calculate the intersection/overlap. Start with the 4311 # very first code point in each list 4312 my $a = $range_a->start; 4313 my $b = $range_b->start; 4314 4315 # Loop through all the ranges of each list; in each iteration, $a and 4316 # $b are the current code points in their respective lists 4317 while (1) { 4318 4319 # If $a and $b are the same code point, ... 4320 if ($a == $b) { 4321 4322 # it means the lists overlap. If just checking for overlap 4323 # know the answer now, 4324 return 1 if $check_if_overlapping; 4325 4326 # The intersection includes this code point plus anything else 4327 # common to both current ranges. 4328 my $start = $a; 4329 my $end = main::min($range_a->end, $range_b->end); 4330 if (! $check_if_overlapping) { 4331 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; 4332 $new->add_range($start, $end); 4333 } 4334 4335 # Skip ahead to the end of the current intersect 4336 $a = $b = $end; 4337 4338 # If the current intersect ends at the end of either range (as 4339 # it must for at least one of them), the next possible one 4340 # will be the beginning code point in it's list's next range. 4341 if ($a == $range_a->end) { 4342 $range_a = $a_ranges[++$a_i]; 4343 last unless defined $range_a; 4344 $a = $range_a->start; 4345 } 4346 if ($b == $range_b->end) { 4347 $range_b = $b_ranges[++$b_i]; 4348 last unless defined $range_b; 4349 $b = $range_b->start; 4350 } 4351 4352 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 4353 } 4354 elsif ($a < $b) { 4355 4356 # Not equal, but if the range containing $a encompasses $b, 4357 # change $a to be the middle of the range where it does equal 4358 # $b, so the next iteration will get the intersection 4359 if ($range_a->end >= $b) { 4360 $a = $b; 4361 } 4362 else { 4363 4364 # Here, the current range containing $a is entirely below 4365 # $b. Go try to find a range that could contain $b. 4366 $a_i = $a_object->_search_ranges($b); 4367 4368 # If no range found, quit. 4369 last unless defined $a_i; 4370 4371 # The search returns $a_i, such that 4372 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end 4373 # Set $a to the beginning of this new range, and repeat. 4374 $range_a = $a_ranges[$a_i]; 4375 $a = $range_a->start; 4376 } 4377 } 4378 else { # Here, $b < $a. 4379 4380 # Mirror image code to the leg just above 4381 if ($range_b->end >= $a) { 4382 $b = $a; 4383 } 4384 else { 4385 $b_i = $b_object->_search_ranges($a); 4386 last unless defined $b_i; 4387 $range_b = $b_ranges[$b_i]; 4388 $b = $range_b->start; 4389 } 4390 } 4391 } # End of looping through ranges. 4392 4393 # Intersection fully computed, or now know that there is no overlap 4394 return $check_if_overlapping ? 0 : $new; 4395 } 4396 4397 sub overlaps { 4398 # Returns boolean giving whether the two arguments overlap somewhere 4399 4400 my $self = shift; 4401 my $other = shift; 4402 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4403 4404 return $self->_intersect($other, 1); 4405 } 4406 4407 sub add_range { 4408 # Add a range to the list. 4409 4410 my $self = shift; 4411 my $start = shift; 4412 my $end = shift; 4413 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4414 4415 return $self->_add_delete('+', $start, $end, ""); 4416 } 4417 4418 sub matches_identically_to { 4419 # Return a boolean as to whether or not two Range_Lists match identical 4420 # sets of code points. 4421 4422 my $self = shift; 4423 my $other = shift; 4424 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4425 4426 # These are ordered in increasing real time to figure out (at least 4427 # until a patch changes that and doesn't change this) 4428 return 0 if $self->max != $other->max; 4429 return 0 if $self->min != $other->min; 4430 return 0 if $self->range_count != $other->range_count; 4431 return 0 if $self->count != $other->count; 4432 4433 # Here they could be identical because all the tests above passed. 4434 # The loop below is somewhat simpler since we know they have the same 4435 # number of elements. Compare range by range, until reach the end or 4436 # find something that differs. 4437 my @a_ranges = $self->ranges; 4438 my @b_ranges = $other->ranges; 4439 for my $i (0 .. @a_ranges - 1) { 4440 my $a = $a_ranges[$i]; 4441 my $b = $b_ranges[$i]; 4442 trace "self $a; other $b" if main::DEBUG && $to_trace; 4443 return 0 if ! defined $b 4444 || $a->start != $b->start 4445 || $a->end != $b->end; 4446 } 4447 return 1; 4448 } 4449 4450 sub is_code_point_usable { 4451 # This used only for making the test script. See if the input 4452 # proposed trial code point is one that Perl will handle. If second 4453 # parameter is 0, it won't select some code points for various 4454 # reasons, noted below. 4455 4456 my $code = shift; 4457 my $try_hard = shift; 4458 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4459 4460 return 0 if $code < 0; # Never use a negative 4461 4462 # shun null. I'm (khw) not sure why this was done, but NULL would be 4463 # the character very frequently used. 4464 return $try_hard if $code == 0x0000; 4465 4466 # shun non-character code points. 4467 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; 4468 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF 4469 4470 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range 4471 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate 4472 4473 return 1; 4474 } 4475 4476 sub get_valid_code_point { 4477 # Return a code point that's part of the range list. Returns nothing 4478 # if the table is empty or we can't find a suitable code point. This 4479 # used only for making the test script. 4480 4481 my $self = shift; 4482 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4483 4484 my $addr = do { no overloading; pack 'J', $self; }; 4485 4486 # On first pass, don't choose less desirable code points; if no good 4487 # one is found, repeat, allowing a less desirable one to be selected. 4488 for my $try_hard (0, 1) { 4489 4490 # Look through all the ranges for a usable code point. 4491 for my $set (reverse $self->ranges) { 4492 4493 # Try the edge cases first, starting with the end point of the 4494 # range. 4495 my $end = $set->end; 4496 return $end if is_code_point_usable($end, $try_hard); 4497 4498 # End point didn't, work. Start at the beginning and try 4499 # every one until find one that does work. 4500 for my $trial ($set->start .. $end - 1) { 4501 return $trial if is_code_point_usable($trial, $try_hard); 4502 } 4503 } 4504 } 4505 return (); # If none found, give up. 4506 } 4507 4508 sub get_invalid_code_point { 4509 # Return a code point that's not part of the table. Returns nothing 4510 # if the table covers all code points or a suitable code point can't 4511 # be found. This used only for making the test script. 4512 4513 my $self = shift; 4514 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4515 4516 # Just find a valid code point of the inverse, if any. 4517 return Range_List->new(Initialize => ~ $self)->get_valid_code_point; 4518 } 4519} # end closure for Range_List 4520 4521package Range_Map; 4522use base '_Range_List_Base'; 4523 4524# A Range_Map is a range list in which the range values (called maps) are 4525# significant, and hence shouldn't be manipulated by our other code, which 4526# could be ambiguous or lose things. For example, in taking the union of two 4527# lists, which share code points, but which have differing values, which one 4528# has precedence in the union? 4529# It turns out that these operations aren't really necessary for map tables, 4530# and so this class was created to make sure they aren't accidentally 4531# applied to them. 4532 4533{ # Closure 4534 4535 sub add_map { 4536 # Add a range containing a mapping value to the list 4537 4538 my $self = shift; 4539 # Rest of parameters passed on 4540 4541 return $self->_add_delete('+', @_); 4542 } 4543 4544 sub add_duplicate { 4545 # Adds entry to a range list which can duplicate an existing entry 4546 4547 my $self = shift; 4548 my $code_point = shift; 4549 my $value = shift; 4550 my %args = @_; 4551 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE; 4552 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4553 4554 return $self->add_map($code_point, $code_point, 4555 $value, Replace => $replace); 4556 } 4557} # End of closure for package Range_Map 4558 4559package _Base_Table; 4560 4561# A table is the basic data structure that gets written out into a file for 4562# use by the Perl core. This is the abstract base class implementing the 4563# common elements from the derived ones. A list of the methods to be 4564# furnished by an implementing class is just after the constructor. 4565 4566sub standardize { return main::standardize($_[0]); } 4567sub trace { return main::trace(@_); } 4568 4569{ # Closure 4570 4571 main::setup_package(); 4572 4573 my %range_list; 4574 # Object containing the ranges of the table. 4575 main::set_access('range_list', \%range_list, 'p_r', 'p_s'); 4576 4577 my %full_name; 4578 # The full table name. 4579 main::set_access('full_name', \%full_name, 'r'); 4580 4581 my %name; 4582 # The table name, almost always shorter 4583 main::set_access('name', \%name, 'r'); 4584 4585 my %short_name; 4586 # The shortest of all the aliases for this table, with underscores removed 4587 main::set_access('short_name', \%short_name); 4588 4589 my %nominal_short_name_length; 4590 # The length of short_name before removing underscores 4591 main::set_access('nominal_short_name_length', 4592 \%nominal_short_name_length); 4593 4594 my %complete_name; 4595 # The complete name, including property. 4596 main::set_access('complete_name', \%complete_name, 'r'); 4597 4598 my %property; 4599 # Parent property this table is attached to. 4600 main::set_access('property', \%property, 'r'); 4601 4602 my %aliases; 4603 # Ordered list of alias objects of the table's name. The first ones in 4604 # the list are output first in comments 4605 main::set_access('aliases', \%aliases, 'readable_array'); 4606 4607 my %comment; 4608 # A comment associated with the table for human readers of the files 4609 main::set_access('comment', \%comment, 's'); 4610 4611 my %description; 4612 # A comment giving a short description of the table's meaning for human 4613 # readers of the files. 4614 main::set_access('description', \%description, 'readable_array'); 4615 4616 my %note; 4617 # A comment giving a short note about the table for human readers of the 4618 # files. 4619 main::set_access('note', \%note, 'readable_array'); 4620 4621 my %fate; 4622 # Enum; there are a number of possibilities for what happens to this 4623 # table: it could be normal, or suppressed, or not for external use. See 4624 # values at definition for $SUPPRESSED. 4625 main::set_access('fate', \%fate, 'r'); 4626 4627 my %find_table_from_alias; 4628 # The parent property passes this pointer to a hash which this class adds 4629 # all its aliases to, so that the parent can quickly take an alias and 4630 # find this table. 4631 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); 4632 4633 my %locked; 4634 # After this table is made equivalent to another one; we shouldn't go 4635 # changing the contents because that could mean it's no longer equivalent 4636 main::set_access('locked', \%locked, 'r'); 4637 4638 my %file_path; 4639 # This gives the final path to the file containing the table. Each 4640 # directory in the path is an element in the array 4641 main::set_access('file_path', \%file_path, 'readable_array'); 4642 4643 my %status; 4644 # What is the table's status, normal, $OBSOLETE, etc. Enum 4645 main::set_access('status', \%status, 'r'); 4646 4647 my %status_info; 4648 # A comment about its being obsolete, or whatever non normal status it has 4649 main::set_access('status_info', \%status_info, 'r'); 4650 4651 my %caseless_equivalent; 4652 # The table this is equivalent to under /i matching, if any. 4653 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's'); 4654 4655 my %range_size_1; 4656 # Is the table to be output with each range only a single code point? 4657 # This is done to avoid breaking existing code that may have come to rely 4658 # on this behavior in previous versions of this program.) 4659 main::set_access('range_size_1', \%range_size_1, 'r', 's'); 4660 4661 my %perl_extension; 4662 # A boolean set iff this table is a Perl extension to the Unicode 4663 # standard. 4664 main::set_access('perl_extension', \%perl_extension, 'r'); 4665 4666 my %output_range_counts; 4667 # A boolean set iff this table is to have comments written in the 4668 # output file that contain the number of code points in the range. 4669 # The constructor can override the global flag of the same name. 4670 main::set_access('output_range_counts', \%output_range_counts, 'r'); 4671 4672 my %format; 4673 # The format of the entries of the table. This is calculated from the 4674 # data in the table (or passed in the constructor). This is an enum e.g., 4675 # $STRING_FORMAT. It is marked protected as it should not be generally 4676 # used to override calculations. 4677 main::set_access('format', \%format, 'r', 'p_s'); 4678 4679 sub new { 4680 # All arguments are key => value pairs, which you can see below, most 4681 # of which match fields documented above. Otherwise: Re_Pod_Entry, 4682 # OK_as_Filename, and Fuzzy apply to the names of the table, and are 4683 # documented in the Alias package 4684 4685 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 4686 4687 my $class = shift; 4688 4689 my $self = bless \do { my $anonymous_scalar }, $class; 4690 my $addr = do { no overloading; pack 'J', $self; }; 4691 4692 my %args = @_; 4693 4694 $name{$addr} = delete $args{'Name'}; 4695 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; 4696 $full_name{$addr} = delete $args{'Full_Name'}; 4697 my $complete_name = $complete_name{$addr} 4698 = delete $args{'Complete_Name'}; 4699 $format{$addr} = delete $args{'Format'}; 4700 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; 4701 $property{$addr} = delete $args{'_Property'}; 4702 $range_list{$addr} = delete $args{'_Range_List'}; 4703 $status{$addr} = delete $args{'Status'} || $NORMAL; 4704 $status_info{$addr} = delete $args{'_Status_Info'} || ""; 4705 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; 4706 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; 4707 $fate{$addr} = delete $args{'Fate'} || $ORDINARY; 4708 my $ucd = delete $args{'UCD'}; 4709 4710 my $description = delete $args{'Description'}; 4711 my $ok_as_filename = delete $args{'OK_as_Filename'}; 4712 my $loose_match = delete $args{'Fuzzy'}; 4713 my $note = delete $args{'Note'}; 4714 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 4715 my $perl_extension = delete $args{'Perl_Extension'}; 4716 4717 # Shouldn't have any left over 4718 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4719 4720 # Can't use || above because conceivably the name could be 0, and 4721 # can't use // operator in case this program gets used in Perl 5.8 4722 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; 4723 $output_range_counts{$addr} = $output_range_counts if 4724 ! defined $output_range_counts{$addr}; 4725 4726 $aliases{$addr} = [ ]; 4727 $comment{$addr} = [ ]; 4728 $description{$addr} = [ ]; 4729 $note{$addr} = [ ]; 4730 $file_path{$addr} = [ ]; 4731 $locked{$addr} = ""; 4732 4733 push @{$description{$addr}}, $description if $description; 4734 push @{$note{$addr}}, $note if $note; 4735 4736 if ($fate{$addr} == $PLACEHOLDER) { 4737 4738 # A placeholder table doesn't get documented, is a perl extension, 4739 # and quite likely will be empty 4740 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 4741 $perl_extension = 1 if ! defined $perl_extension; 4742 $ucd = 0 if ! defined $ucd; 4743 push @tables_that_may_be_empty, $complete_name{$addr}; 4744 $self->add_comment(<<END); 4745This is a placeholder because it is not in Version $string_version of Unicode, 4746but is needed by the Perl core to work gracefully. Because it is not in this 4747version of Unicode, it will not be listed in $pod_file.pod 4748END 4749 } 4750 elsif (exists $why_suppressed{$complete_name} 4751 # Don't suppress if overridden 4752 && ! grep { $_ eq $complete_name{$addr} } 4753 @output_mapped_properties) 4754 { 4755 $fate{$addr} = $SUPPRESSED; 4756 } 4757 elsif ($fate{$addr} == $SUPPRESSED 4758 && ! exists $why_suppressed{$property{$addr}->complete_name}) 4759 { 4760 Carp::my_carp_bug("There is no current capability to set the reason for suppressing."); 4761 # perhaps Fate => [ $SUPPRESSED, "reason" ] 4762 } 4763 4764 # If hasn't set its status already, see if it is on one of the 4765 # lists of properties or tables that have particular statuses; if 4766 # not, is normal. The lists are prioritized so the most serious 4767 # ones are checked first 4768 if (! $status{$addr}) { 4769 if (exists $why_deprecated{$complete_name}) { 4770 $status{$addr} = $DEPRECATED; 4771 } 4772 elsif (exists $why_stabilized{$complete_name}) { 4773 $status{$addr} = $STABILIZED; 4774 } 4775 elsif (exists $why_obsolete{$complete_name}) { 4776 $status{$addr} = $OBSOLETE; 4777 } 4778 4779 # Existence above doesn't necessarily mean there is a message 4780 # associated with it. Use the most serious message. 4781 if ($status{$addr}) { 4782 if ($why_deprecated{$complete_name}) { 4783 $status_info{$addr} 4784 = $why_deprecated{$complete_name}; 4785 } 4786 elsif ($why_stabilized{$complete_name}) { 4787 $status_info{$addr} 4788 = $why_stabilized{$complete_name}; 4789 } 4790 elsif ($why_obsolete{$complete_name}) { 4791 $status_info{$addr} 4792 = $why_obsolete{$complete_name}; 4793 } 4794 } 4795 } 4796 4797 $perl_extension{$addr} = $perl_extension || 0; 4798 4799 # Don't list a property by default that is internal only 4800 if ($fate{$addr} > $MAP_PROXIED) { 4801 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 4802 $ucd = 0 if ! defined $ucd; 4803 } 4804 else { 4805 $ucd = 1 if ! defined $ucd; 4806 } 4807 4808 # By convention what typically gets printed only or first is what's 4809 # first in the list, so put the full name there for good output 4810 # clarity. Other routines rely on the full name being first on the 4811 # list 4812 $self->add_alias($full_name{$addr}, 4813 OK_as_Filename => $ok_as_filename, 4814 Fuzzy => $loose_match, 4815 Re_Pod_Entry => $make_re_pod_entry, 4816 Status => $status{$addr}, 4817 UCD => $ucd, 4818 ); 4819 4820 # Then comes the other name, if meaningfully different. 4821 if (standardize($full_name{$addr}) ne standardize($name{$addr})) { 4822 $self->add_alias($name{$addr}, 4823 OK_as_Filename => $ok_as_filename, 4824 Fuzzy => $loose_match, 4825 Re_Pod_Entry => $make_re_pod_entry, 4826 Status => $status{$addr}, 4827 UCD => $ucd, 4828 ); 4829 } 4830 4831 return $self; 4832 } 4833 4834 # Here are the methods that are required to be defined by any derived 4835 # class 4836 for my $sub (qw( 4837 handle_special_range 4838 append_to_body 4839 pre_body 4840 )) 4841 # write() knows how to write out normal ranges, but it calls 4842 # handle_special_range() when it encounters a non-normal one. 4843 # append_to_body() is called by it after it has handled all 4844 # ranges to add anything after the main portion of the table. 4845 # And finally, pre_body() is called after all this to build up 4846 # anything that should appear before the main portion of the 4847 # table. Doing it this way allows things in the middle to 4848 # affect what should appear before the main portion of the 4849 # table. 4850 { 4851 no strict "refs"; 4852 *$sub = sub { 4853 Carp::my_carp_bug( __LINE__ 4854 . ": Must create method '$sub()' for " 4855 . ref shift); 4856 return; 4857 } 4858 } 4859 4860 use overload 4861 fallback => 0, 4862 "." => \&main::_operator_dot, 4863 '!=' => \&main::_operator_not_equal, 4864 '==' => \&main::_operator_equal, 4865 ; 4866 4867 sub ranges { 4868 # Returns the array of ranges associated with this table. 4869 4870 no overloading; 4871 return $range_list{pack 'J', shift}->ranges; 4872 } 4873 4874 sub add_alias { 4875 # Add a synonym for this table. 4876 4877 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 4878 4879 my $self = shift; 4880 my $name = shift; # The name to add. 4881 my $pointer = shift; # What the alias hash should point to. For 4882 # map tables, this is the parent property; 4883 # for match tables, it is the table itself. 4884 4885 my %args = @_; 4886 my $loose_match = delete $args{'Fuzzy'}; 4887 4888 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 4889 $make_re_pod_entry = $YES unless defined $make_re_pod_entry; 4890 4891 my $ok_as_filename = delete $args{'OK_as_Filename'}; 4892 $ok_as_filename = 1 unless defined $ok_as_filename; 4893 4894 my $status = delete $args{'Status'}; 4895 $status = $NORMAL unless defined $status; 4896 4897 # An internal name does not get documented, unless overridden by the 4898 # input. 4899 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); 4900 4901 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4902 4903 # Capitalize the first letter of the alias unless it is one of the CJK 4904 # ones which specifically begins with a lower 'k'. Do this because 4905 # Unicode has varied whether they capitalize first letters or not, and 4906 # have later changed their minds and capitalized them, but not the 4907 # other way around. So do it always and avoid changes from release to 4908 # release 4909 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 4910 4911 my $addr = do { no overloading; pack 'J', $self; }; 4912 4913 # Figure out if should be loosely matched if not already specified. 4914 if (! defined $loose_match) { 4915 4916 # Is a loose_match if isn't null, and doesn't begin with an 4917 # underscore and isn't just a number 4918 if ($name ne "" 4919 && substr($name, 0, 1) ne '_' 4920 && $name !~ qr{^[0-9_.+-/]+$}) 4921 { 4922 $loose_match = 1; 4923 } 4924 else { 4925 $loose_match = 0; 4926 } 4927 } 4928 4929 # If this alias has already been defined, do nothing. 4930 return if defined $find_table_from_alias{$addr}->{$name}; 4931 4932 # That includes if it is standardly equivalent to an existing alias, 4933 # in which case, add this name to the list, so won't have to search 4934 # for it again. 4935 my $standard_name = main::standardize($name); 4936 if (defined $find_table_from_alias{$addr}->{$standard_name}) { 4937 $find_table_from_alias{$addr}->{$name} 4938 = $find_table_from_alias{$addr}->{$standard_name}; 4939 return; 4940 } 4941 4942 # Set the index hash for this alias for future quick reference. 4943 $find_table_from_alias{$addr}->{$name} = $pointer; 4944 $find_table_from_alias{$addr}->{$standard_name} = $pointer; 4945 local $to_trace = 0 if main::DEBUG; 4946 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; 4947 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; 4948 4949 4950 # Put the new alias at the end of the list of aliases unless the final 4951 # element begins with an underscore (meaning it is for internal perl 4952 # use) or is all numeric, in which case, put the new one before that 4953 # one. This floats any all-numeric or underscore-beginning aliases to 4954 # the end. This is done so that they are listed last in output lists, 4955 # to encourage the user to use a better name (either more descriptive 4956 # or not an internal-only one) instead. This ordering is relied on 4957 # implicitly elsewhere in this program, like in short_name() 4958 my $list = $aliases{$addr}; 4959 my $insert_position = (@$list == 0 4960 || (substr($list->[-1]->name, 0, 1) ne '_' 4961 && $list->[-1]->name =~ /\D/)) 4962 ? @$list 4963 : @$list - 1; 4964 splice @$list, 4965 $insert_position, 4966 0, 4967 Alias->new($name, $loose_match, $make_re_pod_entry, 4968 $ok_as_filename, $status, $ucd); 4969 4970 # This name may be shorter than any existing ones, so clear the cache 4971 # of the shortest, so will have to be recalculated. 4972 no overloading; 4973 undef $short_name{pack 'J', $self}; 4974 return; 4975 } 4976 4977 sub short_name { 4978 # Returns a name suitable for use as the base part of a file name. 4979 # That is, shorter wins. It can return undef if there is no suitable 4980 # name. The name has all non-essential underscores removed. 4981 4982 # The optional second parameter is a reference to a scalar in which 4983 # this routine will store the length the returned name had before the 4984 # underscores were removed, or undef if the return is undef. 4985 4986 # The shortest name can change if new aliases are added. So using 4987 # this should be deferred until after all these are added. The code 4988 # that does that should clear this one's cache. 4989 # Any name with alphabetics is preferred over an all numeric one, even 4990 # if longer. 4991 4992 my $self = shift; 4993 my $nominal_length_ptr = shift; 4994 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4995 4996 my $addr = do { no overloading; pack 'J', $self; }; 4997 4998 # For efficiency, don't recalculate, but this means that adding new 4999 # aliases could change what the shortest is, so the code that does 5000 # that needs to undef this. 5001 if (defined $short_name{$addr}) { 5002 if ($nominal_length_ptr) { 5003 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5004 } 5005 return $short_name{$addr}; 5006 } 5007 5008 # Look at each alias 5009 foreach my $alias ($self->aliases()) { 5010 5011 # Don't use an alias that isn't ok to use for an external name. 5012 next if ! $alias->ok_as_filename; 5013 5014 my $name = main::Standardize($alias->name); 5015 trace $self, $name if main::DEBUG && $to_trace; 5016 5017 # Take the first one, or a shorter one that isn't numeric. This 5018 # relies on numeric aliases always being last in the array 5019 # returned by aliases(). Any alpha one will have precedence. 5020 if (! defined $short_name{$addr} 5021 || ($name =~ /\D/ 5022 && length($name) < length($short_name{$addr}))) 5023 { 5024 # Remove interior underscores. 5025 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; 5026 5027 $nominal_short_name_length{$addr} = length $name; 5028 } 5029 } 5030 5031 # If the short name isn't a nice one, perhaps an equivalent table has 5032 # a better one. 5033 if (! defined $short_name{$addr} 5034 || $short_name{$addr} eq "" 5035 || $short_name{$addr} eq "_") 5036 { 5037 my $return; 5038 foreach my $follower ($self->children) { # All equivalents 5039 my $follower_name = $follower->short_name; 5040 next unless defined $follower_name; 5041 5042 # Anything (except undefined) is better than underscore or 5043 # empty 5044 if (! defined $return || $return eq "_") { 5045 $return = $follower_name; 5046 next; 5047 } 5048 5049 # If the new follower name isn't "_" and is shorter than the 5050 # current best one, prefer the new one. 5051 next if $follower_name eq "_"; 5052 next if length $follower_name > length $return; 5053 $return = $follower_name; 5054 } 5055 $short_name{$addr} = $return if defined $return; 5056 } 5057 5058 # If no suitable external name return undef 5059 if (! defined $short_name{$addr}) { 5060 $$nominal_length_ptr = undef if $nominal_length_ptr; 5061 return; 5062 } 5063 5064 # Don't allow a null short name. 5065 if ($short_name{$addr} eq "") { 5066 $short_name{$addr} = '_'; 5067 $nominal_short_name_length{$addr} = 1; 5068 } 5069 5070 trace $self, $short_name{$addr} if main::DEBUG && $to_trace; 5071 5072 if ($nominal_length_ptr) { 5073 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5074 } 5075 return $short_name{$addr}; 5076 } 5077 5078 sub external_name { 5079 # Returns the external name that this table should be known by. This 5080 # is usually the short_name, but not if the short_name is undefined, 5081 # in which case the external_name is arbitrarily set to the 5082 # underscore. 5083 5084 my $self = shift; 5085 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5086 5087 my $short = $self->short_name; 5088 return $short if defined $short; 5089 5090 return '_'; 5091 } 5092 5093 sub add_description { # Adds the parameter as a short description. 5094 5095 my $self = shift; 5096 my $description = shift; 5097 chomp $description; 5098 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5099 5100 no overloading; 5101 push @{$description{pack 'J', $self}}, $description; 5102 5103 return; 5104 } 5105 5106 sub add_note { # Adds the parameter as a short note. 5107 5108 my $self = shift; 5109 my $note = shift; 5110 chomp $note; 5111 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5112 5113 no overloading; 5114 push @{$note{pack 'J', $self}}, $note; 5115 5116 return; 5117 } 5118 5119 sub add_comment { # Adds the parameter as a comment. 5120 5121 return unless $debugging_build; 5122 5123 my $self = shift; 5124 my $comment = shift; 5125 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5126 5127 chomp $comment; 5128 5129 no overloading; 5130 push @{$comment{pack 'J', $self}}, $comment; 5131 5132 return; 5133 } 5134 5135 sub comment { 5136 # Return the current comment for this table. If called in list 5137 # context, returns the array of comments. In scalar, returns a string 5138 # of each element joined together with a period ending each. 5139 5140 my $self = shift; 5141 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5142 5143 my $addr = do { no overloading; pack 'J', $self; }; 5144 my @list = @{$comment{$addr}}; 5145 return @list if wantarray; 5146 my $return = ""; 5147 foreach my $sentence (@list) { 5148 $return .= '. ' if $return; 5149 $return .= $sentence; 5150 $return =~ s/\.$//; 5151 } 5152 $return .= '.' if $return; 5153 return $return; 5154 } 5155 5156 sub initialize { 5157 # Initialize the table with the argument which is any valid 5158 # initialization for range lists. 5159 5160 my $self = shift; 5161 my $addr = do { no overloading; pack 'J', $self; }; 5162 my $initialization = shift; 5163 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5164 5165 # Replace the current range list with a new one of the same exact 5166 # type. 5167 my $class = ref $range_list{$addr}; 5168 $range_list{$addr} = $class->new(Owner => $self, 5169 Initialize => $initialization); 5170 return; 5171 5172 } 5173 5174 sub header { 5175 # The header that is output for the table in the file it is written 5176 # in. 5177 5178 my $self = shift; 5179 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5180 5181 my $return = ""; 5182 $return .= $DEVELOPMENT_ONLY if $compare_versions; 5183 $return .= $HEADER; 5184 return $return; 5185 } 5186 5187 sub write { 5188 # Write a representation of the table to its file. It calls several 5189 # functions furnished by sub-classes of this abstract base class to 5190 # handle non-normal ranges, to add stuff before the table, and at its 5191 # end. If the table is to be written so that adjustments are 5192 # required, this does that conversion. 5193 5194 my $self = shift; 5195 my $use_adjustments = shift; # ? output in adjusted format or not 5196 my $tab_stops = shift; # The number of tab stops over to put any 5197 # comment. 5198 my $suppress_value = shift; # Optional, if the value associated with 5199 # a range equals this one, don't write 5200 # the range 5201 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5202 5203 my $addr = do { no overloading; pack 'J', $self; }; 5204 5205 # Start with the header 5206 my @HEADER = $self->header; 5207 5208 # Then the comments 5209 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n" 5210 if $comment{$addr}; 5211 5212 # Things discovered processing the main body of the document may 5213 # affect what gets output before it, therefore pre_body() isn't called 5214 # until after all other processing of the table is done. 5215 5216 # The main body looks like a 'here' document. If annotating, get rid 5217 # of the comments before passing to the caller, as some callers, such 5218 # as charnames.pm, can't cope with them. (Outputting range counts 5219 # also introduces comments, but these don't show up in the tables that 5220 # can't cope with comments, and there aren't that many of them that 5221 # it's worth the extra real time to get rid of them). 5222 my @OUT; 5223 if ($annotate) { 5224 # Use the line below in Perls that don't have /r 5225 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n"; 5226 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n"; 5227 } else { 5228 push @OUT, "return <<'END';\n"; 5229 } 5230 5231 if ($range_list{$addr}->is_empty) { 5232 5233 # This is a kludge for empty tables to silence a warning in 5234 # utf8.c, which can't really deal with empty tables, but it can 5235 # deal with a table that matches nothing, as the inverse of 'Any' 5236 # does. 5237 push @OUT, "!utf8::Any\n"; 5238 } 5239 elsif ($self->name eq 'N' 5240 5241 # To save disk space and table cache space, avoid putting out 5242 # binary N tables, but instead create a file which just inverts 5243 # the Y table. Since the file will still exist and occupy a 5244 # certain number of blocks, might as well output the whole 5245 # thing if it all will fit in one block. The number of 5246 # ranges below is an approximate number for that. 5247 && ($self->property->type == $BINARY 5248 || $self->property->type == $FORCED_BINARY) 5249 # && $self->property->tables == 2 Can't do this because the 5250 # non-binary properties, like NFDQC aren't specifiable 5251 # by the notation 5252 && $range_list{$addr}->ranges > 15 5253 && ! $annotate) # Under --annotate, want to see everything 5254 { 5255 push @OUT, "!utf8::" . $self->property->name . "\n"; 5256 } 5257 else { 5258 my $range_size_1 = $range_size_1{$addr}; 5259 my $format; # Used only in $annotate option 5260 my $include_name; # Used only in $annotate option 5261 5262 if ($annotate) { 5263 5264 # If annotating each code point, must print 1 per line. 5265 # The variable could point to a subroutine, and we don't want 5266 # to lose that fact, so only set if not set already 5267 $range_size_1 = 1 if ! $range_size_1; 5268 5269 $format = $self->format; 5270 5271 # The name of the character is output only for tables that 5272 # don't already include the name in the output. 5273 my $property = $self->property; 5274 $include_name = 5275 ! ($property == $perl_charname 5276 || $property == main::property_ref('Unicode_1_Name') 5277 || $property == main::property_ref('Name') 5278 || $property == main::property_ref('Name_Alias') 5279 ); 5280 } 5281 5282 # Values for previous time through the loop. Initialize to 5283 # something that won't be adjacent to the first iteration; 5284 # only $previous_end matters for that. 5285 my $previous_start; 5286 my $previous_end = -2; 5287 my $previous_value; 5288 5289 # Values for next time through the portion of the loop that splits 5290 # the range. 0 in $next_start means there is no remaining portion 5291 # to deal with. 5292 my $next_start = 0; 5293 my $next_end; 5294 my $next_value; 5295 my $offset = 0; 5296 5297 # Output each range as part of the here document. 5298 RANGE: 5299 for my $set ($range_list{$addr}->ranges) { 5300 if ($set->type != 0) { 5301 $self->handle_special_range($set); 5302 next RANGE; 5303 } 5304 my $start = $set->start; 5305 my $end = $set->end; 5306 my $value = $set->value; 5307 5308 # Don't output ranges whose value is the one to suppress 5309 next RANGE if defined $suppress_value 5310 && $value eq $suppress_value; 5311 5312 { # This bare block encloses the scope where we may need to 5313 # split a range (when outputting adjusteds), and each time 5314 # through we handle the next portion of the original by 5315 # ending the block with a 'redo'. The values to use for 5316 # that next time through are set up just below in the 5317 # scalars whose names begin with '$next_'. 5318 5319 if ($use_adjustments) { 5320 5321 # When converting to use adjustments, we can handle 5322 # only single element ranges. Set up so that this 5323 # time through the loop, we look at the first element, 5324 # and the next time through, we start off with the 5325 # remainder. Thus each time through we look at the 5326 # first element of the range 5327 if ($end != $start) { 5328 $next_start = $start + 1; 5329 $next_end = $end; 5330 $next_value = $value; 5331 $end = $start; 5332 } 5333 5334 # The values for some of these tables are stored as 5335 # hex strings. Convert those to decimal 5336 $value = hex($value) 5337 if $self->default_map eq $CODE_POINT 5338 && $value =~ / ^ [A-Fa-f0-9]+ $ /x; 5339 5340 # If this range is adjacent to the previous one, and 5341 # the values in each are integers that are also 5342 # adjacent (differ by 1), then this range really 5343 # extends the previous one that is already in element 5344 # $OUT[-1]. So we pop that element, and pretend that 5345 # the range starts with whatever it started with. 5346 # $offset is incremented by 1 each time so that it 5347 # gives the current offset from the first element in 5348 # the accumulating range, and we keep in $value the 5349 # value of that first element. 5350 if ($start == $previous_end + 1 5351 && $value =~ /^ -? \d+ $/xa 5352 && $previous_value =~ /^ -? \d+ $/xa 5353 && ($value == ($previous_value + ++$offset))) 5354 { 5355 pop @OUT; 5356 $start = $previous_start; 5357 $value = $previous_value; 5358 } 5359 else { 5360 $offset = 0; 5361 } 5362 5363 # Save the current values for the next time through 5364 # the loop. 5365 $previous_start = $start; 5366 $previous_end = $end; 5367 $previous_value = $value; 5368 } 5369 5370 # If there is a range and doesn't need a single point range 5371 # output 5372 if ($start != $end && ! $range_size_1) { 5373 push @OUT, sprintf "%04X\t%04X", $start, $end; 5374 $OUT[-1] .= "\t$value" if $value ne ""; 5375 5376 # Add a comment with the size of the range, if 5377 # requested. Expand Tabs to make sure they all start 5378 # in the same column, and then unexpand to use mostly 5379 # tabs. 5380 if (! $output_range_counts{$addr}) { 5381 $OUT[-1] .= "\n"; 5382 } 5383 else { 5384 $OUT[-1] = Text::Tabs::expand($OUT[-1]); 5385 my $count = main::clarify_number($end - $start + 1); 5386 use integer; 5387 5388 my $width = $tab_stops * 8 - 1; 5389 $OUT[-1] = sprintf("%-*s # [%s]\n", 5390 $width, 5391 $OUT[-1], 5392 $count); 5393 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]); 5394 } 5395 } 5396 5397 # Here to output a single code point per line. 5398 # If not to annotate, use the simple formats 5399 elsif (! $annotate) { 5400 5401 # Use any passed in subroutine to output. 5402 if (ref $range_size_1 eq 'CODE') { 5403 for my $i ($start .. $end) { 5404 push @OUT, &{$range_size_1}($i, $value); 5405 } 5406 } 5407 else { 5408 5409 # Here, caller is ok with default output. 5410 for (my $i = $start; $i <= $end; $i++) { 5411 push @OUT, sprintf "%04X\t\t%s\n", $i, $value; 5412 } 5413 } 5414 } 5415 else { 5416 5417 # Here, wants annotation. 5418 for (my $i = $start; $i <= $end; $i++) { 5419 5420 # Get character information if don't have it already 5421 main::populate_char_info($i) 5422 if ! defined $viacode[$i]; 5423 my $type = $annotate_char_type[$i]; 5424 5425 # Figure out if should output the next code points 5426 # as part of a range or not. If this is not in an 5427 # annotation range, then won't output as a range, 5428 # so returns $i. Otherwise use the end of the 5429 # annotation range, but no further than the 5430 # maximum possible end point of the loop. 5431 my $range_end = main::min( 5432 $annotate_ranges->value_of($i) || $i, 5433 $end); 5434 5435 # Use a range if it is a range, and either is one 5436 # of the special annotation ranges, or the range 5437 # is at most 3 long. This last case causes the 5438 # algorithmically named code points to be output 5439 # individually in spans of at most 3, as they are 5440 # the ones whose $type is > 0. 5441 if ($range_end != $i 5442 && ( $type < 0 || $range_end - $i > 2)) 5443 { 5444 # Here is to output a range. We don't allow a 5445 # caller-specified output format--just use the 5446 # standard one. 5447 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i, 5448 $range_end, 5449 $value; 5450 my $range_name = $viacode[$i]; 5451 5452 # For the code points which end in their hex 5453 # value, we eliminate that from the output 5454 # annotation, and capitalize only the first 5455 # letter of each word. 5456 if ($type == $CP_IN_NAME) { 5457 my $hex = sprintf "%04X", $i; 5458 $range_name =~ s/-$hex$//; 5459 my @words = split " ", $range_name; 5460 for my $word (@words) { 5461 $word = 5462 ucfirst(lc($word)) if $word ne 'CJK'; 5463 } 5464 $range_name = join " ", @words; 5465 } 5466 elsif ($type == $HANGUL_SYLLABLE) { 5467 $range_name = "Hangul Syllable"; 5468 } 5469 5470 $OUT[-1] .= " $range_name" if $range_name; 5471 5472 # Include the number of code points in the 5473 # range 5474 my $count = 5475 main::clarify_number($range_end - $i + 1); 5476 $OUT[-1] .= " [$count]\n"; 5477 5478 # Skip to the end of the range 5479 $i = $range_end; 5480 } 5481 else { # Not in a range. 5482 my $comment = ""; 5483 5484 # When outputting the names of each character, 5485 # use the character itself if printable 5486 $comment .= "'" . chr($i) . "' " 5487 if $printable[$i]; 5488 5489 # To make it more readable, use a minimum 5490 # indentation 5491 my $comment_indent; 5492 5493 # Determine the annotation 5494 if ($format eq $DECOMP_STRING_FORMAT) { 5495 5496 # This is very specialized, with the type 5497 # of decomposition beginning the line 5498 # enclosed in <...>, and the code points 5499 # that the code point decomposes to 5500 # separated by blanks. Create two 5501 # strings, one of the printable 5502 # characters, and one of their official 5503 # names. 5504 (my $map = $value) =~ s/ \ * < .*? > \ +//x; 5505 my $tostr = ""; 5506 my $to_name = ""; 5507 my $to_chr = ""; 5508 foreach my $to (split " ", $map) { 5509 $to = CORE::hex $to; 5510 $to_name .= " + " if $to_name; 5511 $to_chr .= chr($to); 5512 main::populate_char_info($to) 5513 if ! defined $viacode[$to]; 5514 $to_name .= $viacode[$to]; 5515 } 5516 5517 $comment .= 5518 "=> '$to_chr'; $viacode[$i] => $to_name"; 5519 $comment_indent = 25; # Determined by 5520 # experiment 5521 } 5522 else { 5523 5524 # Assume that any table that has hex 5525 # format is a mapping of one code point to 5526 # another. 5527 if ($format eq $HEX_FORMAT) { 5528 my $decimal_value = CORE::hex $value; 5529 main::populate_char_info($decimal_value) 5530 if ! defined $viacode[$decimal_value]; 5531 $comment .= "=> '" 5532 . chr($decimal_value) 5533 . "'; " if $printable[$decimal_value]; 5534 } 5535 $comment .= $viacode[$i] if $include_name 5536 && $viacode[$i]; 5537 if ($format eq $HEX_FORMAT) { 5538 my $decimal_value = CORE::hex $value; 5539 $comment .= 5540 " => $viacode[$decimal_value]" 5541 if $viacode[$decimal_value]; 5542 } 5543 5544 # If including the name, no need to 5545 # indent, as the name will already be way 5546 # across the line. 5547 $comment_indent = ($include_name) ? 0 : 60; 5548 } 5549 5550 # Use any passed in routine to output the base 5551 # part of the line. 5552 if (ref $range_size_1 eq 'CODE') { 5553 my $base_part=&{$range_size_1}($i, $value); 5554 chomp $base_part; 5555 push @OUT, $base_part; 5556 } 5557 else { 5558 push @OUT, sprintf "%04X\t\t%s", $i, $value; 5559 } 5560 5561 # And add the annotation. 5562 $OUT[-1] = sprintf "%-*s\t# %s", 5563 $comment_indent, 5564 $OUT[-1], 5565 $comment 5566 if $comment; 5567 $OUT[-1] .= "\n"; 5568 } 5569 } 5570 } 5571 5572 # If we split the range, set up so the next time through 5573 # we get the remainder, and redo. 5574 if ($next_start) { 5575 $start = $next_start; 5576 $end = $next_end; 5577 $value = $next_value; 5578 $next_start = 0; 5579 redo; 5580 } 5581 } 5582 } # End of loop through all the table's ranges 5583 } 5584 5585 # Add anything that goes after the main body, but within the here 5586 # document, 5587 my $append_to_body = $self->append_to_body; 5588 push @OUT, $append_to_body if $append_to_body; 5589 5590 # And finish the here document. 5591 push @OUT, "END\n"; 5592 5593 # Done with the main portion of the body. Can now figure out what 5594 # should appear before it in the file. 5595 my $pre_body = $self->pre_body; 5596 push @HEADER, $pre_body, "\n" if $pre_body; 5597 5598 # All these files should have a .pl suffix added to them. 5599 my @file_with_pl = @{$file_path{$addr}}; 5600 $file_with_pl[-1] .= '.pl'; 5601 5602 main::write(\@file_with_pl, 5603 $annotate, # utf8 iff annotating 5604 \@HEADER, 5605 \@OUT); 5606 return; 5607 } 5608 5609 sub set_status { # Set the table's status 5610 my $self = shift; 5611 my $status = shift; # The status enum value 5612 my $info = shift; # Any message associated with it. 5613 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5614 5615 my $addr = do { no overloading; pack 'J', $self; }; 5616 5617 $status{$addr} = $status; 5618 $status_info{$addr} = $info; 5619 return; 5620 } 5621 5622 sub set_fate { # Set the fate of a table 5623 my $self = shift; 5624 my $fate = shift; 5625 my $reason = shift; 5626 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5627 5628 my $addr = do { no overloading; pack 'J', $self; }; 5629 5630 return if $fate{$addr} == $fate; # If no-op 5631 5632 # Can only change the ordinary fate, except if going to $MAP_PROXIED 5633 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; 5634 5635 $fate{$addr} = $fate; 5636 5637 # Don't document anything to do with a non-normal fated table 5638 if ($fate != $ORDINARY) { 5639 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; 5640 foreach my $alias ($self->aliases) { 5641 $alias->set_ucd($put_in_pod); 5642 5643 # MAP_PROXIED doesn't affect the match tables 5644 next if $fate == $MAP_PROXIED; 5645 $alias->set_make_re_pod_entry($put_in_pod); 5646 } 5647 } 5648 5649 # Save the reason for suppression for output 5650 if ($fate == $SUPPRESSED && defined $reason) { 5651 $why_suppressed{$complete_name{$addr}} = $reason; 5652 } 5653 5654 return; 5655 } 5656 5657 sub lock { 5658 # Don't allow changes to the table from now on. This stores a stack 5659 # trace of where it was called, so that later attempts to modify it 5660 # can immediately show where it got locked. 5661 5662 my $self = shift; 5663 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5664 5665 my $addr = do { no overloading; pack 'J', $self; }; 5666 5667 $locked{$addr} = ""; 5668 5669 my $line = (caller(0))[2]; 5670 my $i = 1; 5671 5672 # Accumulate the stack trace 5673 while (1) { 5674 my ($pkg, $file, $caller_line, $caller) = caller $i++; 5675 5676 last unless defined $caller; 5677 5678 $locked{$addr} .= " called from $caller() at line $line\n"; 5679 $line = $caller_line; 5680 } 5681 $locked{$addr} .= " called from main at line $line\n"; 5682 5683 return; 5684 } 5685 5686 sub carp_if_locked { 5687 # Return whether a table is locked or not, and, by the way, complain 5688 # if is locked 5689 5690 my $self = shift; 5691 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5692 5693 my $addr = do { no overloading; pack 'J', $self; }; 5694 5695 return 0 if ! $locked{$addr}; 5696 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); 5697 return 1; 5698 } 5699 5700 sub set_file_path { # Set the final directory path for this table 5701 my $self = shift; 5702 # Rest of parameters passed on 5703 5704 no overloading; 5705 @{$file_path{pack 'J', $self}} = @_; 5706 return 5707 } 5708 5709 # Accessors for the range list stored in this table. First for 5710 # unconditional 5711 for my $sub (qw( 5712 containing_range 5713 contains 5714 count 5715 each_range 5716 hash 5717 is_empty 5718 matches_identically_to 5719 max 5720 min 5721 range_count 5722 reset_each_range 5723 type_of 5724 value_of 5725 )) 5726 { 5727 no strict "refs"; 5728 *$sub = sub { 5729 use strict "refs"; 5730 my $self = shift; 5731 return $self->_range_list->$sub(@_); 5732 } 5733 } 5734 5735 # Then for ones that should fail if locked 5736 for my $sub (qw( 5737 delete_range 5738 )) 5739 { 5740 no strict "refs"; 5741 *$sub = sub { 5742 use strict "refs"; 5743 my $self = shift; 5744 5745 return if $self->carp_if_locked; 5746 no overloading; 5747 return $self->_range_list->$sub(@_); 5748 } 5749 } 5750 5751} # End closure 5752 5753package Map_Table; 5754use base '_Base_Table'; 5755 5756# A Map Table is a table that contains the mappings from code points to 5757# values. There are two weird cases: 5758# 1) Anomalous entries are ones that aren't maps of ranges of code points, but 5759# are written in the table's file at the end of the table nonetheless. It 5760# requires specially constructed code to handle these; utf8.c can not read 5761# these in, so they should not go in $map_directory. As of this writing, 5762# the only case that these happen is for named sequences used in 5763# charnames.pm. But this code doesn't enforce any syntax on these, so 5764# something else could come along that uses it. 5765# 2) Specials are anything that doesn't fit syntactically into the body of the 5766# table. The ranges for these have a map type of non-zero. The code below 5767# knows about and handles each possible type. In most cases, these are 5768# written as part of the header. 5769# 5770# A map table deliberately can't be manipulated at will unlike match tables. 5771# This is because of the ambiguities having to do with what to do with 5772# overlapping code points. And there just isn't a need for those things; 5773# what one wants to do is just query, add, replace, or delete mappings, plus 5774# write the final result. 5775# However, there is a method to get the list of possible ranges that aren't in 5776# this table to use for defaulting missing code point mappings. And, 5777# map_add_or_replace_non_nulls() does allow one to add another table to this 5778# one, but it is clearly very specialized, and defined that the other's 5779# non-null values replace this one's if there is any overlap. 5780 5781sub trace { return main::trace(@_); } 5782 5783{ # Closure 5784 5785 main::setup_package(); 5786 5787 my %default_map; 5788 # Many input files omit some entries; this gives what the mapping for the 5789 # missing entries should be 5790 main::set_access('default_map', \%default_map, 'r'); 5791 5792 my %anomalous_entries; 5793 # Things that go in the body of the table which don't fit the normal 5794 # scheme of things, like having a range. Not much can be done with these 5795 # once there except to output them. This was created to handle named 5796 # sequences. 5797 main::set_access('anomalous_entry', \%anomalous_entries, 'a'); 5798 main::set_access('anomalous_entries', # Append singular, read plural 5799 \%anomalous_entries, 5800 'readable_array'); 5801 5802 my %to_output_map; 5803 # Enum as to whether or not to write out this map table, and how: 5804 # 0 don't output 5805 # $EXTERNAL_MAP means its existence is noted in the documentation, and 5806 # it should not be removed nor its format changed. This 5807 # is done for those files that have traditionally been 5808 # output. 5809 # $INTERNAL_MAP means Perl reserves the right to do anything it wants 5810 # with this file 5811 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of 5812 # outputting the actual mappings as-is, we adjust things 5813 # to create a much more compact table. Only those few 5814 # tables where the mapping is convertible at least to an 5815 # integer and compacting makes a big difference should 5816 # have this. Hence, the default is to not do this 5817 # unless the table's default mapping is to $CODE_POINT, 5818 # and the range size is not 1. 5819 main::set_access('to_output_map', \%to_output_map, 's'); 5820 5821 sub new { 5822 my $class = shift; 5823 my $name = shift; 5824 5825 my %args = @_; 5826 5827 # Optional initialization data for the table. 5828 my $initialize = delete $args{'Initialize'}; 5829 5830 my $default_map = delete $args{'Default_Map'}; 5831 my $property = delete $args{'_Property'}; 5832 my $full_name = delete $args{'Full_Name'}; 5833 my $to_output_map = delete $args{'To_Output_Map'}; 5834 5835 # Rest of parameters passed on 5836 5837 my $range_list = Range_Map->new(Owner => $property); 5838 5839 my $self = $class->SUPER::new( 5840 Name => $name, 5841 Complete_Name => $full_name, 5842 Full_Name => $full_name, 5843 _Property => $property, 5844 _Range_List => $range_list, 5845 %args); 5846 5847 my $addr = do { no overloading; pack 'J', $self; }; 5848 5849 $anomalous_entries{$addr} = []; 5850 $default_map{$addr} = $default_map; 5851 $to_output_map{$addr} = $to_output_map; 5852 5853 $self->initialize($initialize) if defined $initialize; 5854 5855 return $self; 5856 } 5857 5858 use overload 5859 fallback => 0, 5860 qw("") => "_operator_stringify", 5861 ; 5862 5863 sub _operator_stringify { 5864 my $self = shift; 5865 5866 my $name = $self->property->full_name; 5867 $name = '""' if $name eq ""; 5868 return "Map table for Property '$name'"; 5869 } 5870 5871 sub add_alias { 5872 # Add a synonym for this table (which means the property itself) 5873 my $self = shift; 5874 my $name = shift; 5875 # Rest of parameters passed on. 5876 5877 $self->SUPER::add_alias($name, $self->property, @_); 5878 return; 5879 } 5880 5881 sub add_map { 5882 # Add a range of code points to the list of specially-handled code 5883 # points. $MULTI_CP is assumed if the type of special is not passed 5884 # in. 5885 5886 my $self = shift; 5887 my $lower = shift; 5888 my $upper = shift; 5889 my $string = shift; 5890 my %args = @_; 5891 5892 my $type = delete $args{'Type'} || 0; 5893 # Rest of parameters passed on 5894 5895 # Can't change the table if locked. 5896 return if $self->carp_if_locked; 5897 5898 my $addr = do { no overloading; pack 'J', $self; }; 5899 5900 $self->_range_list->add_map($lower, $upper, 5901 $string, 5902 @_, 5903 Type => $type); 5904 return; 5905 } 5906 5907 sub append_to_body { 5908 # Adds to the written HERE document of the table's body any anomalous 5909 # entries in the table.. 5910 5911 my $self = shift; 5912 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5913 5914 my $addr = do { no overloading; pack 'J', $self; }; 5915 5916 return "" unless @{$anomalous_entries{$addr}}; 5917 return join("\n", @{$anomalous_entries{$addr}}) . "\n"; 5918 } 5919 5920 sub map_add_or_replace_non_nulls { 5921 # This adds the mappings in the table $other to $self. Non-null 5922 # mappings from $other override those in $self. It essentially merges 5923 # the two tables, with the second having priority except for null 5924 # mappings. 5925 5926 my $self = shift; 5927 my $other = shift; 5928 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5929 5930 return if $self->carp_if_locked; 5931 5932 if (! $other->isa(__PACKAGE__)) { 5933 Carp::my_carp_bug("$other should be a " 5934 . __PACKAGE__ 5935 . ". Not a '" 5936 . ref($other) 5937 . "'. Not added;"); 5938 return; 5939 } 5940 5941 my $addr = do { no overloading; pack 'J', $self; }; 5942 my $other_addr = do { no overloading; pack 'J', $other; }; 5943 5944 local $to_trace = 0 if main::DEBUG; 5945 5946 my $self_range_list = $self->_range_list; 5947 my $other_range_list = $other->_range_list; 5948 foreach my $range ($other_range_list->ranges) { 5949 my $value = $range->value; 5950 next if $value eq ""; 5951 $self_range_list->_add_delete('+', 5952 $range->start, 5953 $range->end, 5954 $value, 5955 Type => $range->type, 5956 Replace => $UNCONDITIONALLY); 5957 } 5958 5959 return; 5960 } 5961 5962 sub set_default_map { 5963 # Define what code points that are missing from the input files should 5964 # map to 5965 5966 my $self = shift; 5967 my $map = shift; 5968 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5969 5970 my $addr = do { no overloading; pack 'J', $self; }; 5971 5972 # Convert the input to the standard equivalent, if any (won't have any 5973 # for $STRING properties) 5974 my $standard = $self->_find_table_from_alias->{$map}; 5975 $map = $standard->name if defined $standard; 5976 5977 # Warn if there already is a non-equivalent default map for this 5978 # property. Note that a default map can be a ref, which means that 5979 # what it actually means is delayed until later in the program, and it 5980 # IS permissible to override it here without a message. 5981 my $default_map = $default_map{$addr}; 5982 if (defined $default_map 5983 && ! ref($default_map) 5984 && $default_map ne $map 5985 && main::Standardize($map) ne $default_map) 5986 { 5987 my $property = $self->property; 5988 my $map_table = $property->table($map); 5989 my $default_table = $property->table($default_map); 5990 if (defined $map_table 5991 && defined $default_table 5992 && $map_table != $default_table) 5993 { 5994 Carp::my_carp("Changing the default mapping for " 5995 . $property 5996 . " from $default_map to $map'"); 5997 } 5998 } 5999 6000 $default_map{$addr} = $map; 6001 6002 # Don't also create any missing table for this map at this point, 6003 # because if we did, it could get done before the main table add is 6004 # done for PropValueAliases.txt; instead the caller will have to make 6005 # sure it exists, if desired. 6006 return; 6007 } 6008 6009 sub to_output_map { 6010 # Returns boolean: should we write this map table? 6011 6012 my $self = shift; 6013 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6014 6015 my $addr = do { no overloading; pack 'J', $self; }; 6016 6017 # If overridden, use that 6018 return $to_output_map{$addr} if defined $to_output_map{$addr}; 6019 6020 my $full_name = $self->full_name; 6021 return $global_to_output_map{$full_name} 6022 if defined $global_to_output_map{$full_name}; 6023 6024 # If table says to output, do so; if says to suppress it, do so. 6025 my $fate = $self->fate; 6026 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; 6027 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; 6028 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; 6029 6030 my $type = $self->property->type; 6031 6032 # Don't want to output binary map tables even for debugging. 6033 return 0 if $type == $BINARY; 6034 6035 # But do want to output string ones. All the ones that remain to 6036 # be dealt with (i.e. which haven't explicitly been set to external) 6037 # are for internal Perl use only. The default for those that map to 6038 # $CODE_POINT and haven't been restricted to a single element range 6039 # is to use the adjusted form. 6040 if ($type == $STRING) { 6041 return $INTERNAL_MAP if $self->range_size_1 6042 || $default_map{$addr} ne $CODE_POINT; 6043 return $OUTPUT_ADJUSTED; 6044 } 6045 6046 # Otherwise is an $ENUM, do output it, for Perl's purposes 6047 return $INTERNAL_MAP; 6048 } 6049 6050 sub inverse_list { 6051 # Returns a Range_List that is gaps of the current table. That is, 6052 # the inversion 6053 6054 my $self = shift; 6055 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6056 6057 my $current = Range_List->new(Initialize => $self->_range_list, 6058 Owner => $self->property); 6059 return ~ $current; 6060 } 6061 6062 sub header { 6063 my $self = shift; 6064 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6065 6066 my $return = $self->SUPER::header(); 6067 6068 if ($self->to_output_map >= $INTERNAL_MAP) { 6069 $return .= $INTERNAL_ONLY_HEADER; 6070 } 6071 else { 6072 my $property_name = $self->property->full_name =~ s/Legacy_//r; 6073 $return .= <<END; 6074 6075# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!! 6076 6077# This file is for internal use by core Perl only. It is retained for 6078# backwards compatibility with applications that may have come to rely on it, 6079# but its format and even its name or existence are subject to change without 6080# notice in a future Perl version. Don't use it directly. Instead, its 6081# contents are now retrievable through a stable API in the Unicode::UCD 6082# module: Unicode::UCD::prop_invmap('$property_name'). 6083END 6084 } 6085 return $return; 6086 } 6087 6088 sub set_final_comment { 6089 # Just before output, create the comment that heads the file 6090 # containing this table. 6091 6092 return unless $debugging_build; 6093 6094 my $self = shift; 6095 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6096 6097 # No sense generating a comment if aren't going to write it out. 6098 return if ! $self->to_output_map; 6099 6100 my $addr = do { no overloading; pack 'J', $self; }; 6101 6102 my $property = $self->property; 6103 6104 # Get all the possible names for this property. Don't use any that 6105 # aren't ok for use in a file name, etc. This is perhaps causing that 6106 # flag to do double duty, and may have to be changed in the future to 6107 # have our own flag for just this purpose; but it works now to exclude 6108 # Perl generated synonyms from the lists for properties, where the 6109 # name is always the proper Unicode one. 6110 my @property_aliases = grep { $_->ok_as_filename } $self->aliases; 6111 6112 my $count = $self->count; 6113 my $default_map = $default_map{$addr}; 6114 6115 # The ranges that map to the default aren't output, so subtract that 6116 # to get those actually output. A property with matching tables 6117 # already has the information calculated. 6118 if ($property->type != $STRING) { 6119 $count -= $property->table($default_map)->count; 6120 } 6121 elsif (defined $default_map) { 6122 6123 # But for $STRING properties, must calculate now. Subtract the 6124 # count from each range that maps to the default. 6125 foreach my $range ($self->_range_list->ranges) { 6126 if ($range->value eq $default_map) { 6127 $count -= $range->end +1 - $range->start; 6128 } 6129 } 6130 6131 } 6132 6133 # Get a string version of $count with underscores in large numbers, 6134 # for clarity. 6135 my $string_count = main::clarify_number($count); 6136 6137 my $code_points = ($count == 1) 6138 ? 'single code point' 6139 : "$string_count code points"; 6140 6141 my $mapping; 6142 my $these_mappings; 6143 my $are; 6144 if (@property_aliases <= 1) { 6145 $mapping = 'mapping'; 6146 $these_mappings = 'this mapping'; 6147 $are = 'is' 6148 } 6149 else { 6150 $mapping = 'synonymous mappings'; 6151 $these_mappings = 'these mappings'; 6152 $are = 'are' 6153 } 6154 my $cp; 6155 if ($count >= $MAX_UNICODE_CODEPOINTS) { 6156 $cp = "any code point in Unicode Version $string_version"; 6157 } 6158 else { 6159 my $map_to; 6160 if ($default_map eq "") { 6161 $map_to = 'the null string'; 6162 } 6163 elsif ($default_map eq $CODE_POINT) { 6164 $map_to = "itself"; 6165 } 6166 else { 6167 $map_to = "'$default_map'"; 6168 } 6169 if ($count == 1) { 6170 $cp = "the single code point"; 6171 } 6172 else { 6173 $cp = "one of the $code_points"; 6174 } 6175 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to"; 6176 } 6177 6178 my $comment = ""; 6179 6180 my $status = $self->status; 6181 if ($status) { 6182 my $warn = uc $status_past_participles{$status}; 6183 $comment .= <<END; 6184 6185!!!!!!! $warn !!!!!!!!!!!!!!!!!!! 6186 All property or property=value combinations contained in this file are $warn. 6187 See $unicode_reference_url for what this means. 6188 6189END 6190 } 6191 $comment .= "This file returns the $mapping:\n"; 6192 6193 my $ucd_accessible_name = ""; 6194 my $full_name = $self->property->full_name; 6195 for my $i (0 .. @property_aliases - 1) { 6196 my $name = $property_aliases[$i]->name; 6197 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); 6198 if ($property_aliases[$i]->ucd) { 6199 if ($name eq $full_name) { 6200 $ucd_accessible_name = $full_name; 6201 } 6202 elsif (! $ucd_accessible_name) { 6203 $ucd_accessible_name = $name; 6204 } 6205 } 6206 } 6207 $comment .= "\nwhere 'cp' is $cp."; 6208 if ($ucd_accessible_name) { 6209 $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD"; 6210 } 6211 6212 # And append any commentary already set from the actual property. 6213 $comment .= "\n\n" . $self->comment if $self->comment; 6214 if ($self->description) { 6215 $comment .= "\n\n" . join " ", $self->description; 6216 } 6217 if ($self->note) { 6218 $comment .= "\n\n" . join " ", $self->note; 6219 } 6220 $comment .= "\n"; 6221 6222 if (! $self->perl_extension) { 6223 $comment .= <<END; 6224 6225For information about what this property really means, see: 6226$unicode_reference_url 6227END 6228 } 6229 6230 if ($count) { # Format differs for empty table 6231 $comment.= "\nThe format of the "; 6232 if ($self->range_size_1) { 6233 $comment.= <<END; 6234main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT 6235is in hex; MAPPING is what CODE_POINT maps to. 6236END 6237 } 6238 else { 6239 6240 # There are tables which end up only having one element per 6241 # range, but it is not worth keeping track of for making just 6242 # this comment a little better. 6243 $comment.= <<END; 6244non-comment portions of the main body of lines of this file is: 6245START\\tSTOP\\tMAPPING where START is the starting code point of the 6246range, in hex; STOP is the ending point, or if omitted, the range has just one 6247code point; MAPPING is what each code point between START and STOP maps to. 6248END 6249 if ($self->output_range_counts) { 6250 $comment .= <<END; 6251Numbers in comments in [brackets] indicate how many code points are in the 6252range (omitted when the range is a single code point or if the mapping is to 6253the null string). 6254END 6255 } 6256 } 6257 } 6258 $self->set_comment(main::join_lines($comment)); 6259 return; 6260 } 6261 6262 my %swash_keys; # Makes sure don't duplicate swash names. 6263 6264 # The remaining variables are temporaries used while writing each table, 6265 # to output special ranges. 6266 my @multi_code_point_maps; # Map is to more than one code point. 6267 6268 sub handle_special_range { 6269 # Called in the middle of write when it finds a range it doesn't know 6270 # how to handle. 6271 6272 my $self = shift; 6273 my $range = shift; 6274 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6275 6276 my $addr = do { no overloading; pack 'J', $self; }; 6277 6278 my $type = $range->type; 6279 6280 my $low = $range->start; 6281 my $high = $range->end; 6282 my $map = $range->value; 6283 6284 # No need to output the range if it maps to the default. 6285 return if $map eq $default_map{$addr}; 6286 6287 my $property = $self->property; 6288 6289 # Switch based on the map type... 6290 if ($type == $HANGUL_SYLLABLE) { 6291 6292 # These are entirely algorithmically determinable based on 6293 # some constants furnished by Unicode; for now, just set a 6294 # flag to indicate that have them. After everything is figured 6295 # out, we will output the code that does the algorithm. (Don't 6296 # output them if not needed because we are suppressing this 6297 # property.) 6298 $has_hangul_syllables = 1 if $property->to_output_map; 6299 } 6300 elsif ($type == $CP_IN_NAME) { 6301 6302 # Code points whose name ends in their code point are also 6303 # algorithmically determinable, but need information about the map 6304 # to do so. Both the map and its inverse are stored in data 6305 # structures output in the file. They are stored in the mean time 6306 # in global lists The lists will be written out later into Name.pm, 6307 # which is created only if needed. In order to prevent duplicates 6308 # in the list, only add to them for one property, should multiple 6309 # ones need them. 6310 if ($needing_code_points_ending_in_code_point == 0) { 6311 $needing_code_points_ending_in_code_point = $property; 6312 } 6313 if ($property == $needing_code_points_ending_in_code_point) { 6314 push @{$names_ending_in_code_point{$map}->{'low'}}, $low; 6315 push @{$names_ending_in_code_point{$map}->{'high'}}, $high; 6316 6317 my $squeezed = $map =~ s/[-\s]+//gr; 6318 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, 6319 $low; 6320 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, 6321 $high; 6322 6323 push @code_points_ending_in_code_point, { low => $low, 6324 high => $high, 6325 name => $map 6326 }; 6327 } 6328 } 6329 elsif ($range->type == $MULTI_CP || $range->type == $NULL) { 6330 6331 # Multi-code point maps and null string maps have an entry 6332 # for each code point in the range. They use the same 6333 # output format. 6334 for my $code_point ($low .. $high) { 6335 6336 # The pack() below can't cope with surrogates. XXX This may 6337 # no longer be true 6338 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { 6339 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); 6340 next; 6341 } 6342 6343 # Generate the hash entries for these in the form that 6344 # utf8.c understands. 6345 my $tostr = ""; 6346 my $to_name = ""; 6347 my $to_chr = ""; 6348 foreach my $to (split " ", $map) { 6349 if ($to !~ /^$code_point_re$/) { 6350 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); 6351 next; 6352 } 6353 $tostr .= sprintf "\\x{%s}", $to; 6354 $to = CORE::hex $to; 6355 if ($annotate) { 6356 $to_name .= " + " if $to_name; 6357 $to_chr .= chr($to); 6358 main::populate_char_info($to) 6359 if ! defined $viacode[$to]; 6360 $to_name .= $viacode[$to]; 6361 } 6362 } 6363 6364 # I (khw) have never waded through this line to 6365 # understand it well enough to comment it. 6366 my $utf8 = sprintf(qq["%s" => "$tostr",], 6367 join("", map { sprintf "\\x%02X", $_ } 6368 unpack("U0C*", pack("U", $code_point)))); 6369 6370 # Add a comment so that a human reader can more easily 6371 # see what's going on. 6372 push @multi_code_point_maps, 6373 sprintf("%-45s # U+%04X", $utf8, $code_point); 6374 if (! $annotate) { 6375 $multi_code_point_maps[-1] .= " => $map"; 6376 } 6377 else { 6378 main::populate_char_info($code_point) 6379 if ! defined $viacode[$code_point]; 6380 $multi_code_point_maps[-1] .= " '" 6381 . chr($code_point) 6382 . "' => '$to_chr'; $viacode[$code_point] => $to_name"; 6383 } 6384 } 6385 } 6386 else { 6387 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written"); 6388 } 6389 6390 return; 6391 } 6392 6393 sub pre_body { 6394 # Returns the string that should be output in the file before the main 6395 # body of this table. It isn't called until the main body is 6396 # calculated, saving a pass. The string includes some hash entries 6397 # identifying the format of the body, and what the single value should 6398 # be for all ranges missing from it. It also includes any code points 6399 # which have map_types that don't go in the main table. 6400 6401 my $self = shift; 6402 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6403 6404 my $addr = do { no overloading; pack 'J', $self; }; 6405 6406 my $name = $self->property->swash_name; 6407 6408 # Currently there is nothing in the pre_body unless a swash is being 6409 # generated. 6410 return unless defined $name; 6411 6412 if (defined $swash_keys{$name}) { 6413 Carp::my_carp(main::join_lines(<<END 6414Already created a swash name '$name' for $swash_keys{$name}. This means that 6415the same name desired for $self shouldn't be used. Bad News. This must be 6416fixed before production use, but proceeding anyway 6417END 6418 )); 6419 } 6420 $swash_keys{$name} = "$self"; 6421 6422 my $pre_body = ""; 6423 6424 # Here we assume we were called after have gone through the whole 6425 # file. If we actually generated anything for each map type, add its 6426 # respective header and trailer 6427 my $specials_name = ""; 6428 if (@multi_code_point_maps) { 6429 $specials_name = "utf8::ToSpec$name"; 6430 $pre_body .= <<END; 6431 6432# Some code points require special handling because their mappings are each to 6433# multiple code points. These do not appear in the main body, but are defined 6434# in the hash below. 6435 6436# Each key is the string of N bytes that together make up the UTF-8 encoding 6437# for the code point. (i.e. the same as looking at the code point's UTF-8 6438# under "use bytes"). Each value is the UTF-8 of the translation, for speed. 6439\%$specials_name = ( 6440END 6441 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; 6442 } 6443 6444 my $format = $self->format; 6445 6446 my $return = ""; 6447 6448 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 6449 if ($output_adjusted) { 6450 if ($specials_name) { 6451 $return .= <<END; 6452# The mappings in the non-hash portion of this file must be modified to get the 6453# correct values by adding the code point ordinal number to each one that is 6454# numeric. 6455END 6456 } 6457 else { 6458 $return .= <<END; 6459# The mappings must be modified to get the correct values by adding the code 6460# point ordinal number to each one that is numeric. 6461END 6462 } 6463 } 6464 6465 $return .= <<END; 6466 6467# The name this swash is to be known by, with the format of the mappings in 6468# the main body of the table, and what all code points missing from this file 6469# map to. 6470\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} 6471END 6472 if ($specials_name) { 6473 $return .= <<END; 6474\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings 6475END 6476 } 6477 my $default_map = $default_map{$addr}; 6478 6479 # For $CODE_POINT default maps and using adjustments, instead the default 6480 # becomes zero. 6481 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '" 6482 . (($output_adjusted && $default_map eq $CODE_POINT) 6483 ? "0" 6484 : $default_map) 6485 . "';"; 6486 6487 if ($default_map eq $CODE_POINT) { 6488 $return .= ' # code point maps to itself'; 6489 } 6490 elsif ($default_map eq "") { 6491 $return .= ' # code point maps to the null string'; 6492 } 6493 $return .= "\n"; 6494 6495 $return .= $pre_body; 6496 6497 return $return; 6498 } 6499 6500 sub write { 6501 # Write the table to the file. 6502 6503 my $self = shift; 6504 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6505 6506 my $addr = do { no overloading; pack 'J', $self; }; 6507 6508 # Clear the temporaries 6509 undef @multi_code_point_maps; 6510 6511 # Calculate the format of the table if not already done. 6512 my $format = $self->format; 6513 my $type = $self->property->type; 6514 my $default_map = $self->default_map; 6515 if (! defined $format) { 6516 if ($type == $BINARY) { 6517 6518 # Don't bother checking the values, because we elsewhere 6519 # verify that a binary table has only 2 values. 6520 $format = $BINARY_FORMAT; 6521 } 6522 else { 6523 my @ranges = $self->_range_list->ranges; 6524 6525 # default an empty table based on its type and default map 6526 if (! @ranges) { 6527 6528 # But it turns out that the only one we can say is a 6529 # non-string (besides binary, handled above) is when the 6530 # table is a string and the default map is to a code point 6531 if ($type == $STRING && $default_map eq $CODE_POINT) { 6532 $format = $HEX_FORMAT; 6533 } 6534 else { 6535 $format = $STRING_FORMAT; 6536 } 6537 } 6538 else { 6539 6540 # Start with the most restrictive format, and as we find 6541 # something that doesn't fit with that, change to the next 6542 # most restrictive, and so on. 6543 $format = $DECIMAL_FORMAT; 6544 foreach my $range (@ranges) { 6545 next if $range->type != 0; # Non-normal ranges don't 6546 # affect the main body 6547 my $map = $range->value; 6548 if ($map ne $default_map) { 6549 last if $format eq $STRING_FORMAT; # already at 6550 # least 6551 # restrictive 6552 $format = $INTEGER_FORMAT 6553 if $format eq $DECIMAL_FORMAT 6554 && $map !~ / ^ [0-9] $ /x; 6555 $format = $FLOAT_FORMAT 6556 if $format eq $INTEGER_FORMAT 6557 && $map !~ / ^ -? [0-9]+ $ /x; 6558 $format = $RATIONAL_FORMAT 6559 if $format eq $FLOAT_FORMAT 6560 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; 6561 $format = $HEX_FORMAT 6562 if ($format eq $RATIONAL_FORMAT 6563 && $map !~ 6564 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x) 6565 # Assume a leading zero means hex, 6566 # even if all digits are 0-9 6567 || ($format eq $INTEGER_FORMAT 6568 && $map =~ /^0[0-9A-F]/); 6569 $format = $STRING_FORMAT if $format eq $HEX_FORMAT 6570 && $map =~ /[^0-9A-F]/; 6571 } 6572 } 6573 } 6574 } 6575 } # end of calculating format 6576 6577 if ($default_map eq $CODE_POINT 6578 && $format ne $HEX_FORMAT 6579 && ! defined $self->format) # manual settings are always 6580 # considered ok 6581 { 6582 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'") 6583 } 6584 6585 # If the output is to be adjusted, the format of the table that gets 6586 # output is actually 'a' instead of whatever it is stored internally 6587 # as. 6588 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 6589 if ($output_adjusted) { 6590 $format = $ADJUST_FORMAT; 6591 } 6592 6593 $self->_set_format($format); 6594 6595 return $self->SUPER::write( 6596 $output_adjusted, 6597 ($self->property == $block) 6598 ? 7 # block file needs more tab stops 6599 : 3, 6600 $default_map); # don't write defaulteds 6601 } 6602 6603 # Accessors for the underlying list that should fail if locked. 6604 for my $sub (qw( 6605 add_duplicate 6606 )) 6607 { 6608 no strict "refs"; 6609 *$sub = sub { 6610 use strict "refs"; 6611 my $self = shift; 6612 6613 return if $self->carp_if_locked; 6614 return $self->_range_list->$sub(@_); 6615 } 6616 } 6617} # End closure for Map_Table 6618 6619package Match_Table; 6620use base '_Base_Table'; 6621 6622# A Match table is one which is a list of all the code points that have 6623# the same property and property value, for use in \p{property=value} 6624# constructs in regular expressions. It adds very little data to the base 6625# structure, but many methods, as these lists can be combined in many ways to 6626# form new ones. 6627# There are only a few concepts added: 6628# 1) Equivalents and Relatedness. 6629# Two tables can match the identical code points, but have different names. 6630# This always happens when there is a perl single form extension 6631# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two 6632# tables are set to be related, with the Perl extension being a child, and 6633# the Unicode property being the parent. 6634# 6635# It may be that two tables match the identical code points and we don't 6636# know if they are related or not. This happens most frequently when the 6637# Block and Script properties have the exact range. But note that a 6638# revision to Unicode could add new code points to the script, which would 6639# now have to be in a different block (as the block was filled, or there 6640# would have been 'Unknown' script code points in it and they wouldn't have 6641# been identical). So we can't rely on any two properties from Unicode 6642# always matching the same code points from release to release, and thus 6643# these tables are considered coincidentally equivalent--not related. When 6644# two tables are unrelated but equivalent, one is arbitrarily chosen as the 6645# 'leader', and the others are 'equivalents'. This concept is useful 6646# to minimize the number of tables written out. Only one file is used for 6647# any identical set of code points, with entries in Heavy.pl mapping all 6648# the involved tables to it. 6649# 6650# Related tables will always be identical; we set them up to be so. Thus 6651# if the Unicode one is deprecated, the Perl one will be too. Not so for 6652# unrelated tables. Relatedness makes generating the documentation easier. 6653# 6654# 2) Complement. 6655# Like equivalents, two tables may be the inverses of each other, the 6656# intersection between them is null, and the union is every Unicode code 6657# point. The two tables that occupy a binary property are necessarily like 6658# this. By specifying one table as the complement of another, we can avoid 6659# storing it on disk (using the other table and performing a fast 6660# transform), and some memory and calculations. 6661# 6662# 3) Conflicting. It may be that there will eventually be name clashes, with 6663# the same name meaning different things. For a while, there actually were 6664# conflicts, but they have so far been resolved by changing Perl's or 6665# Unicode's definitions to match the other, but when this code was written, 6666# it wasn't clear that that was what was going to happen. (Unicode changed 6667# because of protests during their beta period.) Name clashes are warned 6668# about during compilation, and the documentation. The generated tables 6669# are sane, free of name clashes, because the code suppresses the Perl 6670# version. But manual intervention to decide what the actual behavior 6671# should be may be required should this happen. The introductory comments 6672# have more to say about this. 6673 6674sub standardize { return main::standardize($_[0]); } 6675sub trace { return main::trace(@_); } 6676 6677 6678{ # Closure 6679 6680 main::setup_package(); 6681 6682 my %leader; 6683 # The leader table of this one; initially $self. 6684 main::set_access('leader', \%leader, 'r'); 6685 6686 my %equivalents; 6687 # An array of any tables that have this one as their leader 6688 main::set_access('equivalents', \%equivalents, 'readable_array'); 6689 6690 my %parent; 6691 # The parent table to this one, initially $self. This allows us to 6692 # distinguish between equivalent tables that are related (for which this 6693 # is set to), and those which may not be, but share the same output file 6694 # because they match the exact same set of code points in the current 6695 # Unicode release. 6696 main::set_access('parent', \%parent, 'r'); 6697 6698 my %children; 6699 # An array of any tables that have this one as their parent 6700 main::set_access('children', \%children, 'readable_array'); 6701 6702 my %conflicting; 6703 # Array of any tables that would have the same name as this one with 6704 # a different meaning. This is used for the generated documentation. 6705 main::set_access('conflicting', \%conflicting, 'readable_array'); 6706 6707 my %matches_all; 6708 # Set in the constructor for tables that are expected to match all code 6709 # points. 6710 main::set_access('matches_all', \%matches_all, 'r'); 6711 6712 my %complement; 6713 # Points to the complement that this table is expressed in terms of; 0 if 6714 # none. 6715 main::set_access('complement', \%complement, 'r'); 6716 6717 sub new { 6718 my $class = shift; 6719 6720 my %args = @_; 6721 6722 # The property for which this table is a listing of property values. 6723 my $property = delete $args{'_Property'}; 6724 6725 my $name = delete $args{'Name'}; 6726 my $full_name = delete $args{'Full_Name'}; 6727 $full_name = $name if ! defined $full_name; 6728 6729 # Optional 6730 my $initialize = delete $args{'Initialize'}; 6731 my $matches_all = delete $args{'Matches_All'} || 0; 6732 my $format = delete $args{'Format'}; 6733 # Rest of parameters passed on. 6734 6735 my $range_list = Range_List->new(Initialize => $initialize, 6736 Owner => $property); 6737 6738 my $complete = $full_name; 6739 $complete = '""' if $complete eq ""; # A null name shouldn't happen, 6740 # but this helps debug if it 6741 # does 6742 # The complete name for a match table includes it's property in a 6743 # compound form 'property=table', except if the property is the 6744 # pseudo-property, perl, in which case it is just the single form, 6745 # 'table' (If you change the '=' must also change the ':' in lots of 6746 # places in this program that assume an equal sign) 6747 $complete = $property->full_name . "=$complete" if $property != $perl; 6748 6749 my $self = $class->SUPER::new(%args, 6750 Name => $name, 6751 Complete_Name => $complete, 6752 Full_Name => $full_name, 6753 _Property => $property, 6754 _Range_List => $range_list, 6755 Format => $EMPTY_FORMAT, 6756 ); 6757 my $addr = do { no overloading; pack 'J', $self; }; 6758 6759 $conflicting{$addr} = [ ]; 6760 $equivalents{$addr} = [ ]; 6761 $children{$addr} = [ ]; 6762 $matches_all{$addr} = $matches_all; 6763 $leader{$addr} = $self; 6764 $parent{$addr} = $self; 6765 $complement{$addr} = 0; 6766 6767 if (defined $format && $format ne $EMPTY_FORMAT) { 6768 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); 6769 } 6770 6771 return $self; 6772 } 6773 6774 # See this program's beginning comment block about overloading these. 6775 use overload 6776 fallback => 0, 6777 qw("") => "_operator_stringify", 6778 '=' => sub { 6779 my $self = shift; 6780 6781 return if $self->carp_if_locked; 6782 return $self; 6783 }, 6784 6785 '+' => sub { 6786 my $self = shift; 6787 my $other = shift; 6788 6789 return $self->_range_list + $other; 6790 }, 6791 '&' => sub { 6792 my $self = shift; 6793 my $other = shift; 6794 6795 return $self->_range_list & $other; 6796 }, 6797 '+=' => sub { 6798 my $self = shift; 6799 my $other = shift; 6800 6801 return if $self->carp_if_locked; 6802 6803 my $addr = do { no overloading; pack 'J', $self; }; 6804 6805 if (ref $other) { 6806 6807 # Change the range list of this table to be the 6808 # union of the two. 6809 $self->_set_range_list($self->_range_list 6810 + $other); 6811 } 6812 else { # $other is just a simple value 6813 $self->add_range($other, $other); 6814 } 6815 return $self; 6816 }, 6817 '-' => sub { my $self = shift; 6818 my $other = shift; 6819 my $reversed = shift; 6820 6821 if ($reversed) { 6822 Carp::my_carp_bug("Can't cope with a " 6823 . __PACKAGE__ 6824 . " being the first parameter in a '-'. Subtraction ignored."); 6825 return; 6826 } 6827 6828 return $self->_range_list - $other; 6829 }, 6830 '~' => sub { my $self = shift; 6831 return ~ $self->_range_list; 6832 }, 6833 ; 6834 6835 sub _operator_stringify { 6836 my $self = shift; 6837 6838 my $name = $self->complete_name; 6839 return "Table '$name'"; 6840 } 6841 6842 sub _range_list { 6843 # Returns the range list associated with this table, which will be the 6844 # complement's if it has one. 6845 6846 my $self = shift; 6847 my $complement; 6848 if (($complement = $self->complement) != 0) { 6849 return ~ $complement->_range_list; 6850 } 6851 else { 6852 return $self->SUPER::_range_list; 6853 } 6854 } 6855 6856 sub add_alias { 6857 # Add a synonym for this table. See the comments in the base class 6858 6859 my $self = shift; 6860 my $name = shift; 6861 # Rest of parameters passed on. 6862 6863 $self->SUPER::add_alias($name, $self, @_); 6864 return; 6865 } 6866 6867 sub add_conflicting { 6868 # Add the name of some other object to the list of ones that name 6869 # clash with this match table. 6870 6871 my $self = shift; 6872 my $conflicting_name = shift; # The name of the conflicting object 6873 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? 6874 my $conflicting_object = shift; # Optional, the conflicting object 6875 # itself. This is used to 6876 # disambiguate the text if the input 6877 # name is identical to any of the 6878 # aliases $self is known by. 6879 # Sometimes the conflicting object is 6880 # merely hypothetical, so this has to 6881 # be an optional parameter. 6882 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6883 6884 my $addr = do { no overloading; pack 'J', $self; }; 6885 6886 # Check if the conflicting name is exactly the same as any existing 6887 # alias in this table (as long as there is a real object there to 6888 # disambiguate with). 6889 if (defined $conflicting_object) { 6890 foreach my $alias ($self->aliases) { 6891 if ($alias->name eq $conflicting_name) { 6892 6893 # Here, there is an exact match. This results in 6894 # ambiguous comments, so disambiguate by changing the 6895 # conflicting name to its object's complete equivalent. 6896 $conflicting_name = $conflicting_object->complete_name; 6897 last; 6898 } 6899 } 6900 } 6901 6902 # Convert to the \p{...} final name 6903 $conflicting_name = "\\$p" . "{$conflicting_name}"; 6904 6905 # Only add once 6906 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; 6907 6908 push @{$conflicting{$addr}}, $conflicting_name; 6909 6910 return; 6911 } 6912 6913 sub is_set_equivalent_to { 6914 # Return boolean of whether or not the other object is a table of this 6915 # type and has been marked equivalent to this one. 6916 6917 my $self = shift; 6918 my $other = shift; 6919 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6920 6921 return 0 if ! defined $other; # Can happen for incomplete early 6922 # releases 6923 unless ($other->isa(__PACKAGE__)) { 6924 my $ref_other = ref $other; 6925 my $ref_self = ref $self; 6926 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); 6927 return 0; 6928 } 6929 6930 # Two tables are equivalent if they have the same leader. 6931 no overloading; 6932 return $leader{pack 'J', $self} == $leader{pack 'J', $other}; 6933 return; 6934 } 6935 6936 sub set_equivalent_to { 6937 # Set $self equivalent to the parameter table. 6938 # The required Related => 'x' parameter is a boolean indicating 6939 # whether these tables are related or not. If related, $other becomes 6940 # the 'parent' of $self; if unrelated it becomes the 'leader' 6941 # 6942 # Related tables share all characteristics except names; equivalents 6943 # not quite so many. 6944 # If they are related, one must be a perl extension. This is because 6945 # we can't guarantee that Unicode won't change one or the other in a 6946 # later release even if they are identical now. 6947 6948 my $self = shift; 6949 my $other = shift; 6950 6951 my %args = @_; 6952 my $related = delete $args{'Related'}; 6953 6954 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 6955 6956 return if ! defined $other; # Keep on going; happens in some early 6957 # Unicode releases. 6958 6959 if (! defined $related) { 6960 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); 6961 $related = 0; 6962 } 6963 6964 # If already are equivalent, no need to re-do it; if subroutine 6965 # returns null, it found an error, also do nothing 6966 my $are_equivalent = $self->is_set_equivalent_to($other); 6967 return if ! defined $are_equivalent || $are_equivalent; 6968 6969 my $addr = do { no overloading; pack 'J', $self; }; 6970 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; 6971 6972 if ($related) { 6973 if ($current_leader->perl_extension) { 6974 if ($other->perl_extension) { 6975 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); 6976 return; 6977 } 6978 } elsif ($self->property != $other->property # Depending on 6979 # situation, might 6980 # be better to use 6981 # add_alias() 6982 # instead for same 6983 # property 6984 && ! $other->perl_extension) 6985 { 6986 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); 6987 $related = 0; 6988 } 6989 } 6990 6991 if (! $self->is_empty && ! $self->matches_identically_to($other)) { 6992 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); 6993 return; 6994 } 6995 6996 my $leader = do { no overloading; pack 'J', $current_leader; }; 6997 my $other_addr = do { no overloading; pack 'J', $other; }; 6998 6999 # Any tables that are equivalent to or children of this table must now 7000 # instead be equivalent to or (children) to the new leader (parent), 7001 # still equivalent. The equivalency includes their matches_all info, 7002 # and for related tables, their fate and status. 7003 # All related tables are of necessity equivalent, but the converse 7004 # isn't necessarily true 7005 my $status = $other->status; 7006 my $status_info = $other->status_info; 7007 my $fate = $other->fate; 7008 my $matches_all = $matches_all{other_addr}; 7009 my $caseless_equivalent = $other->caseless_equivalent; 7010 foreach my $table ($current_leader, @{$equivalents{$leader}}) { 7011 next if $table == $other; 7012 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; 7013 7014 my $table_addr = do { no overloading; pack 'J', $table; }; 7015 $leader{$table_addr} = $other; 7016 $matches_all{$table_addr} = $matches_all; 7017 $self->_set_range_list($other->_range_list); 7018 push @{$equivalents{$other_addr}}, $table; 7019 if ($related) { 7020 $parent{$table_addr} = $other; 7021 push @{$children{$other_addr}}, $table; 7022 $table->set_status($status, $status_info); 7023 7024 # This reason currently doesn't get exposed outside; otherwise 7025 # would have to look up the parent's reason and use it instead. 7026 $table->set_fate($fate, "Parent's fate"); 7027 7028 $self->set_caseless_equivalent($caseless_equivalent); 7029 } 7030 } 7031 7032 # Now that we've declared these to be equivalent, any changes to one 7033 # of the tables would invalidate that equivalency. 7034 $self->lock; 7035 $other->lock; 7036 return; 7037 } 7038 7039 sub set_complement { 7040 # Set $self to be the complement of the parameter table. $self is 7041 # locked, as what it contains should all come from the other table. 7042 7043 my $self = shift; 7044 my $other = shift; 7045 7046 my %args = @_; 7047 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 7048 7049 if ($other->complement != 0) { 7050 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); 7051 return; 7052 } 7053 my $addr = do { no overloading; pack 'J', $self; }; 7054 $complement{$addr} = $other; 7055 $self->lock; 7056 return; 7057 } 7058 7059 sub add_range { # Add a range to the list for this table. 7060 my $self = shift; 7061 # Rest of parameters passed on 7062 7063 return if $self->carp_if_locked; 7064 return $self->_range_list->add_range(@_); 7065 } 7066 7067 sub header { 7068 my $self = shift; 7069 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7070 7071 # All match tables are to be used only by the Perl core. 7072 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; 7073 } 7074 7075 sub pre_body { # Does nothing for match tables. 7076 return 7077 } 7078 7079 sub append_to_body { # Does nothing for match tables. 7080 return 7081 } 7082 7083 sub set_fate { 7084 my $self = shift; 7085 my $fate = shift; 7086 my $reason = shift; 7087 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7088 7089 $self->SUPER::set_fate($fate, $reason); 7090 7091 # All children share this fate 7092 foreach my $child ($self->children) { 7093 $child->set_fate($fate, $reason); 7094 } 7095 return; 7096 } 7097 7098 sub write { 7099 my $self = shift; 7100 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7101 7102 return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops 7103 } 7104 7105 sub set_final_comment { 7106 # This creates a comment for the file that is to hold the match table 7107 # $self. It is somewhat convoluted to make the English read nicely, 7108 # but, heh, it's just a comment. 7109 # This should be called only with the leader match table of all the 7110 # ones that share the same file. It lists all such tables, ordered so 7111 # that related ones are together. 7112 7113 return unless $debugging_build; 7114 7115 my $leader = shift; # Should only be called on the leader table of 7116 # an equivalent group 7117 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7118 7119 my $addr = do { no overloading; pack 'J', $leader; }; 7120 7121 if ($leader{$addr} != $leader) { 7122 Carp::my_carp_bug(<<END 7123set_final_comment() must be called on a leader table, which $leader is not. 7124It is equivalent to $leader{$addr}. No comment created 7125END 7126 ); 7127 return; 7128 } 7129 7130 # Get the number of code points matched by each of the tables in this 7131 # file, and add underscores for clarity. 7132 my $count = $leader->count; 7133 my $string_count = main::clarify_number($count); 7134 7135 my $loose_count = 0; # how many aliases loosely matched 7136 my $compound_name = ""; # ? Are any names compound?, and if so, an 7137 # example 7138 my $properties_with_compound_names = 0; # count of these 7139 7140 7141 my %flags; # The status flags used in the file 7142 my $total_entries = 0; # number of entries written in the comment 7143 my $matches_comment = ""; # The portion of the comment about the 7144 # \p{}'s 7145 my @global_comments; # List of all the tables' comments that are 7146 # there before this routine was called. 7147 my $has_ucd_alias = 0; # If there is an alias that is accessible via 7148 # Unicode::UCD. If not, then don't say it is 7149 # in the comment 7150 7151 # Get list of all the parent tables that are equivalent to this one 7152 # (including itself). 7153 my @parents = grep { $parent{main::objaddr $_} == $_ } 7154 main::uniques($leader, @{$equivalents{$addr}}); 7155 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated 7156 # tables 7157 7158 for my $parent (@parents) { 7159 7160 my $property = $parent->property; 7161 7162 # Special case 'N' tables in properties with two match tables when 7163 # the other is a 'Y' one. These are likely to be binary tables, 7164 # but not necessarily. In either case, \P{} will match the 7165 # complement of \p{}, and so if something is a synonym of \p, the 7166 # complement of that something will be the synonym of \P. This 7167 # would be true of any property with just two match tables, not 7168 # just those whose values are Y and N; but that would require a 7169 # little extra work, and there are none such so far in Unicode. 7170 my $perl_p = 'p'; # which is it? \p{} or \P{} 7171 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table 7172 7173 if (scalar $property->tables == 2 7174 && $parent == $property->table('N') 7175 && defined (my $yes = $property->table('Y'))) 7176 { 7177 my $yes_addr = do { no overloading; pack 'J', $yes; }; 7178 @yes_perl_synonyms 7179 = grep { $_->property == $perl } 7180 main::uniques($yes, 7181 $parent{$yes_addr}, 7182 $parent{$yes_addr}->children); 7183 7184 # But these synonyms are \P{} ,not \p{} 7185 $perl_p = 'P'; 7186 } 7187 7188 my @description; # Will hold the table description 7189 my @note; # Will hold the table notes. 7190 my @conflicting; # Will hold the table conflicts. 7191 7192 # Look at the parent, any yes synonyms, and all the children 7193 my $parent_addr = do { no overloading; pack 'J', $parent; }; 7194 for my $table ($parent, 7195 @yes_perl_synonyms, 7196 @{$children{$parent_addr}}) 7197 { 7198 my $table_addr = do { no overloading; pack 'J', $table; }; 7199 my $table_property = $table->property; 7200 7201 # Tables are separated by a blank line to create a grouping. 7202 $matches_comment .= "\n" if $matches_comment; 7203 7204 # The table is named based on the property and value 7205 # combination it is for, like script=greek. But there may be 7206 # a number of synonyms for each side, like 'sc' for 'script', 7207 # and 'grek' for 'greek'. Any combination of these is a valid 7208 # name for this table. In this case, there are three more, 7209 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than 7210 # listing all possible combinations in the comment, we make 7211 # sure that each synonym occurs at least once, and add 7212 # commentary that the other combinations are possible. 7213 # Because regular expressions don't recognize things like 7214 # \p{jsn=}, only look at non-null right-hand-sides 7215 my @property_aliases = $table_property->aliases; 7216 my @table_aliases = grep { $_->name ne "" } $table->aliases; 7217 7218 # The alias lists above are already ordered in the order we 7219 # want to output them. To ensure that each synonym is listed, 7220 # we must use the max of the two numbers. But if there are no 7221 # legal synonyms (nothing in @table_aliases), then we don't 7222 # list anything. 7223 my $listed_combos = (@table_aliases) 7224 ? main::max(scalar @table_aliases, 7225 scalar @property_aliases) 7226 : 0; 7227 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG; 7228 7229 7230 my $property_had_compound_name = 0; 7231 7232 for my $i (0 .. $listed_combos - 1) { 7233 $total_entries++; 7234 7235 # The current alias for the property is the next one on 7236 # the list, or if beyond the end, start over. Similarly 7237 # for the table (\p{prop=table}) 7238 my $property_alias = $property_aliases 7239 [$i % @property_aliases]->name; 7240 my $table_alias_object = $table_aliases 7241 [$i % @table_aliases]; 7242 my $table_alias = $table_alias_object->name; 7243 my $loose_match = $table_alias_object->loose_match; 7244 $has_ucd_alias |= $table_alias_object->ucd; 7245 7246 if ($table_alias !~ /\D/) { # Clarify large numbers. 7247 $table_alias = main::clarify_number($table_alias) 7248 } 7249 7250 # Add a comment for this alias combination 7251 my $current_match_comment; 7252 if ($table_property == $perl) { 7253 $current_match_comment = "\\$perl_p" 7254 . "{$table_alias}"; 7255 } 7256 else { 7257 $current_match_comment 7258 = "\\p{$property_alias=$table_alias}"; 7259 $property_had_compound_name = 1; 7260 } 7261 7262 # Flag any abnormal status for this table. 7263 my $flag = $property->status 7264 || $table->status 7265 || $table_alias_object->status; 7266 $flags{$flag} = $status_past_participles{$flag} if $flag; 7267 7268 $loose_count++; 7269 7270 # Pretty up the comment. Note the \b; it says don't make 7271 # this line a continuation. 7272 $matches_comment .= sprintf("\b%-1s%-s%s\n", 7273 $flag, 7274 " " x 7, 7275 $current_match_comment); 7276 } # End of generating the entries for this table. 7277 7278 # Save these for output after this group of related tables. 7279 push @description, $table->description; 7280 push @note, $table->note; 7281 push @conflicting, $table->conflicting; 7282 7283 # And this for output after all the tables. 7284 push @global_comments, $table->comment; 7285 7286 # Compute an alternate compound name using the final property 7287 # synonym and the first table synonym with a colon instead of 7288 # the equal sign used elsewhere. 7289 if ($property_had_compound_name) { 7290 $properties_with_compound_names ++; 7291 if (! $compound_name || @property_aliases > 1) { 7292 $compound_name = $property_aliases[-1]->name 7293 . ': ' 7294 . $table_aliases[0]->name; 7295 } 7296 } 7297 } # End of looping through all children of this table 7298 7299 # Here have assembled in $matches_comment all the related tables 7300 # to the current parent (preceded by the same info for all the 7301 # previous parents). Put out information that applies to all of 7302 # the current family. 7303 if (@conflicting) { 7304 7305 # But output the conflicting information now, as it applies to 7306 # just this table. 7307 my $conflicting = join ", ", @conflicting; 7308 if ($conflicting) { 7309 $matches_comment .= <<END; 7310 7311 Note that contrary to what you might expect, the above is NOT the same as 7312END 7313 $matches_comment .= "any of: " if @conflicting > 1; 7314 $matches_comment .= "$conflicting\n"; 7315 } 7316 } 7317 if (@description) { 7318 $matches_comment .= "\n Meaning: " 7319 . join('; ', @description) 7320 . "\n"; 7321 } 7322 if (@note) { 7323 $matches_comment .= "\n Note: " 7324 . join("\n ", @note) 7325 . "\n"; 7326 } 7327 } # End of looping through all tables 7328 7329 7330 my $code_points; 7331 my $match; 7332 my $any_of_these; 7333 if ($count == 1) { 7334 $match = 'matches'; 7335 $code_points = 'single code point'; 7336 } 7337 else { 7338 $match = 'match'; 7339 $code_points = "$string_count code points"; 7340 } 7341 7342 my $synonyms; 7343 my $entries; 7344 if ($total_entries == 1) { 7345 $synonyms = ""; 7346 $entries = 'entry'; 7347 $any_of_these = 'this' 7348 } 7349 else { 7350 $synonyms = " any of the following regular expression constructs"; 7351 $entries = 'entries'; 7352 $any_of_these = 'any of these' 7353 } 7354 7355 my $comment = ""; 7356 if ($has_ucd_alias) { 7357 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; 7358 } 7359 if ($has_unrelated) { 7360 $comment .= <<END; 7361This file is for tables that are not necessarily related: To conserve 7362resources, every table that matches the identical set of code points in this 7363version of Unicode uses this file. Each one is listed in a separate group 7364below. It could be that the tables will match the same set of code points in 7365other Unicode releases, or it could be purely coincidence that they happen to 7366be the same in Unicode $string_version, and hence may not in other versions. 7367 7368END 7369 } 7370 7371 if (%flags) { 7372 foreach my $flag (sort keys %flags) { 7373 $comment .= <<END; 7374'$flag' below means that this form is $flags{$flag}. 7375Consult $pod_file.pod 7376END 7377 } 7378 $comment .= "\n"; 7379 } 7380 7381 if ($total_entries == 0) { 7382 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); 7383 $comment .= <<END; 7384This file returns the $code_points in Unicode Version $string_version for 7385$leader, but it is inaccessible through Perl regular expressions, as 7386"\\p{prop=}" is not recognized. 7387END 7388 7389 } else { 7390 $comment .= <<END; 7391This file returns the $code_points in Unicode Version $string_version that 7392$match$synonyms: 7393 7394$matches_comment 7395$pod_file.pod should be consulted for the syntax rules for $any_of_these, 7396including if adding or subtracting white space, underscore, and hyphen 7397characters matters or doesn't matter, and other permissible syntactic 7398variants. Upper/lower case distinctions never matter. 7399END 7400 7401 } 7402 if ($compound_name) { 7403 $comment .= <<END; 7404 7405A colon can be substituted for the equals sign, and 7406END 7407 if ($properties_with_compound_names > 1) { 7408 $comment .= <<END; 7409within each group above, 7410END 7411 } 7412 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name); 7413 7414 # Note the \b below, it says don't make that line a continuation. 7415 $comment .= <<END; 7416anything to the left of the equals (or colon) can be combined with anything to 7417the right. Thus, for example, 7418$compound_name 7419\bis also valid. 7420END 7421 } 7422 7423 # And append any comment(s) from the actual tables. They are all 7424 # gathered here, so may not read all that well. 7425 if (@global_comments) { 7426 $comment .= "\n" . join("\n\n", @global_comments) . "\n"; 7427 } 7428 7429 if ($count) { # The format differs if no code points, and needs no 7430 # explanation in that case 7431 $comment.= <<END; 7432 7433The format of the lines of this file is: 7434END 7435 $comment.= <<END; 7436START\\tSTOP\\twhere START is the starting code point of the range, in hex; 7437STOP is the ending point, or if omitted, the range has just one code point. 7438END 7439 if ($leader->output_range_counts) { 7440 $comment .= <<END; 7441Numbers in comments in [brackets] indicate how many code points are in the 7442range. 7443END 7444 } 7445 } 7446 7447 $leader->set_comment(main::join_lines($comment)); 7448 return; 7449 } 7450 7451 # Accessors for the underlying list 7452 for my $sub (qw( 7453 get_valid_code_point 7454 get_invalid_code_point 7455 )) 7456 { 7457 no strict "refs"; 7458 *$sub = sub { 7459 use strict "refs"; 7460 my $self = shift; 7461 7462 return $self->_range_list->$sub(@_); 7463 } 7464 } 7465} # End closure for Match_Table 7466 7467package Property; 7468 7469# The Property class represents a Unicode property, or the $perl 7470# pseudo-property. It contains a map table initialized empty at construction 7471# time, and for properties accessible through regular expressions, various 7472# match tables, created through the add_match_table() method, and referenced 7473# by the table('NAME') or tables() methods, the latter returning a list of all 7474# of the match tables. Otherwise table operations implicitly are for the map 7475# table. 7476# 7477# Most of the data in the property is actually about its map table, so it 7478# mostly just uses that table's accessors for most methods. The two could 7479# have been combined into one object, but for clarity because of their 7480# differing semantics, they have been kept separate. It could be argued that 7481# the 'file' and 'directory' fields should be kept with the map table. 7482# 7483# Each property has a type. This can be set in the constructor, or in the 7484# set_type accessor, but mostly it is figured out by the data. Every property 7485# starts with unknown type, overridden by a parameter to the constructor, or 7486# as match tables are added, or ranges added to the map table, the data is 7487# inspected, and the type changed. After the table is mostly or entirely 7488# filled, compute_type() should be called to finalize they analysis. 7489# 7490# There are very few operations defined. One can safely remove a range from 7491# the map table, and property_add_or_replace_non_nulls() adds the maps from another 7492# table to this one, replacing any in the intersection of the two. 7493 7494sub standardize { return main::standardize($_[0]); } 7495sub trace { return main::trace(@_) if main::DEBUG && $to_trace } 7496 7497{ # Closure 7498 7499 # This hash will contain as keys, all the aliases of all properties, and 7500 # as values, pointers to their respective property objects. This allows 7501 # quick look-up of a property from any of its names. 7502 my %alias_to_property_of; 7503 7504 sub dump_alias_to_property_of { 7505 # For debugging 7506 7507 print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; 7508 return; 7509 } 7510 7511 sub property_ref { 7512 # This is a package subroutine, not called as a method. 7513 # If the single parameter is a literal '*' it returns a list of all 7514 # defined properties. 7515 # Otherwise, the single parameter is a name, and it returns a pointer 7516 # to the corresponding property object, or undef if none. 7517 # 7518 # Properties can have several different names. The 'standard' form of 7519 # each of them is stored in %alias_to_property_of as they are defined. 7520 # But it's possible that this subroutine will be called with some 7521 # variant, so if the initial lookup fails, it is repeated with the 7522 # standardized form of the input name. If found, besides returning the 7523 # result, the input name is added to the list so future calls won't 7524 # have to do the conversion again. 7525 7526 my $name = shift; 7527 7528 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7529 7530 if (! defined $name) { 7531 Carp::my_carp_bug("Undefined input property. No action taken."); 7532 return; 7533 } 7534 7535 return main::uniques(values %alias_to_property_of) if $name eq '*'; 7536 7537 # Return cached result if have it. 7538 my $result = $alias_to_property_of{$name}; 7539 return $result if defined $result; 7540 7541 # Convert the input to standard form. 7542 my $standard_name = standardize($name); 7543 7544 $result = $alias_to_property_of{$standard_name}; 7545 return unless defined $result; # Don't cache undefs 7546 7547 # Cache the result before returning it. 7548 $alias_to_property_of{$name} = $result; 7549 return $result; 7550 } 7551 7552 7553 main::setup_package(); 7554 7555 my %map; 7556 # A pointer to the map table object for this property 7557 main::set_access('map', \%map); 7558 7559 my %full_name; 7560 # The property's full name. This is a duplicate of the copy kept in the 7561 # map table, but is needed because stringify needs it during 7562 # construction of the map table, and then would have a chicken before egg 7563 # problem. 7564 main::set_access('full_name', \%full_name, 'r'); 7565 7566 my %table_ref; 7567 # This hash will contain as keys, all the aliases of any match tables 7568 # attached to this property, and as values, the pointers to their 7569 # respective tables. This allows quick look-up of a table from any of its 7570 # names. 7571 main::set_access('table_ref', \%table_ref); 7572 7573 my %type; 7574 # The type of the property, $ENUM, $BINARY, etc 7575 main::set_access('type', \%type, 'r'); 7576 7577 my %file; 7578 # The filename where the map table will go (if actually written). 7579 # Normally defaulted, but can be overridden. 7580 main::set_access('file', \%file, 'r', 's'); 7581 7582 my %directory; 7583 # The directory where the map table will go (if actually written). 7584 # Normally defaulted, but can be overridden. 7585 main::set_access('directory', \%directory, 's'); 7586 7587 my %pseudo_map_type; 7588 # This is used to affect the calculation of the map types for all the 7589 # ranges in the table. It should be set to one of the values that signify 7590 # to alter the calculation. 7591 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); 7592 7593 my %has_only_code_point_maps; 7594 # A boolean used to help in computing the type of data in the map table. 7595 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); 7596 7597 my %unique_maps; 7598 # A list of the first few distinct mappings this property has. This is 7599 # used to disambiguate between binary and enum property types, so don't 7600 # have to keep more than three. 7601 main::set_access('unique_maps', \%unique_maps); 7602 7603 my %pre_declared_maps; 7604 # A boolean that gives whether the input data should declare all the 7605 # tables used, or not. If the former, unknown ones raise a warning. 7606 main::set_access('pre_declared_maps', 7607 \%pre_declared_maps, 'r', 's'); 7608 7609 sub new { 7610 # The only required parameter is the positionally first, name. All 7611 # other parameters are key => value pairs. See the documentation just 7612 # above for the meanings of the ones not passed directly on to the map 7613 # table constructor. 7614 7615 my $class = shift; 7616 my $name = shift || ""; 7617 7618 my $self = property_ref($name); 7619 if (defined $self) { 7620 my $options_string = join ", ", @_; 7621 $options_string = ". Ignoring options $options_string" if $options_string; 7622 Carp::my_carp("$self is already in use. Using existing one$options_string;"); 7623 return $self; 7624 } 7625 7626 my %args = @_; 7627 7628 $self = bless \do { my $anonymous_scalar }, $class; 7629 my $addr = do { no overloading; pack 'J', $self; }; 7630 7631 $directory{$addr} = delete $args{'Directory'}; 7632 $file{$addr} = delete $args{'File'}; 7633 $full_name{$addr} = delete $args{'Full_Name'} || $name; 7634 $type{$addr} = delete $args{'Type'} || $UNKNOWN; 7635 $pseudo_map_type{$addr} = delete $args{'Map_Type'}; 7636 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'} 7637 # Starting in this release, property 7638 # values should be defined for all 7639 # properties, except those overriding this 7640 // $v_version ge v5.1.0; 7641 7642 # Rest of parameters passed on. 7643 7644 $has_only_code_point_maps{$addr} = 1; 7645 $table_ref{$addr} = { }; 7646 $unique_maps{$addr} = { }; 7647 7648 $map{$addr} = Map_Table->new($name, 7649 Full_Name => $full_name{$addr}, 7650 _Alias_Hash => \%alias_to_property_of, 7651 _Property => $self, 7652 %args); 7653 return $self; 7654 } 7655 7656 # See this program's beginning comment block about overloading the copy 7657 # constructor. Few operations are defined on properties, but a couple are 7658 # useful. It is safe to take the inverse of a property, and to remove a 7659 # single code point from it. 7660 use overload 7661 fallback => 0, 7662 qw("") => "_operator_stringify", 7663 "." => \&main::_operator_dot, 7664 '==' => \&main::_operator_equal, 7665 '!=' => \&main::_operator_not_equal, 7666 '=' => sub { return shift }, 7667 '-=' => "_minus_and_equal", 7668 ; 7669 7670 sub _operator_stringify { 7671 return "Property '" . shift->full_name . "'"; 7672 } 7673 7674 sub _minus_and_equal { 7675 # Remove a single code point from the map table of a property. 7676 7677 my $self = shift; 7678 my $other = shift; 7679 my $reversed = shift; 7680 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7681 7682 if (ref $other) { 7683 Carp::my_carp_bug("Can't cope with a " 7684 . ref($other) 7685 . " argument to '-='. Subtraction ignored."); 7686 return $self; 7687 } 7688 elsif ($reversed) { # Shouldn't happen in a -=, but just in case 7689 Carp::my_carp_bug("Can't cope with a " 7690 . __PACKAGE__ 7691 . " being the first parameter in a '-='. Subtraction ignored."); 7692 return $self; 7693 } 7694 else { 7695 no overloading; 7696 $map{pack 'J', $self}->delete_range($other, $other); 7697 } 7698 return $self; 7699 } 7700 7701 sub add_match_table { 7702 # Add a new match table for this property, with name given by the 7703 # parameter. It returns a pointer to the table. 7704 7705 my $self = shift; 7706 my $name = shift; 7707 my %args = @_; 7708 7709 my $addr = do { no overloading; pack 'J', $self; }; 7710 7711 my $table = $table_ref{$addr}{$name}; 7712 my $standard_name = main::standardize($name); 7713 if (defined $table 7714 || (defined ($table = $table_ref{$addr}{$standard_name}))) 7715 { 7716 Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); 7717 $table_ref{$addr}{$name} = $table; 7718 return $table; 7719 } 7720 else { 7721 7722 # See if this is a perl extension, if not passed in. 7723 my $perl_extension = delete $args{'Perl_Extension'}; 7724 $perl_extension 7725 = $self->perl_extension if ! defined $perl_extension; 7726 7727 $table = Match_Table->new( 7728 Name => $name, 7729 Perl_Extension => $perl_extension, 7730 _Alias_Hash => $table_ref{$addr}, 7731 _Property => $self, 7732 7733 # gets property's fate and status by default 7734 Fate => $self->fate, 7735 Status => $self->status, 7736 _Status_Info => $self->status_info, 7737 %args); 7738 return unless defined $table; 7739 } 7740 7741 # Save the names for quick look up 7742 $table_ref{$addr}{$standard_name} = $table; 7743 $table_ref{$addr}{$name} = $table; 7744 7745 # Perhaps we can figure out the type of this property based on the 7746 # fact of adding this match table. First, string properties don't 7747 # have match tables; second, a binary property can't have 3 match 7748 # tables 7749 if ($type{$addr} == $UNKNOWN) { 7750 $type{$addr} = $NON_STRING; 7751 } 7752 elsif ($type{$addr} == $STRING) { 7753 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); 7754 $type{$addr} = $NON_STRING; 7755 } 7756 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { 7757 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2 7758 && $type{$addr} == $BINARY) 7759 { 7760 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); 7761 $type{$addr} = $ENUM; 7762 } 7763 } 7764 7765 return $table; 7766 } 7767 7768 sub delete_match_table { 7769 # Delete the table referred to by $2 from the property $1. 7770 7771 my $self = shift; 7772 my $table_to_remove = shift; 7773 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7774 7775 my $addr = do { no overloading; pack 'J', $self; }; 7776 7777 # Remove all names that refer to it. 7778 foreach my $key (keys %{$table_ref{$addr}}) { 7779 delete $table_ref{$addr}{$key} 7780 if $table_ref{$addr}{$key} == $table_to_remove; 7781 } 7782 7783 $table_to_remove->DESTROY; 7784 return; 7785 } 7786 7787 sub table { 7788 # Return a pointer to the match table (with name given by the 7789 # parameter) associated with this property; undef if none. 7790 7791 my $self = shift; 7792 my $name = shift; 7793 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7794 7795 my $addr = do { no overloading; pack 'J', $self; }; 7796 7797 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; 7798 7799 # If quick look-up failed, try again using the standard form of the 7800 # input name. If that succeeds, cache the result before returning so 7801 # won't have to standardize this input name again. 7802 my $standard_name = main::standardize($name); 7803 return unless defined $table_ref{$addr}{$standard_name}; 7804 7805 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; 7806 return $table_ref{$addr}{$name}; 7807 } 7808 7809 sub tables { 7810 # Return a list of pointers to all the match tables attached to this 7811 # property 7812 7813 no overloading; 7814 return main::uniques(values %{$table_ref{pack 'J', shift}}); 7815 } 7816 7817 sub directory { 7818 # Returns the directory the map table for this property should be 7819 # output in. If a specific directory has been specified, that has 7820 # priority; 'undef' is returned if the type isn't defined; 7821 # or $map_directory for everything else. 7822 7823 my $addr = do { no overloading; pack 'J', shift; }; 7824 7825 return $directory{$addr} if defined $directory{$addr}; 7826 return undef if $type{$addr} == $UNKNOWN; 7827 return $map_directory; 7828 } 7829 7830 sub swash_name { 7831 # Return the name that is used to both: 7832 # 1) Name the file that the map table is written to. 7833 # 2) The name of swash related stuff inside that file. 7834 # The reason for this is that the Perl core historically has used 7835 # certain names that aren't the same as the Unicode property names. 7836 # To continue using these, $file is hard-coded in this file for those, 7837 # but otherwise the standard name is used. This is different from the 7838 # external_name, so that the rest of the files, like in lib can use 7839 # the standard name always, without regard to historical precedent. 7840 7841 my $self = shift; 7842 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7843 7844 my $addr = do { no overloading; pack 'J', $self; }; 7845 7846 # Swash names are used only on regular map tables; otherwise there 7847 # should be no access to the property map table from other parts of 7848 # Perl. 7849 return if $map{$addr}->fate != $ORDINARY; 7850 7851 return $file{$addr} if defined $file{$addr}; 7852 return $map{$addr}->external_name; 7853 } 7854 7855 sub to_create_match_tables { 7856 # Returns a boolean as to whether or not match tables should be 7857 # created for this property. 7858 7859 my $self = shift; 7860 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7861 7862 # The whole point of this pseudo property is match tables. 7863 return 1 if $self == $perl; 7864 7865 my $addr = do { no overloading; pack 'J', $self; }; 7866 7867 # Don't generate tables of code points that match the property values 7868 # of a string property. Such a list would most likely have many 7869 # property values, each with just one or very few code points mapping 7870 # to it. 7871 return 0 if $type{$addr} == $STRING; 7872 7873 # Don't generate anything for unimplemented properties. 7874 return 0 if grep { $self->complete_name eq $_ } 7875 @unimplemented_properties; 7876 # Otherwise, do. 7877 return 1; 7878 } 7879 7880 sub property_add_or_replace_non_nulls { 7881 # This adds the mappings in the property $other to $self. Non-null 7882 # mappings from $other override those in $self. It essentially merges 7883 # the two properties, with the second having priority except for null 7884 # mappings. 7885 7886 my $self = shift; 7887 my $other = shift; 7888 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7889 7890 if (! $other->isa(__PACKAGE__)) { 7891 Carp::my_carp_bug("$other should be a " 7892 . __PACKAGE__ 7893 . ". Not a '" 7894 . ref($other) 7895 . "'. Not added;"); 7896 return; 7897 } 7898 7899 no overloading; 7900 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); 7901 } 7902 7903 sub set_proxy_for { 7904 # Certain tables are not generally written out to files, but 7905 # Unicode::UCD has the intelligence to know that the file for $self 7906 # can be used to reconstruct those tables. This routine just changes 7907 # things so that UCD pod entries for those suppressed tables are 7908 # generated, so the fact that a proxy is used is invisible to the 7909 # user. 7910 7911 my $self = shift; 7912 7913 foreach my $property_name (@_) { 7914 my $ref = property_ref($property_name); 7915 next if $ref->to_output_map; 7916 $ref->set_fate($MAP_PROXIED); 7917 } 7918 } 7919 7920 sub set_type { 7921 # Set the type of the property. Mostly this is figured out by the 7922 # data in the table. But this is used to set it explicitly. The 7923 # reason it is not a standard accessor is that when setting a binary 7924 # property, we need to make sure that all the true/false aliases are 7925 # present, as they were omitted in early Unicode releases. 7926 7927 my $self = shift; 7928 my $type = shift; 7929 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7930 7931 if ($type != $ENUM 7932 && $type != $BINARY 7933 && $type != $FORCED_BINARY 7934 && $type != $STRING) 7935 { 7936 Carp::my_carp("Unrecognized type '$type'. Type not set"); 7937 return; 7938 } 7939 7940 { no overloading; $type{pack 'J', $self} = $type; } 7941 return if $type != $BINARY && $type != $FORCED_BINARY; 7942 7943 my $yes = $self->table('Y'); 7944 $yes = $self->table('Yes') if ! defined $yes; 7945 $yes = $self->add_match_table('Y', Full_Name => 'Yes') 7946 if ! defined $yes; 7947 7948 # Add aliases in order wanted, duplicates will be ignored. We use a 7949 # binary property present in all releases for its ordered lists of 7950 # true/false aliases. Note, that could run into problems in 7951 # outputting things in that we don't distinguish between the name and 7952 # full name of these. Hopefully, if the table was already created 7953 # before this code is executed, it was done with these set properly. 7954 my $bm = property_ref("Bidi_Mirrored"); 7955 foreach my $alias ($bm->table("Y")->aliases) { 7956 $yes->add_alias($alias->name); 7957 } 7958 my $no = $self->table('N'); 7959 $no = $self->table('No') if ! defined $no; 7960 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; 7961 foreach my $alias ($bm->table("N")->aliases) { 7962 $no->add_alias($alias->name); 7963 } 7964 7965 return; 7966 } 7967 7968 sub add_map { 7969 # Add a map to the property's map table. This also keeps 7970 # track of the maps so that the property type can be determined from 7971 # its data. 7972 7973 my $self = shift; 7974 my $start = shift; # First code point in range 7975 my $end = shift; # Final code point in range 7976 my $map = shift; # What the range maps to. 7977 # Rest of parameters passed on. 7978 7979 my $addr = do { no overloading; pack 'J', $self; }; 7980 7981 # If haven't the type of the property, gather information to figure it 7982 # out. 7983 if ($type{$addr} == $UNKNOWN) { 7984 7985 # If the map contains an interior blank or dash, or most other 7986 # nonword characters, it will be a string property. This 7987 # heuristic may actually miss some string properties. If so, they 7988 # may need to have explicit set_types called for them. This 7989 # happens in the Unihan properties. 7990 if ($map =~ / (?<= . ) [ -] (?= . ) /x 7991 || $map =~ / [^\w.\/\ -] /x) 7992 { 7993 $self->set_type($STRING); 7994 7995 # $unique_maps is used for disambiguating between ENUM and 7996 # BINARY later; since we know the property is not going to be 7997 # one of those, no point in keeping the data around 7998 undef $unique_maps{$addr}; 7999 } 8000 else { 8001 8002 # Not necessarily a string. The final decision has to be 8003 # deferred until all the data are in. We keep track of if all 8004 # the values are code points for that eventual decision. 8005 $has_only_code_point_maps{$addr} &= 8006 $map =~ / ^ $code_point_re $/x; 8007 8008 # For the purposes of disambiguating between binary and other 8009 # enumerations at the end, we keep track of the first three 8010 # distinct property values. Once we get to three, we know 8011 # it's not going to be binary, so no need to track more. 8012 if (scalar keys %{$unique_maps{$addr}} < 3) { 8013 $unique_maps{$addr}{main::standardize($map)} = 1; 8014 } 8015 } 8016 } 8017 8018 # Add the mapping by calling our map table's method 8019 return $map{$addr}->add_map($start, $end, $map, @_); 8020 } 8021 8022 sub compute_type { 8023 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This 8024 # should be called after the property is mostly filled with its maps. 8025 # We have been keeping track of what the property values have been, 8026 # and now have the necessary information to figure out the type. 8027 8028 my $self = shift; 8029 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8030 8031 my $addr = do { no overloading; pack 'J', $self; }; 8032 8033 my $type = $type{$addr}; 8034 8035 # If already have figured these out, no need to do so again, but we do 8036 # a double check on ENUMS to make sure that a string property hasn't 8037 # improperly been classified as an ENUM, so continue on with those. 8038 return if $type == $STRING 8039 || $type == $BINARY 8040 || $type == $FORCED_BINARY; 8041 8042 # If every map is to a code point, is a string property. 8043 if ($type == $UNKNOWN 8044 && ($has_only_code_point_maps{$addr} 8045 || (defined $map{$addr}->default_map 8046 && $map{$addr}->default_map eq ""))) 8047 { 8048 $self->set_type($STRING); 8049 } 8050 else { 8051 8052 # Otherwise, it is to some sort of enumeration. (The case where 8053 # it is a Unicode miscellaneous property, and treated like a 8054 # string in this program is handled in add_map()). Distinguish 8055 # between binary and some other enumeration type. Of course, if 8056 # there are more than two values, it's not binary. But more 8057 # subtle is the test that the default mapping is defined means it 8058 # isn't binary. This in fact may change in the future if Unicode 8059 # changes the way its data is structured. But so far, no binary 8060 # properties ever have @missing lines for them, so the default map 8061 # isn't defined for them. The few properties that are two-valued 8062 # and aren't considered binary have the default map defined 8063 # starting in Unicode 5.0, when the @missing lines appeared; and 8064 # this program has special code to put in a default map for them 8065 # for earlier than 5.0 releases. 8066 if ($type == $ENUM 8067 || scalar keys %{$unique_maps{$addr}} > 2 8068 || defined $self->default_map) 8069 { 8070 my $tables = $self->tables; 8071 my $count = $self->count; 8072 if ($verbosity && $count > 500 && $tables/$count > .1) { 8073 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n"); 8074 } 8075 $self->set_type($ENUM); 8076 } 8077 else { 8078 $self->set_type($BINARY); 8079 } 8080 } 8081 undef $unique_maps{$addr}; # Garbage collect 8082 return; 8083 } 8084 8085 sub set_fate { 8086 my $self = shift; 8087 my $fate = shift; 8088 my $reason = shift; # Ignored unless suppressing 8089 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8090 8091 my $addr = do { no overloading; pack 'J', $self; }; 8092 if ($fate == $SUPPRESSED) { 8093 $why_suppressed{$self->complete_name} = $reason; 8094 } 8095 8096 # Each table shares the property's fate, except that MAP_PROXIED 8097 # doesn't affect match tables 8098 $map{$addr}->set_fate($fate, $reason); 8099 if ($fate != $MAP_PROXIED) { 8100 foreach my $table ($map{$addr}, $self->tables) { 8101 $table->set_fate($fate, $reason); 8102 } 8103 } 8104 return; 8105 } 8106 8107 8108 # Most of the accessors for a property actually apply to its map table. 8109 # Setup up accessor functions for those, referring to %map 8110 for my $sub (qw( 8111 add_alias 8112 add_anomalous_entry 8113 add_comment 8114 add_conflicting 8115 add_description 8116 add_duplicate 8117 add_note 8118 aliases 8119 comment 8120 complete_name 8121 containing_range 8122 count 8123 default_map 8124 delete_range 8125 description 8126 each_range 8127 external_name 8128 fate 8129 file_path 8130 format 8131 initialize 8132 inverse_list 8133 is_empty 8134 name 8135 note 8136 perl_extension 8137 property 8138 range_count 8139 ranges 8140 range_size_1 8141 reset_each_range 8142 set_comment 8143 set_default_map 8144 set_file_path 8145 set_final_comment 8146 _set_format 8147 set_range_size_1 8148 set_status 8149 set_to_output_map 8150 short_name 8151 status 8152 status_info 8153 to_output_map 8154 type_of 8155 value_of 8156 write 8157 )) 8158 # 'property' above is for symmetry, so that one can take 8159 # the property of a property and get itself, and so don't 8160 # have to distinguish between properties and tables in 8161 # calling code 8162 { 8163 no strict "refs"; 8164 *$sub = sub { 8165 use strict "refs"; 8166 my $self = shift; 8167 no overloading; 8168 return $map{pack 'J', $self}->$sub(@_); 8169 } 8170 } 8171 8172 8173} # End closure 8174 8175package main; 8176 8177sub join_lines($) { 8178 # Returns lines of the input joined together, so that they can be folded 8179 # properly. 8180 # This causes continuation lines to be joined together into one long line 8181 # for folding. A continuation line is any line that doesn't begin with a 8182 # space or "\b" (the latter is stripped from the output). This is so 8183 # lines can be be in a HERE document so as to fit nicely in the terminal 8184 # width, but be joined together in one long line, and then folded with 8185 # indents, '#' prefixes, etc, properly handled. 8186 # A blank separates the joined lines except if there is a break; an extra 8187 # blank is inserted after a period ending a line. 8188 8189 # Initialize the return with the first line. 8190 my ($return, @lines) = split "\n", shift; 8191 8192 # If the first line is null, it was an empty line, add the \n back in 8193 $return = "\n" if $return eq ""; 8194 8195 # Now join the remainder of the physical lines. 8196 for my $line (@lines) { 8197 8198 # An empty line means wanted a blank line, so add two \n's to get that 8199 # effect, and go to the next line. 8200 if (length $line == 0) { 8201 $return .= "\n\n"; 8202 next; 8203 } 8204 8205 # Look at the last character of what we have so far. 8206 my $previous_char = substr($return, -1, 1); 8207 8208 # And at the next char to be output. 8209 my $next_char = substr($line, 0, 1); 8210 8211 if ($previous_char ne "\n") { 8212 8213 # Here didn't end wth a nl. If the next char a blank or \b, it 8214 # means that here there is a break anyway. So add a nl to the 8215 # output. 8216 if ($next_char eq " " || $next_char eq "\b") { 8217 $previous_char = "\n"; 8218 $return .= $previous_char; 8219 } 8220 8221 # Add an extra space after periods. 8222 $return .= " " if $previous_char eq '.'; 8223 } 8224 8225 # Here $previous_char is still the latest character to be output. If 8226 # it isn't a nl, it means that the next line is to be a continuation 8227 # line, with a blank inserted between them. 8228 $return .= " " if $previous_char ne "\n"; 8229 8230 # Get rid of any \b 8231 substr($line, 0, 1) = "" if $next_char eq "\b"; 8232 8233 # And append this next line. 8234 $return .= $line; 8235 } 8236 8237 return $return; 8238} 8239 8240sub simple_fold($;$$$) { 8241 # Returns a string of the input (string or an array of strings) folded 8242 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus 8243 # a \n 8244 # This is tailored for the kind of text written by this program, 8245 # especially the pod file, which can have very long names with 8246 # underscores in the middle, or words like AbcDefgHij.... We allow 8247 # breaking in the middle of such constructs if the line won't fit 8248 # otherwise. The break in such cases will come either just after an 8249 # underscore, or just before one of the Capital letters. 8250 8251 local $to_trace = 0 if main::DEBUG; 8252 8253 my $line = shift; 8254 my $prefix = shift; # Optional string to prepend to each output 8255 # line 8256 $prefix = "" unless defined $prefix; 8257 8258 my $hanging_indent = shift; # Optional number of spaces to indent 8259 # continuation lines 8260 $hanging_indent = 0 unless $hanging_indent; 8261 8262 my $right_margin = shift; # Optional number of spaces to narrow the 8263 # total width by. 8264 $right_margin = 0 unless defined $right_margin; 8265 8266 # Call carp with the 'nofold' option to avoid it from trying to call us 8267 # recursively 8268 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_; 8269 8270 # The space available doesn't include what's automatically prepended 8271 # to each line, or what's reserved on the right. 8272 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; 8273 # XXX Instead of using the 'nofold' perhaps better to look up the stack 8274 8275 if (DEBUG && $hanging_indent >= $max) { 8276 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); 8277 $hanging_indent = 0; 8278 } 8279 8280 # First, split into the current physical lines. 8281 my @line; 8282 if (ref $line) { # Better be an array, because not bothering to 8283 # test 8284 foreach my $line (@{$line}) { 8285 push @line, split /\n/, $line; 8286 } 8287 } 8288 else { 8289 @line = split /\n/, $line; 8290 } 8291 8292 #local $to_trace = 1 if main::DEBUG; 8293 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; 8294 8295 # Look at each current physical line. 8296 for (my $i = 0; $i < @line; $i++) { 8297 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; 8298 #local $to_trace = 1 if main::DEBUG; 8299 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; 8300 8301 # Remove prefix, because will be added back anyway, don't want 8302 # doubled prefix 8303 $line[$i] =~ s/^$prefix//; 8304 8305 # Remove trailing space 8306 $line[$i] =~ s/\s+\Z//; 8307 8308 # If the line is too long, fold it. 8309 if (length $line[$i] > $max) { 8310 my $remainder; 8311 8312 # Here needs to fold. Save the leading space in the line for 8313 # later. 8314 $line[$i] =~ /^ ( \s* )/x; 8315 my $leading_space = $1; 8316 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; 8317 8318 # If character at final permissible position is white space, 8319 # fold there, which will delete that white space 8320 if (substr($line[$i], $max - 1, 1) =~ /\s/) { 8321 $remainder = substr($line[$i], $max); 8322 $line[$i] = substr($line[$i], 0, $max - 1); 8323 } 8324 else { 8325 8326 # Otherwise fold at an acceptable break char closest to 8327 # the max length. Look at just the maximal initial 8328 # segment of the line 8329 my $segment = substr($line[$i], 0, $max - 1); 8330 if ($segment =~ 8331 /^ ( .{$hanging_indent} # Don't look before the 8332 # indent. 8333 \ * # Don't look in leading 8334 # blanks past the indent 8335 [^ ] .* # Find the right-most 8336 (?: # acceptable break: 8337 [ \s = ] # space or equal 8338 | - (?! [.0-9] ) # or non-unary minus. 8339 ) # $1 includes the character 8340 )/x) 8341 { 8342 # Split into the initial part that fits, and remaining 8343 # part of the input 8344 $remainder = substr($line[$i], length $1); 8345 $line[$i] = $1; 8346 trace $line[$i] if DEBUG && $to_trace; 8347 trace $remainder if DEBUG && $to_trace; 8348 } 8349 8350 # If didn't find a good breaking spot, see if there is a 8351 # not-so-good breaking spot. These are just after 8352 # underscores or where the case changes from lower to 8353 # upper. Use \a as a soft hyphen, but give up 8354 # and don't break the line if there is actually a \a 8355 # already in the input. We use an ascii character for the 8356 # soft-hyphen to avoid any attempt by miniperl to try to 8357 # access the files that this program is creating. 8358 elsif ($segment !~ /\a/ 8359 && ($segment =~ s/_/_\a/g 8360 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg)) 8361 { 8362 # Here were able to find at least one place to insert 8363 # our substitute soft hyphen. Find the right-most one 8364 # and replace it by a real hyphen. 8365 trace $segment if DEBUG && $to_trace; 8366 substr($segment, 8367 rindex($segment, "\a"), 8368 1) = '-'; 8369 8370 # Then remove the soft hyphen substitutes. 8371 $segment =~ s/\a//g; 8372 trace $segment if DEBUG && $to_trace; 8373 8374 # And split into the initial part that fits, and 8375 # remainder of the line 8376 my $pos = rindex($segment, '-'); 8377 $remainder = substr($line[$i], $pos); 8378 trace $remainder if DEBUG && $to_trace; 8379 $line[$i] = substr($segment, 0, $pos + 1); 8380 } 8381 } 8382 8383 # Here we know if we can fold or not. If we can, $remainder 8384 # is what remains to be processed in the next iteration. 8385 if (defined $remainder) { 8386 trace "folded='$line[$i]'" if main::DEBUG && $to_trace; 8387 8388 # Insert the folded remainder of the line as a new element 8389 # of the array. (It may still be too long, but we will 8390 # deal with that next time through the loop.) Omit any 8391 # leading space in the remainder. 8392 $remainder =~ s/^\s+//; 8393 trace "remainder='$remainder'" if main::DEBUG && $to_trace; 8394 8395 # But then indent by whichever is larger of: 8396 # 1) the leading space on the input line; 8397 # 2) the hanging indent. 8398 # This preserves indentation in the original line. 8399 my $lead = ($leading_space) 8400 ? length $leading_space 8401 : $hanging_indent; 8402 $lead = max($lead, $hanging_indent); 8403 splice @line, $i+1, 0, (" " x $lead) . $remainder; 8404 } 8405 } 8406 8407 # Ready to output the line. Get rid of any trailing space 8408 # And prefix by the required $prefix passed in. 8409 $line[$i] =~ s/\s+$//; 8410 $line[$i] = "$prefix$line[$i]\n"; 8411 } # End of looping through all the lines. 8412 8413 return join "", @line; 8414} 8415 8416sub property_ref { # Returns a reference to a property object. 8417 return Property::property_ref(@_); 8418} 8419 8420sub force_unlink ($) { 8421 my $filename = shift; 8422 return unless file_exists($filename); 8423 return if CORE::unlink($filename); 8424 8425 # We might need write permission 8426 chmod 0777, $filename; 8427 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); 8428 return; 8429} 8430 8431sub write ($$@) { 8432 # Given a filename and references to arrays of lines, write the lines of 8433 # each array to the file 8434 # Filename can be given as an arrayref of directory names 8435 8436 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 8437 8438 my $file = shift; 8439 my $use_utf8 = shift; 8440 8441 # Get into a single string if an array, and get rid of, in Unix terms, any 8442 # leading '.' 8443 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; 8444 $file = File::Spec->canonpath($file); 8445 8446 # If has directories, make sure that they all exist 8447 (undef, my $directories, undef) = File::Spec->splitpath($file); 8448 File::Path::mkpath($directories) if $directories && ! -d $directories; 8449 8450 push @files_actually_output, $file; 8451 8452 force_unlink ($file); 8453 8454 my $OUT; 8455 if (not open $OUT, ">", $file) { 8456 Carp::my_carp("can't open $file for output. Skipping this file: $!"); 8457 return; 8458 } 8459 8460 binmode $OUT, ":utf8" if $use_utf8; 8461 8462 while (defined (my $lines_ref = shift)) { 8463 unless (@$lines_ref) { 8464 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;"); 8465 } 8466 8467 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); 8468 } 8469 close $OUT or die Carp::my_carp("close '$file' failed: $!"); 8470 8471 print "$file written.\n" if $verbosity >= $VERBOSE; 8472 8473 return; 8474} 8475 8476 8477sub Standardize($) { 8478 # This converts the input name string into a standardized equivalent to 8479 # use internally. 8480 8481 my $name = shift; 8482 unless (defined $name) { 8483 Carp::my_carp_bug("Standardize() called with undef. Returning undef."); 8484 return; 8485 } 8486 8487 # Remove any leading or trailing white space 8488 $name =~ s/^\s+//g; 8489 $name =~ s/\s+$//g; 8490 8491 # Convert interior white space and hyphens into underscores. 8492 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; 8493 8494 # Capitalize the letter following an underscore, and convert a sequence of 8495 # multiple underscores to a single one 8496 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; 8497 8498 # And capitalize the first letter, but not for the special cjk ones. 8499 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 8500 return $name; 8501} 8502 8503sub standardize ($) { 8504 # Returns a lower-cased standardized name, without underscores. This form 8505 # is chosen so that it can distinguish between any real versus superficial 8506 # Unicode name differences. It relies on the fact that Unicode doesn't 8507 # have interior underscores, white space, nor dashes in any 8508 # stricter-matched name. It should not be used on Unicode code point 8509 # names (the Name property), as they mostly, but not always follow these 8510 # rules. 8511 8512 my $name = Standardize(shift); 8513 return if !defined $name; 8514 8515 $name =~ s/ (?<= .) _ (?= . ) //xg; 8516 return lc $name; 8517} 8518 8519sub utf8_heavy_name ($$) { 8520 # Returns the name that utf8_heavy.pl will use to find a table. XXX 8521 # perhaps this function should be placed somewhere, like Heavy.pl so that 8522 # utf8_heavy can use it directly without duplicating code that can get 8523 # out-of sync. 8524 8525 my $table = shift; 8526 my $alias = shift; 8527 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8528 8529 my $property = $table->property; 8530 $property = ($property == $perl) 8531 ? "" # 'perl' is never explicitly stated 8532 : standardize($property->name) . '='; 8533 if ($alias->loose_match) { 8534 return $property . standardize($alias->name); 8535 } 8536 else { 8537 return lc ($property . $alias->name); 8538 } 8539 8540 return; 8541} 8542 8543{ # Closure 8544 8545 my $indent_increment = " " x (($debugging_build) ? 2 : 0); 8546 my %already_output; 8547 8548 $main::simple_dumper_nesting = 0; 8549 8550 sub simple_dumper { 8551 # Like Simple Data::Dumper. Good enough for our needs. We can't use 8552 # the real thing as we have to run under miniperl. 8553 8554 # It is designed so that on input it is at the beginning of a line, 8555 # and the final thing output in any call is a trailing ",\n". 8556 8557 my $item = shift; 8558 my $indent = shift; 8559 $indent = "" if ! $debugging_build || ! defined $indent; 8560 8561 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8562 8563 # nesting level is localized, so that as the call stack pops, it goes 8564 # back to the prior value. 8565 local $main::simple_dumper_nesting = $main::simple_dumper_nesting; 8566 undef %already_output if $main::simple_dumper_nesting == 0; 8567 $main::simple_dumper_nesting++; 8568 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; 8569 8570 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8571 8572 # Determine the indent for recursive calls. 8573 my $next_indent = $indent . $indent_increment; 8574 8575 my $output; 8576 if (! ref $item) { 8577 8578 # Dump of scalar: just output it in quotes if not a number. To do 8579 # so we must escape certain characters, and therefore need to 8580 # operate on a copy to avoid changing the original 8581 my $copy = $item; 8582 $copy = $UNDEF unless defined $copy; 8583 8584 # Quote non-integers (integers also have optional leading '-') 8585 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { 8586 8587 # Escape apostrophe and backslash 8588 $copy =~ s/ ( ['\\] ) /\\$1/xg; 8589 $copy = "'$copy'"; 8590 } 8591 $output = "$indent$copy,\n"; 8592 } 8593 else { 8594 8595 # Keep track of cycles in the input, and refuse to infinitely loop 8596 my $addr = do { no overloading; pack 'J', $item; }; 8597 if (defined $already_output{$addr}) { 8598 return "${indent}ALREADY OUTPUT: $item\n"; 8599 } 8600 $already_output{$addr} = $item; 8601 8602 if (ref $item eq 'ARRAY') { 8603 my $using_brackets; 8604 $output = $indent; 8605 if ($main::simple_dumper_nesting > 1) { 8606 $output .= '['; 8607 $using_brackets = 1; 8608 } 8609 else { 8610 $using_brackets = 0; 8611 } 8612 8613 # If the array is empty, put the closing bracket on the same 8614 # line. Otherwise, recursively add each array element 8615 if (@$item == 0) { 8616 $output .= " "; 8617 } 8618 else { 8619 $output .= "\n"; 8620 for (my $i = 0; $i < @$item; $i++) { 8621 8622 # Indent array elements one level 8623 $output .= &simple_dumper($item->[$i], $next_indent); 8624 next if ! $debugging_build; 8625 $output =~ s/\n$//; # Remove any trailing nl so 8626 $output .= " # [$i]\n"; # as to add a comment giving 8627 # the array index 8628 } 8629 $output .= $indent; # Indent closing ']' to orig level 8630 } 8631 $output .= ']' if $using_brackets; 8632 $output .= ",\n"; 8633 } 8634 elsif (ref $item eq 'HASH') { 8635 my $is_first_line; 8636 my $using_braces; 8637 my $body_indent; 8638 8639 # No surrounding braces at top level 8640 $output .= $indent; 8641 if ($main::simple_dumper_nesting > 1) { 8642 $output .= "{\n"; 8643 $is_first_line = 0; 8644 $body_indent = $next_indent; 8645 $next_indent .= $indent_increment; 8646 $using_braces = 1; 8647 } 8648 else { 8649 $is_first_line = 1; 8650 $body_indent = $indent; 8651 $using_braces = 0; 8652 } 8653 8654 # Output hashes sorted alphabetically instead of apparently 8655 # random. Use caseless alphabetic sort 8656 foreach my $key (sort { lc $a cmp lc $b } keys %$item) 8657 { 8658 if ($is_first_line) { 8659 $is_first_line = 0; 8660 } 8661 else { 8662 $output .= "$body_indent"; 8663 } 8664 8665 # The key must be a scalar, but this recursive call quotes 8666 # it 8667 $output .= &simple_dumper($key); 8668 8669 # And change the trailing comma and nl to the hash fat 8670 # comma for clarity, and so the value can be on the same 8671 # line 8672 $output =~ s/,\n$/ => /; 8673 8674 # Recursively call to get the value's dump. 8675 my $next = &simple_dumper($item->{$key}, $next_indent); 8676 8677 # If the value is all on one line, remove its indent, so 8678 # will follow the => immediately. If it takes more than 8679 # one line, start it on a new line. 8680 if ($next !~ /\n.*\n/) { 8681 $next =~ s/^ *//; 8682 } 8683 else { 8684 $output .= "\n"; 8685 } 8686 $output .= $next; 8687 } 8688 8689 $output .= "$indent},\n" if $using_braces; 8690 } 8691 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { 8692 $output = $indent . ref($item) . "\n"; 8693 # XXX see if blessed 8694 } 8695 elsif ($item->can('dump')) { 8696 8697 # By convention in this program, objects furnish a 'dump' 8698 # method. Since not doing any output at this level, just pass 8699 # on the input indent 8700 $output = $item->dump($indent); 8701 } 8702 else { 8703 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); 8704 } 8705 } 8706 return $output; 8707 } 8708} 8709 8710sub dump_inside_out { 8711 # Dump inside-out hashes in an object's state by converting them to a 8712 # regular hash and then calling simple_dumper on that. 8713 8714 my $object = shift; 8715 my $fields_ref = shift; 8716 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8717 8718 my $addr = do { no overloading; pack 'J', $object; }; 8719 8720 my %hash; 8721 foreach my $key (keys %$fields_ref) { 8722 $hash{$key} = $fields_ref->{$key}{$addr}; 8723 } 8724 8725 return simple_dumper(\%hash, @_); 8726} 8727 8728sub _operator_dot { 8729 # Overloaded '.' method that is common to all packages. It uses the 8730 # package's stringify method. 8731 8732 my $self = shift; 8733 my $other = shift; 8734 my $reversed = shift; 8735 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8736 8737 $other = "" unless defined $other; 8738 8739 foreach my $which (\$self, \$other) { 8740 next unless ref $$which; 8741 if ($$which->can('_operator_stringify')) { 8742 $$which = $$which->_operator_stringify; 8743 } 8744 else { 8745 my $ref = ref $$which; 8746 my $addr = do { no overloading; pack 'J', $$which; }; 8747 $$which = "$ref ($addr)"; 8748 } 8749 } 8750 return ($reversed) 8751 ? "$other$self" 8752 : "$self$other"; 8753} 8754 8755sub _operator_equal { 8756 # Generic overloaded '==' routine. To be equal, they must be the exact 8757 # same object 8758 8759 my $self = shift; 8760 my $other = shift; 8761 8762 return 0 unless defined $other; 8763 return 0 unless ref $other; 8764 no overloading; 8765 return $self == $other; 8766} 8767 8768sub _operator_not_equal { 8769 my $self = shift; 8770 my $other = shift; 8771 8772 return ! _operator_equal($self, $other); 8773} 8774 8775sub process_PropertyAliases($) { 8776 # This reads in the PropertyAliases.txt file, which contains almost all 8777 # the character properties in Unicode and their equivalent aliases: 8778 # scf ; Simple_Case_Folding ; sfc 8779 # 8780 # Field 0 is the preferred short name for the property. 8781 # Field 1 is the full name. 8782 # Any succeeding ones are other accepted names. 8783 8784 my $file= shift; 8785 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8786 8787 # This whole file was non-existent in early releases, so use our own 8788 # internal one. 8789 $file->insert_lines(get_old_property_aliases()) 8790 if ! -e 'PropertyAliases.txt'; 8791 8792 # Add any cjk properties that may have been defined. 8793 $file->insert_lines(@cjk_properties); 8794 8795 while ($file->next_line) { 8796 8797 my @data = split /\s*;\s*/; 8798 8799 my $full = $data[1]; 8800 8801 my $this = Property->new($data[0], Full_Name => $full); 8802 8803 # Start looking for more aliases after these two. 8804 for my $i (2 .. @data - 1) { 8805 $this->add_alias($data[$i]); 8806 } 8807 8808 } 8809 return; 8810} 8811 8812sub finish_property_setup { 8813 # Finishes setting up after PropertyAliases. 8814 8815 my $file = shift; 8816 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8817 8818 # This entry was missing from this file in earlier Unicode versions 8819 if (-e 'Jamo.txt') { 8820 my $jsn = property_ref('JSN'); 8821 if (! defined $jsn) { 8822 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name'); 8823 } 8824 } 8825 8826 # These are used so much, that we set globals for them. 8827 $gc = property_ref('General_Category'); 8828 $block = property_ref('Block'); 8829 $script = property_ref('Script'); 8830 8831 # Perl adds this alias. 8832 $gc->add_alias('Category'); 8833 8834 # Unicode::Normalize expects this file with this name and directory. 8835 my $ccc = property_ref('Canonical_Combining_Class'); 8836 if (defined $ccc) { 8837 $ccc->set_file('CombiningClass'); 8838 $ccc->set_directory(File::Spec->curdir()); 8839 } 8840 8841 # These two properties aren't actually used in the core, but unfortunately 8842 # the names just above that are in the core interfere with these, so 8843 # choose different names. These aren't a problem unless the map tables 8844 # for these files get written out. 8845 my $lowercase = property_ref('Lowercase'); 8846 $lowercase->set_file('IsLower') if defined $lowercase; 8847 my $uppercase = property_ref('Uppercase'); 8848 $uppercase->set_file('IsUpper') if defined $uppercase; 8849 8850 # Set up the hard-coded default mappings, but only on properties defined 8851 # for this release 8852 foreach my $property (keys %default_mapping) { 8853 my $property_object = property_ref($property); 8854 next if ! defined $property_object; 8855 my $default_map = $default_mapping{$property}; 8856 $property_object->set_default_map($default_map); 8857 8858 # A map of <code point> implies the property is string. 8859 if ($property_object->type == $UNKNOWN 8860 && $default_map eq $CODE_POINT) 8861 { 8862 $property_object->set_type($STRING); 8863 } 8864 } 8865 8866 # The following use the Multi_Default class to create objects for 8867 # defaults. 8868 8869 # Bidi class has a complicated default, but the derived file takes care of 8870 # the complications, leaving just 'L'. 8871 if (file_exists("${EXTRACTED}DBidiClass.txt")) { 8872 property_ref('Bidi_Class')->set_default_map('L'); 8873 } 8874 else { 8875 my $default; 8876 8877 # The derived file was introduced in 3.1.1. The values below are 8878 # taken from table 3-8, TUS 3.0 8879 my $default_R = 8880 'my $default = Range_List->new; 8881 $default->add_range(0x0590, 0x05FF); 8882 $default->add_range(0xFB1D, 0xFB4F);' 8883 ; 8884 8885 # The defaults apply only to unassigned characters 8886 $default_R .= '$gc->table("Unassigned") & $default;'; 8887 8888 if ($v_version lt v3.0.0) { 8889 $default = Multi_Default->new(R => $default_R, 'L'); 8890 } 8891 else { 8892 8893 # AL apparently not introduced until 3.0: TUS 2.x references are 8894 # not on-line to check it out 8895 my $default_AL = 8896 'my $default = Range_List->new; 8897 $default->add_range(0x0600, 0x07BF); 8898 $default->add_range(0xFB50, 0xFDFF); 8899 $default->add_range(0xFE70, 0xFEFF);' 8900 ; 8901 8902 # Non-character code points introduced in this release; aren't AL 8903 if ($v_version ge 3.1.0) { 8904 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; 8905 } 8906 $default_AL .= '$gc->table("Unassigned") & $default'; 8907 $default = Multi_Default->new(AL => $default_AL, 8908 R => $default_R, 8909 'L'); 8910 } 8911 property_ref('Bidi_Class')->set_default_map($default); 8912 } 8913 8914 # Joining type has a complicated default, but the derived file takes care 8915 # of the complications, leaving just 'U' (or Non_Joining), except the file 8916 # is bad in 3.1.0 8917 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { 8918 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { 8919 property_ref('Joining_Type')->set_default_map('Non_Joining'); 8920 } 8921 else { 8922 8923 # Otherwise, there are not one, but two possibilities for the 8924 # missing defaults: T and U. 8925 # The missing defaults that evaluate to T are given by: 8926 # T = Mn + Cf - ZWNJ - ZWJ 8927 # where Mn and Cf are the general category values. In other words, 8928 # any non-spacing mark or any format control character, except 8929 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO 8930 # WIDTH JOINER (joining type C). 8931 my $default = Multi_Default->new( 8932 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', 8933 'Non_Joining'); 8934 property_ref('Joining_Type')->set_default_map($default); 8935 } 8936 } 8937 8938 # Line break has a complicated default in early releases. It is 'Unknown' 8939 # for non-assigned code points; 'AL' for assigned. 8940 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { 8941 my $lb = property_ref('Line_Break'); 8942 if ($v_version gt 3.2.0) { 8943 $lb->set_default_map('Unknown'); 8944 } 8945 else { 8946 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")', 8947 'AL'); 8948 $lb->set_default_map($default); 8949 } 8950 8951 # If has the URS property, make sure that the standard aliases are in 8952 # it, since not in the input tables in some versions. 8953 my $urs = property_ref('Unicode_Radical_Stroke'); 8954 if (defined $urs) { 8955 $urs->add_alias('cjkRSUnicode'); 8956 $urs->add_alias('kRSUnicode'); 8957 } 8958 } 8959 8960 # For backwards compatibility with applications that may read the mapping 8961 # file directly (it was documented in 5.12 and 5.14 as being thusly 8962 # usable), keep it from being adjusted. (range_size_1 is 8963 # used to force the traditional format.) 8964 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { 8965 $nfkc_cf->set_to_output_map($EXTERNAL_MAP); 8966 $nfkc_cf->set_range_size_1(1); 8967 } 8968 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { 8969 $bmg->set_to_output_map($EXTERNAL_MAP); 8970 $bmg->set_range_size_1(1); 8971 } 8972 8973 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); 8974 8975 return; 8976} 8977 8978sub get_old_property_aliases() { 8979 # Returns what would be in PropertyAliases.txt if it existed in very old 8980 # versions of Unicode. It was derived from the one in 3.2, and pared 8981 # down based on the data that was actually in the older releases. 8982 # An attempt was made to use the existence of files to mean inclusion or 8983 # not of various aliases, but if this was not sufficient, using version 8984 # numbers was resorted to. 8985 8986 my @return; 8987 8988 # These are to be used in all versions (though some are constructed by 8989 # this program if missing) 8990 push @return, split /\n/, <<'END'; 8991bc ; Bidi_Class 8992Bidi_M ; Bidi_Mirrored 8993cf ; Case_Folding 8994ccc ; Canonical_Combining_Class 8995dm ; Decomposition_Mapping 8996dt ; Decomposition_Type 8997gc ; General_Category 8998isc ; ISO_Comment 8999lc ; Lowercase_Mapping 9000na ; Name 9001na1 ; Unicode_1_Name 9002nt ; Numeric_Type 9003nv ; Numeric_Value 9004sfc ; Simple_Case_Folding 9005slc ; Simple_Lowercase_Mapping 9006stc ; Simple_Titlecase_Mapping 9007suc ; Simple_Uppercase_Mapping 9008tc ; Titlecase_Mapping 9009uc ; Uppercase_Mapping 9010END 9011 9012 if (-e 'Blocks.txt') { 9013 push @return, "blk ; Block\n"; 9014 } 9015 if (-e 'ArabicShaping.txt') { 9016 push @return, split /\n/, <<'END'; 9017jg ; Joining_Group 9018jt ; Joining_Type 9019END 9020 } 9021 if (-e 'PropList.txt') { 9022 9023 # This first set is in the original old-style proplist. 9024 push @return, split /\n/, <<'END'; 9025Alpha ; Alphabetic 9026Bidi_C ; Bidi_Control 9027Dash ; Dash 9028Dia ; Diacritic 9029Ext ; Extender 9030Hex ; Hex_Digit 9031Hyphen ; Hyphen 9032IDC ; ID_Continue 9033Ideo ; Ideographic 9034Join_C ; Join_Control 9035Math ; Math 9036QMark ; Quotation_Mark 9037Term ; Terminal_Punctuation 9038WSpace ; White_Space 9039END 9040 # The next sets were added later 9041 if ($v_version ge v3.0.0) { 9042 push @return, split /\n/, <<'END'; 9043Upper ; Uppercase 9044Lower ; Lowercase 9045END 9046 } 9047 if ($v_version ge v3.0.1) { 9048 push @return, split /\n/, <<'END'; 9049NChar ; Noncharacter_Code_Point 9050END 9051 } 9052 # The next sets were added in the new-style 9053 if ($v_version ge v3.1.0) { 9054 push @return, split /\n/, <<'END'; 9055OAlpha ; Other_Alphabetic 9056OLower ; Other_Lowercase 9057OMath ; Other_Math 9058OUpper ; Other_Uppercase 9059END 9060 } 9061 if ($v_version ge v3.1.1) { 9062 push @return, "AHex ; ASCII_Hex_Digit\n"; 9063 } 9064 } 9065 if (-e 'EastAsianWidth.txt') { 9066 push @return, "ea ; East_Asian_Width\n"; 9067 } 9068 if (-e 'CompositionExclusions.txt') { 9069 push @return, "CE ; Composition_Exclusion\n"; 9070 } 9071 if (-e 'LineBreak.txt') { 9072 push @return, "lb ; Line_Break\n"; 9073 } 9074 if (-e 'BidiMirroring.txt') { 9075 push @return, "bmg ; Bidi_Mirroring_Glyph\n"; 9076 } 9077 if (-e 'Scripts.txt') { 9078 push @return, "sc ; Script\n"; 9079 } 9080 if (-e 'DNormalizationProps.txt') { 9081 push @return, split /\n/, <<'END'; 9082Comp_Ex ; Full_Composition_Exclusion 9083FC_NFKC ; FC_NFKC_Closure 9084NFC_QC ; NFC_Quick_Check 9085NFD_QC ; NFD_Quick_Check 9086NFKC_QC ; NFKC_Quick_Check 9087NFKD_QC ; NFKD_Quick_Check 9088XO_NFC ; Expands_On_NFC 9089XO_NFD ; Expands_On_NFD 9090XO_NFKC ; Expands_On_NFKC 9091XO_NFKD ; Expands_On_NFKD 9092END 9093 } 9094 if (-e 'DCoreProperties.txt') { 9095 push @return, split /\n/, <<'END'; 9096IDS ; ID_Start 9097XIDC ; XID_Continue 9098XIDS ; XID_Start 9099END 9100 # These can also appear in some versions of PropList.txt 9101 push @return, "Lower ; Lowercase\n" 9102 unless grep { $_ =~ /^Lower\b/} @return; 9103 push @return, "Upper ; Uppercase\n" 9104 unless grep { $_ =~ /^Upper\b/} @return; 9105 } 9106 9107 # This flag requires the DAge.txt file to be copied into the directory. 9108 if (DEBUG && $compare_versions) { 9109 push @return, 'age ; Age'; 9110 } 9111 9112 return @return; 9113} 9114 9115sub process_PropValueAliases { 9116 # This file contains values that properties look like: 9117 # bc ; AL ; Arabic_Letter 9118 # blk; n/a ; Greek_And_Coptic ; Greek 9119 # 9120 # Field 0 is the property. 9121 # Field 1 is the short name of a property value or 'n/a' if no 9122 # short name exists; 9123 # Field 2 is the full property value name; 9124 # Any other fields are more synonyms for the property value. 9125 # Purely numeric property values are omitted from the file; as are some 9126 # others, fewer and fewer in later releases 9127 9128 # Entries for the ccc property have an extra field before the 9129 # abbreviation: 9130 # ccc; 0; NR ; Not_Reordered 9131 # It is the numeric value that the names are synonyms for. 9132 9133 # There are comment entries for values missing from this file: 9134 # # @missing: 0000..10FFFF; ISO_Comment; <none> 9135 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point> 9136 9137 my $file= shift; 9138 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9139 9140 # This whole file was non-existent in early releases, so use our own 9141 # internal one if necessary. 9142 if (! -e 'PropValueAliases.txt') { 9143 $file->insert_lines(get_old_property_value_aliases()); 9144 } 9145 9146 # Add any explicit cjk values 9147 $file->insert_lines(@cjk_property_values); 9148 9149 # This line is used only for testing the code that checks for name 9150 # conflicts. There is a script Inherited, and when this line is executed 9151 # it causes there to be a name conflict with the 'Inherited' that this 9152 # program generates for this block property value 9153 #$file->insert_lines('blk; n/a; Herited'); 9154 9155 9156 # Process each line of the file ... 9157 while ($file->next_line) { 9158 9159 # Fix typo in input file 9160 s/CCC133/CCC132/g if $v_version eq v6.1.0; 9161 9162 my ($property, @data) = split /\s*;\s*/; 9163 9164 # The ccc property has an extra field at the beginning, which is the 9165 # numeric value. Move it to be after the other two, mnemonic, fields, 9166 # so that those will be used as the property value's names, and the 9167 # number will be an extra alias. (Rightmost splice removes field 1-2, 9168 # returning them in a slice; left splice inserts that before anything, 9169 # thus shifting the former field 0 to after them.) 9170 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; 9171 9172 # Field 0 is a short name unless "n/a"; field 1 is the full name. If 9173 # there is no short name, use the full one in element 1 9174 if ($data[0] eq "n/a") { 9175 $data[0] = $data[1]; 9176 } 9177 elsif ($data[0] ne $data[1] 9178 && standardize($data[0]) eq standardize($data[1]) 9179 && $data[1] !~ /[[:upper:]]/) 9180 { 9181 # Also, there is a bug in the file in which "n/a" is omitted, and 9182 # the two fields are identical except for case, and the full name 9183 # is all lower case. Copy the "short" name unto the full one to 9184 # give it some upper case. 9185 9186 $data[1] = $data[0]; 9187 } 9188 9189 # Earlier releases had the pseudo property 'qc' that should expand to 9190 # the ones that replace it below. 9191 if ($property eq 'qc') { 9192 if (lc $data[0] eq 'y') { 9193 $file->insert_lines('NFC_QC; Y ; Yes', 9194 'NFD_QC; Y ; Yes', 9195 'NFKC_QC; Y ; Yes', 9196 'NFKD_QC; Y ; Yes', 9197 ); 9198 } 9199 elsif (lc $data[0] eq 'n') { 9200 $file->insert_lines('NFC_QC; N ; No', 9201 'NFD_QC; N ; No', 9202 'NFKC_QC; N ; No', 9203 'NFKD_QC; N ; No', 9204 ); 9205 } 9206 elsif (lc $data[0] eq 'm') { 9207 $file->insert_lines('NFC_QC; M ; Maybe', 9208 'NFKC_QC; M ; Maybe', 9209 ); 9210 } 9211 else { 9212 $file->carp_bad_line("qc followed by unexpected '$data[0]"); 9213 } 9214 next; 9215 } 9216 9217 # The first field is the short name, 2nd is the full one. 9218 my $property_object = property_ref($property); 9219 my $table = $property_object->add_match_table($data[0], 9220 Full_Name => $data[1]); 9221 9222 # Start looking for more aliases after these two. 9223 for my $i (2 .. @data - 1) { 9224 $table->add_alias($data[$i]); 9225 } 9226 } # End of looping through the file 9227 9228 # As noted in the comments early in the program, it generates tables for 9229 # the default values for all releases, even those for which the concept 9230 # didn't exist at the time. Here we add those if missing. 9231 my $age = property_ref('age'); 9232 if (defined $age && ! defined $age->table('Unassigned')) { 9233 $age->add_match_table('Unassigned'); 9234 } 9235 $block->add_match_table('No_Block') if -e 'Blocks.txt' 9236 && ! defined $block->table('No_Block'); 9237 9238 9239 # Now set the default mappings of the properties from the file. This is 9240 # done after the loop because a number of properties have only @missings 9241 # entries in the file, and may not show up until the end. 9242 my @defaults = $file->get_missings; 9243 foreach my $default_ref (@defaults) { 9244 my $default = $default_ref->[0]; 9245 my $property = property_ref($default_ref->[1]); 9246 $property->set_default_map($default); 9247 } 9248 return; 9249} 9250 9251sub get_old_property_value_aliases () { 9252 # Returns what would be in PropValueAliases.txt if it existed in very old 9253 # versions of Unicode. It was derived from the one in 3.2, and pared 9254 # down. An attempt was made to use the existence of files to mean 9255 # inclusion or not of various aliases, but if this was not sufficient, 9256 # using version numbers was resorted to. 9257 9258 my @return = split /\n/, <<'END'; 9259bc ; AN ; Arabic_Number 9260bc ; B ; Paragraph_Separator 9261bc ; CS ; Common_Separator 9262bc ; EN ; European_Number 9263bc ; ES ; European_Separator 9264bc ; ET ; European_Terminator 9265bc ; L ; Left_To_Right 9266bc ; ON ; Other_Neutral 9267bc ; R ; Right_To_Left 9268bc ; WS ; White_Space 9269 9270# The standard combining classes are very much different in v1, so only use 9271# ones that look right (not checked thoroughly) 9272ccc; 0; NR ; Not_Reordered 9273ccc; 1; OV ; Overlay 9274ccc; 7; NK ; Nukta 9275ccc; 8; KV ; Kana_Voicing 9276ccc; 9; VR ; Virama 9277ccc; 202; ATBL ; Attached_Below_Left 9278ccc; 216; ATAR ; Attached_Above_Right 9279ccc; 218; BL ; Below_Left 9280ccc; 220; B ; Below 9281ccc; 222; BR ; Below_Right 9282ccc; 224; L ; Left 9283ccc; 228; AL ; Above_Left 9284ccc; 230; A ; Above 9285ccc; 232; AR ; Above_Right 9286ccc; 234; DA ; Double_Above 9287 9288dt ; can ; canonical 9289dt ; enc ; circle 9290dt ; fin ; final 9291dt ; font ; font 9292dt ; fra ; fraction 9293dt ; init ; initial 9294dt ; iso ; isolated 9295dt ; med ; medial 9296dt ; n/a ; none 9297dt ; nb ; noBreak 9298dt ; sqr ; square 9299dt ; sub ; sub 9300dt ; sup ; super 9301 9302gc ; C ; Other # Cc | Cf | Cn | Co | Cs 9303gc ; Cc ; Control 9304gc ; Cn ; Unassigned 9305gc ; Co ; Private_Use 9306gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu 9307gc ; LC ; Cased_Letter # Ll | Lt | Lu 9308gc ; Ll ; Lowercase_Letter 9309gc ; Lm ; Modifier_Letter 9310gc ; Lo ; Other_Letter 9311gc ; Lu ; Uppercase_Letter 9312gc ; M ; Mark # Mc | Me | Mn 9313gc ; Mc ; Spacing_Mark 9314gc ; Mn ; Nonspacing_Mark 9315gc ; N ; Number # Nd | Nl | No 9316gc ; Nd ; Decimal_Number 9317gc ; No ; Other_Number 9318gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps 9319gc ; Pd ; Dash_Punctuation 9320gc ; Pe ; Close_Punctuation 9321gc ; Po ; Other_Punctuation 9322gc ; Ps ; Open_Punctuation 9323gc ; S ; Symbol # Sc | Sk | Sm | So 9324gc ; Sc ; Currency_Symbol 9325gc ; Sm ; Math_Symbol 9326gc ; So ; Other_Symbol 9327gc ; Z ; Separator # Zl | Zp | Zs 9328gc ; Zl ; Line_Separator 9329gc ; Zp ; Paragraph_Separator 9330gc ; Zs ; Space_Separator 9331 9332nt ; de ; Decimal 9333nt ; di ; Digit 9334nt ; n/a ; None 9335nt ; nu ; Numeric 9336END 9337 9338 if (-e 'ArabicShaping.txt') { 9339 push @return, split /\n/, <<'END'; 9340jg ; n/a ; AIN 9341jg ; n/a ; ALEF 9342jg ; n/a ; DAL 9343jg ; n/a ; GAF 9344jg ; n/a ; LAM 9345jg ; n/a ; MEEM 9346jg ; n/a ; NO_JOINING_GROUP 9347jg ; n/a ; NOON 9348jg ; n/a ; QAF 9349jg ; n/a ; SAD 9350jg ; n/a ; SEEN 9351jg ; n/a ; TAH 9352jg ; n/a ; WAW 9353 9354jt ; C ; Join_Causing 9355jt ; D ; Dual_Joining 9356jt ; L ; Left_Joining 9357jt ; R ; Right_Joining 9358jt ; U ; Non_Joining 9359jt ; T ; Transparent 9360END 9361 if ($v_version ge v3.0.0) { 9362 push @return, split /\n/, <<'END'; 9363jg ; n/a ; ALAPH 9364jg ; n/a ; BEH 9365jg ; n/a ; BETH 9366jg ; n/a ; DALATH_RISH 9367jg ; n/a ; E 9368jg ; n/a ; FEH 9369jg ; n/a ; FINAL_SEMKATH 9370jg ; n/a ; GAMAL 9371jg ; n/a ; HAH 9372jg ; n/a ; HAMZA_ON_HEH_GOAL 9373jg ; n/a ; HE 9374jg ; n/a ; HEH 9375jg ; n/a ; HEH_GOAL 9376jg ; n/a ; HETH 9377jg ; n/a ; KAF 9378jg ; n/a ; KAPH 9379jg ; n/a ; KNOTTED_HEH 9380jg ; n/a ; LAMADH 9381jg ; n/a ; MIM 9382jg ; n/a ; NUN 9383jg ; n/a ; PE 9384jg ; n/a ; QAPH 9385jg ; n/a ; REH 9386jg ; n/a ; REVERSED_PE 9387jg ; n/a ; SADHE 9388jg ; n/a ; SEMKATH 9389jg ; n/a ; SHIN 9390jg ; n/a ; SWASH_KAF 9391jg ; n/a ; TAW 9392jg ; n/a ; TEH_MARBUTA 9393jg ; n/a ; TETH 9394jg ; n/a ; YEH 9395jg ; n/a ; YEH_BARREE 9396jg ; n/a ; YEH_WITH_TAIL 9397jg ; n/a ; YUDH 9398jg ; n/a ; YUDH_HE 9399jg ; n/a ; ZAIN 9400END 9401 } 9402 } 9403 9404 9405 if (-e 'EastAsianWidth.txt') { 9406 push @return, split /\n/, <<'END'; 9407ea ; A ; Ambiguous 9408ea ; F ; Fullwidth 9409ea ; H ; Halfwidth 9410ea ; N ; Neutral 9411ea ; Na ; Narrow 9412ea ; W ; Wide 9413END 9414 } 9415 9416 if (-e 'LineBreak.txt') { 9417 push @return, split /\n/, <<'END'; 9418lb ; AI ; Ambiguous 9419lb ; AL ; Alphabetic 9420lb ; B2 ; Break_Both 9421lb ; BA ; Break_After 9422lb ; BB ; Break_Before 9423lb ; BK ; Mandatory_Break 9424lb ; CB ; Contingent_Break 9425lb ; CL ; Close_Punctuation 9426lb ; CM ; Combining_Mark 9427lb ; CR ; Carriage_Return 9428lb ; EX ; Exclamation 9429lb ; GL ; Glue 9430lb ; HY ; Hyphen 9431lb ; ID ; Ideographic 9432lb ; IN ; Inseperable 9433lb ; IS ; Infix_Numeric 9434lb ; LF ; Line_Feed 9435lb ; NS ; Nonstarter 9436lb ; NU ; Numeric 9437lb ; OP ; Open_Punctuation 9438lb ; PO ; Postfix_Numeric 9439lb ; PR ; Prefix_Numeric 9440lb ; QU ; Quotation 9441lb ; SA ; Complex_Context 9442lb ; SG ; Surrogate 9443lb ; SP ; Space 9444lb ; SY ; Break_Symbols 9445lb ; XX ; Unknown 9446lb ; ZW ; ZWSpace 9447END 9448 } 9449 9450 if (-e 'DNormalizationProps.txt') { 9451 push @return, split /\n/, <<'END'; 9452qc ; M ; Maybe 9453qc ; N ; No 9454qc ; Y ; Yes 9455END 9456 } 9457 9458 if (-e 'Scripts.txt') { 9459 push @return, split /\n/, <<'END'; 9460sc ; Arab ; Arabic 9461sc ; Armn ; Armenian 9462sc ; Beng ; Bengali 9463sc ; Bopo ; Bopomofo 9464sc ; Cans ; Canadian_Aboriginal 9465sc ; Cher ; Cherokee 9466sc ; Cyrl ; Cyrillic 9467sc ; Deva ; Devanagari 9468sc ; Dsrt ; Deseret 9469sc ; Ethi ; Ethiopic 9470sc ; Geor ; Georgian 9471sc ; Goth ; Gothic 9472sc ; Grek ; Greek 9473sc ; Gujr ; Gujarati 9474sc ; Guru ; Gurmukhi 9475sc ; Hang ; Hangul 9476sc ; Hani ; Han 9477sc ; Hebr ; Hebrew 9478sc ; Hira ; Hiragana 9479sc ; Ital ; Old_Italic 9480sc ; Kana ; Katakana 9481sc ; Khmr ; Khmer 9482sc ; Knda ; Kannada 9483sc ; Laoo ; Lao 9484sc ; Latn ; Latin 9485sc ; Mlym ; Malayalam 9486sc ; Mong ; Mongolian 9487sc ; Mymr ; Myanmar 9488sc ; Ogam ; Ogham 9489sc ; Orya ; Oriya 9490sc ; Qaai ; Inherited 9491sc ; Runr ; Runic 9492sc ; Sinh ; Sinhala 9493sc ; Syrc ; Syriac 9494sc ; Taml ; Tamil 9495sc ; Telu ; Telugu 9496sc ; Thaa ; Thaana 9497sc ; Thai ; Thai 9498sc ; Tibt ; Tibetan 9499sc ; Yiii ; Yi 9500sc ; Zyyy ; Common 9501END 9502 } 9503 9504 if ($v_version ge v2.0.0) { 9505 push @return, split /\n/, <<'END'; 9506dt ; com ; compat 9507dt ; nar ; narrow 9508dt ; sml ; small 9509dt ; vert ; vertical 9510dt ; wide ; wide 9511 9512gc ; Cf ; Format 9513gc ; Cs ; Surrogate 9514gc ; Lt ; Titlecase_Letter 9515gc ; Me ; Enclosing_Mark 9516gc ; Nl ; Letter_Number 9517gc ; Pc ; Connector_Punctuation 9518gc ; Sk ; Modifier_Symbol 9519END 9520 } 9521 if ($v_version ge v2.1.2) { 9522 push @return, "bc ; S ; Segment_Separator\n"; 9523 } 9524 if ($v_version ge v2.1.5) { 9525 push @return, split /\n/, <<'END'; 9526gc ; Pf ; Final_Punctuation 9527gc ; Pi ; Initial_Punctuation 9528END 9529 } 9530 if ($v_version ge v2.1.8) { 9531 push @return, "ccc; 240; IS ; Iota_Subscript\n"; 9532 } 9533 9534 if ($v_version ge v3.0.0) { 9535 push @return, split /\n/, <<'END'; 9536bc ; AL ; Arabic_Letter 9537bc ; BN ; Boundary_Neutral 9538bc ; LRE ; Left_To_Right_Embedding 9539bc ; LRO ; Left_To_Right_Override 9540bc ; NSM ; Nonspacing_Mark 9541bc ; PDF ; Pop_Directional_Format 9542bc ; RLE ; Right_To_Left_Embedding 9543bc ; RLO ; Right_To_Left_Override 9544 9545ccc; 233; DB ; Double_Below 9546END 9547 } 9548 9549 if ($v_version ge v3.1.0) { 9550 push @return, "ccc; 226; R ; Right\n"; 9551 } 9552 9553 return @return; 9554} 9555 9556sub output_perl_charnames_line ($$) { 9557 9558 # Output the entries in Perl_charnames specially, using 5 digits instead 9559 # of four. This makes the entries a constant length, and simplifies 9560 # charnames.pm which this table is for. Unicode can have 6 digit 9561 # ordinals, but they are all private use or noncharacters which do not 9562 # have names, so won't be in this table. 9563 9564 return sprintf "%05X\t%s\n", $_[0], $_[1]; 9565} 9566 9567{ # Closure 9568 # This is used to store the range list of all the code points usable when 9569 # the little used $compare_versions feature is enabled. 9570 my $compare_versions_range_list; 9571 9572 # These are constants to the $property_info hash in this subroutine, to 9573 # avoid using a quoted-string which might have a typo. 9574 my $TYPE = 'type'; 9575 my $DEFAULT_MAP = 'default_map'; 9576 my $DEFAULT_TABLE = 'default_table'; 9577 my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; 9578 my $MISSINGS = 'missings'; 9579 9580 sub process_generic_property_file { 9581 # This processes a file containing property mappings and puts them 9582 # into internal map tables. It should be used to handle any property 9583 # files that have mappings from a code point or range thereof to 9584 # something else. This means almost all the UCD .txt files. 9585 # each_line_handlers() should be set to adjust the lines of these 9586 # files, if necessary, to what this routine understands: 9587 # 9588 # 0374 ; NFD_QC; N 9589 # 003C..003E ; Math 9590 # 9591 # the fields are: "codepoint-range ; property; map" 9592 # 9593 # meaning the codepoints in the range all have the value 'map' under 9594 # 'property'. 9595 # Beginning and trailing white space in each field are not significant. 9596 # Note there is not a trailing semi-colon in the above. A trailing 9597 # semi-colon means the map is a null-string. An omitted map, as 9598 # opposed to a null-string, is assumed to be 'Y', based on Unicode 9599 # table syntax. (This could have been hidden from this routine by 9600 # doing it in the $file object, but that would require parsing of the 9601 # line there, so would have to parse it twice, or change the interface 9602 # to pass this an array. So not done.) 9603 # 9604 # The map field may begin with a sequence of commands that apply to 9605 # this range. Each such command begins and ends with $CMD_DELIM. 9606 # These are used to indicate, for example, that the mapping for a 9607 # range has a non-default type. 9608 # 9609 # This loops through the file, calling it's next_line() method, and 9610 # then taking the map and adding it to the property's table. 9611 # Complications arise because any number of properties can be in the 9612 # file, in any order, interspersed in any way. The first time a 9613 # property is seen, it gets information about that property and 9614 # caches it for quick retrieval later. It also normalizes the maps 9615 # so that only one of many synonyms is stored. The Unicode input 9616 # files do use some multiple synonyms. 9617 9618 my $file = shift; 9619 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9620 9621 my %property_info; # To keep track of what properties 9622 # have already had entries in the 9623 # current file, and info about each, 9624 # so don't have to recompute. 9625 my $property_name; # property currently being worked on 9626 my $property_type; # and its type 9627 my $previous_property_name = ""; # name from last time through loop 9628 my $property_object; # pointer to the current property's 9629 # object 9630 my $property_addr; # the address of that object 9631 my $default_map; # the string that code points missing 9632 # from the file map to 9633 my $default_table; # For non-string properties, a 9634 # reference to the match table that 9635 # will contain the list of code 9636 # points that map to $default_map. 9637 9638 # Get the next real non-comment line 9639 LINE: 9640 while ($file->next_line) { 9641 9642 # Default replacement type; means that if parts of the range have 9643 # already been stored in our tables, the new map overrides them if 9644 # they differ more than cosmetically 9645 my $replace = $IF_NOT_EQUIVALENT; 9646 my $map_type; # Default type for the map of this range 9647 9648 #local $to_trace = 1 if main::DEBUG; 9649 trace $_ if main::DEBUG && $to_trace; 9650 9651 # Split the line into components 9652 my ($range, $property_name, $map, @remainder) 9653 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 9654 9655 # If more or less on the line than we are expecting, warn and skip 9656 # the line 9657 if (@remainder) { 9658 $file->carp_bad_line('Extra fields'); 9659 next LINE; 9660 } 9661 elsif ( ! defined $property_name) { 9662 $file->carp_bad_line('Missing property'); 9663 next LINE; 9664 } 9665 9666 # Examine the range. 9667 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 9668 { 9669 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); 9670 next LINE; 9671 } 9672 my $low = hex $1; 9673 my $high = (defined $2) ? hex $2 : $low; 9674 9675 # For the very specialized case of comparing two Unicode 9676 # versions... 9677 if (DEBUG && $compare_versions) { 9678 if ($property_name eq 'Age') { 9679 9680 # Only allow code points at least as old as the version 9681 # specified. 9682 my $age = pack "C*", split(/\./, $map); # v string 9683 next LINE if $age gt $compare_versions; 9684 } 9685 else { 9686 9687 # Again, we throw out code points younger than those of 9688 # the specified version. By now, the Age property is 9689 # populated. We use the intersection of each input range 9690 # with this property to find what code points in it are 9691 # valid. To do the intersection, we have to convert the 9692 # Age property map to a Range_list. We only have to do 9693 # this once. 9694 if (! defined $compare_versions_range_list) { 9695 my $age = property_ref('Age'); 9696 if (! -e 'DAge.txt') { 9697 croak "Need to have 'DAge.txt' file to do version comparison"; 9698 } 9699 elsif ($age->count == 0) { 9700 croak "The 'Age' table is empty, but its file exists"; 9701 } 9702 $compare_versions_range_list 9703 = Range_List->new(Initialize => $age); 9704 } 9705 9706 # An undefined map is always 'Y' 9707 $map = 'Y' if ! defined $map; 9708 9709 # Calculate the intersection of the input range with the 9710 # code points that are known in the specified version 9711 my @ranges = ($compare_versions_range_list 9712 & Range->new($low, $high))->ranges; 9713 9714 # If the intersection is empty, throw away this range 9715 next LINE unless @ranges; 9716 9717 # Only examine the first range this time through the loop. 9718 my $this_range = shift @ranges; 9719 9720 # Put any remaining ranges in the queue to be processed 9721 # later. Note that there is unnecessary work here, as we 9722 # will do the intersection again for each of these ranges 9723 # during some future iteration of the LINE loop, but this 9724 # code is not used in production. The later intersections 9725 # are guaranteed to not splinter, so this will not become 9726 # an infinite loop. 9727 my $line = join ';', $property_name, $map; 9728 foreach my $range (@ranges) { 9729 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s", 9730 $range->start, 9731 $range->end, 9732 $line)); 9733 } 9734 9735 # And process the first range, like any other. 9736 $low = $this_range->start; 9737 $high = $this_range->end; 9738 } 9739 } # End of $compare_versions 9740 9741 # If changing to a new property, get the things constant per 9742 # property 9743 if ($previous_property_name ne $property_name) { 9744 9745 $property_object = property_ref($property_name); 9746 if (! defined $property_object) { 9747 $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); 9748 next LINE; 9749 } 9750 { no overloading; $property_addr = pack 'J', $property_object; } 9751 9752 # Defer changing names until have a line that is acceptable 9753 # (the 'next' statement above means is unacceptable) 9754 $previous_property_name = $property_name; 9755 9756 # If not the first time for this property, retrieve info about 9757 # it from the cache 9758 if (defined ($property_info{$property_addr}{$TYPE})) { 9759 $property_type = $property_info{$property_addr}{$TYPE}; 9760 $default_map = $property_info{$property_addr}{$DEFAULT_MAP}; 9761 $map_type 9762 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}; 9763 $default_table 9764 = $property_info{$property_addr}{$DEFAULT_TABLE}; 9765 } 9766 else { 9767 9768 # Here, is the first time for this property. Set up the 9769 # cache. 9770 $property_type = $property_info{$property_addr}{$TYPE} 9771 = $property_object->type; 9772 $map_type 9773 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE} 9774 = $property_object->pseudo_map_type; 9775 9776 # The Unicode files are set up so that if the map is not 9777 # defined, it is a binary property 9778 if (! defined $map && $property_type != $BINARY) { 9779 if ($property_type != $UNKNOWN 9780 && $property_type != $NON_STRING) 9781 { 9782 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); 9783 } 9784 else { 9785 $property_object->set_type($BINARY); 9786 $property_type 9787 = $property_info{$property_addr}{$TYPE} 9788 = $BINARY; 9789 } 9790 } 9791 9792 # Get any @missings default for this property. This 9793 # should precede the first entry for the property in the 9794 # input file, and is located in a comment that has been 9795 # stored by the Input_file class until we access it here. 9796 # It's possible that there is more than one such line 9797 # waiting for us; collect them all, and parse 9798 my @missings_list = $file->get_missings 9799 if $file->has_missings_defaults; 9800 foreach my $default_ref (@missings_list) { 9801 my $default = $default_ref->[0]; 9802 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); }; 9803 9804 # For string properties, the default is just what the 9805 # file says, but non-string properties should already 9806 # have set up a table for the default property value; 9807 # use the table for these, so can resolve synonyms 9808 # later to a single standard one. 9809 if ($property_type == $STRING 9810 || $property_type == $UNKNOWN) 9811 { 9812 $property_info{$addr}{$MISSINGS} = $default; 9813 } 9814 else { 9815 $property_info{$addr}{$MISSINGS} 9816 = $property_object->table($default); 9817 } 9818 } 9819 9820 # Finished storing all the @missings defaults in the input 9821 # file so far. Get the one for the current property. 9822 my $missings = $property_info{$property_addr}{$MISSINGS}; 9823 9824 # But we likely have separately stored what the default 9825 # should be. (This is to accommodate versions of the 9826 # standard where the @missings lines are absent or 9827 # incomplete.) Hopefully the two will match. But check 9828 # it out. 9829 $default_map = $property_object->default_map; 9830 9831 # If the map is a ref, it means that the default won't be 9832 # processed until later, so undef it, so next few lines 9833 # will redefine it to something that nothing will match 9834 undef $default_map if ref $default_map; 9835 9836 # Create a $default_map if don't have one; maybe a dummy 9837 # that won't match anything. 9838 if (! defined $default_map) { 9839 9840 # Use any @missings line in the file. 9841 if (defined $missings) { 9842 if (ref $missings) { 9843 $default_map = $missings->full_name; 9844 $default_table = $missings; 9845 } 9846 else { 9847 $default_map = $missings; 9848 } 9849 9850 # And store it with the property for outside use. 9851 $property_object->set_default_map($default_map); 9852 } 9853 else { 9854 9855 # Neither an @missings nor a default map. Create 9856 # a dummy one, so won't have to test definedness 9857 # in the main loop. 9858 $default_map = '_Perl This will never be in a file 9859 from Unicode'; 9860 } 9861 } 9862 9863 # Here, we have $default_map defined, possibly in terms of 9864 # $missings, but maybe not, and possibly is a dummy one. 9865 if (defined $missings) { 9866 9867 # Make sure there is no conflict between the two. 9868 # $missings has priority. 9869 if (ref $missings) { 9870 $default_table 9871 = $property_object->table($default_map); 9872 if (! defined $default_table 9873 || $default_table != $missings) 9874 { 9875 if (! defined $default_table) { 9876 $default_table = $UNDEF; 9877 } 9878 $file->carp_bad_line(<<END 9879The \@missings line for $property_name in $file says that missings default to 9880$missings, but we expect it to be $default_table. $missings used. 9881END 9882 ); 9883 $default_table = $missings; 9884 $default_map = $missings->full_name; 9885 } 9886 $property_info{$property_addr}{$DEFAULT_TABLE} 9887 = $default_table; 9888 } 9889 elsif ($default_map ne $missings) { 9890 $file->carp_bad_line(<<END 9891The \@missings line for $property_name in $file says that missings default to 9892$missings, but we expect it to be $default_map. $missings used. 9893END 9894 ); 9895 $default_map = $missings; 9896 } 9897 } 9898 9899 $property_info{$property_addr}{$DEFAULT_MAP} 9900 = $default_map; 9901 9902 # If haven't done so already, find the table corresponding 9903 # to this map for non-string properties. 9904 if (! defined $default_table 9905 && $property_type != $STRING 9906 && $property_type != $UNKNOWN) 9907 { 9908 $default_table = $property_info{$property_addr} 9909 {$DEFAULT_TABLE} 9910 = $property_object->table($default_map); 9911 } 9912 } # End of is first time for this property 9913 } # End of switching properties. 9914 9915 # Ready to process the line. 9916 # The Unicode files are set up so that if the map is not defined, 9917 # it is a binary property with value 'Y' 9918 if (! defined $map) { 9919 $map = 'Y'; 9920 } 9921 else { 9922 9923 # If the map begins with a special command to us (enclosed in 9924 # delimiters), extract the command(s). 9925 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { 9926 my $command = $1; 9927 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { 9928 $replace = $1; 9929 } 9930 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { 9931 $map_type = $1; 9932 } 9933 else { 9934 $file->carp_bad_line("Unknown command line: '$1'"); 9935 next LINE; 9936 } 9937 } 9938 } 9939 9940 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x) 9941 { 9942 9943 # Here, we have a map to a particular code point, and the 9944 # default map is to a code point itself. If the range 9945 # includes the particular code point, change that portion of 9946 # the range to the default. This makes sure that in the final 9947 # table only the non-defaults are listed. 9948 my $decimal_map = hex $map; 9949 if ($low <= $decimal_map && $decimal_map <= $high) { 9950 9951 # If the range includes stuff before or after the map 9952 # we're changing, split it and process the split-off parts 9953 # later. 9954 if ($low < $decimal_map) { 9955 $file->insert_adjusted_lines( 9956 sprintf("%04X..%04X; %s; %s", 9957 $low, 9958 $decimal_map - 1, 9959 $property_name, 9960 $map)); 9961 } 9962 if ($high > $decimal_map) { 9963 $file->insert_adjusted_lines( 9964 sprintf("%04X..%04X; %s; %s", 9965 $decimal_map + 1, 9966 $high, 9967 $property_name, 9968 $map)); 9969 } 9970 $low = $high = $decimal_map; 9971 $map = $CODE_POINT; 9972 } 9973 } 9974 9975 # If we can tell that this is a synonym for the default map, use 9976 # the default one instead. 9977 if ($property_type != $STRING 9978 && $property_type != $UNKNOWN) 9979 { 9980 my $table = $property_object->table($map); 9981 if (defined $table && $table == $default_table) { 9982 $map = $default_map; 9983 } 9984 } 9985 9986 # And figure out the map type if not known. 9987 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { 9988 if ($map eq "") { # Nulls are always $NULL map type 9989 $map_type = $NULL; 9990 } # Otherwise, non-strings, and those that don't allow 9991 # $MULTI_CP, and those that aren't multiple code points are 9992 # 0 9993 elsif 9994 (($property_type != $STRING && $property_type != $UNKNOWN) 9995 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) 9996 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) 9997 { 9998 $map_type = 0; 9999 } 10000 else { 10001 $map_type = $MULTI_CP; 10002 } 10003 } 10004 10005 $property_object->add_map($low, $high, 10006 $map, 10007 Type => $map_type, 10008 Replace => $replace); 10009 } # End of loop through file's lines 10010 10011 return; 10012 } 10013} 10014 10015{ # Closure for UnicodeData.txt handling 10016 10017 # This file was the first one in the UCD; its design leads to some 10018 # awkwardness in processing. Here is a sample line: 10019 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 10020 # The fields in order are: 10021 my $i = 0; # The code point is in field 0, and is shifted off. 10022 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") 10023 my $CATEGORY = $i++; # category (e.g. "Lu") 10024 my $CCC = $i++; # Canonical combining class (e.g. "230") 10025 my $BIDI = $i++; # directional class (e.g. "L") 10026 my $PERL_DECOMPOSITION = $i++; # decomposition mapping 10027 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value 10028 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript 10029 # Dual-use in this program; see below 10030 my $NUMERIC = $i++; # numeric value 10031 my $MIRRORED = $i++; # ? mirrored 10032 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 10033 my $COMMENT = $i++; # iso comment 10034 my $UPPER = $i++; # simple uppercase mapping 10035 my $LOWER = $i++; # simple lowercase mapping 10036 my $TITLE = $i++; # simple titlecase mapping 10037 my $input_field_count = $i; 10038 10039 # This routine in addition outputs these extra fields: 10040 10041 my $DECOMP_TYPE = $i++; # Decomposition type 10042 10043 # These fields are modifications of ones above, and are usually 10044 # suppressed; they must come last, as for speed, the loop upper bound is 10045 # normally set to ignore them 10046 my $NAME = $i++; # This is the strict name field, not the one that 10047 # charnames uses. 10048 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used 10049 # by Unicode::Normalize 10050 my $last_field = $i - 1; 10051 10052 # All these are read into an array for each line, with the indices defined 10053 # above. The empty fields in the example line above indicate that the 10054 # value is defaulted. The handler called for each line of the input 10055 # changes these to their defaults. 10056 10057 # Here are the official names of the properties, in a parallel array: 10058 my @field_names; 10059 $field_names[$BIDI] = 'Bidi_Class'; 10060 $field_names[$CATEGORY] = 'General_Category'; 10061 $field_names[$CCC] = 'Canonical_Combining_Class'; 10062 $field_names[$CHARNAME] = 'Perl_Charnames'; 10063 $field_names[$COMMENT] = 'ISO_Comment'; 10064 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; 10065 $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; 10066 $field_names[$LOWER] = 'Lowercase_Mapping'; 10067 $field_names[$MIRRORED] = 'Bidi_Mirrored'; 10068 $field_names[$NAME] = 'Name'; 10069 $field_names[$NUMERIC] = 'Numeric_Value'; 10070 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; 10071 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; 10072 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; 10073 $field_names[$TITLE] = 'Titlecase_Mapping'; 10074 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; 10075 $field_names[$UPPER] = 'Uppercase_Mapping'; 10076 10077 # Some of these need a little more explanation: 10078 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode 10079 # property, but is used in calculating the Numeric_Type. Perl however, 10080 # creates a file from this field, so a Perl property is created from it. 10081 # Similarly, the Other_Digit field is used only for calculating the 10082 # Numeric_Type, and so it can be safely re-used as the place to store 10083 # the value for Numeric_Type; hence it is referred to as 10084 # $NUMERIC_TYPE_OTHER_DIGIT. 10085 # The input field named $PERL_DECOMPOSITION is a combination of both the 10086 # decomposition mapping and its type. Perl creates a file containing 10087 # exactly this field, so it is used for that. The two properties are 10088 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. 10089 # $DECOMP_MAP is usually suppressed (unless the lists are changed to 10090 # output it), as Perl doesn't use it directly. 10091 # The input field named here $CHARNAME is used to construct the 10092 # Perl_Charnames property, which is a combination of the Name property 10093 # (which the input field contains), and the Unicode_1_Name property, and 10094 # others from other files. Since, the strict Name property is not used 10095 # by Perl, this field is used for the table that Perl does use. The 10096 # strict Name property table is usually suppressed (unless the lists are 10097 # changed to output it), so it is accumulated in a separate field, 10098 # $NAME, which to save time is discarded unless the table is actually to 10099 # be output 10100 10101 # This file is processed like most in this program. Control is passed to 10102 # process_generic_property_file() which calls filter_UnicodeData_line() 10103 # for each input line. This filter converts the input into line(s) that 10104 # process_generic_property_file() understands. There is also a setup 10105 # routine called before any of the file is processed, and a handler for 10106 # EOF processing, all in this closure. 10107 10108 # A huge speed-up occurred at the cost of some added complexity when these 10109 # routines were altered to buffer the outputs into ranges. Almost all the 10110 # lines of the input file apply to just one code point, and for most 10111 # properties, the map for the next code point up is the same as the 10112 # current one. So instead of creating a line for each property for each 10113 # input line, filter_UnicodeData_line() remembers what the previous map 10114 # of a property was, and doesn't generate a line to pass on until it has 10115 # to, as when the map changes; and that passed-on line encompasses the 10116 # whole contiguous range of code points that have the same map for that 10117 # property. This means a slight amount of extra setup, and having to 10118 # flush these buffers on EOF, testing if the maps have changed, plus 10119 # remembering state information in the closure. But it means a lot less 10120 # real time in not having to change the data base for each property on 10121 # each line. 10122 10123 # Another complication is that there are already a few ranges designated 10124 # in the input. There are two lines for each, with the same maps except 10125 # the code point and name on each line. This was actually the hardest 10126 # thing to design around. The code points in those ranges may actually 10127 # have real maps not given by these two lines. These maps will either 10128 # be algorithmically determinable, or be in the extracted files furnished 10129 # with the UCD. In the event of conflicts between these extracted files, 10130 # and this one, Unicode says that this one prevails. But it shouldn't 10131 # prevail for conflicts that occur in these ranges. The data from the 10132 # extracted files prevails in those cases. So, this program is structured 10133 # so that those files are processed first, storing maps. Then the other 10134 # files are processed, generally overwriting what the extracted files 10135 # stored. But just the range lines in this input file are processed 10136 # without overwriting. This is accomplished by adding a special string to 10137 # the lines output to tell process_generic_property_file() to turn off the 10138 # overwriting for just this one line. 10139 # A similar mechanism is used to tell it that the map is of a non-default 10140 # type. 10141 10142 sub setup_UnicodeData { # Called before any lines of the input are read 10143 my $file = shift; 10144 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10145 10146 # Create a new property specially located that is a combination of the 10147 # various Name properties: Name, Unicode_1_Name, Named Sequences, and 10148 # Name_Alias properties. (The final duplicates elements of the 10149 # first.) A comment for it will later be constructed based on the 10150 # actual properties present and used 10151 $perl_charname = Property->new('Perl_Charnames', 10152 Default_Map => "", 10153 Directory => File::Spec->curdir(), 10154 File => 'Name', 10155 Fate => $INTERNAL_ONLY, 10156 Perl_Extension => 1, 10157 Range_Size_1 => \&output_perl_charnames_line, 10158 Type => $STRING, 10159 ); 10160 $perl_charname->set_proxy_for('Name'); 10161 10162 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', 10163 Directory => File::Spec->curdir(), 10164 File => 'Decomposition', 10165 Format => $DECOMP_STRING_FORMAT, 10166 Fate => $INTERNAL_ONLY, 10167 Perl_Extension => 1, 10168 Default_Map => $CODE_POINT, 10169 10170 # normalize.pm can't cope with these 10171 Output_Range_Counts => 0, 10172 10173 # This is a specially formatted table 10174 # explicitly for normalize.pm, which 10175 # is expecting a particular format, 10176 # which means that mappings containing 10177 # multiple code points are in the main 10178 # body of the table 10179 Map_Type => $COMPUTE_NO_MULTI_CP, 10180 Type => $STRING, 10181 To_Output_Map => $INTERNAL_MAP, 10182 ); 10183 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); 10184 $Perl_decomp->add_comment(join_lines(<<END 10185This mapping is a combination of the Unicode 'Decomposition_Type' and 10186'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is 10187identical to the official Unicode 'Decomposition_Mapping' property except for 10188two things: 10189 1) It omits the algorithmically determinable Hangul syllable decompositions, 10190which normalize.pm handles algorithmically. 10191 2) It contains the decomposition type as well. Non-canonical decompositions 10192begin with a word in angle brackets, like <super>, which denotes the 10193compatible decomposition type. If the map does not begin with the <angle 10194brackets>, the decomposition is canonical. 10195END 10196 )); 10197 10198 my $Decimal_Digit = Property->new("Perl_Decimal_Digit", 10199 Default_Map => "", 10200 Perl_Extension => 1, 10201 Directory => $map_directory, 10202 Type => $STRING, 10203 To_Output_Map => $OUTPUT_ADJUSTED, 10204 ); 10205 $Decimal_Digit->add_comment(join_lines(<<END 10206This file gives the mapping of all code points which represent a single 10207decimal digit [0-9] to their respective digits, but it has ranges of 10 code 10208points, and the mapping of each non-initial element of each range is actually 10209not to "0", but to the offset that element has from its corresponding DIGIT 0. 10210These code points are those that have Numeric_Type=Decimal; not special 10211things, like subscripts nor Roman numerals. 10212END 10213 )); 10214 10215 # These properties are not used for generating anything else, and are 10216 # usually not output. By making them last in the list, we can just 10217 # change the high end of the loop downwards to avoid the work of 10218 # generating a table(s) that is/are just going to get thrown away. 10219 if (! property_ref('Decomposition_Mapping')->to_output_map 10220 && ! property_ref('Name')->to_output_map) 10221 { 10222 $last_field = min($NAME, $DECOMP_MAP) - 1; 10223 } elsif (property_ref('Decomposition_Mapping')->to_output_map) { 10224 $last_field = $DECOMP_MAP; 10225 } elsif (property_ref('Name')->to_output_map) { 10226 $last_field = $NAME; 10227 } 10228 return; 10229 } 10230 10231 my $first_time = 1; # ? Is this the first line of the file 10232 my $in_range = 0; # ? Are we in one of the file's ranges 10233 my $previous_cp; # hex code point of previous line 10234 my $decimal_previous_cp = -1; # And its decimal equivalent 10235 my @start; # For each field, the current starting 10236 # code point in hex for the range 10237 # being accumulated. 10238 my @fields; # The input fields; 10239 my @previous_fields; # And those from the previous call 10240 10241 sub filter_UnicodeData_line { 10242 # Handle a single input line from UnicodeData.txt; see comments above 10243 # Conceptually this takes a single line from the file containing N 10244 # properties, and converts it into N lines with one property per line, 10245 # which is what the final handler expects. But there are 10246 # complications due to the quirkiness of the input file, and to save 10247 # time, it accumulates ranges where the property values don't change 10248 # and only emits lines when necessary. This is about an order of 10249 # magnitude fewer lines emitted. 10250 10251 my $file = shift; 10252 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10253 10254 # $_ contains the input line. 10255 # -1 in split means retain trailing null fields 10256 (my $cp, @fields) = split /\s*;\s*/, $_, -1; 10257 10258 #local $to_trace = 1 if main::DEBUG; 10259 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; 10260 if (@fields > $input_field_count) { 10261 $file->carp_bad_line('Extra fields'); 10262 $_ = ""; 10263 return; 10264 } 10265 10266 my $decimal_cp = hex $cp; 10267 10268 # We have to output all the buffered ranges when the next code point 10269 # is not exactly one after the previous one, which means there is a 10270 # gap in the ranges. 10271 my $force_output = ($decimal_cp != $decimal_previous_cp + 1); 10272 10273 # The decomposition mapping field requires special handling. It looks 10274 # like either: 10275 # 10276 # <compat> 0032 0020 10277 # 0041 0300 10278 # 10279 # The decomposition type is enclosed in <brackets>; if missing, it 10280 # means the type is canonical. There are two decomposition mapping 10281 # tables: the one for use by Perl's normalize.pm has a special format 10282 # which is this field intact; the other, for general use is of 10283 # standard format. In either case we have to find the decomposition 10284 # type. Empty fields have None as their type, and map to the code 10285 # point itself 10286 if ($fields[$PERL_DECOMPOSITION] eq "") { 10287 $fields[$DECOMP_TYPE] = 'None'; 10288 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; 10289 } 10290 else { 10291 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] 10292 =~ / < ( .+? ) > \s* ( .+ ) /x; 10293 if (! defined $fields[$DECOMP_TYPE]) { 10294 $fields[$DECOMP_TYPE] = 'Canonical'; 10295 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; 10296 } 10297 else { 10298 $fields[$DECOMP_MAP] = $map; 10299 } 10300 } 10301 10302 # The 3 numeric fields also require special handling. The 2 digit 10303 # fields must be either empty or match the number field. This means 10304 # that if it is empty, they must be as well, and the numeric type is 10305 # None, and the numeric value is 'Nan'. 10306 # The decimal digit field must be empty or match the other digit 10307 # field. If the decimal digit field is non-empty, the code point is 10308 # a decimal digit, and the other two fields will have the same value. 10309 # If it is empty, but the other digit field is non-empty, the code 10310 # point is an 'other digit', and the number field will have the same 10311 # value as the other digit field. If the other digit field is empty, 10312 # but the number field is non-empty, the code point is a generic 10313 # numeric type. 10314 if ($fields[$NUMERIC] eq "") { 10315 if ($fields[$PERL_DECIMAL_DIGIT] ne "" 10316 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" 10317 ) { 10318 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); 10319 } 10320 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; 10321 $fields[$NUMERIC] = 'NaN'; 10322 } 10323 else { 10324 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; 10325 if ($fields[$PERL_DECIMAL_DIGIT] ne "") { 10326 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; 10327 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; 10328 } 10329 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { 10330 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; 10331 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; 10332 } 10333 else { 10334 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; 10335 10336 # Rationals require extra effort. 10337 register_fraction($fields[$NUMERIC]) 10338 if $fields[$NUMERIC] =~ qr{/}; 10339 } 10340 } 10341 10342 # For the properties that have empty fields in the file, and which 10343 # mean something different from empty, change them to that default. 10344 # Certain fields just haven't been empty so far in any Unicode 10345 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, 10346 # $CATEGORY. This leaves just the two fields, and so we hard-code in 10347 # the defaults; which are very unlikely to ever change. 10348 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; 10349 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; 10350 10351 # UAX44 says that if title is empty, it is the same as whatever upper 10352 # is, 10353 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; 10354 10355 # There are a few pairs of lines like: 10356 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 10357 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 10358 # that define ranges. These should be processed after the fields are 10359 # adjusted above, as they may override some of them; but mostly what 10360 # is left is to possibly adjust the $CHARNAME field. The names of all the 10361 # paired lines start with a '<', but this is also true of '<control>, 10362 # which isn't one of these special ones. 10363 if ($fields[$CHARNAME] eq '<control>') { 10364 10365 # Some code points in this file have the pseudo-name 10366 # '<control>', but the official name for such ones is the null 10367 # string. 10368 $fields[$NAME] = $fields[$CHARNAME] = ""; 10369 10370 # We had better not be in between range lines. 10371 if ($in_range) { 10372 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 10373 $in_range = 0; 10374 } 10375 } 10376 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { 10377 10378 # Here is a non-range line. We had better not be in between range 10379 # lines. 10380 if ($in_range) { 10381 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 10382 $in_range = 0; 10383 } 10384 if ($fields[$CHARNAME] =~ s/- $cp $//x) { 10385 10386 # These are code points whose names end in their code points, 10387 # which means the names are algorithmically derivable from the 10388 # code points. To shorten the output Name file, the algorithm 10389 # for deriving these is placed in the file instead of each 10390 # code point, so they have map type $CP_IN_NAME 10391 $fields[$CHARNAME] = $CMD_DELIM 10392 . $MAP_TYPE_CMD 10393 . '=' 10394 . $CP_IN_NAME 10395 . $CMD_DELIM 10396 . $fields[$CHARNAME]; 10397 } 10398 $fields[$NAME] = $fields[$CHARNAME]; 10399 } 10400 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { 10401 $fields[$CHARNAME] = $fields[$NAME] = $1; 10402 10403 # Here we are at the beginning of a range pair. 10404 if ($in_range) { 10405 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); 10406 } 10407 $in_range = 1; 10408 10409 # Because the properties in the range do not overwrite any already 10410 # in the db, we must flush the buffers of what's already there, so 10411 # they get handled in the normal scheme. 10412 $force_output = 1; 10413 10414 } 10415 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { 10416 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); 10417 $_ = ""; 10418 return; 10419 } 10420 else { # Here, we are at the last line of a range pair. 10421 10422 if (! $in_range) { 10423 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); 10424 $_ = ""; 10425 return; 10426 } 10427 $in_range = 0; 10428 10429 $fields[$NAME] = $fields[$CHARNAME]; 10430 10431 # Check that the input is valid: that the closing of the range is 10432 # the same as the beginning. 10433 foreach my $i (0 .. $last_field) { 10434 next if $fields[$i] eq $previous_fields[$i]; 10435 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); 10436 } 10437 10438 # The processing differs depending on the type of range, 10439 # determined by its $CHARNAME 10440 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { 10441 10442 # Check that the data looks right. 10443 if ($decimal_previous_cp != $SBase) { 10444 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); 10445 } 10446 if ($decimal_cp != $SBase + $SCount - 1) { 10447 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); 10448 } 10449 10450 # The Hangul syllable range has a somewhat complicated name 10451 # generation algorithm. Each code point in it has a canonical 10452 # decomposition also computable by an algorithm. The 10453 # perl decomposition map table built from these is used only 10454 # by normalize.pm, which has the algorithm built in it, so the 10455 # decomposition maps are not needed, and are large, so are 10456 # omitted from it. If the full decomposition map table is to 10457 # be output, the decompositions are generated for it, in the 10458 # EOF handling code for this input file. 10459 10460 $previous_fields[$DECOMP_TYPE] = 'Canonical'; 10461 10462 # This range is stored in our internal structure with its 10463 # own map type, different from all others. 10464 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 10465 = $CMD_DELIM 10466 . $MAP_TYPE_CMD 10467 . '=' 10468 . $HANGUL_SYLLABLE 10469 . $CMD_DELIM 10470 . $fields[$CHARNAME]; 10471 } 10472 elsif ($fields[$CHARNAME] =~ /^CJK/) { 10473 10474 # The name for these contains the code point itself, and all 10475 # are defined to have the same base name, regardless of what 10476 # is in the file. They are stored in our internal structure 10477 # with a map type of $CP_IN_NAME 10478 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 10479 = $CMD_DELIM 10480 . $MAP_TYPE_CMD 10481 . '=' 10482 . $CP_IN_NAME 10483 . $CMD_DELIM 10484 . 'CJK UNIFIED IDEOGRAPH'; 10485 10486 } 10487 elsif ($fields[$CATEGORY] eq 'Co' 10488 || $fields[$CATEGORY] eq 'Cs') 10489 { 10490 # The names of all the code points in these ranges are set to 10491 # null, as there are no names for the private use and 10492 # surrogate code points. 10493 10494 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; 10495 } 10496 else { 10497 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); 10498 } 10499 10500 # The first line of the range caused everything else to be output, 10501 # and then its values were stored as the beginning values for the 10502 # next set of ranges, which this one ends. Now, for each value, 10503 # add a command to tell the handler that these values should not 10504 # replace any existing ones in our database. 10505 foreach my $i (0 .. $last_field) { 10506 $previous_fields[$i] = $CMD_DELIM 10507 . $REPLACE_CMD 10508 . '=' 10509 . $NO 10510 . $CMD_DELIM 10511 . $previous_fields[$i]; 10512 } 10513 10514 # And change things so it looks like the entire range has been 10515 # gone through with this being the final part of it. Adding the 10516 # command above to each field will cause this range to be flushed 10517 # during the next iteration, as it guaranteed that the stored 10518 # field won't match whatever value the next one has. 10519 $previous_cp = $cp; 10520 $decimal_previous_cp = $decimal_cp; 10521 10522 # We are now set up for the next iteration; so skip the remaining 10523 # code in this subroutine that does the same thing, but doesn't 10524 # know about these ranges. 10525 $_ = ""; 10526 10527 return; 10528 } 10529 10530 # On the very first line, we fake it so the code below thinks there is 10531 # nothing to output, and initialize so that when it does get output it 10532 # uses the first line's values for the lowest part of the range. 10533 # (One could avoid this by using peek(), but then one would need to 10534 # know the adjustments done above and do the same ones in the setup 10535 # routine; not worth it) 10536 if ($first_time) { 10537 $first_time = 0; 10538 @previous_fields = @fields; 10539 @start = ($cp) x scalar @fields; 10540 $decimal_previous_cp = $decimal_cp - 1; 10541 } 10542 10543 # For each field, output the stored up ranges that this code point 10544 # doesn't fit in. Earlier we figured out if all ranges should be 10545 # terminated because of changing the replace or map type styles, or if 10546 # there is a gap between this new code point and the previous one, and 10547 # that is stored in $force_output. But even if those aren't true, we 10548 # need to output the range if this new code point's value for the 10549 # given property doesn't match the stored range's. 10550 #local $to_trace = 1 if main::DEBUG; 10551 foreach my $i (0 .. $last_field) { 10552 my $field = $fields[$i]; 10553 if ($force_output || $field ne $previous_fields[$i]) { 10554 10555 # Flush the buffer of stored values. 10556 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 10557 10558 # Start a new range with this code point and its value 10559 $start[$i] = $cp; 10560 $previous_fields[$i] = $field; 10561 } 10562 } 10563 10564 # Set the values for the next time. 10565 $previous_cp = $cp; 10566 $decimal_previous_cp = $decimal_cp; 10567 10568 # The input line has generated whatever adjusted lines are needed, and 10569 # should not be looked at further. 10570 $_ = ""; 10571 return; 10572 } 10573 10574 sub EOF_UnicodeData { 10575 # Called upon EOF to flush the buffers, and create the Hangul 10576 # decomposition mappings if needed. 10577 10578 my $file = shift; 10579 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10580 10581 # Flush the buffers. 10582 foreach my $i (1 .. $last_field) { 10583 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 10584 } 10585 10586 if (-e 'Jamo.txt') { 10587 10588 # The algorithm is published by Unicode, based on values in 10589 # Jamo.txt, (which should have been processed before this 10590 # subroutine), and the results left in %Jamo 10591 unless (%Jamo) { 10592 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); 10593 return; 10594 } 10595 10596 # If the full decomposition map table is being output, insert 10597 # into it the Hangul syllable mappings. This is to avoid having 10598 # to publish a subroutine in it to compute them. (which would 10599 # essentially be this code.) This uses the algorithm published by 10600 # Unicode. 10601 if (property_ref('Decomposition_Mapping')->to_output_map) { 10602 for (my $S = $SBase; $S < $SBase + $SCount; $S++) { 10603 use integer; 10604 my $SIndex = $S - $SBase; 10605 my $L = $LBase + $SIndex / $NCount; 10606 my $V = $VBase + ($SIndex % $NCount) / $TCount; 10607 my $T = $TBase + $SIndex % $TCount; 10608 10609 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; 10610 my $decomposition = sprintf("%04X %04X", $L, $V); 10611 $decomposition .= sprintf(" %04X", $T) if $T != $TBase; 10612 $file->insert_adjusted_lines( 10613 sprintf("%04X; Decomposition_Mapping; %s", 10614 $S, 10615 $decomposition)); 10616 } 10617 } 10618 } 10619 10620 return; 10621 } 10622 10623 sub filter_v1_ucd { 10624 # Fix UCD lines in version 1. This is probably overkill, but this 10625 # fixes some glaring errors in Version 1 UnicodeData.txt. That file: 10626 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later 10627 # removed. This program retains them 10628 # 2) didn't include ranges, which it should have, and which are now 10629 # added in @corrected_lines below. It was hand populated by 10630 # taking the data from Version 2, verified by analyzing 10631 # DAge.txt. 10632 # 3) There is a syntax error in the entry for U+09F8 which could 10633 # cause problems for utf8_heavy, and so is changed. It's 10634 # numeric value was simply a minus sign, without any number. 10635 # (Eventually Unicode changed the code point to non-numeric.) 10636 # 4) The decomposition types often don't match later versions 10637 # exactly, and the whole syntax of that field is different; so 10638 # the syntax is changed as well as the types to their later 10639 # terminology. Otherwise normalize.pm would be very unhappy 10640 # 5) Many ccc classes are different. These are left intact. 10641 # 6) U+FF10 - U+FF19 are missing their numeric values in all three 10642 # fields. These are unchanged because it doesn't really cause 10643 # problems for Perl. 10644 # 7) A number of code points, such as controls, don't have their 10645 # Unicode Version 1 Names in this file. These are unchanged. 10646 10647 my @corrected_lines = split /\n/, <<'END'; 106484E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 106499FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 10650E000;<Private Use, First>;Co;0;L;;;;;N;;;;; 10651F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;; 10652F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;; 10653FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;; 10654END 10655 10656 my $file = shift; 10657 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10658 10659 #local $to_trace = 1 if main::DEBUG; 10660 trace $_ if main::DEBUG && $to_trace; 10661 10662 # -1 => retain trailing null fields 10663 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 10664 10665 # At the first place that is wrong in the input, insert all the 10666 # corrections, replacing the wrong line. 10667 if ($code_point eq '4E00') { 10668 my @copy = @corrected_lines; 10669 $_ = shift @copy; 10670 ($code_point, @fields) = split /\s*;\s*/, $_, -1; 10671 10672 $file->insert_lines(@copy); 10673 } 10674 10675 10676 if ($fields[$NUMERIC] eq '-') { 10677 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. 10678 } 10679 10680 if ($fields[$PERL_DECOMPOSITION] ne "") { 10681 10682 # Several entries have this change to superscript 2 or 3 in the 10683 # middle. Convert these to the modern version, which is to use 10684 # the actual U+00B2 and U+00B3 (the superscript forms) instead. 10685 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes 10686 # 'HHHH HHHH 00B3 HHHH'. 10687 # It turns out that all of these that don't have another 10688 # decomposition defined at the beginning of the line have the 10689 # <square> decomposition in later releases. 10690 if ($code_point ne '00B2' && $code_point ne '00B3') { 10691 if ($fields[$PERL_DECOMPOSITION] 10692 =~ s/<\+sup> 003([23]) <-sup>/00B$1/) 10693 { 10694 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { 10695 $fields[$PERL_DECOMPOSITION] = '<square> ' 10696 . $fields[$PERL_DECOMPOSITION]; 10697 } 10698 } 10699 } 10700 10701 # If is like '<+circled> 0052 <-circled>', convert to 10702 # '<circled> 0052' 10703 $fields[$PERL_DECOMPOSITION] =~ 10704 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x; 10705 10706 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc. 10707 $fields[$PERL_DECOMPOSITION] =~ 10708 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x 10709 or $fields[$PERL_DECOMPOSITION] =~ 10710 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x 10711 or $fields[$PERL_DECOMPOSITION] =~ 10712 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x 10713 or $fields[$PERL_DECOMPOSITION] =~ 10714 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x; 10715 10716 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc. 10717 $fields[$PERL_DECOMPOSITION] =~ 10718 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; 10719 10720 # Change names to modern form. 10721 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g; 10722 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g; 10723 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g; 10724 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g; 10725 10726 # One entry has weird braces 10727 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; 10728 } 10729 10730 $_ = join ';', $code_point, @fields; 10731 trace $_ if main::DEBUG && $to_trace; 10732 return; 10733 } 10734 10735 sub filter_v2_1_5_ucd { 10736 # A dozen entries in this 2.1.5 file had the mirrored and numeric 10737 # columns swapped; These all had mirrored be 'N'. So if the numeric 10738 # column appears to be N, swap it back. 10739 10740 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 10741 if ($fields[$NUMERIC] eq 'N') { 10742 $fields[$NUMERIC] = $fields[$MIRRORED]; 10743 $fields[$MIRRORED] = 'N'; 10744 $_ = join ';', $code_point, @fields; 10745 } 10746 return; 10747 } 10748 10749 sub filter_v6_ucd { 10750 10751 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't 10752 # accepted that yet to allow for some deprecation cycles. 10753 10754 return if $_ !~ /^(?:0007|1F514|070F);/; 10755 10756 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 10757 if ($code_point eq '0007') { 10758 $fields[$CHARNAME] = ""; 10759 } 10760 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see 10761 # http://www.unicode.org/versions/corrigendum8.html 10762 $fields[$BIDI] = "AL"; 10763 } 10764 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name 10765 $fields[$CHARNAME] = ""; 10766 } 10767 10768 $_ = join ';', $code_point, @fields; 10769 10770 return; 10771 } 10772} # End closure for UnicodeData 10773 10774sub process_GCB_test { 10775 10776 my $file = shift; 10777 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10778 10779 while ($file->next_line) { 10780 push @backslash_X_tests, $_; 10781 } 10782 10783 return; 10784} 10785 10786sub process_NamedSequences { 10787 # NamedSequences.txt entries are just added to an array. Because these 10788 # don't look like the other tables, they have their own handler. 10789 # An example: 10790 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 10791 # 10792 # This just adds the sequence to an array for later handling 10793 10794 my $file = shift; 10795 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10796 10797 while ($file->next_line) { 10798 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; 10799 if (@remainder) { 10800 $file->carp_bad_line( 10801 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); 10802 next; 10803 } 10804 10805 # Note single \t in keeping with special output format of 10806 # Perl_charnames. But it turns out that the code points don't have to 10807 # be 5 digits long, like the rest, based on the internal workings of 10808 # charnames.pm. This could be easily changed for consistency. 10809 push @named_sequences, "$sequence\t$name"; 10810 } 10811 return; 10812} 10813 10814{ # Closure 10815 10816 my $first_range; 10817 10818 sub filter_early_ea_lb { 10819 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a 10820 # third field be the name of the code point, which can be ignored in 10821 # most cases. But it can be meaningful if it marks a range: 10822 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 10823 # 3400;W;<CJK Ideograph Extension A, First> 10824 # 10825 # We need to see the First in the example above to know it's a range. 10826 # They did not use the later range syntaxes. This routine changes it 10827 # to use the modern syntax. 10828 # $1 is the Input_file object. 10829 10830 my @fields = split /\s*;\s*/; 10831 if ($fields[2] =~ /^<.*, First>/) { 10832 $first_range = $fields[0]; 10833 $_ = ""; 10834 } 10835 elsif ($fields[2] =~ /^<.*, Last>/) { 10836 $_ = $_ = "$first_range..$fields[0]; $fields[1]"; 10837 } 10838 else { 10839 undef $first_range; 10840 $_ = "$fields[0]; $fields[1]"; 10841 } 10842 10843 return; 10844 } 10845} 10846 10847sub filter_old_style_arabic_shaping { 10848 # Early versions used a different term for the later one. 10849 10850 my @fields = split /\s*;\s*/; 10851 $fields[3] =~ s/<no shaping>/No_Joining_Group/; 10852 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores 10853 $_ = join ';', @fields; 10854 return; 10855} 10856 10857sub filter_arabic_shaping_line { 10858 # ArabicShaping.txt has entries that look like: 10859 # 062A; TEH; D; BEH 10860 # The field containing 'TEH' is not used. The next field is Joining_Type 10861 # and the last is Joining_Group 10862 # This generates two lines to pass on, one for each property on the input 10863 # line. 10864 10865 my $file = shift; 10866 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10867 10868 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 10869 10870 if (@fields > 4) { 10871 $file->carp_bad_line('Extra fields'); 10872 $_ = ""; 10873 return; 10874 } 10875 10876 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]"); 10877 $_ = "$fields[0]; Joining_Type; $fields[2]"; 10878 10879 return; 10880} 10881 10882{ # Closure 10883 my $lc; # Table for lowercase mapping 10884 my $tc; 10885 my $uc; 10886 10887 sub setup_special_casing { 10888 # SpecialCasing.txt contains the non-simple case change mappings. The 10889 # simple ones are in UnicodeData.txt, which should already have been 10890 # read in to the full property data structures, so as to initialize 10891 # these with the simple ones. Then the SpecialCasing.txt entries 10892 # add or overwrite the ones which have different full mappings. 10893 10894 # This routine sees if the simple mappings are to be output, and if 10895 # so, copies what has already been put into the full mapping tables, 10896 # while they still contain only the simple mappings. 10897 10898 # The reason it is done this way is that the simple mappings are 10899 # probably not going to be output, so it saves work to initialize the 10900 # full tables with the simple mappings, and then overwrite those 10901 # relatively few entries in them that have different full mappings, 10902 # and thus skip the simple mapping tables altogether. 10903 10904 my $file= shift; 10905 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10906 10907 $lc = property_ref('lc'); 10908 $tc = property_ref('tc'); 10909 $uc = property_ref('uc'); 10910 10911 # For each of the case change mappings... 10912 foreach my $full_table ($lc, $tc, $uc) { 10913 my $full_name = $full_table->name; 10914 unless (defined $full_table && ! $full_table->is_empty) { 10915 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); 10916 } 10917 10918 # Create a table in the old-style format and with the original 10919 # file name for backwards compatibility with applications that 10920 # read it directly. The new tables contain both the simple and 10921 # full maps, and the old are missing simple maps when there is a 10922 # conflicting full one. Probably it would have been ok to add 10923 # those to the legacy version, as was already done in 5.14 to the 10924 # case folding one, but this was not done, out of an abundance of 10925 # caution. The tables are set up here before we deal with the 10926 # full maps so that as we handle those, we can override the simple 10927 # maps for them in the legacy table, and merely add them in the 10928 # new-style one. 10929 my $legacy = Property->new("Legacy_" . $full_table->full_name, 10930 File => $full_table->full_name =~ 10931 s/case_Mapping//r, 10932 Range_Size_1 => 1, 10933 Format => $HEX_FORMAT, 10934 Default_Map => $CODE_POINT, 10935 UCD => 0, 10936 Initialize => $full_table, 10937 To_Output_Map => $EXTERNAL_MAP, 10938 ); 10939 10940 $full_table->add_comment(join_lines( <<END 10941This file includes both the simple and full case changing maps. The simple 10942ones are in the main body of the table below, and the full ones adding to or 10943overriding them are in the hash. 10944END 10945 )); 10946 10947 # The simple version's name in each mapping merely has an 's' in 10948 # front of the full one's 10949 my $simple_name = 's' . $full_name; 10950 my $simple = property_ref($simple_name); 10951 $simple->initialize($full_table) if $simple->to_output_map(); 10952 10953 unless ($simple->to_output_map()) { 10954 $full_table->set_proxy_for($simple_name); 10955 } 10956 } 10957 10958 return; 10959 } 10960 10961 sub filter_special_casing_line { 10962 # Change the format of $_ from SpecialCasing.txt into something that 10963 # the generic handler understands. Each input line contains three 10964 # case mappings. This will generate three lines to pass to the 10965 # generic handler for each of those. 10966 10967 # The input syntax (after stripping comments and trailing white space 10968 # is like one of the following (with the final two being entries that 10969 # we ignore): 10970 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 10971 # 03A3; 03C2; 03A3; 03A3; Final_Sigma; 10972 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 10973 # Note the trailing semi-colon, unlike many of the input files. That 10974 # means that there will be an extra null field generated by the split 10975 10976 my $file = shift; 10977 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10978 10979 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 10980 # fields 10981 10982 # field #4 is when this mapping is conditional. If any of these get 10983 # implemented, it would be by hard-coding in the casing functions in 10984 # the Perl core, not through tables. But if there is a new condition 10985 # we don't know about, output a warning. We know about all the 10986 # conditions through 6.0 10987 if ($fields[4] ne "") { 10988 my @conditions = split ' ', $fields[4]; 10989 if ($conditions[0] ne 'tr' # We know that these languages have 10990 # conditions, and some are multiple 10991 && $conditions[0] ne 'az' 10992 && $conditions[0] ne 'lt' 10993 10994 # And, we know about a single condition Final_Sigma, but 10995 # nothing else. 10996 && ($v_version gt v5.2.0 10997 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) 10998 { 10999 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); 11000 } 11001 elsif ($conditions[0] ne 'Final_Sigma') { 11002 11003 # Don't print out a message for Final_Sigma, because we 11004 # have hard-coded handling for it. (But the standard 11005 # could change what the rule should be, but it wouldn't 11006 # show up here anyway. 11007 11008 print "# SKIPPING Special Casing: $_\n" 11009 if $verbosity >= $VERBOSE; 11010 } 11011 $_ = ""; 11012 return; 11013 } 11014 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { 11015 $file->carp_bad_line('Extra fields'); 11016 $_ = ""; 11017 return; 11018 } 11019 11020 my $decimal_code_point = hex $fields[0]; 11021 11022 # Loop to handle each of the three mappings in the input line, in 11023 # order, with $i indicating the current field number. 11024 my $i = 0; 11025 for my $object ($lc, $tc, $uc) { 11026 $i++; # First time through, $i = 0 ... 3rd time = 3 11027 11028 my $value = $object->value_of($decimal_code_point); 11029 $value = ($value eq $CODE_POINT) 11030 ? $decimal_code_point 11031 : hex $value; 11032 11033 # If this isn't a multi-character mapping, it should already have 11034 # been read in. 11035 if ($fields[$i] !~ / /) { 11036 if ($value != hex $fields[$i]) { 11037 Carp::my_carp("Bad news. UnicodeData.txt thinks " 11038 . $object->name 11039 . "(0x$fields[0]) is $value" 11040 . " and SpecialCasing.txt thinks it is " 11041 . hex($fields[$i]) 11042 . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); 11043 } 11044 } 11045 else { 11046 11047 # The mapping goes into both the legacy table, in which it 11048 # replaces the simple one... 11049 $file->insert_adjusted_lines("$fields[0]; Legacy_" 11050 . $object->full_name 11051 . "; $fields[$i]"); 11052 11053 # ... and, the The regular table, in which it is additional, 11054 # beyond the simple mapping. 11055 $file->insert_adjusted_lines("$fields[0]; " 11056 . $object->name 11057 . "; " 11058 . $CMD_DELIM 11059 . "$REPLACE_CMD=$MULTIPLE_BEFORE" 11060 . $CMD_DELIM 11061 . $fields[$i]); 11062 } 11063 } 11064 11065 # Everything has been handled by the insert_adjusted_lines() 11066 $_ = ""; 11067 11068 return; 11069 } 11070} 11071 11072sub filter_old_style_case_folding { 11073 # This transforms $_ containing the case folding style of 3.0.1, to 3.1 11074 # and later style. Different letters were used in the earlier. 11075 11076 my $file = shift; 11077 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11078 11079 my @fields = split /\s*;\s*/; 11080 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields 11081 $fields[1] = 'I'; 11082 } 11083 elsif ($fields[1] eq 'L') { 11084 $fields[1] = 'C'; # L => C always 11085 } 11086 elsif ($fields[1] eq 'E') { 11087 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise 11088 $fields[1] = 'F' 11089 } 11090 else { 11091 $fields[1] = 'C' 11092 } 11093 } 11094 else { 11095 $file->carp_bad_line("Expecting L or E in second field"); 11096 $_ = ""; 11097 return; 11098 } 11099 $_ = join("; ", @fields) . ';'; 11100 return; 11101} 11102 11103{ # Closure for case folding 11104 11105 # Create the map for simple only if are going to output it, for otherwise 11106 # it takes no part in anything we do. 11107 my $to_output_simple; 11108 my $non_final_folds; 11109 11110 sub setup_case_folding($) { 11111 # Read in the case foldings in CaseFolding.txt. This handles both 11112 # simple and full case folding. 11113 11114 $to_output_simple 11115 = property_ref('Simple_Case_Folding')->to_output_map; 11116 11117 if (! $to_output_simple) { 11118 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); 11119 } 11120 11121 $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds", 11122 Perl_Extension => 1, 11123 Fate => $INTERNAL_ONLY, 11124 Description => "Code points that particpate in a multi-char fold and are not the final character of said fold", 11125 ); 11126 11127 # If we ever wanted to show that these tables were combined, a new 11128 # property method could be created, like set_combined_props() 11129 property_ref('Case_Folding')->add_comment(join_lines( <<END 11130This file includes both the simple and full case folding maps. The simple 11131ones are in the main body of the table below, and the full ones adding to or 11132overriding them are in the hash. 11133END 11134 )); 11135 return; 11136 } 11137 11138 sub filter_case_folding_line { 11139 # Called for each line in CaseFolding.txt 11140 # Input lines look like: 11141 # 0041; C; 0061; # LATIN CAPITAL LETTER A 11142 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 11143 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 11144 # 11145 # 'C' means that folding is the same for both simple and full 11146 # 'F' that it is only for full folding 11147 # 'S' that it is only for simple folding 11148 # 'T' is locale-dependent, and ignored 11149 # 'I' is a type of 'F' used in some early releases. 11150 # Note the trailing semi-colon, unlike many of the input files. That 11151 # means that there will be an extra null field generated by the split 11152 # below, which we ignore and hence is not an error. 11153 11154 my $file = shift; 11155 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11156 11157 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; 11158 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { 11159 $file->carp_bad_line('Extra fields'); 11160 $_ = ""; 11161 return; 11162 } 11163 11164 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent 11165 $_ = ""; 11166 return; 11167 } 11168 11169 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase 11170 # I are all full foldings; S is single-char. For S, there is always 11171 # an F entry, so we must allow multiple values for the same code 11172 # point. Fortunately this table doesn't need further manipulation 11173 # which would preclude using multiple-values. The S is now included 11174 # so that _swash_inversion_hash() is able to construct closures 11175 # without having to worry about F mappings. 11176 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { 11177 $_ = "$range; Case_Folding; " 11178 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map"; 11179 if ($type eq 'F') { 11180 my @string = split " ", $map; 11181 for my $i (0 .. @string - 1 -1) { 11182 $non_final_folds->add_range(hex $string[$i], hex $string[$i]); 11183 } 11184 } 11185 } 11186 else { 11187 $_ = ""; 11188 $file->carp_bad_line('Expecting C F I S or T in second field'); 11189 } 11190 11191 # C and S are simple foldings, but simple case folding is not needed 11192 # unless we explicitly want its map table output. 11193 if ($to_output_simple && $type eq 'C' || $type eq 'S') { 11194 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); 11195 } 11196 11197 return; 11198 } 11199 11200} # End case fold closure 11201 11202sub filter_jamo_line { 11203 # Filter Jamo.txt lines. This routine mainly is used to populate hashes 11204 # from this file that is used in generating the Name property for Jamo 11205 # code points. But, it also is used to convert early versions' syntax 11206 # into the modern form. Here are two examples: 11207 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax 11208 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax 11209 # 11210 # The input is $_, the output is $_ filtered. 11211 11212 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 11213 11214 # Let the caller handle unexpected input. In earlier versions, there was 11215 # a third field which is supposed to be a comment, but did not have a '#' 11216 # before it. 11217 return if @fields > (($v_version gt v3.0.0) ? 2 : 3); 11218 11219 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous 11220 # beginning. 11221 11222 # Some 2.1 versions had this wrong. Causes havoc with the algorithm. 11223 $fields[1] = 'R' if $fields[0] eq '1105'; 11224 11225 # Add to structure so can generate Names from it. 11226 my $cp = hex $fields[0]; 11227 my $short_name = $fields[1]; 11228 $Jamo{$cp} = $short_name; 11229 if ($cp <= $LBase + $LCount) { 11230 $Jamo_L{$short_name} = $cp - $LBase; 11231 } 11232 elsif ($cp <= $VBase + $VCount) { 11233 $Jamo_V{$short_name} = $cp - $VBase; 11234 } 11235 elsif ($cp <= $TBase + $TCount) { 11236 $Jamo_T{$short_name} = $cp - $TBase; 11237 } 11238 else { 11239 Carp::my_carp_bug("Unexpected Jamo code point in $_"); 11240 } 11241 11242 11243 # Reassemble using just the first two fields to look like a typical 11244 # property file line 11245 $_ = "$fields[0]; $fields[1]"; 11246 11247 return; 11248} 11249 11250sub register_fraction($) { 11251 # This registers the input rational number so that it can be passed on to 11252 # utf8_heavy.pl, both in rational and floating forms. 11253 11254 my $rational = shift; 11255 11256 my $float = eval $rational; 11257 $nv_floating_to_rational{$float} = $rational; 11258 return; 11259} 11260 11261sub filter_numeric_value_line { 11262 # DNumValues contains lines of a different syntax than the typical 11263 # property file: 11264 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO 11265 # 11266 # This routine transforms $_ containing the anomalous syntax to the 11267 # typical, by filtering out the extra columns, and convert early version 11268 # decimal numbers to strings that look like rational numbers. 11269 11270 my $file = shift; 11271 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11272 11273 # Starting in 5.1, there is a rational field. Just use that, omitting the 11274 # extra columns. Otherwise convert the decimal number in the second field 11275 # to a rational, and omit extraneous columns. 11276 my @fields = split /\s*;\s*/, $_, -1; 11277 my $rational; 11278 11279 if ($v_version ge v5.1.0) { 11280 if (@fields != 4) { 11281 $file->carp_bad_line('Not 4 semi-colon separated fields'); 11282 $_ = ""; 11283 return; 11284 } 11285 $rational = $fields[3]; 11286 $_ = join '; ', @fields[ 0, 3 ]; 11287 } 11288 else { 11289 11290 # Here, is an older Unicode file, which has decimal numbers instead of 11291 # rationals in it. Use the fraction to calculate the denominator and 11292 # convert to rational. 11293 11294 if (@fields != 2 && @fields != 3) { 11295 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); 11296 $_ = ""; 11297 return; 11298 } 11299 11300 my $codepoints = $fields[0]; 11301 my $decimal = $fields[1]; 11302 if ($decimal =~ s/\.0+$//) { 11303 11304 # Anything ending with a decimal followed by nothing but 0's is an 11305 # integer 11306 $_ = "$codepoints; $decimal"; 11307 $rational = $decimal; 11308 } 11309 else { 11310 11311 my $denominator; 11312 if ($decimal =~ /\.50*$/) { 11313 $denominator = 2; 11314 } 11315 11316 # Here have the hardcoded repeating decimals in the fraction, and 11317 # the denominator they imply. There were only a few denominators 11318 # in the older Unicode versions of this file which this code 11319 # handles, so it is easy to convert them. 11320 11321 # The 4 is because of a round-off error in the Unicode 3.2 files 11322 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { 11323 $denominator = 3; 11324 } 11325 elsif ($decimal =~ /\.[27]50*$/) { 11326 $denominator = 4; 11327 } 11328 elsif ($decimal =~ /\.[2468]0*$/) { 11329 $denominator = 5; 11330 } 11331 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { 11332 $denominator = 6; 11333 } 11334 elsif ($decimal =~ /\.(12|37|62|87)50*$/) { 11335 $denominator = 8; 11336 } 11337 if ($denominator) { 11338 my $sign = ($decimal < 0) ? "-" : ""; 11339 my $numerator = int((abs($decimal) * $denominator) + .5); 11340 $rational = "$sign$numerator/$denominator"; 11341 $_ = "$codepoints; $rational"; 11342 } 11343 else { 11344 $file->carp_bad_line("Can't cope with number '$decimal'."); 11345 $_ = ""; 11346 return; 11347 } 11348 } 11349 } 11350 11351 register_fraction($rational) if $rational =~ qr{/}; 11352 return; 11353} 11354 11355{ # Closure 11356 my %unihan_properties; 11357 11358 sub setup_unihan { 11359 # Do any special setup for Unihan properties. 11360 11361 # This property gives the wrong computed type, so override. 11362 my $usource = property_ref('kIRG_USource'); 11363 $usource->set_type($STRING) if defined $usource; 11364 11365 # This property is to be considered binary (it says so in 11366 # http://www.unicode.org/reports/tr38/) 11367 my $iicore = property_ref('kIICore'); 11368 if (defined $iicore) { 11369 $iicore->set_type($FORCED_BINARY); 11370 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38."); 11371 11372 # Unicode doesn't include the maps for this property, so don't 11373 # warn that they are missing. 11374 $iicore->set_pre_declared_maps(0); 11375 $iicore->add_comment(join_lines( <<END 11376This property contains enum values, but Unicode UAX #38 says it should be 11377interpreted as binary, so Perl creates tables for both 1) its enum values, 11378plus 2) true/false tables in which it is considered true for all code points 11379that have a non-null value 11380END 11381 )); 11382 } 11383 11384 return; 11385 } 11386 11387 sub filter_unihan_line { 11388 # Change unihan db lines to look like the others in the db. Here is 11389 # an input sample: 11390 # U+341C kCangjie IEKN 11391 11392 # Tabs are used instead of semi-colons to separate fields; therefore 11393 # they may have semi-colons embedded in them. Change these to periods 11394 # so won't screw up the rest of the code. 11395 s/;/./g; 11396 11397 # Remove lines that don't look like ones we accept. 11398 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { 11399 $_ = ""; 11400 return; 11401 } 11402 11403 # Extract the property, and save a reference to its object. 11404 my $property = $1; 11405 if (! exists $unihan_properties{$property}) { 11406 $unihan_properties{$property} = property_ref($property); 11407 } 11408 11409 # Don't do anything unless the property is one we're handling, which 11410 # we determine by seeing if there is an object defined for it or not 11411 if (! defined $unihan_properties{$property}) { 11412 $_ = ""; 11413 return; 11414 } 11415 11416 # Convert the tab separators to our standard semi-colons, and convert 11417 # the U+HHHH notation to the rest of the standard's HHHH 11418 s/\t/;/g; 11419 s/\b U \+ (?= $code_point_re )//xg; 11420 11421 #local $to_trace = 1 if main::DEBUG; 11422 trace $_ if main::DEBUG && $to_trace; 11423 11424 return; 11425 } 11426} 11427 11428sub filter_blocks_lines { 11429 # In the Blocks.txt file, the names of the blocks don't quite match the 11430 # names given in PropertyValueAliases.txt, so this changes them so they 11431 # do match: Blanks and hyphens are changed into underscores. Also makes 11432 # early release versions look like later ones 11433 # 11434 # $_ is transformed to the correct value. 11435 11436 my $file = shift; 11437 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11438 11439 if ($v_version lt v3.2.0) { 11440 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted 11441 $_ = ""; 11442 return; 11443 } 11444 11445 # Old versions used a different syntax to mark the range. 11446 $_ =~ s/;\s+/../ if $v_version lt v3.1.0; 11447 } 11448 11449 my @fields = split /\s*;\s*/, $_, -1; 11450 if (@fields != 2) { 11451 $file->carp_bad_line("Expecting exactly two fields"); 11452 $_ = ""; 11453 return; 11454 } 11455 11456 # Change hyphens and blanks in the block name field only 11457 $fields[1] =~ s/[ -]/_/g; 11458 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word 11459 11460 $_ = join("; ", @fields); 11461 return; 11462} 11463 11464{ # Closure 11465 my $current_property; 11466 11467 sub filter_old_style_proplist { 11468 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it 11469 # was in a completely different syntax. Ken Whistler of Unicode says 11470 # that it was something he used as an aid for his own purposes, but 11471 # was never an official part of the standard. However, comments in 11472 # DAge.txt indicate that non-character code points were available in 11473 # the UCD as of 3.1. It is unclear to me (khw) how they could be 11474 # there except through this file (but on the other hand, they first 11475 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe 11476 # not. But the claim is that it was published as an aid to others who 11477 # might want some more information than was given in the official UCD 11478 # of the time. Many of the properties in it were incorporated into 11479 # the later PropList.txt, but some were not. This program uses this 11480 # early file to generate property tables that are otherwise not 11481 # accessible in the early UCD's, and most were probably not really 11482 # official at that time, so one could argue that it should be ignored, 11483 # and you can easily modify things to skip this. And there are bugs 11484 # in this file in various versions. (For example, the 2.1.9 version 11485 # removes from Alphabetic the CJK range starting at 4E00, and they 11486 # weren't added back in until 3.1.0.) Many of this file's properties 11487 # were later sanctioned, so this code generates tables for those 11488 # properties that aren't otherwise in the UCD of the time but 11489 # eventually did become official, and throws away the rest. Here is a 11490 # list of all the ones that are thrown away: 11491 # Bidi=* duplicates UnicodeData.txt 11492 # Combining never made into official property; 11493 # is \P{ccc=0} 11494 # Composite never made into official property. 11495 # Currency Symbol duplicates UnicodeData.txt: gc=sc 11496 # Decimal Digit duplicates UnicodeData.txt: gc=nd 11497 # Delimiter never made into official property; 11498 # removed in 3.0.1 11499 # Format Control never made into official property; 11500 # similar to gc=cf 11501 # High Surrogate duplicates Blocks.txt 11502 # Ignorable Control never made into official property; 11503 # similar to di=y 11504 # ISO Control duplicates UnicodeData.txt: gc=cc 11505 # Left of Pair never made into official property; 11506 # Line Separator duplicates UnicodeData.txt: gc=zl 11507 # Low Surrogate duplicates Blocks.txt 11508 # Non-break was actually listed as a property 11509 # in 3.2, but without any code 11510 # points. Unicode denies that this 11511 # was ever an official property 11512 # Non-spacing duplicate UnicodeData.txt: gc=mn 11513 # Numeric duplicates UnicodeData.txt: gc=cc 11514 # Paired Punctuation never made into official property; 11515 # appears to be gc=ps + gc=pe 11516 # Paragraph Separator duplicates UnicodeData.txt: gc=cc 11517 # Private Use duplicates UnicodeData.txt: gc=co 11518 # Private Use High Surrogate duplicates Blocks.txt 11519 # Punctuation duplicates UnicodeData.txt: gc=p 11520 # Space different definition than eventual 11521 # one. 11522 # Titlecase duplicates UnicodeData.txt: gc=lt 11523 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc 11524 # Zero-width never made into official property; 11525 # subset of gc=cf 11526 # Most of the properties have the same names in this file as in later 11527 # versions, but a couple do not. 11528 # 11529 # This subroutine filters $_, converting it from the old style into 11530 # the new style. Here's a sample of the old-style 11531 # 11532 # ******************************************* 11533 # 11534 # Property dump for: 0x100000A0 (Join Control) 11535 # 11536 # 200C..200D (2 chars) 11537 # 11538 # In the example, the property is "Join Control". It is kept in this 11539 # closure between calls to the subroutine. The numbers beginning with 11540 # 0x were internal to Ken's program that generated this file. 11541 11542 # If this line contains the property name, extract it. 11543 if (/^Property dump for: [^(]*\((.*)\)/) { 11544 $_ = $1; 11545 11546 # Convert white space to underscores. 11547 s/ /_/g; 11548 11549 # Convert the few properties that don't have the same name as 11550 # their modern counterparts 11551 s/Identifier_Part/ID_Continue/ 11552 or s/Not_a_Character/NChar/; 11553 11554 # If the name matches an existing property, use it. 11555 if (defined property_ref($_)) { 11556 trace "new property=", $_ if main::DEBUG && $to_trace; 11557 $current_property = $_; 11558 } 11559 else { # Otherwise discard it 11560 trace "rejected property=", $_ if main::DEBUG && $to_trace; 11561 undef $current_property; 11562 } 11563 $_ = ""; # The property is saved for the next lines of the 11564 # file, but this defining line is of no further use, 11565 # so clear it so that the caller won't process it 11566 # further. 11567 } 11568 elsif (! defined $current_property || $_ !~ /^$code_point_re/) { 11569 11570 # Here, the input line isn't a header defining a property for the 11571 # following section, and either we aren't in such a section, or 11572 # the line doesn't look like one that defines the code points in 11573 # such a section. Ignore this line. 11574 $_ = ""; 11575 } 11576 else { 11577 11578 # Here, we have a line defining the code points for the current 11579 # stashed property. Anything starting with the first blank is 11580 # extraneous. Otherwise, it should look like a normal range to 11581 # the caller. Append the property name so that it looks just like 11582 # a modern PropList entry. 11583 11584 $_ =~ s/\s.*//; 11585 $_ .= "; $current_property"; 11586 } 11587 trace $_ if main::DEBUG && $to_trace; 11588 return; 11589 } 11590} # End closure for old style proplist 11591 11592sub filter_old_style_normalization_lines { 11593 # For early releases of Unicode, the lines were like: 11594 # 74..2A76 ; NFKD_NO 11595 # For later releases this became: 11596 # 74..2A76 ; NFKD_QC; N 11597 # Filter $_ to look like those in later releases. 11598 # Similarly for MAYBEs 11599 11600 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; 11601 11602 # Also, the property FC_NFKC was abbreviated to FNC 11603 s/FNC/FC_NFKC/; 11604 return; 11605} 11606 11607sub setup_script_extensions { 11608 # The Script_Extensions property starts out with a clone of the Script 11609 # property. 11610 11611 my $scx = property_ref("Script_Extensions"); 11612 $scx = Property->new("scx", Full_Name => "Script_Extensions") 11613 if ! defined $scx; 11614 $scx->_set_format($STRING_WHITE_SPACE_LIST); 11615 $scx->initialize($script); 11616 $scx->set_default_map($script->default_map); 11617 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 11618 $scx->add_comment(join_lines( <<END 11619The values for code points that appear in one script are just the same as for 11620the 'Script' property. Likewise the values for those that appear in many 11621scripts are either 'Common' or 'Inherited', same as with 'Script'. But the 11622values of code points that appear in a few scripts are a space separated list 11623of those scripts. 11624END 11625 )); 11626 11627 # Initialize scx's tables and the aliases for them to be the same as sc's 11628 foreach my $table ($script->tables) { 11629 my $scx_table = $scx->add_match_table($table->name, 11630 Full_Name => $table->full_name); 11631 foreach my $alias ($table->aliases) { 11632 $scx_table->add_alias($alias->name); 11633 } 11634 } 11635} 11636 11637sub filter_script_extensions_line { 11638 # The Scripts file comes with the full name for the scripts; the 11639 # ScriptExtensions, with the short name. The final mapping file is a 11640 # combination of these, and without adjustment, would have inconsistent 11641 # entries. This filters the latter file to convert to full names. 11642 # Entries look like this: 11643 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 11644 11645 my @fields = split /\s*;\s*/; 11646 11647 # This script was erroneously omitted in this Unicode version. 11648 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; 11649 11650 my @full_names; 11651 foreach my $short_name (split " ", $fields[1]) { 11652 push @full_names, $script->table($short_name)->full_name; 11653 } 11654 $fields[1] = join " ", @full_names; 11655 $_ = join "; ", @fields; 11656 11657 return; 11658} 11659 11660sub setup_early_name_alias { 11661 my $file= shift; 11662 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11663 11664 my $aliases = property_ref('Name_Alias'); 11665 $aliases = Property->new('Name_Alias') if ! defined $aliases; 11666 $file->insert_lines(get_old_name_aliases()); 11667 11668 return; 11669} 11670 11671sub get_old_name_aliases () { 11672 my @return = split /\n/, <<'END'; 116730000;NULL;control 116740000;NUL;abbreviation 116750001;START OF HEADING;control 116760001;SOH;abbreviation 116770002;START OF TEXT;control 116780002;STX;abbreviation 116790003;END OF TEXT;control 116800003;ETX;abbreviation 116810004;END OF TRANSMISSION;control 116820004;EOT;abbreviation 116830005;ENQUIRY;control 116840005;ENQ;abbreviation 116850006;ACKNOWLEDGE;control 116860006;ACK;abbreviation 116870007;ALERT;control 116880007;BELL;control 116890007;BEL;abbreviation 116900008;BACKSPACE;control 116910008;BS;abbreviation 116920009;CHARACTER TABULATION;control 116930009;HORIZONTAL TABULATION;control 116940009;HT;abbreviation 116950009;TAB;abbreviation 11696000A;LINE FEED;control 11697000A;LINE FEED (LF);control 11698000A;NEW LINE;control 11699000A;END OF LINE;control 11700000A;LF;abbreviation 11701000A;NL;abbreviation 11702000A;EOL;abbreviation 11703000B;LINE TABULATION;control 11704000B;VERTICAL TABULATION;control 11705000B;VT;abbreviation 11706000C;FORM FEED;control 11707000C;FORM FEED (FF);control 11708000C;FF;abbreviation 11709000D;CARRIAGE RETURN;control 11710000D;CARRIAGE RETURN (CR);control 11711000D;CR;abbreviation 11712000E;SHIFT OUT;control 11713000E;LOCKING-SHIFT ONE;control 11714000E;SO;abbreviation 11715000F;SHIFT IN;control 11716000F;LOCKING-SHIFT ZERO;control 11717000F;SI;abbreviation 117180010;DATA LINK ESCAPE;control 117190010;DLE;abbreviation 117200011;DEVICE CONTROL ONE;control 117210011;DC1;abbreviation 117220012;DEVICE CONTROL TWO;control 117230012;DC2;abbreviation 117240013;DEVICE CONTROL THREE;control 117250013;DC3;abbreviation 117260014;DEVICE CONTROL FOUR;control 117270014;DC4;abbreviation 117280015;NEGATIVE ACKNOWLEDGE;control 117290015;NAK;abbreviation 117300016;SYNCHRONOUS IDLE;control 117310016;SYN;abbreviation 117320017;END OF TRANSMISSION BLOCK;control 117330017;ETB;abbreviation 117340018;CANCEL;control 117350018;CAN;abbreviation 117360019;END OF MEDIUM;control 117370019;EOM;abbreviation 11738001A;SUBSTITUTE;control 11739001A;SUB;abbreviation 11740001B;ESCAPE;control 11741001B;ESC;abbreviation 11742001C;INFORMATION SEPARATOR FOUR;control 11743001C;FILE SEPARATOR;control 11744001C;FS;abbreviation 11745001D;INFORMATION SEPARATOR THREE;control 11746001D;GROUP SEPARATOR;control 11747001D;GS;abbreviation 11748001E;INFORMATION SEPARATOR TWO;control 11749001E;RECORD SEPARATOR;control 11750001E;RS;abbreviation 11751001F;INFORMATION SEPARATOR ONE;control 11752001F;UNIT SEPARATOR;control 11753001F;US;abbreviation 117540020;SP;abbreviation 11755007F;DELETE;control 11756007F;DEL;abbreviation 117570080;PADDING CHARACTER;figment 117580080;PAD;abbreviation 117590081;HIGH OCTET PRESET;figment 117600081;HOP;abbreviation 117610082;BREAK PERMITTED HERE;control 117620082;BPH;abbreviation 117630083;NO BREAK HERE;control 117640083;NBH;abbreviation 117650084;INDEX;control 117660084;IND;abbreviation 117670085;NEXT LINE;control 117680085;NEXT LINE (NEL);control 117690085;NEL;abbreviation 117700086;START OF SELECTED AREA;control 117710086;SSA;abbreviation 117720087;END OF SELECTED AREA;control 117730087;ESA;abbreviation 117740088;CHARACTER TABULATION SET;control 117750088;HORIZONTAL TABULATION SET;control 117760088;HTS;abbreviation 117770089;CHARACTER TABULATION WITH JUSTIFICATION;control 117780089;HORIZONTAL TABULATION WITH JUSTIFICATION;control 117790089;HTJ;abbreviation 11780008A;LINE TABULATION SET;control 11781008A;VERTICAL TABULATION SET;control 11782008A;VTS;abbreviation 11783008B;PARTIAL LINE FORWARD;control 11784008B;PARTIAL LINE DOWN;control 11785008B;PLD;abbreviation 11786008C;PARTIAL LINE BACKWARD;control 11787008C;PARTIAL LINE UP;control 11788008C;PLU;abbreviation 11789008D;REVERSE LINE FEED;control 11790008D;REVERSE INDEX;control 11791008D;RI;abbreviation 11792008E;SINGLE SHIFT TWO;control 11793008E;SINGLE-SHIFT-2;control 11794008E;SS2;abbreviation 11795008F;SINGLE SHIFT THREE;control 11796008F;SINGLE-SHIFT-3;control 11797008F;SS3;abbreviation 117980090;DEVICE CONTROL STRING;control 117990090;DCS;abbreviation 118000091;PRIVATE USE ONE;control 118010091;PRIVATE USE-1;control 118020091;PU1;abbreviation 118030092;PRIVATE USE TWO;control 118040092;PRIVATE USE-2;control 118050092;PU2;abbreviation 118060093;SET TRANSMIT STATE;control 118070093;STS;abbreviation 118080094;CANCEL CHARACTER;control 118090094;CCH;abbreviation 118100095;MESSAGE WAITING;control 118110095;MW;abbreviation 118120096;START OF GUARDED AREA;control 118130096;START OF PROTECTED AREA;control 118140096;SPA;abbreviation 118150097;END OF GUARDED AREA;control 118160097;END OF PROTECTED AREA;control 118170097;EPA;abbreviation 118180098;START OF STRING;control 118190098;SOS;abbreviation 118200099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment 118210099;SGC;abbreviation 11822009A;SINGLE CHARACTER INTRODUCER;control 11823009A;SCI;abbreviation 11824009B;CONTROL SEQUENCE INTRODUCER;control 11825009B;CSI;abbreviation 11826009C;STRING TERMINATOR;control 11827009C;ST;abbreviation 11828009D;OPERATING SYSTEM COMMAND;control 11829009D;OSC;abbreviation 11830009E;PRIVACY MESSAGE;control 11831009E;PM;abbreviation 11832009F;APPLICATION PROGRAM COMMAND;control 11833009F;APC;abbreviation 1183400A0;NBSP;abbreviation 1183500AD;SHY;abbreviation 11836200B;ZWSP;abbreviation 11837200C;ZWNJ;abbreviation 11838200D;ZWJ;abbreviation 11839200E;LRM;abbreviation 11840200F;RLM;abbreviation 11841202A;LRE;abbreviation 11842202B;RLE;abbreviation 11843202C;PDF;abbreviation 11844202D;LRO;abbreviation 11845202E;RLO;abbreviation 11846FEFF;BYTE ORDER MARK;alternate 11847FEFF;BOM;abbreviation 11848FEFF;ZWNBSP;abbreviation 11849END 11850 11851 if ($v_version ge v3.0.0) { 11852 push @return, split /\n/, <<'END'; 11853180B; FVS1; abbreviation 11854180C; FVS2; abbreviation 11855180D; FVS3; abbreviation 11856180E; MVS; abbreviation 11857202F; NNBSP; abbreviation 11858END 11859 } 11860 11861 if ($v_version ge v3.2.0) { 11862 push @return, split /\n/, <<'END'; 11863034F; CGJ; abbreviation 11864205F; MMSP; abbreviation 118652060; WJ; abbreviation 11866END 11867 # Add in VS1..VS16 11868 my $cp = 0xFE00 - 1; 11869 for my $i (1..16) { 11870 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); 11871 } 11872 } 11873 if ($v_version ge v4.0.0) { # Add in VS17..VS256 11874 my $cp = 0xE0100 - 17; 11875 for my $i (17..256) { 11876 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); 11877 } 11878 } 11879 11880 return @return; 11881} 11882 11883sub filter_later_version_name_alias_line { 11884 11885 # This file has an extra entry per line for the alias type. This is 11886 # handled by creating a compound entry: "$alias: $type"; First, split 11887 # the line into components. 11888 my ($range, $alias, $type, @remainder) 11889 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 11890 11891 # This file contains multiple entries for some components, so tell the 11892 # downstream code to allow this in our internal tables; the 11893 # $MULTIPLE_AFTER preserves the input ordering. 11894 $_ = join ";", $range, $CMD_DELIM 11895 . $REPLACE_CMD 11896 . '=' 11897 . $MULTIPLE_AFTER 11898 . $CMD_DELIM 11899 . "$alias: $type", 11900 @remainder; 11901 return; 11902} 11903 11904sub filter_early_version_name_alias_line { 11905 11906 # Early versions did not have the trailing alias type field; implicitly it 11907 # was 'correction'. But our synthetic lines we add in this program do 11908 # have it, so test for the type field. 11909 $_ .= "; correction" if $_ !~ /;.*;/; 11910 11911 filter_later_version_name_alias_line; 11912 return; 11913} 11914 11915sub finish_Unicode() { 11916 # This routine should be called after all the Unicode files have been read 11917 # in. It: 11918 # 1) Adds the mappings for code points missing from the files which have 11919 # defaults specified for them. 11920 # 2) At this this point all mappings are known, so it computes the type of 11921 # each property whose type hasn't been determined yet. 11922 # 3) Calculates all the regular expression match tables based on the 11923 # mappings. 11924 # 3) Calculates and adds the tables which are defined by Unicode, but 11925 # which aren't derived by them, and certain derived tables that Perl 11926 # uses. 11927 11928 # For each property, fill in any missing mappings, and calculate the re 11929 # match tables. If a property has more than one missing mapping, the 11930 # default is a reference to a data structure, and requires data from other 11931 # properties to resolve. The sort is used to cause these to be processed 11932 # last, after all the other properties have been calculated. 11933 # (Fortunately, the missing properties so far don't depend on each other.) 11934 foreach my $property 11935 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } 11936 property_ref('*')) 11937 { 11938 # $perl has been defined, but isn't one of the Unicode properties that 11939 # need to be finished up. 11940 next if $property == $perl; 11941 11942 # Nor do we need to do anything with properties that aren't going to 11943 # be output. 11944 next if $property->fate == $SUPPRESSED; 11945 11946 # Handle the properties that have more than one possible default 11947 if (ref $property->default_map) { 11948 my $default_map = $property->default_map; 11949 11950 # These properties have stored in the default_map: 11951 # One or more of: 11952 # 1) A default map which applies to all code points in a 11953 # certain class 11954 # 2) an expression which will evaluate to the list of code 11955 # points in that class 11956 # And 11957 # 3) the default map which applies to every other missing code 11958 # point. 11959 # 11960 # Go through each list. 11961 while (my ($default, $eval) = $default_map->get_next_defaults) { 11962 11963 # Get the class list, and intersect it with all the so-far 11964 # unspecified code points yielding all the code points 11965 # in the class that haven't been specified. 11966 my $list = eval $eval; 11967 if ($@) { 11968 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); 11969 last; 11970 } 11971 11972 # Narrow down the list to just those code points we don't have 11973 # maps for yet. 11974 $list = $list & $property->inverse_list; 11975 11976 # Add mappings to the property for each code point in the list 11977 foreach my $range ($list->ranges) { 11978 $property->add_map($range->start, $range->end, $default, 11979 Replace => $CROAK); 11980 } 11981 } 11982 11983 # All remaining code points have the other mapping. Set that up 11984 # so the normal single-default mapping code will work on them 11985 $property->set_default_map($default_map->other_default); 11986 11987 # And fall through to do that 11988 } 11989 11990 # We should have enough data now to compute the type of the property. 11991 $property->compute_type; 11992 my $property_type = $property->type; 11993 11994 next if ! $property->to_create_match_tables; 11995 11996 # Here want to create match tables for this property 11997 11998 # The Unicode db always (so far, and they claim into the future) have 11999 # the default for missing entries in binary properties be 'N' (unless 12000 # there is a '@missing' line that specifies otherwise) 12001 if ($property_type == $BINARY && ! defined $property->default_map) { 12002 $property->set_default_map('N'); 12003 } 12004 12005 # Add any remaining code points to the mapping, using the default for 12006 # missing code points. 12007 my $default_table; 12008 if (defined (my $default_map = $property->default_map)) { 12009 12010 # Make sure there is a match table for the default 12011 if (! defined ($default_table = $property->table($default_map))) { 12012 $default_table = $property->add_match_table($default_map); 12013 } 12014 12015 # And, if the property is binary, the default table will just 12016 # be the complement of the other table. 12017 if ($property_type == $BINARY) { 12018 my $non_default_table; 12019 12020 # Find the non-default table. 12021 for my $table ($property->tables) { 12022 next if $table == $default_table; 12023 $non_default_table = $table; 12024 } 12025 $default_table->set_complement($non_default_table); 12026 } 12027 else { 12028 12029 # This fills in any missing values with the default. It's not 12030 # necessary to do this with binary properties, as the default 12031 # is defined completely in terms of the Y table. 12032 $property->add_map(0, $MAX_UNICODE_CODEPOINT, 12033 $default_map, Replace => $NO); 12034 } 12035 } 12036 12037 # Have all we need to populate the match tables. 12038 my $property_name = $property->name; 12039 my $maps_should_be_defined = $property->pre_declared_maps; 12040 foreach my $range ($property->ranges) { 12041 my $map = $range->value; 12042 my $table = $property->table($map); 12043 if (! defined $table) { 12044 12045 # Integral and rational property values are not necessarily 12046 # defined in PropValueAliases, but whether all the other ones 12047 # should be depends on the property. 12048 if ($maps_should_be_defined 12049 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) 12050 { 12051 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.") 12052 } 12053 $table = $property->add_match_table($map); 12054 } 12055 12056 next if $table->complement != 0; # Don't need to populate these 12057 $table->add_range($range->start, $range->end); 12058 } 12059 12060 # A forced binary property has additional true/false tables which 12061 # should have been set up when it was forced into binary. The false 12062 # table matches exactly the same set as the property's default table. 12063 # The true table matches the complement of that. The false table is 12064 # not the same as an additional set of aliases on top of the default 12065 # table, so use 'set_equivalent_to'. If it were implemented as 12066 # additional aliases, various things would have to be adjusted, but 12067 # especially, if the user wants to get a list of names for the table 12068 # using Unicode::UCD::prop_value_aliases(), s/he should get a 12069 # different set depending on whether they want the default table or 12070 # the false table. 12071 if ($property_type == $FORCED_BINARY) { 12072 $property->table('N')->set_equivalent_to($default_table, 12073 Related => 1); 12074 $property->table('Y')->set_complement($default_table); 12075 } 12076 12077 # For Perl 5.6 compatibility, all properties matchable in regexes can 12078 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl. 12079 # But warn if this creates a conflict with a (new) Unicode property 12080 # name, although it appears that Unicode has made a decision never to 12081 # begin a property name with 'Is_', so this shouldn't happen. 12082 foreach my $alias ($property->aliases) { 12083 my $Is_name = 'Is_' . $alias->name; 12084 if (defined (my $pre_existing = property_ref($Is_name))) { 12085 Carp::my_carp(<<END 12086There is already an alias named $Is_name (from " . $pre_existing . "), so 12087creating one for $property won't work. This is bad news. If it is not too 12088late, get Unicode to back off. Otherwise go back to the old scheme (findable 12089from the git blame log for this area of the code that suppressed individual 12090aliases that conflict with the new Unicode names. Proceeding anyway. 12091END 12092 ); 12093 } 12094 } # End of loop through aliases for this property 12095 } # End of loop through all Unicode properties. 12096 12097 # Fill in the mappings that Unicode doesn't completely furnish. First the 12098 # single letter major general categories. If Unicode were to start 12099 # delivering the values, this would be redundant, but better that than to 12100 # try to figure out if should skip and not get it right. Ths could happen 12101 # if a new major category were to be introduced, and the hard-coded test 12102 # wouldn't know about it. 12103 # This routine depends on the standard names for the general categories 12104 # being what it thinks they are, like 'Cn'. The major categories are the 12105 # union of all the general category tables which have the same first 12106 # letters. eg. L = Lu + Lt + Ll + Lo + Lm 12107 foreach my $minor_table ($gc->tables) { 12108 my $minor_name = $minor_table->name; 12109 next if length $minor_name == 1; 12110 if (length $minor_name != 2) { 12111 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); 12112 next; 12113 } 12114 12115 my $major_name = uc(substr($minor_name, 0, 1)); 12116 my $major_table = $gc->table($major_name); 12117 $major_table += $minor_table; 12118 } 12119 12120 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt 12121 # defines it as LC) 12122 my $LC = $gc->table('LC'); 12123 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... 12124 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. 12125 12126 12127 if ($LC->is_empty) { # Assume if not empty that Unicode has started to 12128 # deliver the correct values in it 12129 $LC->initialize($gc->table('Ll') + $gc->table('Lu')); 12130 12131 # Lt not in release 1. 12132 if (defined $gc->table('Lt')) { 12133 $LC += $gc->table('Lt'); 12134 $gc->table('Lt')->set_caseless_equivalent($LC); 12135 } 12136 } 12137 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); 12138 12139 $gc->table('Ll')->set_caseless_equivalent($LC); 12140 $gc->table('Lu')->set_caseless_equivalent($LC); 12141 12142 my $Cs = $gc->table('Cs'); 12143 12144 12145 # Folding information was introduced later into Unicode data. To get 12146 # Perl's case ignore (/i) to work at all in releases that don't have 12147 # folding, use the best available alternative, which is lower casing. 12148 my $fold = property_ref('Simple_Case_Folding'); 12149 if ($fold->is_empty) { 12150 $fold->initialize(property_ref('Simple_Lowercase_Mapping')); 12151 $fold->add_note(join_lines(<<END 12152WARNING: This table uses lower case as a substitute for missing fold 12153information 12154END 12155 )); 12156 } 12157 12158 # Multiple-character mapping was introduced later into Unicode data. If 12159 # missing, use the single-characters maps as best available alternative 12160 foreach my $map (qw { Uppercase_Mapping 12161 Lowercase_Mapping 12162 Titlecase_Mapping 12163 Case_Folding 12164 } ) 12165 { 12166 my $full = property_ref($map); 12167 if ($full->is_empty) { 12168 my $simple = property_ref('Simple_' . $map); 12169 $full->initialize($simple); 12170 $full->add_comment($simple->comment) if ($simple->comment); 12171 $full->add_note(join_lines(<<END 12172WARNING: This table uses simple mapping (single-character only) as a 12173substitute for missing multiple-character information 12174END 12175 )); 12176 } 12177 } 12178 12179 # Create digit and case fold tables with the original file names for 12180 # backwards compatibility with applications that read them directly. 12181 my $Digit = Property->new("Legacy_Perl_Decimal_Digit", 12182 Default_Map => "", 12183 Perl_Extension => 1, 12184 File => 'Digit', # Trad. location 12185 Directory => $map_directory, 12186 UCD => 0, 12187 Type => $STRING, 12188 To_Output_Map => $EXTERNAL_MAP, 12189 Range_Size_1 => 1, 12190 Initialize => property_ref('Perl_Decimal_Digit'), 12191 ); 12192 $Digit->add_comment(join_lines(<<END 12193This file gives the mapping of all code points which represent a single 12194decimal digit [0-9] to their respective digits. For example, the code point 12195U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those 12196that have Numeric_Type=Decimal; not special things, like subscripts nor Roman 12197numerals. 12198END 12199 )); 12200 12201 Property->new('Legacy_Case_Folding', 12202 File => "Fold", 12203 Directory => $map_directory, 12204 Default_Map => $CODE_POINT, 12205 UCD => 0, 12206 Range_Size_1 => 1, 12207 Type => $STRING, 12208 To_Output_Map => $EXTERNAL_MAP, 12209 Format => $HEX_FORMAT, 12210 Initialize => property_ref('cf'), 12211 ); 12212 12213 # The Script_Extensions property started out as a clone of the Script 12214 # property. But processing its data file caused some elements to be 12215 # replaced with different data. (These elements were for the Common and 12216 # Inherited properties.) This data is a qw() list of all the scripts that 12217 # the code points in the given range are in. An example line is: 12218 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA 12219 # 12220 # The code above has created a new match table named "Arab Syrc Thaa" 12221 # which contains 060C. (The cloned table started out with this code point 12222 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and 12223 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa" 12224 # match table. This is repeated for all these tables and ranges. The map 12225 # data is retained in the map table for reference, but the spurious match 12226 # tables are deleted. 12227 12228 my $scx = property_ref("Script_Extensions"); 12229 if (defined $scx) { 12230 foreach my $table ($scx->tables) { 12231 next unless $table->name =~ /\s/; # All the new and only the new 12232 # tables have a space in their 12233 # names 12234 my @scripts = split /\s+/, $table->name; 12235 foreach my $script (@scripts) { 12236 my $script_table = $scx->table($script); 12237 $script_table += $table; 12238 } 12239 $scx->delete_match_table($table); 12240 } 12241 } 12242 12243 return; 12244} 12245 12246sub compile_perl() { 12247 # Create perl-defined tables. Almost all are part of the pseudo-property 12248 # named 'perl' internally to this program. Many of these are recommended 12249 # in UTS#18 "Unicode Regular Expressions", and their derivations are based 12250 # on those found there. 12251 # Almost all of these are equivalent to some Unicode property. 12252 # A number of these properties have equivalents restricted to the ASCII 12253 # range, with their names prefaced by 'Posix', to signify that these match 12254 # what the Posix standard says they should match. A couple are 12255 # effectively this, but the name doesn't have 'Posix' in it because there 12256 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended 12257 # to the full Unicode range, by our guesses as to what is appropriate. 12258 12259 # 'Any' is all code points. As an error check, instead of just setting it 12260 # to be that, construct it to be the union of all the major categories 12261 $Any = $perl->add_match_table('Any', 12262 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]", 12263 Matches_All => 1); 12264 12265 foreach my $major_table ($gc->tables) { 12266 12267 # Major categories are the ones with single letter names. 12268 next if length($major_table->name) != 1; 12269 12270 $Any += $major_table; 12271 } 12272 12273 if ($Any->max != $MAX_UNICODE_CODEPOINT) { 12274 Carp::my_carp_bug("Generated highest code point (" 12275 . sprintf("%X", $Any->max) 12276 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.") 12277 } 12278 if ($Any->range_count != 1 || $Any->min != 0) { 12279 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.") 12280 } 12281 12282 $Any->add_alias('All'); 12283 12284 # Assigned is the opposite of gc=unassigned 12285 my $Assigned = $perl->add_match_table('Assigned', 12286 Description => "All assigned code points", 12287 Initialize => ~ $gc->table('Unassigned'), 12288 ); 12289 12290 # Our internal-only property should be treated as more than just a 12291 # synonym; grandfather it in to the pod. 12292 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1, 12293 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED) 12294 ->set_equivalent_to(property_ref('ccc')->table('Above'), 12295 Related => 1); 12296 12297 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]'); 12298 if (defined $block) { # This is equivalent to the block if have it. 12299 my $Unicode_ASCII = $block->table('Basic_Latin'); 12300 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { 12301 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); 12302 } 12303 } 12304 12305 # Very early releases didn't have blocks, so initialize ASCII ourselves if 12306 # necessary 12307 if ($ASCII->is_empty) { 12308 $ASCII->initialize([ 0..127 ]); 12309 } 12310 12311 # Get the best available case definitions. Early Unicode versions didn't 12312 # have Uppercase and Lowercase defined, so use the general category 12313 # instead for them. 12314 my $Lower = $perl->add_match_table('Lower'); 12315 my $Unicode_Lower = property_ref('Lowercase'); 12316 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { 12317 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); 12318 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y')); 12319 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N')); 12320 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y')); 12321 12322 } 12323 else { 12324 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'), 12325 Related => 1); 12326 } 12327 $Lower->add_alias('XPosixLower'); 12328 my $Posix_Lower = $perl->add_match_table("PosixLower", 12329 Description => "[a-z]", 12330 Initialize => $Lower & $ASCII, 12331 ); 12332 12333 my $Upper = $perl->add_match_table('Upper'); 12334 my $Unicode_Upper = property_ref('Uppercase'); 12335 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { 12336 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); 12337 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y')); 12338 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N')); 12339 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y')); 12340 } 12341 else { 12342 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'), 12343 Related => 1); 12344 } 12345 $Upper->add_alias('XPosixUpper'); 12346 my $Posix_Upper = $perl->add_match_table("PosixUpper", 12347 Description => "[A-Z]", 12348 Initialize => $Upper & $ASCII, 12349 ); 12350 12351 # Earliest releases didn't have title case. Initialize it to empty if not 12352 # otherwise present 12353 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase', 12354 Description => '(= \p{Gc=Lt})'); 12355 my $lt = $gc->table('Lt'); 12356 12357 # Earlier versions of mktables had this related to $lt since they have 12358 # identical code points, but their caseless equivalents are not the same, 12359 # one being 'Cased' and the other being 'LC', and so now must be kept as 12360 # separate entities. 12361 $Title += $lt if defined $lt; 12362 12363 # If this Unicode version doesn't have Cased, set up our own. From 12364 # Unicode 5.1: Definition D120: A character C is defined to be cased if 12365 # and only if C has the Lowercase or Uppercase property or has a 12366 # General_Category value of Titlecase_Letter. 12367 my $Unicode_Cased = property_ref('Cased'); 12368 unless (defined $Unicode_Cased) { 12369 my $cased = $perl->add_match_table('Cased', 12370 Initialize => $Lower + $Upper + $Title, 12371 Description => 'Uppercase or Lowercase or Titlecase', 12372 ); 12373 $Unicode_Cased = $cased; 12374 } 12375 $Title->set_caseless_equivalent($Unicode_Cased->table('Y')); 12376 12377 # Similarly, set up our own Case_Ignorable property if this Unicode 12378 # version doesn't have it. From Unicode 5.1: Definition D121: A character 12379 # C is defined to be case-ignorable if C has the value MidLetter or the 12380 # value MidNumLet for the Word_Break property or its General_Category is 12381 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), 12382 # Modifier_Letter (Lm), or Modifier_Symbol (Sk). 12383 12384 # Perl has long had an internal-only alias for this property; grandfather 12385 # it in to the pod, but discourage its use. 12386 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable', 12387 Re_Pod_Entry => 1, 12388 Fate => $INTERNAL_ONLY, 12389 Status => $DISCOURAGED); 12390 my $case_ignorable = property_ref('Case_Ignorable'); 12391 if (defined $case_ignorable && ! $case_ignorable->is_empty) { 12392 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'), 12393 Related => 1); 12394 } 12395 else { 12396 12397 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm')); 12398 12399 # The following three properties are not in early releases 12400 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me'); 12401 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf'); 12402 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk'); 12403 12404 # For versions 4.1 - 5.0, there is no MidNumLet property, and 12405 # correspondingly the case-ignorable definition lacks that one. For 12406 # 4.0, it appears that it was meant to be the same definition, but was 12407 # inadvertently omitted from the standard's text, so add it if the 12408 # property actually is there 12409 my $wb = property_ref('Word_Break'); 12410 if (defined $wb) { 12411 my $midlet = $wb->table('MidLetter'); 12412 $perl_case_ignorable += $midlet if defined $midlet; 12413 my $midnumlet = $wb->table('MidNumLet'); 12414 $perl_case_ignorable += $midnumlet if defined $midnumlet; 12415 } 12416 else { 12417 12418 # In earlier versions of the standard, instead of the above two 12419 # properties , just the following characters were used: 12420 $perl_case_ignorable += 0x0027 # APOSTROPHE 12421 + 0x00AD # SOFT HYPHEN (SHY) 12422 + 0x2019; # RIGHT SINGLE QUOTATION MARK 12423 } 12424 } 12425 12426 # The remaining perl defined tables are mostly based on Unicode TR 18, 12427 # "Annex C: Compatibility Properties". All of these have two versions, 12428 # one whose name generally begins with Posix that is posix-compliant, and 12429 # one that matches Unicode characters beyond the Posix, ASCII range 12430 12431 my $Alpha = $perl->add_match_table('Alpha'); 12432 12433 # Alphabetic was not present in early releases 12434 my $Alphabetic = property_ref('Alphabetic'); 12435 if (defined $Alphabetic && ! $Alphabetic->is_empty) { 12436 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); 12437 } 12438 else { 12439 12440 # For early releases, we don't get it exactly right. The below 12441 # includes more than it should, which in 5.2 terms is: L + Nl + 12442 # Other_Alphabetic. Other_Alphabetic contains many characters from 12443 # Mn and Mc. It's better to match more than we should, than less than 12444 # we should. 12445 $Alpha->initialize($gc->table('Letter') 12446 + $gc->table('Mn') 12447 + $gc->table('Mc')); 12448 $Alpha += $gc->table('Nl') if defined $gc->table('Nl'); 12449 $Alpha->add_description('Alphabetic'); 12450 } 12451 $Alpha->add_alias('XPosixAlpha'); 12452 my $Posix_Alpha = $perl->add_match_table("PosixAlpha", 12453 Description => "[A-Za-z]", 12454 Initialize => $Alpha & $ASCII, 12455 ); 12456 $Posix_Upper->set_caseless_equivalent($Posix_Alpha); 12457 $Posix_Lower->set_caseless_equivalent($Posix_Alpha); 12458 12459 my $Alnum = $perl->add_match_table('Alnum', 12460 Description => 'Alphabetic and (decimal) Numeric', 12461 Initialize => $Alpha + $gc->table('Decimal_Number'), 12462 ); 12463 $Alnum->add_alias('XPosixAlnum'); 12464 $perl->add_match_table("PosixAlnum", 12465 Description => "[A-Za-z0-9]", 12466 Initialize => $Alnum & $ASCII, 12467 ); 12468 12469 my $Word = $perl->add_match_table('Word', 12470 Description => '\w, including beyond ASCII;' 12471 . ' = \p{Alnum} + \pM + \p{Pc}', 12472 Initialize => $Alnum + $gc->table('Mark'), 12473 ); 12474 $Word->add_alias('XPosixWord'); 12475 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 12476 $Word += $Pc if defined $Pc; 12477 12478 # This is a Perl extension, so the name doesn't begin with Posix. 12479 my $PerlWord = $perl->add_match_table('PerlWord', 12480 Description => '\w, restricted to ASCII = [A-Za-z0-9_]', 12481 Initialize => $Word & $ASCII, 12482 ); 12483 $PerlWord->add_alias('PosixWord'); 12484 12485 my $Blank = $perl->add_match_table('Blank', 12486 Description => '\h, Horizontal white space', 12487 12488 # 200B is Zero Width Space which is for line 12489 # break control, and was listed as 12490 # Space_Separator in early releases 12491 Initialize => $gc->table('Space_Separator') 12492 + 0x0009 # TAB 12493 - 0x200B, # ZWSP 12494 ); 12495 $Blank->add_alias('HorizSpace'); # Another name for it. 12496 $Blank->add_alias('XPosixBlank'); 12497 $perl->add_match_table("PosixBlank", 12498 Description => "\\t and ' '", 12499 Initialize => $Blank & $ASCII, 12500 ); 12501 12502 my $VertSpace = $perl->add_match_table('VertSpace', 12503 Description => '\v', 12504 Initialize => $gc->table('Line_Separator') 12505 + $gc->table('Paragraph_Separator') 12506 + 0x000A # LINE FEED 12507 + 0x000B # VERTICAL TAB 12508 + 0x000C # FORM FEED 12509 + 0x000D # CARRIAGE RETURN 12510 + 0x0085, # NEL 12511 ); 12512 # No Posix equivalent for vertical space 12513 12514 my $Space = $perl->add_match_table('Space', 12515 Description => '\s including beyond ASCII plus vertical tab', 12516 Initialize => $Blank + $VertSpace, 12517 ); 12518 $Space->add_alias('XPosixSpace'); 12519 $perl->add_match_table("PosixSpace", 12520 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", 12521 Initialize => $Space & $ASCII, 12522 ); 12523 12524 # Perl's traditional space doesn't include Vertical Tab 12525 my $XPerlSpace = $perl->add_match_table('XPerlSpace', 12526 Description => '\s, including beyond ASCII', 12527 Initialize => $Space - 0x000B, 12528 ); 12529 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym 12530 my $PerlSpace = $perl->add_match_table('PerlSpace', 12531 Description => '\s, restricted to ASCII = [ \f\n\r\t]', 12532 Initialize => $XPerlSpace & $ASCII, 12533 ); 12534 12535 12536 my $Cntrl = $perl->add_match_table('Cntrl', 12537 Description => 'Control characters'); 12538 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); 12539 $Cntrl->add_alias('XPosixCntrl'); 12540 $perl->add_match_table("PosixCntrl", 12541 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL", 12542 Initialize => $Cntrl & $ASCII, 12543 ); 12544 12545 # $controls is a temporary used to construct Graph. 12546 my $controls = Range_List->new(Initialize => $gc->table('Unassigned') 12547 + $gc->table('Control')); 12548 # Cs not in release 1 12549 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate'); 12550 12551 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) 12552 my $Graph = $perl->add_match_table('Graph', 12553 Description => 'Characters that are graphical', 12554 Initialize => ~ ($Space + $controls), 12555 ); 12556 $Graph->add_alias('XPosixGraph'); 12557 $perl->add_match_table("PosixGraph", 12558 Description => 12559 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]', 12560 Initialize => $Graph & $ASCII, 12561 ); 12562 12563 $print = $perl->add_match_table('Print', 12564 Description => 'Characters that are graphical plus space characters (but no controls)', 12565 Initialize => $Blank + $Graph - $gc->table('Control'), 12566 ); 12567 $print->add_alias('XPosixPrint'); 12568 $perl->add_match_table("PosixPrint", 12569 Description => 12570 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', 12571 Initialize => $print & $ASCII, 12572 ); 12573 12574 my $Punct = $perl->add_match_table('Punct'); 12575 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); 12576 12577 # \p{punct} doesn't include the symbols, which posix does 12578 my $XPosixPunct = $perl->add_match_table('XPosixPunct', 12579 Description => '\p{Punct} + ASCII-range \p{Symbol}', 12580 Initialize => $gc->table('Punctuation') 12581 + ($ASCII & $gc->table('Symbol')), 12582 Perl_Extension => 1 12583 ); 12584 $perl->add_match_table('PosixPunct', Perl_Extension => 1, 12585 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', 12586 Initialize => $ASCII & $XPosixPunct, 12587 ); 12588 12589 my $Digit = $perl->add_match_table('Digit', 12590 Description => '[0-9] + all other decimal digits'); 12591 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); 12592 $Digit->add_alias('XPosixDigit'); 12593 my $PosixDigit = $perl->add_match_table("PosixDigit", 12594 Description => '[0-9]', 12595 Initialize => $Digit & $ASCII, 12596 ); 12597 12598 # Hex_Digit was not present in first release 12599 my $Xdigit = $perl->add_match_table('XDigit'); 12600 $Xdigit->add_alias('XPosixXDigit'); 12601 my $Hex = property_ref('Hex_Digit'); 12602 if (defined $Hex && ! $Hex->is_empty) { 12603 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); 12604 } 12605 else { 12606 # (Have to use hex instead of e.g. '0', because could be running on an 12607 # non-ASCII machine, and we want the Unicode (ASCII) values) 12608 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66, 12609 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); 12610 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO'); 12611 } 12612 12613 # AHex was not present in early releases 12614 my $PosixXDigit = $perl->add_match_table('PosixXDigit'); 12615 my $AHex = property_ref('ASCII_Hex_Digit'); 12616 if (defined $AHex && ! $AHex->is_empty) { 12617 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); 12618 } 12619 else { 12620 $PosixXDigit->initialize($Xdigit & $ASCII); 12621 } 12622 $PosixXDigit->add_description('[0-9A-Fa-f]'); 12623 12624 my $dt = property_ref('Decomposition_Type'); 12625 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', 12626 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), 12627 Perl_Extension => 1, 12628 Note => 'Union of all non-canonical decompositions', 12629 ); 12630 12631 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier 12632 # than SD appeared, construct it ourselves, based on the first release SD 12633 # was in. A pod entry is grandfathered in for it 12634 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1, 12635 Perl_Extension => 1, 12636 Fate => $INTERNAL_ONLY, 12637 Status => $DISCOURAGED); 12638 my $soft_dotted = property_ref('Soft_Dotted'); 12639 if (defined $soft_dotted && ! $soft_dotted->is_empty) { 12640 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1); 12641 } 12642 else { 12643 12644 # This list came from 3.2 Soft_Dotted. 12645 $CanonDCIJ->initialize([ 0x0069, 12646 0x006A, 12647 0x012F, 12648 0x0268, 12649 0x0456, 12650 0x0458, 12651 0x1E2D, 12652 0x1ECB, 12653 ]); 12654 $CanonDCIJ = $CanonDCIJ & $Assigned; 12655 } 12656 12657 # These are used in Unicode's definition of \X 12658 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1, 12659 Fate => $INTERNAL_ONLY); 12660 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1, 12661 Fate => $INTERNAL_ONLY); 12662 12663 # For backward compatibility, Perl has its own definition for IDStart 12664 # First, we include the underscore, and then the regular XID_Start also 12665 # have to be Words 12666 $perl->add_match_table('_Perl_IDStart', 12667 Perl_Extension => 1, 12668 Fate => $INTERNAL_ONLY, 12669 Initialize => 12670 ord('_') 12671 + (property_ref('XID_Start')->table('Y') & $Word) 12672 ); 12673 12674 my $gcb = property_ref('Grapheme_Cluster_Break'); 12675 12676 # The 'extended' grapheme cluster came in 5.1. The non-extended 12677 # definition differs too much from the traditional Perl one to use. 12678 if (defined $gcb && defined $gcb->table('SpacingMark')) { 12679 12680 # Note that assumes HST is defined; it came in an earlier release than 12681 # GCB. In the line below, two negatives means: yes hangul 12682 $begin += ~ property_ref('Hangul_Syllable_Type') 12683 ->table('Not_Applicable') 12684 + ~ ($gcb->table('Control') 12685 + $gcb->table('CR') 12686 + $gcb->table('LF')); 12687 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control'); 12688 12689 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark'); 12690 $extend->add_comment('For use in \X; matches: Extend | SpacingMark'); 12691 } 12692 else { # Old definition, used on early releases. 12693 $extend += $gc->table('Mark') 12694 + 0x200C # ZWNJ 12695 + 0x200D; # ZWJ 12696 $begin += ~ $extend; 12697 12698 # Here we may have a release that has the regular grapheme cluster 12699 # defined, or a release that doesn't have anything defined. 12700 # We set things up so the Perl core degrades gracefully, possibly with 12701 # placeholders that match nothing. 12702 12703 if (! defined $gcb) { 12704 $gcb = Property->new('GCB', Status => $PLACEHOLDER); 12705 } 12706 my $hst = property_ref('HST'); 12707 if (!defined $hst) { 12708 $hst = Property->new('HST', Status => $PLACEHOLDER); 12709 $hst->add_match_table('Not_Applicable', 12710 Initialize => $Any, 12711 Matches_All => 1); 12712 } 12713 12714 # On some releases, here we may not have the needed tables for the 12715 # perl core, in some releases we may. 12716 foreach my $name (qw{ L LV LVT T V prepend }) { 12717 my $table = $gcb->table($name); 12718 if (! defined $table) { 12719 $table = $gcb->add_match_table($name); 12720 push @tables_that_may_be_empty, $table->complete_name; 12721 } 12722 12723 # The HST property predates the GCB one, and has identical tables 12724 # for some of them, so use it if we can. 12725 if ($table->is_empty 12726 && defined $hst 12727 && defined $hst->table($name)) 12728 { 12729 $table += $hst->table($name); 12730 } 12731 } 12732 } 12733 12734 # More GCB. If we found some hangul syllables, populate a combined 12735 # table. 12736 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V', 12737 Perl_Extension => 1, 12738 Fate => $INTERNAL_ONLY); 12739 my $LV = $gcb->table('LV'); 12740 if ($LV->is_empty) { 12741 push @tables_that_may_be_empty, $lv_lvt_v->complete_name; 12742 } else { 12743 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V'); 12744 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V'); 12745 } 12746 12747 # Was previously constructed to contain both Name and Unicode_1_Name 12748 my @composition = ('Name', 'Unicode_1_Name'); 12749 12750 if (@named_sequences) { 12751 push @composition, 'Named_Sequence'; 12752 foreach my $sequence (@named_sequences) { 12753 $perl_charname->add_anomalous_entry($sequence); 12754 } 12755 } 12756 12757 my $alias_sentence = ""; 12758 my %abbreviations; 12759 my $alias = property_ref('Name_Alias'); 12760 if (defined $alias) { 12761 push @composition, 'Name_Alias'; 12762 $perl_charname->set_proxy_for('Name_Alias'); 12763 12764 # Add each entry in Name_Alias to Perl_Charnames. Where these go with 12765 # respect to any existing entry depends on the entry type. 12766 # Corrections go before said entry, as they should be returned in 12767 # preference over the existing entry. (A correction to a correction 12768 # should be later in the Name_Alias table, so it will correctly 12769 # precede the erroneous correction in Perl_Charnames.) 12770 # 12771 # Abbreviations go after everything else, so they are saved 12772 # temporarily in a hash for later. 12773 # 12774 # Controls are currently added afterwards. This is because Perl has 12775 # previously used the Unicode1 name, and so should still use that. 12776 # (Most of them will be the same anyway, in which case we don't add a 12777 # duplicate) 12778 12779 $alias->reset_each_range; 12780 while (my ($range) = $alias->each_range) { 12781 next if $range->value eq ""; 12782 my $code_point = $range->start; 12783 if ($code_point != $range->end) { 12784 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 12785 } 12786 my ($value, $type) = split ': ', $range->value; 12787 my $replace_type; 12788 if ($type eq 'correction') { 12789 $replace_type = $MULTIPLE_BEFORE; 12790 } 12791 elsif ($type eq 'abbreviation') { 12792 12793 # Save for later 12794 $abbreviations{$value} = $code_point; 12795 next; 12796 } 12797 elsif ($type eq 'control') { 12798 $replace_type = $MULTIPLE_AFTER; 12799 } 12800 else { 12801 $replace_type = $MULTIPLE_AFTER; 12802 } 12803 12804 # Actually add; before or after current entry(ies) as determined 12805 # above. 12806 12807 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); 12808 } 12809 } 12810 12811 # Now add the Unicode_1 names for the controls. The Unicode_1 names had 12812 # precedence before 6.1, so should be first in the file; the other names 12813 # have precedence starting in 6.1, 12814 my $before_or_after = ($v_version lt v6.1.0) 12815 ? $MULTIPLE_BEFORE 12816 : $MULTIPLE_AFTER; 12817 12818 foreach my $range (property_ref('Unicode_1_Name')->ranges) { 12819 my $code_point = $range->start; 12820 my $unicode_1_value = $range->value; 12821 next if $unicode_1_value eq ""; # Skip if name doesn't exist. 12822 12823 if ($code_point != $range->end) { 12824 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 12825 } 12826 12827 # To handle EBCDIC, we don't hard code in the code points of the 12828 # controls; instead realizing that all of them are below 256. 12829 last if $code_point > 255; 12830 12831 # We only add in the controls. 12832 next if $gc->value_of($code_point) ne 'Cc'; 12833 12834 # This won't add an exact duplicate. 12835 $perl_charname->add_duplicate($code_point, $unicode_1_value, 12836 Replace => $before_or_after); 12837 } 12838 12839 # Now that have everything added, add in abbreviations after 12840 # everything else. 12841 foreach my $value (keys %abbreviations) { 12842 $perl_charname->add_duplicate($abbreviations{$value}, $value, 12843 Replace => $MULTIPLE_AFTER); 12844 $alias_sentence = <<END; 12845The Name_Alias property adds duplicate code point entries that are 12846alternatives to the original name. If an addition is a corrected 12847name, it will be physically first in the table. The original (less correct, 12848but still valid) name will be next; then any alternatives, in no particular 12849order; and finally any abbreviations, again in no particular order. 12850END 12851 } 12852 12853 my $comment; 12854 if (@composition <= 2) { # Always at least 2 12855 $comment = join " and ", @composition; 12856 } 12857 else { 12858 $comment = join ", ", @composition[0 .. scalar @composition - 2]; 12859 $comment .= ", and $composition[-1]"; 12860 } 12861 12862 $perl_charname->add_comment(join_lines( <<END 12863This file is for charnames.pm. It is the union of the $comment properties. 12864Unicode_1_Name entries are used only for nameless code points in the Name 12865property. 12866$alias_sentence 12867This file doesn't include the algorithmically determinable names. For those, 12868use 'unicore/Name.pm' 12869END 12870 )); 12871 property_ref('Name')->add_comment(join_lines( <<END 12872This file doesn't include the algorithmically determinable names. For those, 12873use 'unicore/Name.pm' 12874END 12875 )); 12876 12877 # Construct the Present_In property from the Age property. 12878 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) { 12879 my $default_map = $age->default_map; 12880 my $in = Property->new('In', 12881 Default_Map => $default_map, 12882 Full_Name => "Present_In", 12883 Perl_Extension => 1, 12884 Type => $ENUM, 12885 Initialize => $age, 12886 ); 12887 $in->add_comment(join_lines(<<END 12888THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the 12889same as for $age, and not for what $in really means. This is because anything 12890defined in a given release should have multiple values: that release and all 12891higher ones. But only one value per code point can be represented in a table 12892like this. 12893END 12894 )); 12895 12896 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the 12897 # lowest numbered (earliest) come first, with the non-numeric one 12898 # last. 12899 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/) 12900 ? 1 12901 : ($b->name !~ /^[\d.]*$/) 12902 ? -1 12903 : $a->name <=> $b->name 12904 } $age->tables; 12905 12906 # The Present_In property is the cumulative age properties. The first 12907 # one hence is identical to the first age one. 12908 my $previous_in = $in->add_match_table($first_age->name); 12909 $previous_in->set_equivalent_to($first_age, Related => 1); 12910 12911 my $description_start = "Code point's usage introduced in version "; 12912 $first_age->add_description($description_start . $first_age->name); 12913 12914 # To construct the accumulated values, for each of the age tables 12915 # starting with the 2nd earliest, merge the earliest with it, to get 12916 # all those code points existing in the 2nd earliest. Repeat merging 12917 # the new 2nd earliest with the 3rd earliest to get all those existing 12918 # in the 3rd earliest, and so on. 12919 foreach my $current_age (@rest_ages) { 12920 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric 12921 12922 my $current_in = $in->add_match_table( 12923 $current_age->name, 12924 Initialize => $current_age + $previous_in, 12925 Description => $description_start 12926 . $current_age->name 12927 . ' or earlier', 12928 ); 12929 $previous_in = $current_in; 12930 12931 # Add clarifying material for the corresponding age file. This is 12932 # in part because of the confusing and contradictory information 12933 # given in the Standard's documentation itself, as of 5.2. 12934 $current_age->add_description( 12935 "Code point's usage was introduced in version " 12936 . $current_age->name); 12937 $current_age->add_note("See also $in"); 12938 12939 } 12940 12941 # And finally the code points whose usages have yet to be decided are 12942 # the same in both properties. Note that permanently unassigned code 12943 # points actually have their usage assigned (as being permanently 12944 # unassigned), so that these tables are not the same as gc=cn. 12945 my $unassigned = $in->add_match_table($default_map); 12946 my $age_default = $age->table($default_map); 12947 $age_default->add_description(<<END 12948Code point's usage has not been assigned in any Unicode release thus far. 12949END 12950 ); 12951 $unassigned->set_equivalent_to($age_default, Related => 1); 12952 } 12953 12954 # See L<perlfunc/quotemeta> 12955 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', 12956 Perl_Extension => 1, 12957 Fate => $INTERNAL_ONLY, 12958 12959 # Initialize to what's common in 12960 # all Unicode releases. 12961 Initialize => 12962 $Space 12963 + $gc->table('Control') 12964 ); 12965 12966 # In early releases without the proper Unicode properties, just set to \W. 12967 if (! defined (my $patsyn = property_ref('Pattern_Syntax')) 12968 || ! defined (my $patws = property_ref('Pattern_White_Space')) 12969 || ! defined (my $di = property_ref('Default_Ignorable_Code_Point'))) 12970 { 12971 $quotemeta += ~ $Word; 12972 } 12973 else { 12974 $quotemeta += $patsyn->table('Y') 12975 + $patws->table('Y') 12976 + $di->table('Y') 12977 + ((~ $Word) & $ASCII); 12978 } 12979 12980 # Finished creating all the perl properties. All non-internal non-string 12981 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with 12982 # an underscore.) These do not get a separate entry in the pod file 12983 foreach my $table ($perl->tables) { 12984 foreach my $alias ($table->aliases) { 12985 next if $alias->name =~ /^_/; 12986 $table->add_alias('Is_' . $alias->name, 12987 Re_Pod_Entry => 0, 12988 UCD => 0, 12989 Status => $alias->status, 12990 OK_as_Filename => 0); 12991 } 12992 } 12993 12994 # Here done with all the basic stuff. Ready to populate the information 12995 # about each character if annotating them. 12996 if ($annotate) { 12997 12998 # See comments at its declaration 12999 $annotate_ranges = Range_Map->new; 13000 13001 # This separates out the non-characters from the other unassigneds, so 13002 # can give different annotations for each. 13003 $unassigned_sans_noncharacters = Range_List->new( 13004 Initialize => $gc->table('Unassigned') 13005 & property_ref('Noncharacter_Code_Point')->table('N')); 13006 13007 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) { 13008 $i = populate_char_info($i); # Note sets $i so may cause skips 13009 } 13010 } 13011 13012 return; 13013} 13014 13015sub add_perl_synonyms() { 13016 # A number of Unicode tables have Perl synonyms that are expressed in 13017 # the single-form, \p{name}. These are: 13018 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and 13019 # \p{Is_Name} as synonyms 13020 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms 13021 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms 13022 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no 13023 # conflict, \p{Value} and \p{Is_Value} as well 13024 # 13025 # This routine generates these synonyms, warning of any unexpected 13026 # conflicts. 13027 13028 # Construct the list of tables to get synonyms for. Start with all the 13029 # binary and the General_Category ones. 13030 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } 13031 property_ref('*'); 13032 push @tables, $gc->tables; 13033 13034 # If the version of Unicode includes the Script property, add its tables 13035 push @tables, $script->tables if defined $script; 13036 13037 # The Block tables are kept separate because they are treated differently. 13038 # And the earliest versions of Unicode didn't include them, so add only if 13039 # there are some. 13040 my @blocks; 13041 push @blocks, $block->tables if defined $block; 13042 13043 # Here, have the lists of tables constructed. Process blocks last so that 13044 # if there are name collisions with them, blocks have lowest priority. 13045 # Should there ever be other collisions, manual intervention would be 13046 # required. See the comments at the beginning of the program for a 13047 # possible way to handle those semi-automatically. 13048 foreach my $table (@tables, @blocks) { 13049 13050 # For non-binary properties, the synonym is just the name of the 13051 # table, like Greek, but for binary properties the synonym is the name 13052 # of the property, and means the code points in its 'Y' table. 13053 my $nominal = $table; 13054 my $nominal_property = $nominal->property; 13055 my $actual; 13056 if (! $nominal->isa('Property')) { 13057 $actual = $table; 13058 } 13059 else { 13060 13061 # Here is a binary property. Use the 'Y' table. Verify that is 13062 # there 13063 my $yes = $nominal->table('Y'); 13064 unless (defined $yes) { # Must be defined, but is permissible to 13065 # be empty. 13066 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); 13067 next; 13068 } 13069 $actual = $yes; 13070 } 13071 13072 foreach my $alias ($nominal->aliases) { 13073 13074 # Attempt to create a table in the perl directory for the 13075 # candidate table, using whatever aliases in it that don't 13076 # conflict. Also add non-conflicting aliases for all these 13077 # prefixed by 'Is_' (and/or 'In_' for Block property tables) 13078 PREFIX: 13079 foreach my $prefix ("", 'Is_', 'In_') { 13080 13081 # Only Block properties can have added 'In_' aliases. 13082 next if $prefix eq 'In_' and $nominal_property != $block; 13083 13084 my $proposed_name = $prefix . $alias->name; 13085 13086 # No Is_Is, In_In, nor combinations thereof 13087 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; 13088 next if $proposed_name =~ /^ I [ns] _I [ns] _/x; 13089 13090 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; 13091 13092 # Get a reference to any existing table in the perl 13093 # directory with the desired name. 13094 my $pre_existing = $perl->table($proposed_name); 13095 13096 if (! defined $pre_existing) { 13097 13098 # No name collision, so ok to add the perl synonym. 13099 13100 my $make_re_pod_entry; 13101 my $ok_as_filename; 13102 my $status = $alias->status; 13103 if ($nominal_property == $block) { 13104 13105 # For block properties, the 'In' form is preferred for 13106 # external use; the pod file contains wild cards for 13107 # this and the 'Is' form so no entries for those; and 13108 # we don't want people using the name without the 13109 # 'In', so discourage that. 13110 if ($prefix eq "") { 13111 $make_re_pod_entry = 1; 13112 $status = $status || $DISCOURAGED; 13113 $ok_as_filename = 0; 13114 } 13115 elsif ($prefix eq 'In_') { 13116 $make_re_pod_entry = 0; 13117 $status = $status || $NORMAL; 13118 $ok_as_filename = 1; 13119 } 13120 else { 13121 $make_re_pod_entry = 0; 13122 $status = $status || $DISCOURAGED; 13123 $ok_as_filename = 0; 13124 } 13125 } 13126 elsif ($prefix ne "") { 13127 13128 # The 'Is' prefix is handled in the pod by a wild 13129 # card, and we won't use it for an external name 13130 $make_re_pod_entry = 0; 13131 $status = $status || $NORMAL; 13132 $ok_as_filename = 0; 13133 } 13134 else { 13135 13136 # Here, is an empty prefix, non block. This gets its 13137 # own pod entry and can be used for an external name. 13138 $make_re_pod_entry = 1; 13139 $status = $status || $NORMAL; 13140 $ok_as_filename = 1; 13141 } 13142 13143 # Here, there isn't a perl pre-existing table with the 13144 # name. Look through the list of equivalents of this 13145 # table to see if one is a perl table. 13146 foreach my $equivalent ($actual->leader->equivalents) { 13147 next if $equivalent->property != $perl; 13148 13149 # Here, have found a table for $perl. Add this alias 13150 # to it, and are done with this prefix. 13151 $equivalent->add_alias($proposed_name, 13152 Re_Pod_Entry => $make_re_pod_entry, 13153 13154 # Currently don't output these in the 13155 # ucd pod, as are strongly discouraged 13156 # from being used 13157 UCD => 0, 13158 13159 Status => $status, 13160 OK_as_Filename => $ok_as_filename); 13161 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; 13162 next PREFIX; 13163 } 13164 13165 # Here, $perl doesn't already have a table that is a 13166 # synonym for this property, add one. 13167 my $added_table = $perl->add_match_table($proposed_name, 13168 Re_Pod_Entry => $make_re_pod_entry, 13169 13170 # See UCD comment just above 13171 UCD => 0, 13172 13173 Status => $status, 13174 OK_as_Filename => $ok_as_filename); 13175 # And it will be related to the actual table, since it is 13176 # based on it. 13177 $added_table->set_equivalent_to($actual, Related => 1); 13178 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; 13179 next; 13180 } # End of no pre-existing. 13181 13182 # Here, there is a pre-existing table that has the proposed 13183 # name. We could be in trouble, but not if this is just a 13184 # synonym for another table that we have already made a child 13185 # of the pre-existing one. 13186 if ($pre_existing->is_set_equivalent_to($actual)) { 13187 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; 13188 $pre_existing->add_alias($proposed_name); 13189 next; 13190 } 13191 13192 # Here, there is a name collision, but it still could be ok if 13193 # the tables match the identical set of code points, in which 13194 # case, we can combine the names. Compare each table's code 13195 # point list to see if they are identical. 13196 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; 13197 if ($pre_existing->matches_identically_to($actual)) { 13198 13199 # Here, they do match identically. Not a real conflict. 13200 # Make the perl version a child of the Unicode one, except 13201 # in the non-obvious case of where the perl name is 13202 # already a synonym of another Unicode property. (This is 13203 # excluded by the test for it being its own parent.) The 13204 # reason for this exclusion is that then the two Unicode 13205 # properties become related; and we don't really know if 13206 # they are or not. We generate documentation based on 13207 # relatedness, and this would be misleading. Code 13208 # later executed in the process will cause the tables to 13209 # be represented by a single file anyway, without making 13210 # it look in the pod like they are necessarily related. 13211 if ($pre_existing->parent == $pre_existing 13212 && ($pre_existing->property == $perl 13213 || $actual->property == $perl)) 13214 { 13215 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; 13216 $pre_existing->set_equivalent_to($actual, Related => 1); 13217 } 13218 elsif (main::DEBUG && $to_trace) { 13219 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; 13220 trace $pre_existing->parent; 13221 } 13222 next PREFIX; 13223 } 13224 13225 # Here they didn't match identically, there is a real conflict 13226 # between our new name and a pre-existing property. 13227 $actual->add_conflicting($proposed_name, 'p', $pre_existing); 13228 $pre_existing->add_conflicting($nominal->full_name, 13229 'p', 13230 $actual); 13231 13232 # Don't output a warning for aliases for the block 13233 # properties (unless they start with 'In_') as it is 13234 # expected that there will be conflicts and the block 13235 # form loses. 13236 if ($verbosity >= $NORMAL_VERBOSITY 13237 && ($actual->property != $block || $prefix eq 'In_')) 13238 { 13239 print simple_fold(join_lines(<<END 13240There is already an alias named $proposed_name (from " . $pre_existing . "), 13241so not creating this alias for " . $actual 13242END 13243 ), "", 4); 13244 } 13245 13246 # Keep track for documentation purposes. 13247 $has_In_conflicts++ if $prefix eq 'In_'; 13248 $has_Is_conflicts++ if $prefix eq 'Is_'; 13249 } 13250 } 13251 } 13252 13253 # There are some properties which have No and Yes (and N and Y) as 13254 # property values, but aren't binary, and could possibly be confused with 13255 # binary ones. So create caveats for them. There are tables that are 13256 # named 'No', and tables that are named 'N', but confusion is not likely 13257 # unless they are the same table. For example, N meaning Number or 13258 # Neutral is not likely to cause confusion, so don't add caveats to things 13259 # like them. 13260 foreach my $property (grep { $_->type != $BINARY 13261 && $_->type != $FORCED_BINARY } 13262 property_ref('*')) 13263 { 13264 my $yes = $property->table('Yes'); 13265 if (defined $yes) { 13266 my $y = $property->table('Y'); 13267 if (defined $y && $yes == $y) { 13268 foreach my $alias ($property->aliases) { 13269 $yes->add_conflicting($alias->name); 13270 } 13271 } 13272 } 13273 my $no = $property->table('No'); 13274 if (defined $no) { 13275 my $n = $property->table('N'); 13276 if (defined $n && $no == $n) { 13277 foreach my $alias ($property->aliases) { 13278 $no->add_conflicting($alias->name, 'P'); 13279 } 13280 } 13281 } 13282 } 13283 13284 return; 13285} 13286 13287sub register_file_for_name($$$) { 13288 # Given info about a table and a datafile that it should be associated 13289 # with, register that association 13290 13291 my $table = shift; 13292 my $directory_ref = shift; # Array of the directory path for the file 13293 my $file = shift; # The file name in the final directory. 13294 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13295 13296 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace; 13297 13298 if ($table->isa('Property')) { 13299 $table->set_file_path(@$directory_ref, $file); 13300 push @map_properties, $table; 13301 13302 # No swash means don't do the rest of this. 13303 return if $table->fate != $ORDINARY; 13304 13305 # Get the path to the file 13306 my @path = $table->file_path; 13307 13308 # Use just the file name if no subdirectory. 13309 shift @path if $path[0] eq File::Spec->curdir(); 13310 13311 my $file = join '/', @path; 13312 13313 # Create a hash entry for utf8_heavy to get the file that stores this 13314 # property's map table 13315 foreach my $alias ($table->aliases) { 13316 my $name = $alias->name; 13317 $loose_property_to_file_of{standardize($name)} = $file; 13318 } 13319 13320 # And a way for utf8_heavy to find the proper key in the SwashInfo 13321 # hash for this property. 13322 $file_to_swash_name{$file} = "To" . $table->swash_name; 13323 return; 13324 } 13325 13326 # Do all of the work for all equivalent tables when called with the leader 13327 # table, so skip if isn't the leader. 13328 return if $table->leader != $table; 13329 13330 # If this is a complement of another file, use that other file instead, 13331 # with a ! prepended to it. 13332 my $complement; 13333 if (($complement = $table->complement) != 0) { 13334 my @directories = $complement->file_path; 13335 13336 # This assumes that the 0th element is something like 'lib', 13337 # the 1th element the property name (in its own directory), like 13338 # 'AHex', and the 2th element the file like 'Y' which will have a .pl 13339 # appended to it later. 13340 $directories[1] =~ s/^/!/; 13341 $file = pop @directories; 13342 $directory_ref =\@directories; 13343 } 13344 13345 # Join all the file path components together, using slashes. 13346 my $full_filename = join('/', @$directory_ref, $file); 13347 13348 # All go in the same subdirectory of unicore 13349 if ($directory_ref->[0] ne $matches_directory) { 13350 Carp::my_carp("Unexpected directory in " 13351 . join('/', @{$directory_ref}, $file)); 13352 } 13353 13354 # For this table and all its equivalents ... 13355 foreach my $table ($table, $table->equivalents) { 13356 13357 # Associate it with its file internally. Don't include the 13358 # $matches_directory first component 13359 $table->set_file_path(@$directory_ref, $file); 13360 13361 # No swash means don't do the rest of this. 13362 next if $table->isa('Map_Table') && $table->fate != $ORDINARY; 13363 13364 my $sub_filename = join('/', $directory_ref->[1, -1], $file); 13365 13366 my $property = $table->property; 13367 my $property_name = ($property == $perl) 13368 ? "" # 'perl' is never explicitly stated 13369 : standardize($property->name) . '='; 13370 13371 my $is_default = 0; # Is this table the default one for the property? 13372 13373 # To calculate $is_default, we find if this table is the same as the 13374 # default one for the property. But this is complicated by the 13375 # possibility that there is a master table for this one, and the 13376 # information is stored there instead of here. 13377 my $parent = $table->parent; 13378 my $leader_prop = $parent->property; 13379 my $default_map = $leader_prop->default_map; 13380 if (defined $default_map) { 13381 my $default_table = $leader_prop->table($default_map); 13382 $is_default = 1 if defined $default_table && $parent == $default_table; 13383 } 13384 13385 # Calculate the loose name for this table. Mostly it's just its name, 13386 # standardized. But in the case of Perl tables that are single-form 13387 # equivalents to Unicode properties, it is the latter's name. 13388 my $loose_table_name = 13389 ($property != $perl || $leader_prop == $perl) 13390 ? standardize($table->name) 13391 : standardize($parent->name); 13392 13393 my $deprecated = ($table->status eq $DEPRECATED) 13394 ? $table->status_info 13395 : ""; 13396 my $caseless_equivalent = $table->caseless_equivalent; 13397 13398 # And for each of the table's aliases... This inner loop eventually 13399 # goes through all aliases in the UCD that we generate regex match 13400 # files for 13401 foreach my $alias ($table->aliases) { 13402 my $standard = utf8_heavy_name($table, $alias); 13403 13404 # Generate an entry in either the loose or strict hashes, which 13405 # will translate the property and alias names combination into the 13406 # file where the table for them is stored. 13407 if ($alias->loose_match) { 13408 if (exists $loose_to_file_of{$standard}) { 13409 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); 13410 } 13411 else { 13412 $loose_to_file_of{$standard} = $sub_filename; 13413 } 13414 } 13415 else { 13416 if (exists $stricter_to_file_of{$standard}) { 13417 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); 13418 } 13419 else { 13420 $stricter_to_file_of{$standard} = $sub_filename; 13421 13422 # Tightly coupled with how utf8_heavy.pl works, for a 13423 # floating point number that is a whole number, get rid of 13424 # the trailing decimal point and 0's, so that utf8_heavy 13425 # will work. Also note that this assumes that such a 13426 # number is matched strictly; so if that were to change, 13427 # this would be wrong. 13428 if ((my $integer_name = $alias->name) 13429 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) 13430 { 13431 $stricter_to_file_of{$property_name . $integer_name} 13432 = $sub_filename; 13433 } 13434 } 13435 } 13436 13437 # For Unicode::UCD, create a mapping of the prop=value to the 13438 # canonical =value for that property. 13439 if ($standard =~ /=/) { 13440 13441 # This could happen if a strict name mapped into an existing 13442 # loose name. In that event, the strict names would have to 13443 # be moved to a new hash. 13444 if (exists($loose_to_standard_value{$standard})) { 13445 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); 13446 } 13447 $loose_to_standard_value{$standard} = $loose_table_name; 13448 } 13449 13450 # Keep a list of the deprecated properties and their filenames 13451 if ($deprecated && $complement == 0) { 13452 $utf8::why_deprecated{$sub_filename} = $deprecated; 13453 } 13454 13455 # And a substitute table, if any, for case-insensitive matching 13456 if ($caseless_equivalent != 0) { 13457 $caseless_equivalent_to{$standard} = $caseless_equivalent; 13458 } 13459 13460 # Add to defaults list if the table this alias belongs to is the 13461 # default one 13462 $loose_defaults{$standard} = 1 if $is_default; 13463 } 13464 } 13465 13466 return; 13467} 13468 13469{ # Closure 13470 my %base_names; # Names already used for avoiding DOS 8.3 filesystem 13471 # conflicts 13472 my %full_dir_name_of; # Full length names of directories used. 13473 13474 sub construct_filename($$$) { 13475 # Return a file name for a table, based on the table name, but perhaps 13476 # changed to get rid of non-portable characters in it, and to make 13477 # sure that it is unique on a file system that allows the names before 13478 # any period to be at most 8 characters (DOS). While we're at it 13479 # check and complain if there are any directory conflicts. 13480 13481 my $name = shift; # The name to start with 13482 my $mutable = shift; # Boolean: can it be changed? If no, but 13483 # yet it must be to work properly, a warning 13484 # is given 13485 my $directories_ref = shift; # A reference to an array containing the 13486 # path to the file, with each element one path 13487 # component. This is used because the same 13488 # name can be used in different directories. 13489 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13490 13491 my $warn = ! defined wantarray; # If true, then if the name is 13492 # changed, a warning is issued as well. 13493 13494 if (! defined $name) { 13495 Carp::my_carp("Undefined name in directory " 13496 . File::Spec->join(@$directories_ref) 13497 . ". '_' used"); 13498 return '_'; 13499 } 13500 13501 # Make sure that no directory names conflict with each other. Look at 13502 # each directory in the input file's path. If it is already in use, 13503 # assume it is correct, and is merely being re-used, but if we 13504 # truncate it to 8 characters, and find that there are two directories 13505 # that are the same for the first 8 characters, but differ after that, 13506 # then that is a problem. 13507 foreach my $directory (@$directories_ref) { 13508 my $short_dir = substr($directory, 0, 8); 13509 if (defined $full_dir_name_of{$short_dir}) { 13510 next if $full_dir_name_of{$short_dir} eq $directory; 13511 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); 13512 } 13513 else { 13514 $full_dir_name_of{$short_dir} = $directory; 13515 } 13516 } 13517 13518 my $path = join '/', @$directories_ref; 13519 $path .= '/' if $path; 13520 13521 # Remove interior underscores. 13522 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; 13523 13524 # Change any non-word character into an underscore, and truncate to 8. 13525 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" 13526 substr($filename, 8) = "" if length($filename) > 8; 13527 13528 # Make sure the basename doesn't conflict with something we 13529 # might have already written. If we have, say, 13530 # InGreekExtended1 13531 # InGreekExtended2 13532 # they become 13533 # InGreekE 13534 # InGreek2 13535 my $warned = 0; 13536 while (my $num = $base_names{$path}{lc $filename}++) { 13537 $num++; # so basenames with numbers start with '2', which 13538 # just looks more natural. 13539 13540 # Want to append $num, but if it'll make the basename longer 13541 # than 8 characters, pre-truncate $filename so that the result 13542 # is acceptable. 13543 my $delta = length($filename) + length($num) - 8; 13544 if ($delta > 0) { 13545 substr($filename, -$delta) = $num; 13546 } 13547 else { 13548 $filename .= $num; 13549 } 13550 if ($warn && ! $warned) { 13551 $warned = 1; 13552 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); 13553 } 13554 } 13555 13556 return $filename if $mutable; 13557 13558 # If not changeable, must return the input name, but warn if needed to 13559 # change it beyond shortening it. 13560 if ($name ne $filename 13561 && substr($name, 0, length($filename)) ne $filename) { 13562 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); 13563 } 13564 return $name; 13565 } 13566} 13567 13568# The pod file contains a very large table. Many of the lines in that table 13569# would exceed a typical output window's size, and so need to be wrapped with 13570# a hanging indent to make them look good. The pod language is really 13571# insufficient here. There is no general construct to do that in pod, so it 13572# is done here by beginning each such line with a space to cause the result to 13573# be output without formatting, and doing all the formatting here. This leads 13574# to the result that if the eventual display window is too narrow it won't 13575# look good, and if the window is too wide, no advantage is taken of that 13576# extra width. A further complication is that the output may be indented by 13577# the formatter so that there is less space than expected. What I (khw) have 13578# done is to assume that that indent is a particular number of spaces based on 13579# what it is in my Linux system; people can always resize their windows if 13580# necessary, but this is obviously less than desirable, but the best that can 13581# be expected. 13582my $automatic_pod_indent = 8; 13583 13584# Try to format so that uses fewest lines, but few long left column entries 13585# slide into the right column. An experiment on 5.1 data yielded the 13586# following percentages that didn't cut into the other side along with the 13587# associated first-column widths 13588# 69% = 24 13589# 80% not too bad except for a few blocks 13590# 90% = 33; # , cuts 353/3053 lines from 37 = 12% 13591# 95% = 37; 13592my $indent_info_column = 27; # 75% of lines didn't have overlap 13593 13594my $FILLER = 3; # Length of initial boiler-plate columns in a pod line 13595 # The 3 is because of: 13596 # 1 for the leading space to tell the pod formatter to 13597 # output as-is 13598 # 1 for the flag 13599 # 1 for the space between the flag and the main data 13600 13601sub format_pod_line ($$$;$$) { 13602 # Take a pod line and return it, formatted properly 13603 13604 my $first_column_width = shift; 13605 my $entry = shift; # Contents of left column 13606 my $info = shift; # Contents of right column 13607 13608 my $status = shift || ""; # Any flag 13609 13610 my $loose_match = shift; # Boolean. 13611 $loose_match = 1 unless defined $loose_match; 13612 13613 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13614 13615 my $flags = ""; 13616 $flags .= $STRICTER if ! $loose_match; 13617 13618 $flags .= $status if $status; 13619 13620 # There is a blank in the left column to cause the pod formatter to 13621 # output the line as-is. 13622 return sprintf " %-*s%-*s %s\n", 13623 # The first * in the format is replaced by this, the -1 is 13624 # to account for the leading blank. There isn't a 13625 # hard-coded blank after this to separate the flags from 13626 # the rest of the line, so that in the unlikely event that 13627 # multiple flags are shown on the same line, they both 13628 # will get displayed at the expense of that separation, 13629 # but since they are left justified, a blank will be 13630 # inserted in the normal case. 13631 $FILLER - 1, 13632 $flags, 13633 13634 # The other * in the format is replaced by this number to 13635 # cause the first main column to right fill with blanks. 13636 # The -1 is for the guaranteed blank following it. 13637 $first_column_width - $FILLER - 1, 13638 $entry, 13639 $info; 13640} 13641 13642my @zero_match_tables; # List of tables that have no matches in this release 13643 13644sub make_re_pod_entries($) { 13645 # This generates the entries for the pod file for a given table. 13646 # Also done at this time are any children tables. The output looks like: 13647 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) 13648 13649 my $input_table = shift; # Table the entry is for 13650 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13651 13652 # Generate parent and all its children at the same time. 13653 return if $input_table->parent != $input_table; 13654 13655 my $property = $input_table->property; 13656 my $type = $property->type; 13657 my $full_name = $property->full_name; 13658 13659 my $count = $input_table->count; 13660 my $string_count = clarify_number($count); 13661 my $status = $input_table->status; 13662 my $status_info = $input_table->status_info; 13663 my $caseless_equivalent = $input_table->caseless_equivalent; 13664 13665 my $entry_for_first_table; # The entry for the first table output. 13666 # Almost certainly, it is the parent. 13667 13668 # For each related table (including itself), we will generate a pod entry 13669 # for each name each table goes by 13670 foreach my $table ($input_table, $input_table->children) { 13671 13672 # utf8_heavy.pl cannot deal with null string property values, so skip 13673 # any tables that have no non-null names. 13674 next if ! grep { $_->name ne "" } $table->aliases; 13675 13676 # First, gather all the info that applies to this table as a whole. 13677 13678 push @zero_match_tables, $table if $count == 0; 13679 13680 my $table_property = $table->property; 13681 13682 # The short name has all the underscores removed, while the full name 13683 # retains them. Later, we decide whether to output a short synonym 13684 # for the full one, we need to compare apples to apples, so we use the 13685 # short name's length including underscores. 13686 my $table_property_short_name_length; 13687 my $table_property_short_name 13688 = $table_property->short_name(\$table_property_short_name_length); 13689 my $table_property_full_name = $table_property->full_name; 13690 13691 # Get how much savings there is in the short name over the full one 13692 # (delta will always be <= 0) 13693 my $table_property_short_delta = $table_property_short_name_length 13694 - length($table_property_full_name); 13695 my @table_description = $table->description; 13696 my @table_note = $table->note; 13697 13698 # Generate an entry for each alias in this table. 13699 my $entry_for_first_alias; # saves the first one encountered. 13700 foreach my $alias ($table->aliases) { 13701 13702 # Skip if not to go in pod. 13703 next unless $alias->make_re_pod_entry; 13704 13705 # Start gathering all the components for the entry 13706 my $name = $alias->name; 13707 13708 # Skip if name is empty, as can't be accessed by regexes. 13709 next if $name eq ""; 13710 13711 my $entry; # Holds the left column, may include extras 13712 my $entry_ref; # To refer to the left column's contents from 13713 # another entry; has no extras 13714 13715 # First the left column of the pod entry. Tables for the $perl 13716 # property always use the single form. 13717 if ($table_property == $perl) { 13718 $entry = "\\p{$name}"; 13719 $entry_ref = "\\p{$name}"; 13720 } 13721 else { # Compound form. 13722 13723 # Only generate one entry for all the aliases that mean true 13724 # or false in binary properties. Append a '*' to indicate 13725 # some are missing. (The heading comment notes this.) 13726 my $rhs; 13727 if ($type == $BINARY) { 13728 next if $name ne 'N' && $name ne 'Y'; 13729 $rhs = "$name*"; 13730 } 13731 elsif ($type != $FORCED_BINARY) { 13732 $rhs = $name; 13733 } 13734 else { 13735 13736 # Forced binary properties require special handling. It 13737 # has two sets of tables, one set is true/false; and the 13738 # other set is everything else. Entries are generated for 13739 # each set. Use the Bidi_Mirrored property (which appears 13740 # in all Unicode versions) to get a list of the aliases 13741 # for the true/false tables. Of these, only output the N 13742 # and Y ones, the same as, a regular binary property. And 13743 # output all the rest, same as a non-binary property. 13744 my $bm = property_ref("Bidi_Mirrored"); 13745 if ($name eq 'N' || $name eq 'Y') { 13746 $rhs = "$name*"; 13747 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, 13748 $bm->table("N")->aliases) 13749 { 13750 next; 13751 } 13752 else { 13753 $rhs = $name; 13754 } 13755 } 13756 13757 # Colon-space is used to give a little more space to be easier 13758 # to read; 13759 $entry = "\\p{" 13760 . $table_property_full_name 13761 . ": $rhs}"; 13762 13763 # But for the reference to this entry, which will go in the 13764 # right column, where space is at a premium, use equals 13765 # without a space 13766 $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; 13767 } 13768 13769 # Then the right (info) column. This is stored as components of 13770 # an array for the moment, then joined into a string later. For 13771 # non-internal only properties, begin the info with the entry for 13772 # the first table we encountered (if any), as things are ordered 13773 # so that that one is the most descriptive. This leads to the 13774 # info column of an entry being a more descriptive version of the 13775 # name column 13776 my @info; 13777 if ($name =~ /^_/) { 13778 push @info, 13779 '(For internal use by Perl, not necessarily stable)'; 13780 } 13781 elsif ($entry_for_first_alias) { 13782 push @info, $entry_for_first_alias; 13783 } 13784 13785 # If this entry is equivalent to another, add that to the info, 13786 # using the first such table we encountered 13787 if ($entry_for_first_table) { 13788 if (@info) { 13789 push @info, "(= $entry_for_first_table)"; 13790 } 13791 else { 13792 push @info, $entry_for_first_table; 13793 } 13794 } 13795 13796 # If the name is a large integer, add an equivalent with an 13797 # exponent for better readability 13798 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { 13799 push @info, sprintf "(= %.1e)", $name 13800 } 13801 13802 my $parenthesized = ""; 13803 if (! $entry_for_first_alias) { 13804 13805 # This is the first alias for the current table. The alias 13806 # array is ordered so that this is the fullest, most 13807 # descriptive alias, so it gets the fullest info. The other 13808 # aliases are mostly merely pointers to this one, using the 13809 # information already added above. 13810 13811 # Display any status message, but only on the parent table 13812 if ($status && ! $entry_for_first_table) { 13813 push @info, $status_info; 13814 } 13815 13816 # Put out any descriptive info 13817 if (@table_description || @table_note) { 13818 push @info, join "; ", @table_description, @table_note; 13819 } 13820 13821 # Look to see if there is a shorter name we can point people 13822 # at 13823 my $standard_name = standardize($name); 13824 my $short_name; 13825 my $proposed_short = $table->short_name; 13826 if (defined $proposed_short) { 13827 my $standard_short = standardize($proposed_short); 13828 13829 # If the short name is shorter than the standard one, or 13830 # even it it's not, but the combination of it and its 13831 # short property name (as in \p{prop=short} ($perl doesn't 13832 # have this form)) saves at least two characters, then, 13833 # cause it to be listed as a shorter synonym. 13834 if (length $standard_short < length $standard_name 13835 || ($table_property != $perl 13836 && (length($standard_short) 13837 - length($standard_name) 13838 + $table_property_short_delta) # (<= 0) 13839 < -2)) 13840 { 13841 $short_name = $proposed_short; 13842 if ($table_property != $perl) { 13843 $short_name = $table_property_short_name 13844 . "=$short_name"; 13845 } 13846 $short_name = "\\p{$short_name}"; 13847 } 13848 } 13849 13850 # And if this is a compound form name, see if there is a 13851 # single form equivalent 13852 my $single_form; 13853 if ($table_property != $perl) { 13854 13855 # Special case the binary N tables, so that will print 13856 # \P{single}, but use the Y table values to populate 13857 # 'single', as we haven't likewise populated the N table. 13858 # For forced binary tables, we can't just look at the N 13859 # table, but must see if this table is equivalent to the N 13860 # one, as there are two equivalent beasts in these 13861 # properties. 13862 my $test_table; 13863 my $p; 13864 if ( ($type == $BINARY 13865 && $input_table == $property->table('No')) 13866 || ($type == $FORCED_BINARY 13867 && $property->table('No')-> 13868 is_set_equivalent_to($input_table))) 13869 { 13870 $test_table = $property->table('Yes'); 13871 $p = 'P'; 13872 } 13873 else { 13874 $test_table = $input_table; 13875 $p = 'p'; 13876 } 13877 13878 # Look for a single form amongst all the children. 13879 foreach my $table ($test_table->children) { 13880 next if $table->property != $perl; 13881 my $proposed_name = $table->short_name; 13882 next if ! defined $proposed_name; 13883 13884 # Don't mention internal-only properties as a possible 13885 # single form synonym 13886 next if substr($proposed_name, 0, 1) eq '_'; 13887 13888 $proposed_name = "\\$p\{$proposed_name}"; 13889 if (! defined $single_form 13890 || length($proposed_name) < length $single_form) 13891 { 13892 $single_form = $proposed_name; 13893 13894 # The goal here is to find a single form; not the 13895 # shortest possible one. We've already found a 13896 # short name. So, stop at the first single form 13897 # found, which is likely to be closer to the 13898 # original. 13899 last; 13900 } 13901 } 13902 } 13903 13904 # Ouput both short and single in the same parenthesized 13905 # expression, but with only one of 'Single', 'Short' if there 13906 # are both items. 13907 if ($short_name || $single_form || $table->conflicting) { 13908 $parenthesized .= "Short: $short_name" if $short_name; 13909 if ($short_name && $single_form) { 13910 $parenthesized .= ', '; 13911 } 13912 elsif ($single_form) { 13913 $parenthesized .= 'Single: '; 13914 } 13915 $parenthesized .= $single_form if $single_form; 13916 } 13917 } 13918 13919 if ($caseless_equivalent != 0) { 13920 $parenthesized .= '; ' if $parenthesized ne ""; 13921 $parenthesized .= "/i= " . $caseless_equivalent->complete_name; 13922 } 13923 13924 13925 # Warn if this property isn't the same as one that a 13926 # semi-casual user might expect. The other components of this 13927 # parenthesized structure are calculated only for the first entry 13928 # for this table, but the conflicting is deemed important enough 13929 # to go on every entry. 13930 my $conflicting = join " NOR ", $table->conflicting; 13931 if ($conflicting) { 13932 $parenthesized .= '; ' if $parenthesized ne ""; 13933 $parenthesized .= "NOT $conflicting"; 13934 } 13935 13936 push @info, "($parenthesized)" if $parenthesized; 13937 13938 if ($name =~ /_$/ && $alias->loose_match) { 13939 push @info, "Note the trailing '_' matters in spite of loose matching rules."; 13940 } 13941 13942 if ($table_property != $perl && $table->perl_extension) { 13943 push @info, '(Perl extension)'; 13944 } 13945 push @info, "($string_count)"; 13946 13947 # Now, we have both the entry and info so add them to the 13948 # list of all the properties. 13949 push @match_properties, 13950 format_pod_line($indent_info_column, 13951 $entry, 13952 join( " ", @info), 13953 $alias->status, 13954 $alias->loose_match); 13955 13956 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; 13957 } # End of looping through the aliases for this table. 13958 13959 if (! $entry_for_first_table) { 13960 $entry_for_first_table = $entry_for_first_alias; 13961 } 13962 } # End of looping through all the related tables 13963 return; 13964} 13965 13966sub make_ucd_table_pod_entries { 13967 my $table = shift; 13968 13969 # Generate the entries for the UCD section of the pod for $table. This 13970 # also calculates if names are ambiguous, so has to be called even if the 13971 # pod is not being output 13972 13973 my $short_name = $table->name; 13974 my $standard_short_name = standardize($short_name); 13975 my $full_name = $table->full_name; 13976 my $standard_full_name = standardize($full_name); 13977 13978 my $full_info = ""; # Text of info column for full-name entries 13979 my $other_info = ""; # Text of info column for short-name entries 13980 my $short_info = ""; # Text of info column for other entries 13981 my $meaning = ""; # Synonym of this table 13982 13983 my $property = ($table->isa('Property')) 13984 ? $table 13985 : $table->parent->property; 13986 13987 my $perl_extension = $table->perl_extension; 13988 13989 # Get the more official name for for perl extensions that aren't 13990 # stand-alone properties 13991 if ($perl_extension && $property != $table) { 13992 if ($property == $perl ||$property->type == $BINARY) { 13993 $meaning = $table->complete_name; 13994 } 13995 else { 13996 $meaning = $property->full_name . "=$full_name"; 13997 } 13998 } 13999 14000 # There are three types of info column. One for the short name, one for 14001 # the full name, and one for everything else. They mostly are the same, 14002 # so initialize in the same loop. 14003 foreach my $info_ref (\$full_info, \$short_info, \$other_info) { 14004 if ($perl_extension && $property != $table) { 14005 14006 # Add the synonymous name for the non-full name entries; and to 14007 # the full-name entry if it adds extra information 14008 if ($info_ref == \$other_info 14009 || ($info_ref == \$short_info 14010 && $standard_short_name ne $standard_full_name) 14011 || standardize($meaning) ne $standard_full_name 14012 ) { 14013 $$info_ref .= "$meaning."; 14014 } 14015 } 14016 elsif ($info_ref != \$full_info) { 14017 14018 # Otherwise, the non-full name columns include the full name 14019 $$info_ref .= $full_name; 14020 } 14021 14022 # And the full-name entry includes the short name, if different 14023 if ($info_ref == \$full_info 14024 && $standard_short_name ne $standard_full_name) 14025 { 14026 $full_info =~ s/\.\Z//; 14027 $full_info .= " " if $full_info; 14028 $full_info .= "(Short: $short_name)"; 14029 } 14030 14031 if ($table->perl_extension) { 14032 $$info_ref =~ s/\.\Z//; 14033 $$info_ref .= ". " if $$info_ref; 14034 $$info_ref .= "(Perl extension)"; 14035 } 14036 } 14037 14038 # Add any extra annotations to the full name entry 14039 foreach my $more_info ($table->description, 14040 $table->note, 14041 $table->status_info) 14042 { 14043 next unless $more_info; 14044 $full_info =~ s/\.\Z//; 14045 $full_info .= ". " if $full_info; 14046 $full_info .= $more_info; 14047 } 14048 14049 # These keep track if have created full and short name pod entries for the 14050 # property 14051 my $done_full = 0; 14052 my $done_short = 0; 14053 14054 # Every possible name is kept track of, even those that aren't going to be 14055 # output. This way we can be sure to find the ambiguities. 14056 foreach my $alias ($table->aliases) { 14057 my $name = $alias->name; 14058 my $standard = standardize($name); 14059 my $info; 14060 my $output_this = $alias->ucd; 14061 14062 # If the full and short names are the same, we want to output the full 14063 # one's entry, so it has priority. 14064 if ($standard eq $standard_full_name) { 14065 next if $done_full; 14066 $done_full = 1; 14067 $info = $full_info; 14068 } 14069 elsif ($standard eq $standard_short_name) { 14070 next if $done_short; 14071 $done_short = 1; 14072 next if $standard_short_name eq $standard_full_name; 14073 $info = $short_info; 14074 } 14075 else { 14076 $info = $other_info; 14077 } 14078 14079 # Here, we have set up the two columns for this entry. But if an 14080 # entry already exists for this name, we have to decide which one 14081 # we're going to later output. 14082 if (exists $ucd_pod{$standard}) { 14083 14084 # If the two entries refer to the same property, it's not going to 14085 # be ambiguous. (Likely it's because the names when standardized 14086 # are the same.) But that means if they are different properties, 14087 # there is ambiguity. 14088 if ($ucd_pod{$standard}->{'property'} != $property) { 14089 14090 # Here, we have an ambiguity. This code assumes that one is 14091 # scheduled to be output and one not and that one is a perl 14092 # extension (which is not to be output) and the other isn't. 14093 # If those assumptions are wrong, things have to be rethought. 14094 if ($ucd_pod{$standard}{'output_this'} == $output_this 14095 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension 14096 || $output_this == $perl_extension) 14097 { 14098 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); 14099 } 14100 14101 # We modifiy the info column of the one being output to 14102 # indicate the ambiguity. Set $which to point to that one's 14103 # info. 14104 my $which; 14105 if ($ucd_pod{$standard}{'output_this'}) { 14106 $which = \$ucd_pod{$standard}->{'info'}; 14107 } 14108 else { 14109 $which = \$info; 14110 $meaning = $ucd_pod{$standard}{'meaning'}; 14111 } 14112 14113 chomp $$which; 14114 $$which =~ s/\.\Z//; 14115 $$which .= "; NOT '$standard' meaning '$meaning'"; 14116 14117 $ambiguous_names{$standard} = 1; 14118 } 14119 14120 # Use the non-perl-extension variant 14121 next unless $ucd_pod{$standard}{'perl_extension'}; 14122 } 14123 14124 # Store enough information about this entry that we can later look for 14125 # ambiguities, and output it properly. 14126 $ucd_pod{$standard} = { 'name' => $name, 14127 'info' => $info, 14128 'meaning' => $meaning, 14129 'output_this' => $output_this, 14130 'perl_extension' => $perl_extension, 14131 'property' => $property, 14132 'status' => $alias->status, 14133 }; 14134 } # End of looping through all this table's aliases 14135 14136 return; 14137} 14138 14139sub pod_alphanumeric_sort { 14140 # Sort pod entries alphanumerically. 14141 14142 # The first few character columns are filler, plus the '\p{'; and get rid 14143 # of all the trailing stuff, starting with the trailing '}', so as to sort 14144 # on just 'Name=Value' 14145 (my $a = lc $a) =~ s/^ .*? { //x; 14146 $a =~ s/}.*//; 14147 (my $b = lc $b) =~ s/^ .*? { //x; 14148 $b =~ s/}.*//; 14149 14150 # Determine if the two operands are both internal only or both not. 14151 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 14152 # should be the underscore that begins internal only 14153 my $a_is_internal = (substr($a, 0, 1) eq '_'); 14154 my $b_is_internal = (substr($b, 0, 1) eq '_'); 14155 14156 # Sort so the internals come last in the table instead of first (which the 14157 # leading underscore would otherwise indicate). 14158 if ($a_is_internal != $b_is_internal) { 14159 return 1 if $a_is_internal; 14160 return -1 14161 } 14162 14163 # Determine if the two operands are numeric property values or not. 14164 # A numeric property will look like xyz: 3. But the number 14165 # can begin with an optional minus sign, and may have a 14166 # fraction or rational component, like xyz: 3/2. If either 14167 # isn't numeric, use alphabetic sort. 14168 my ($a_initial, $a_number) = 14169 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); 14170 return $a cmp $b unless defined $a_number; 14171 my ($b_initial, $b_number) = 14172 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); 14173 return $a cmp $b unless defined $b_number; 14174 14175 # Here they are both numeric, but use alphabetic sort if the 14176 # initial parts don't match 14177 return $a cmp $b if $a_initial ne $b_initial; 14178 14179 # Convert rationals to floating for the comparison. 14180 $a_number = eval $a_number if $a_number =~ qr{/}; 14181 $b_number = eval $b_number if $b_number =~ qr{/}; 14182 14183 return $a_number <=> $b_number; 14184} 14185 14186sub make_pod () { 14187 # Create the .pod file. This generates the various subsections and then 14188 # combines them in one big HERE document. 14189 14190 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; 14191 14192 return unless defined $pod_directory; 14193 print "Making pod file\n" if $verbosity >= $PROGRESS; 14194 14195 my $exception_message = 14196 '(Any exceptions are individually noted beginning with the word NOT.)'; 14197 my @block_warning; 14198 if (-e 'Blocks.txt') { 14199 14200 # Add the line: '\p{In_*} \p{Block: *}', with the warning message 14201 # if the global $has_In_conflicts indicates we have them. 14202 push @match_properties, format_pod_line($indent_info_column, 14203 '\p{In_*}', 14204 '\p{Block: *}' 14205 . (($has_In_conflicts) 14206 ? " $exception_message" 14207 : "")); 14208 @block_warning = << "END"; 14209 14210Matches in the Block property have shortcuts that begin with "In_". For 14211example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For 14212backward compatibility, if there is no conflict with another shortcut, these 14213may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there 14214are numerous such conflicting shortcuts. Use of these forms for Block is 14215discouraged, and are flagged as such, not only because of the potential 14216confusion as to what is meant, but also because a later release of Unicode may 14217preempt the shortcut, and your program would no longer be correct. Use the 14218"In_" form instead to avoid this, or even more clearly, use the compound form, 14219e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information 14220about this. 14221END 14222 } 14223 my $text = $Is_flags_text; 14224 $text = "$exception_message $text" if $has_Is_conflicts; 14225 14226 # And the 'Is_ line'; 14227 push @match_properties, format_pod_line($indent_info_column, 14228 '\p{Is_*}', 14229 "\\p{*} $text"); 14230 14231 # Sort the properties array for output. It is sorted alphabetically 14232 # except numerically for numeric properties, and only output unique lines. 14233 @match_properties = sort pod_alphanumeric_sort uniques @match_properties; 14234 14235 my $formatted_properties = simple_fold(\@match_properties, 14236 "", 14237 # indent succeeding lines by two extra 14238 # which looks better 14239 $indent_info_column + 2, 14240 14241 # shorten the line length by how much 14242 # the formatter indents, so the folded 14243 # line will fit in the space 14244 # presumably available 14245 $automatic_pod_indent); 14246 # Add column headings, indented to be a little more centered, but not 14247 # exactly 14248 $formatted_properties = format_pod_line($indent_info_column, 14249 ' NAME', 14250 ' INFO') 14251 . "\n" 14252 . $formatted_properties; 14253 14254 # Generate pod documentation lines for the tables that match nothing 14255 my $zero_matches = ""; 14256 if (@zero_match_tables) { 14257 @zero_match_tables = uniques(@zero_match_tables); 14258 $zero_matches = join "\n\n", 14259 map { $_ = '=item \p{' . $_->complete_name . "}" } 14260 sort { $a->complete_name cmp $b->complete_name } 14261 @zero_match_tables; 14262 14263 $zero_matches = <<END; 14264 14265=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters 14266 14267Unicode has some property-value pairs that currently don't match anything. 14268This happens generally either because they are obsolete, or they exist for 14269symmetry with other forms, but no language has yet been encoded that uses 14270them. In this version of Unicode, the following match zero code points: 14271 14272=over 4 14273 14274$zero_matches 14275 14276=back 14277 14278END 14279 } 14280 14281 # Generate list of properties that we don't accept, grouped by the reasons 14282 # why. This is so only put out the 'why' once, and then list all the 14283 # properties that have that reason under it. 14284 14285 my %why_list; # The keys are the reasons; the values are lists of 14286 # properties that have the key as their reason 14287 14288 # For each property, add it to the list that are suppressed for its reason 14289 # The sort will cause the alphabetically first properties to be added to 14290 # each list first, so each list will be sorted. 14291 foreach my $property (sort keys %why_suppressed) { 14292 push @{$why_list{$why_suppressed{$property}}}, $property; 14293 } 14294 14295 # For each reason (sorted by the first property that has that reason)... 14296 my @bad_re_properties; 14297 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } 14298 keys %why_list) 14299 { 14300 # Add to the output, all the properties that have that reason. 14301 my $has_item = 0; # Flag if actually output anything. 14302 foreach my $name (@{$why_list{$why}}) { 14303 14304 # Split compound names into $property and $table components 14305 my $property = $name; 14306 my $table; 14307 if ($property =~ / (.*) = (.*) /x) { 14308 $property = $1; 14309 $table = $2; 14310 } 14311 14312 # This release of Unicode may not have a property that is 14313 # suppressed, so don't reference a non-existent one. 14314 $property = property_ref($property); 14315 next if ! defined $property; 14316 14317 # And since this list is only for match tables, don't list the 14318 # ones that don't have match tables. 14319 next if ! $property->to_create_match_tables; 14320 14321 # Find any abbreviation, and turn it into a compound name if this 14322 # is a property=value pair. 14323 my $short_name = $property->name; 14324 $short_name .= '=' . $property->table($table)->name if $table; 14325 14326 # Start with an empty line. 14327 push @bad_re_properties, "\n\n" unless $has_item; 14328 14329 # And add the property as an item for the reason. 14330 push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; 14331 $has_item = 1; 14332 } 14333 14334 # And add the reason under the list of properties, if such a list 14335 # actually got generated. Note that the header got added 14336 # unconditionally before. But pod ignores extra blank lines, so no 14337 # harm. 14338 push @bad_re_properties, "\n$why\n" if $has_item; 14339 14340 } # End of looping through each reason. 14341 14342 if (! @bad_re_properties) { 14343 push @bad_re_properties, 14344 "*** This installation accepts ALL non-Unihan properties ***"; 14345 } 14346 else { 14347 # Add =over only if non-empty to avoid an empty =over/=back section, 14348 # which is considered bad form. 14349 unshift @bad_re_properties, "\n=over 4\n"; 14350 push @bad_re_properties, "\n=back\n"; 14351 } 14352 14353 # Similiarly, generate a list of files that we don't use, grouped by the 14354 # reasons why. First, create a hash whose keys are the reasons, and whose 14355 # values are anonymous arrays of all the files that share that reason. 14356 my %grouped_by_reason; 14357 foreach my $file (keys %ignored_files) { 14358 push @{$grouped_by_reason{$ignored_files{$file}}}, $file; 14359 } 14360 foreach my $file (keys %skipped_files) { 14361 push @{$grouped_by_reason{$skipped_files{$file}}}, $file; 14362 } 14363 14364 # Then, sort each group. 14365 foreach my $group (keys %grouped_by_reason) { 14366 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } 14367 @{$grouped_by_reason{$group}} ; 14368 } 14369 14370 # Finally, create the output text. For each reason (sorted by the 14371 # alphabetically first file that has that reason)... 14372 my @unused_files; 14373 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] 14374 cmp lc $grouped_by_reason{$b}->[0] 14375 } 14376 keys %grouped_by_reason) 14377 { 14378 # Add all the files that have that reason to the output. Start 14379 # with an empty line. 14380 push @unused_files, "\n\n"; 14381 push @unused_files, map { "\n=item F<$_> \n" } 14382 @{$grouped_by_reason{$reason}}; 14383 # And add the reason under the list of files 14384 push @unused_files, "\n$reason\n"; 14385 } 14386 14387 # Similarly, create the output text for the UCD section of the pod 14388 my @ucd_pod; 14389 foreach my $key (keys %ucd_pod) { 14390 next unless $ucd_pod{$key}->{'output_this'}; 14391 push @ucd_pod, format_pod_line($indent_info_column, 14392 $ucd_pod{$key}->{'name'}, 14393 $ucd_pod{$key}->{'info'}, 14394 $ucd_pod{$key}->{'status'}, 14395 ); 14396 } 14397 14398 # Sort alphabetically, and fold for output 14399 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; 14400 my $ucd_pod = simple_fold(\@ucd_pod, 14401 ' ', 14402 $indent_info_column, 14403 $automatic_pod_indent); 14404 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') 14405 . "\n" 14406 . $ucd_pod; 14407 local $" = ""; 14408 14409 # Everything is ready to assemble. 14410 my @OUT = << "END"; 14411=begin comment 14412 14413$HEADER 14414 14415To change this file, edit $0 instead. 14416 14417=end comment 14418 14419=head1 NAME 14420 14421$pod_file - Index of Unicode Version $string_version character properties in Perl 14422 14423=head1 DESCRIPTION 14424 14425This document provides information about the portion of the Unicode database 14426that deals with character properties, that is the portion that is defined on 14427single code points. (L</Other information in the Unicode data base> 14428below briefly mentions other data that Unicode provides.) 14429 14430Perl can provide access to all non-provisional Unicode character properties, 14431though not all are enabled by default. The omitted ones are the Unihan 14432properties (accessible via the CPAN module L<Unicode::Unihan>) and certain 14433deprecated or Unicode-internal properties. (An installation may choose to 14434recompile Perl's tables to change this. See L<Unicode character 14435properties that are NOT accepted by Perl>.) 14436 14437For most purposes, access to Unicode properties from the Perl core is through 14438regular expression matches, as described in the next section. 14439For some special purposes, and to access the properties that are not suitable 14440for regular expression matching, all the Unicode character properties that 14441Perl handles are accessible via the standard L<Unicode::UCD> module, as 14442described in the section L</Properties accessible through Unicode::UCD>. 14443 14444Perl also provides some additional extensions and short-cut synonyms 14445for Unicode properties. 14446 14447This document merely lists all available properties and does not attempt to 14448explain what each property really means. There is a brief description of each 14449Perl extension; see L<perlunicode/Other Properties> for more information on 14450these. There is some detail about Blocks, Scripts, General_Category, 14451and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the 14452official Unicode properties, refer to the Unicode standard. A good starting 14453place is L<$unicode_reference_url>. 14454 14455Note that you can define your own properties; see 14456L<perlunicode/"User-Defined Character Properties">. 14457 14458=head1 Properties accessible through C<\\p{}> and C<\\P{}> 14459 14460The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to 14461most of the Unicode character properties. The table below shows all these 14462constructs, both single and compound forms. 14463 14464B<Compound forms> consist of two components, separated by an equals sign or a 14465colon. The first component is the property name, and the second component is 14466the particular value of the property to match against, for example, 14467C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters 14468whose Script property is Greek. 14469 14470B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for 14471their equivalent compound forms. The table shows these equivalences. (In our 14472example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.) 14473There are also a few Perl-defined single forms that are not shortcuts for a 14474compound form. One such is C<\\p{Word}>. These are also listed in the table. 14475 14476In parsing these constructs, Perl always ignores Upper/lower case differences 14477everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as 14478C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before 14479the left brace completely changes the meaning of the construct, from "match" 14480(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is 14481for improved legibility. 14482 14483Also, white space, hyphens, and underscores are also normally ignored 14484everywhere between the {braces}, and hence can be freely added or removed 14485even if the C</x> modifier hasn't been specified on the regular expression. 14486But $a_bold_stricter at the beginning of an entry in the table below 14487means that tighter (stricter) rules are used for that entry: 14488 14489=over 4 14490 14491=item Single form (C<\\p{name}>) tighter rules: 14492 14493White space, hyphens, and underscores ARE significant 14494except for: 14495 14496=over 4 14497 14498=item * white space adjacent to a non-word character 14499 14500=item * underscores separating digits in numbers 14501 14502=back 14503 14504That means, for example, that you can freely add or remove white space 14505adjacent to (but within) the braces without affecting the meaning. 14506 14507=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules: 14508 14509The tighter rules given above for the single form apply to everything to the 14510right of the colon or equals; the looser rules still apply to everything to 14511the left. 14512 14513That means, for example, that you can freely add or remove white space 14514adjacent to (but within) the braces and the colon or equal sign. 14515 14516=back 14517 14518Some properties are considered obsolete by Unicode, but still available. 14519There are several varieties of obsolescence: 14520 14521=over 4 14522 14523=item Stabilized 14524 14525A property may be stabilized. Such a determination does not indicate 14526that the property should or should not be used; instead it is a declaration 14527that the property will not be maintained nor extended for newly encoded 14528characters. Such properties are marked with $a_bold_stabilized in the 14529table. 14530 14531=item Deprecated 14532 14533A property may be deprecated, perhaps because its original intent 14534has been replaced by another property, or because its specification was 14535somehow defective. This means that its use is strongly 14536discouraged, so much so that a warning will be issued if used, unless the 14537regular expression is in the scope of a C<S<no warnings 'deprecated'>> 14538statement. $A_bold_deprecated flags each such entry in the table, and 14539the entry there for the longest, most descriptive version of the property will 14540give the reason it is deprecated, and perhaps advice. Perl may issue such a 14541warning, even for properties that aren't officially deprecated by Unicode, 14542when there used to be characters or code points that were matched by them, but 14543no longer. This is to warn you that your program may not work like it did on 14544earlier Unicode releases. 14545 14546A deprecated property may be made unavailable in a future Perl version, so it 14547is best to move away from them. 14548 14549A deprecated property may also be stabilized, but this fact is not shown. 14550 14551=item Obsolete 14552 14553Properties marked with $a_bold_obsolete in the table are considered (plain) 14554obsolete. Generally this designation is given to properties that Unicode once 14555used for internal purposes (but not any longer). 14556 14557=back 14558 14559Some Perl extensions are present for backwards compatibility and are 14560discouraged from being used, but are not obsolete. $A_bold_discouraged 14561flags each such entry in the table. Future Unicode versions may force 14562some of these extensions to be removed without warning, replaced by another 14563property with the same name that means something different. Use the 14564equivalent shown instead. 14565 14566@block_warning 14567 14568The table below has two columns. The left column contains the C<\\p{}> 14569constructs to look up, possibly preceded by the flags mentioned above; and 14570the right column contains information about them, like a description, or 14571synonyms. It shows both the single and compound forms for each property that 14572has them. If the left column is a short name for a property, the right column 14573will give its longer, more descriptive name; and if the left column is the 14574longest name, the right column will show any equivalent shortest name, in both 14575single and compound forms if applicable. 14576 14577The right column will also caution you if a property means something different 14578than what might normally be expected. 14579 14580All single forms are Perl extensions; a few compound forms are as well, and 14581are noted as such. 14582 14583Numbers in (parentheses) indicate the total number of code points matched by 14584the property. For emphasis, those properties that match no code points at all 14585are listed as well in a separate section following the table. 14586 14587Most properties match the same code points regardless of whether C<"/i"> 14588case-insensitive matching is specified or not. But a few properties are 14589affected. These are shown with the notation 14590 14591 (/i= other_property) 14592 14593in the second column. Under case-insensitive matching they match the 14594same code pode points as the property "other_property". 14595 14596There is no description given for most non-Perl defined properties (See 14597L<$unicode_reference_url> for that). 14598 14599For compactness, 'B<*>' is used as a wildcard instead of showing all possible 14600combinations. For example, entries like: 14601 14602 \\p{Gc: *} \\p{General_Category: *} 14603 14604mean that 'Gc' is a synonym for 'General_Category', and anything that is valid 14605for the latter is also valid for the former. Similarly, 14606 14607 \\p{Is_*} \\p{*} 14608 14609means that if and only if, for example, C<\\p{Foo}> exists, then 14610C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing. 14611And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and 14612C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an 14613underscore. 14614 14615Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. 14616And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 14617'N*' to indicate this, and doesn't have separate entries for the other 14618possibilities. Note that not all properties which have values 'Yes' and 'No' 14619are binary, and they have all their values spelled out without using this wild 14620card, and a C<NOT> clause in their description that highlights their not being 14621binary. These also require the compound form to match them, whereas true 14622binary properties have both single and compound forms available. 14623 14624Note that all non-essential underscores are removed in the display of the 14625short names below. 14626 14627B<Legend summary:> 14628 14629=over 4 14630 14631=item Z<>B<*> is a wild-card 14632 14633=item B<(\\d+)> in the info column gives the number of code points matched by 14634this property. 14635 14636=item B<$DEPRECATED> means this is deprecated. 14637 14638=item B<$OBSOLETE> means this is obsolete. 14639 14640=item B<$STABILIZED> means this is stabilized. 14641 14642=item B<$STRICTER> means tighter (stricter) name matching applies. 14643 14644=item B<$DISCOURAGED> means use of this form is discouraged, and may not be 14645stable. 14646 14647=back 14648 14649$formatted_properties 14650 14651$zero_matches 14652 14653=head1 Properties accessible through Unicode::UCD 14654 14655All the Unicode character properties mentioned above (except for those marked 14656as for internal use by Perl) are also accessible by 14657L<Unicode::UCD/prop_invlist()>. 14658 14659Due to their nature, not all Unicode character properties are suitable for 14660regular expression matches, nor C<prop_invlist()>. The remaining 14661non-provisional, non-internal ones are accessible via 14662L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation 14663hasn't included; see L<below for which those are|/Unicode character properties 14664that are NOT accepted by Perl>). 14665 14666For compatibility with other parts of Perl, all the single forms given in the 14667table in the L<section above|/Properties accessible through \\p{} and \\P{}> 14668are recognized. BUT, there are some ambiguities between some Perl extensions 14669and the Unicode properties, all of which are silently resolved in favor of the 14670official Unicode property. To avoid surprises, you should only use 14671C<prop_invmap()> for forms listed in the table below, which omits the 14672non-recommended ones. The affected forms are the Perl single form equivalents 14673of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of 14674C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property, 14675whose short name is C<sc>. The table indicates the current ambiguities in the 14676INFO column, beginning with the word C<"NOT">. 14677 14678The standard Unicode properties listed below are documented in 14679L<$unicode_reference_url>; Perl_Decimal_Digit is documented in 14680L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in 14681L<perlunicode/Other Properties>; 14682 14683The first column in the table is a name for the property; the second column is 14684an alternative name, if any, plus possibly some annotations. The alternative 14685name is the property's full name, unless that would simply repeat the first 14686column, in which case the second column indicates the property's short name 14687(if different). The annotations are given only in the entry for the full 14688name. If a property is obsolete, etc, the entry will be flagged with the same 14689characters used in the table in the L<section above|/Properties accessible 14690through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>. 14691 14692$ucd_pod 14693 14694=head1 Properties accessible through other means 14695 14696Certain properties are accessible also via core function calls. These are: 14697 14698 Lowercase_Mapping lc() and lcfirst() 14699 Titlecase_Mapping ucfirst() 14700 Uppercase_Mapping uc() 14701 14702Also, Case_Folding is accessible through the C</i> modifier in regular 14703expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>> 14704operator. 14705 14706And, the Name and Name_Aliases properties are accessible through the C<\\N{}> 14707interpolation in double-quoted strings and regular expressions; and functions 14708C<charnames::viacode()>, C<charnames::vianame()>, and 14709C<charnames::string_vianame()> (which require a C<use charnames ();> to be 14710specified. 14711 14712Finally, most properties related to decomposition are accessible via 14713L<Unicode::Normalize>. 14714 14715=head1 Unicode character properties that are NOT accepted by Perl 14716 14717Perl will generate an error for a few character properties in Unicode when 14718used in a regular expression. The non-Unihan ones are listed below, with the 14719reasons they are not accepted, perhaps with work-arounds. The short names for 14720the properties are listed enclosed in (parentheses). 14721As described after the list, an installation can change the defaults and choose 14722to accept any of these. The list is machine generated based on the 14723choices made for the installation that generated this document. 14724 14725@bad_re_properties 14726 14727An installation can choose to allow any of these to be matched by downloading 14728the Unicode database from L<http://www.unicode.org/Public/> to 14729C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the 14730controlling lists contained in the program 14731C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing. 14732(C<\%Config> is available from the Config module). 14733 14734=head1 Other information in the Unicode data base 14735 14736The Unicode data base is delivered in two different formats. The XML version 14737is valid for more modern Unicode releases. The other version is a collection 14738of files. The two are intended to give equivalent information. Perl uses the 14739older form; this allows you to recompile Perl to use early Unicode releases. 14740 14741The only non-character property that Perl currently supports is Named 14742Sequences, in which a sequence of code points 14743is given a name and generally treated as a single entity. (Perl supports 14744these via the C<\\N{...}> double-quotish construct, 14745L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>. 14746 14747Below is a list of the files in the Unicode data base that Perl doesn't 14748currently use, along with very brief descriptions of their purposes. 14749Some of the names of the files have been shortened from those that Unicode 14750uses, in order to allow them to be distinguishable from similarly named files 14751on file systems for which only the first 8 characters of a name are 14752significant. 14753 14754=over 4 14755 14756@unused_files 14757 14758=back 14759 14760=head1 SEE ALSO 14761 14762L<$unicode_reference_url> 14763 14764L<perlrecharclass> 14765 14766L<perlunicode> 14767 14768END 14769 14770 # And write it. The 0 means no utf8. 14771 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT); 14772 return; 14773} 14774 14775sub make_Heavy () { 14776 # Create and write Heavy.pl, which passes info about the tables to 14777 # utf8_heavy.pl 14778 14779 # Stringify structures for output 14780 my $loose_property_name_of 14781 = simple_dumper(\%loose_property_name_of, ' ' x 4); 14782 chomp $loose_property_name_of; 14783 14784 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); 14785 chomp $stricter_to_file_of; 14786 14787 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); 14788 chomp $loose_to_file_of; 14789 14790 my $nv_floating_to_rational 14791 = simple_dumper(\%nv_floating_to_rational, ' ' x 4); 14792 chomp $nv_floating_to_rational; 14793 14794 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4); 14795 chomp $why_deprecated; 14796 14797 # We set the key to the file when we associated files with tables, but we 14798 # couldn't do the same for the value then, as we might not have the file 14799 # for the alternate table figured out at that time. 14800 foreach my $cased (keys %caseless_equivalent_to) { 14801 my @path = $caseless_equivalent_to{$cased}->file_path; 14802 my $path = join '/', @path[1, -1]; 14803 $caseless_equivalent_to{$cased} = $path; 14804 } 14805 my $caseless_equivalent_to 14806 = simple_dumper(\%caseless_equivalent_to, ' ' x 4); 14807 chomp $caseless_equivalent_to; 14808 14809 my $loose_property_to_file_of 14810 = simple_dumper(\%loose_property_to_file_of, ' ' x 4); 14811 chomp $loose_property_to_file_of; 14812 14813 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); 14814 chomp $file_to_swash_name; 14815 14816 my @heavy = <<END; 14817$HEADER 14818$INTERNAL_ONLY_HEADER 14819 14820# This file is for the use of utf8_heavy.pl and Unicode::UCD 14821 14822# Maps Unicode (not Perl single-form extensions) property names in loose 14823# standard form to their corresponding standard names 14824\%utf8::loose_property_name_of = ( 14825$loose_property_name_of 14826); 14827 14828# Maps property, table to file for those using stricter matching 14829\%utf8::stricter_to_file_of = ( 14830$stricter_to_file_of 14831); 14832 14833# Maps property, table to file for those using loose matching 14834\%utf8::loose_to_file_of = ( 14835$loose_to_file_of 14836); 14837 14838# Maps floating point to fractional form 14839\%utf8::nv_floating_to_rational = ( 14840$nv_floating_to_rational 14841); 14842 14843# If a floating point number doesn't have enough digits in it to get this 14844# close to a fraction, it isn't considered to be that fraction even if all the 14845# digits it does have match. 14846\$utf8::max_floating_slop = $MAX_FLOATING_SLOP; 14847 14848# Deprecated tables to generate a warning for. The key is the file containing 14849# the table, so as to avoid duplication, as many property names can map to the 14850# file, but we only need one entry for all of them. 14851\%utf8::why_deprecated = ( 14852$why_deprecated 14853); 14854 14855# A few properties have different behavior under /i matching. This maps 14856# those to substitute files to use under /i. 14857\%utf8::caseless_equivalent = ( 14858$caseless_equivalent_to 14859); 14860 14861# Property names to mapping files 14862\%utf8::loose_property_to_file_of = ( 14863$loose_property_to_file_of 14864); 14865 14866# Files to the swash names within them. 14867\%utf8::file_to_swash_name = ( 14868$file_to_swash_name 14869); 14870 148711; 14872END 14873 14874 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8. 14875 return; 14876} 14877 14878sub make_Name_pm () { 14879 # Create and write Name.pm, which contains subroutines and data to use in 14880 # conjunction with Name.pl 14881 14882 # Maybe there's nothing to do. 14883 return unless $has_hangul_syllables || @code_points_ending_in_code_point; 14884 14885 my @name = <<END; 14886$HEADER 14887$INTERNAL_ONLY_HEADER 14888END 14889 14890 # Convert these structures to output format. 14891 my $code_points_ending_in_code_point = 14892 main::simple_dumper(\@code_points_ending_in_code_point, 14893 ' ' x 8); 14894 my $names = main::simple_dumper(\%names_ending_in_code_point, 14895 ' ' x 8); 14896 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point, 14897 ' ' x 8); 14898 14899 # Do the same with the Hangul names, 14900 my $jamo; 14901 my $jamo_l; 14902 my $jamo_v; 14903 my $jamo_t; 14904 my $jamo_re; 14905 if ($has_hangul_syllables) { 14906 14907 # Construct a regular expression of all the possible 14908 # combinations of the Hangul syllables. 14909 my @L_re; # Leading consonants 14910 for my $i ($LBase .. $LBase + $LCount - 1) { 14911 push @L_re, $Jamo{$i} 14912 } 14913 my @V_re; # Middle vowels 14914 for my $i ($VBase .. $VBase + $VCount - 1) { 14915 push @V_re, $Jamo{$i} 14916 } 14917 my @T_re; # Trailing consonants 14918 for my $i ($TBase + 1 .. $TBase + $TCount - 1) { 14919 push @T_re, $Jamo{$i} 14920 } 14921 14922 # The whole re is made up of the L V T combination. 14923 $jamo_re = '(' 14924 . join ('|', sort @L_re) 14925 . ')(' 14926 . join ('|', sort @V_re) 14927 . ')(' 14928 . join ('|', sort @T_re) 14929 . ')?'; 14930 14931 # These hashes needed by the algorithm were generated 14932 # during reading of the Jamo.txt file 14933 $jamo = main::simple_dumper(\%Jamo, ' ' x 8); 14934 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8); 14935 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8); 14936 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8); 14937 } 14938 14939 push @name, <<END; 14940 14941package charnames; 14942 14943# This module contains machine-generated tables and code for the 14944# algorithmically-determinable Unicode character names. The following 14945# routines can be used to translate between name and code point and vice versa 14946 14947{ # Closure 14948 14949 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 14950 # two must be 10; if there are 5, the first must not be a 0. Written this 14951 # way to decrease backtracking. The first regex allows the code point to 14952 # be at the end of a word, but to work properly, the word shouldn't end 14953 # with a valid hex character. The second one won't match a code point at 14954 # the end of a word, and doesn't have the run-on issue 14955 my \$run_on_code_point_re = qr/$run_on_code_point_re/; 14956 my \$code_point_re = qr/$code_point_re/; 14957 14958 # In the following hash, the keys are the bases of names which includes 14959 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values 14960 # of each key is another hash which is used to get the low and high ends 14961 # for each range of code points that apply to the name. 14962 my %names_ending_in_code_point = ( 14963$names 14964 ); 14965 14966 # The following hash is a copy of the previous one, except is for loose 14967 # matching, so each name has blanks and dashes squeezed out 14968 my %loose_names_ending_in_code_point = ( 14969$loose_names 14970 ); 14971 14972 # And the following array gives the inverse mapping from code points to 14973 # names. Lowest code points are first 14974 my \@code_points_ending_in_code_point = ( 14975$code_points_ending_in_code_point 14976 ); 14977END 14978 # Earlier releases didn't have Jamos. No sense outputting 14979 # them unless will be used. 14980 if ($has_hangul_syllables) { 14981 push @name, <<END; 14982 14983 # Convert from code point to Jamo short name for use in composing Hangul 14984 # syllable names 14985 my %Jamo = ( 14986$jamo 14987 ); 14988 14989 # Leading consonant (can be null) 14990 my %Jamo_L = ( 14991$jamo_l 14992 ); 14993 14994 # Vowel 14995 my %Jamo_V = ( 14996$jamo_v 14997 ); 14998 14999 # Optional trailing consonant 15000 my %Jamo_T = ( 15001$jamo_t 15002 ); 15003 15004 # Computed re that splits up a Hangul name into LVT or LV syllables 15005 my \$syllable_re = qr/$jamo_re/; 15006 15007 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 15008 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 15009 15010 # These constants names and values were taken from the Unicode standard, 15011 # version 5.1, section 3.12. They are used in conjunction with Hangul 15012 # syllables 15013 my \$SBase = $SBase_string; 15014 my \$LBase = $LBase_string; 15015 my \$VBase = $VBase_string; 15016 my \$TBase = $TBase_string; 15017 my \$SCount = $SCount; 15018 my \$LCount = $LCount; 15019 my \$VCount = $VCount; 15020 my \$TCount = $TCount; 15021 my \$NCount = \$VCount * \$TCount; 15022END 15023 } # End of has Jamos 15024 15025 push @name, << 'END'; 15026 15027 sub name_to_code_point_special { 15028 my ($name, $loose) = @_; 15029 15030 # Returns undef if not one of the specially handled names; otherwise 15031 # returns the code point equivalent to the input name 15032 # $loose is non-zero if to use loose matching, 'name' in that case 15033 # must be input as upper case with all blanks and dashes squeezed out. 15034END 15035 if ($has_hangul_syllables) { 15036 push @name, << 'END'; 15037 15038 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 15039 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 15040 { 15041 return if $name !~ qr/^$syllable_re$/; 15042 my $L = $Jamo_L{$1}; 15043 my $V = $Jamo_V{$2}; 15044 my $T = (defined $3) ? $Jamo_T{$3} : 0; 15045 return ($L * $VCount + $V) * $TCount + $T + $SBase; 15046 } 15047END 15048 } 15049 push @name, << 'END'; 15050 15051 # Name must end in 'code_point' for this to handle. 15052 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 15053 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 15054 15055 my $base = $1; 15056 my $code_point = CORE::hex $2; 15057 my $names_ref; 15058 15059 if ($loose) { 15060 $names_ref = \%loose_names_ending_in_code_point; 15061 } 15062 else { 15063 return if $base !~ s/-$//; 15064 $names_ref = \%names_ending_in_code_point; 15065 } 15066 15067 # Name must be one of the ones which has the code point in it. 15068 return if ! $names_ref->{$base}; 15069 15070 # Look through the list of ranges that apply to this name to see if 15071 # the code point is in one of them. 15072 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 15073 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 15074 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 15075 15076 # Here, the code point is in the range. 15077 return $code_point; 15078 } 15079 15080 # Here, looked like the name had a code point number in it, but 15081 # did not match one of the valid ones. 15082 return; 15083 } 15084 15085 sub code_point_to_name_special { 15086 my $code_point = shift; 15087 15088 # Returns the name of a code point if algorithmically determinable; 15089 # undef if not 15090END 15091 if ($has_hangul_syllables) { 15092 push @name, << 'END'; 15093 15094 # If in the Hangul range, calculate the name based on Unicode's 15095 # algorithm 15096 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 15097 use integer; 15098 my $SIndex = $code_point - $SBase; 15099 my $L = $LBase + $SIndex / $NCount; 15100 my $V = $VBase + ($SIndex % $NCount) / $TCount; 15101 my $T = $TBase + $SIndex % $TCount; 15102 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 15103 $name .= $Jamo{$T} if $T != $TBase; 15104 return $name; 15105 } 15106END 15107 } 15108 push @name, << 'END'; 15109 15110 # Look through list of these code points for one in range. 15111 foreach my $hash (@code_points_ending_in_code_point) { 15112 return if $code_point < $hash->{'low'}; 15113 if ($code_point <= $hash->{'high'}) { 15114 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 15115 } 15116 } 15117 return; # None found 15118 } 15119} # End closure 15120 151211; 15122END 15123 15124 main::write("Name.pm", 0, \@name); # The 0 means no utf8. 15125 return; 15126} 15127 15128sub make_UCD () { 15129 # Create and write UCD.pl, which passes info about the tables to 15130 # Unicode::UCD 15131 15132 # Create a mapping from each alias of Perl single-form extensions to all 15133 # its equivalent aliases, for quick look-up. 15134 my %perlprop_to_aliases; 15135 foreach my $table ($perl->tables) { 15136 15137 # First create the list of the aliases of each extension 15138 my @aliases_list; # List of legal aliases for this extension 15139 15140 my $table_name = $table->name; 15141 my $standard_table_name = standardize($table_name); 15142 my $table_full_name = $table->full_name; 15143 my $standard_table_full_name = standardize($table_full_name); 15144 15145 # Make sure that the list has both the short and full names 15146 push @aliases_list, $table_name, $table_full_name; 15147 15148 my $found_ucd = 0; # ? Did we actually get an alias that should be 15149 # output for this table 15150 15151 # Go through all the aliases (including the two just added), and add 15152 # any new unique ones to the list 15153 foreach my $alias ($table->aliases) { 15154 15155 # Skip non-legal names 15156 next unless $alias->ok_as_filename; 15157 next unless $alias->ucd; 15158 15159 $found_ucd = 1; # have at least one legal name 15160 15161 my $name = $alias->name; 15162 my $standard = standardize($name); 15163 15164 # Don't repeat a name that is equivalent to one already on the 15165 # list 15166 next if $standard eq $standard_table_name; 15167 next if $standard eq $standard_table_full_name; 15168 15169 push @aliases_list, $name; 15170 } 15171 15172 # If there were no legal names, don't output anything. 15173 next unless $found_ucd; 15174 15175 # To conserve memory in the program reading these in, omit full names 15176 # that are identical to the short name, when those are the only two 15177 # aliases for the property. 15178 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) { 15179 pop @aliases_list; 15180 } 15181 15182 # Here, @aliases_list is the list of all the aliases that this 15183 # extension legally has. Now can create a map to it from each legal 15184 # standardized alias 15185 foreach my $alias ($table->aliases) { 15186 next unless $alias->ucd; 15187 next unless $alias->ok_as_filename; 15188 push @{$perlprop_to_aliases{standardize($alias->name)}}, 15189 @aliases_list; 15190 } 15191 } 15192 15193 # Make a list of all combinations of properties/values that are suppressed. 15194 my @suppressed; 15195 foreach my $property_name (keys %why_suppressed) { 15196 15197 # Just the value 15198 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; 15199 15200 # The hash may contain properties not in this release of Unicode 15201 next unless defined (my $property = property_ref($property_name)); 15202 15203 # Find all combinations 15204 foreach my $prop_alias ($property->aliases) { 15205 my $prop_alias_name = standardize($prop_alias->name); 15206 15207 # If no =value, there's just one combination possibe for this 15208 if (! $value_name) { 15209 15210 # The property may be suppressed, but there may be a proxy for 15211 # it, so it shouldn't be listed as suppressed 15212 next if $prop_alias->ucd; 15213 push @suppressed, $prop_alias_name; 15214 } 15215 else { # Otherwise 15216 foreach my $value_alias ($property->table($value_name)->aliases) 15217 { 15218 next if $value_alias->ucd; 15219 15220 push @suppressed, "$prop_alias_name=" 15221 . standardize($value_alias->name); 15222 } 15223 } 15224 } 15225 } 15226 15227 # Convert the structure below (designed for Name.pm) to a form that UCD 15228 # wants, so it doesn't have to modify it at all; i.e. so that it includes 15229 # an element for the Hangul syllables in the appropriate place, and 15230 # otherwise changes the name to include the "-<code point>" suffix. 15231 my @algorithm_names; 15232 my $done_hangul = 0; 15233 15234 # Copy it linearly. 15235 for my $i (0 .. @code_points_ending_in_code_point - 1) { 15236 15237 # Insert the hanguls in the correct place. 15238 if (! $done_hangul 15239 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase) 15240 { 15241 $done_hangul = 1; 15242 push @algorithm_names, { low => $SBase, 15243 high => $SBase + $SCount - 1, 15244 name => '<hangul syllable>', 15245 }; 15246 } 15247 15248 # Copy the current entry, modified. 15249 push @algorithm_names, { 15250 low => $code_points_ending_in_code_point[$i]->{'low'}, 15251 high => $code_points_ending_in_code_point[$i]->{'high'}, 15252 name => 15253 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>", 15254 }; 15255 } 15256 15257 # Serialize these structures for output. 15258 my $loose_to_standard_value 15259 = simple_dumper(\%loose_to_standard_value, ' ' x 4); 15260 chomp $loose_to_standard_value; 15261 15262 my $string_property_loose_to_name 15263 = simple_dumper(\%string_property_loose_to_name, ' ' x 4); 15264 chomp $string_property_loose_to_name; 15265 15266 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4); 15267 chomp $perlprop_to_aliases; 15268 15269 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4); 15270 chomp $prop_aliases; 15271 15272 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4); 15273 chomp $prop_value_aliases; 15274 15275 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : ""; 15276 chomp $suppressed; 15277 15278 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4); 15279 chomp $algorithm_names; 15280 15281 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); 15282 chomp $ambiguous_names; 15283 15284 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); 15285 chomp $loose_defaults; 15286 15287 my @ucd = <<END; 15288$HEADER 15289$INTERNAL_ONLY_HEADER 15290 15291# This file is for the use of Unicode::UCD 15292 15293# Highest legal Unicode code point 15294\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING; 15295 15296# Hangul syllables 15297\$Unicode::UCD::HANGUL_BEGIN = $SBase_string; 15298\$Unicode::UCD::HANGUL_COUNT = $SCount; 15299 15300# Keys are all the possible "prop=value" combinations, in loose form; values 15301# are the standard loose name for the 'value' part of the key 15302\%Unicode::UCD::loose_to_standard_value = ( 15303$loose_to_standard_value 15304); 15305 15306# String property loose names to standard loose name 15307\%Unicode::UCD::string_property_loose_to_name = ( 15308$string_property_loose_to_name 15309); 15310 15311# Keys are Perl extensions in loose form; values are each one's list of 15312# aliases 15313\%Unicode::UCD::loose_perlprop_to_name = ( 15314$perlprop_to_aliases 15315); 15316 15317# Keys are standard property name; values are each one's aliases 15318\%Unicode::UCD::prop_aliases = ( 15319$prop_aliases 15320); 15321 15322# Keys of top level are standard property name; values are keys to another 15323# hash, Each one is one of the property's values, in standard form. The 15324# values are that prop-val's aliases. If only one specified, the short and 15325# long alias are identical. 15326\%Unicode::UCD::prop_value_aliases = ( 15327$prop_value_aliases 15328); 15329 15330# Ordered (by code point ordinal) list of the ranges of code points whose 15331# names are algorithmically determined. Each range entry is an anonymous hash 15332# of the start and end points and a template for the names within it. 15333\@Unicode::UCD::algorithmic_named_code_points = ( 15334$algorithm_names 15335); 15336 15337# The properties that as-is have two meanings, and which must be disambiguated 15338\%Unicode::UCD::ambiguous_names = ( 15339$ambiguous_names 15340); 15341 15342# Keys are the prop-val combinations which are the default values for the 15343# given property, expressed in standard loose form 15344\%Unicode::UCD::loose_defaults = ( 15345$loose_defaults 15346); 15347 15348# All combinations of names that are suppressed. 15349# This is actually for UCD.t, so it knows which properties shouldn't have 15350# entries. If it got any bigger, would probably want to put it in its own 15351# file to use memory only when it was needed, in testing. 15352\@Unicode::UCD::suppressed_properties = ( 15353$suppressed 15354); 15355 153561; 15357END 15358 15359 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8. 15360 return; 15361} 15362 15363sub write_all_tables() { 15364 # Write out all the tables generated by this program to files, as well as 15365 # the supporting data structures, pod file, and .t file. 15366 15367 my @writables; # List of tables that actually get written 15368 my %match_tables_to_write; # Used to collapse identical match tables 15369 # into one file. Each key is a hash function 15370 # result to partition tables into buckets. 15371 # Each value is an array of the tables that 15372 # fit in the bucket. 15373 15374 # For each property ... 15375 # (sort so that if there is an immutable file name, it has precedence, so 15376 # some other property can't come in and take over its file name. If b's 15377 # file name is defined, will return 1, meaning to take it first; don't 15378 # care if both defined, as they had better be different anyway. And the 15379 # property named 'Perl' needs to be first (it doesn't have any immutable 15380 # file name) because empty properties are defined in terms of it's table 15381 # named 'Any'.) 15382 PROPERTY: 15383 foreach my $property (sort { return -1 if $a == $perl; 15384 return 1 if $b == $perl; 15385 return defined $b->file 15386 } property_ref('*')) 15387 { 15388 my $type = $property->type; 15389 15390 # And for each table for that property, starting with the mapping 15391 # table for it ... 15392 TABLE: 15393 foreach my $table($property, 15394 15395 # and all the match tables for it (if any), sorted so 15396 # the ones with the shortest associated file name come 15397 # first. The length sorting prevents problems of a 15398 # longer file taking a name that might have to be used 15399 # by a shorter one. The alphabetic sorting prevents 15400 # differences between releases 15401 sort { my $ext_a = $a->external_name; 15402 return 1 if ! defined $ext_a; 15403 my $ext_b = $b->external_name; 15404 return -1 if ! defined $ext_b; 15405 15406 # But return the non-complement table before 15407 # the complement one, as the latter is defined 15408 # in terms of the former, and needs to have 15409 # the information for the former available. 15410 return 1 if $a->complement != 0; 15411 return -1 if $b->complement != 0; 15412 15413 # Similarly, return a subservient table after 15414 # a leader 15415 return 1 if $a->leader != $a; 15416 return -1 if $b->leader != $b; 15417 15418 my $cmp = length $ext_a <=> length $ext_b; 15419 15420 # Return result if lengths not equal 15421 return $cmp if $cmp; 15422 15423 # Alphabetic if lengths equal 15424 return $ext_a cmp $ext_b 15425 } $property->tables 15426 ) 15427 { 15428 15429 # Here we have a table associated with a property. It could be 15430 # the map table (done first for each property), or one of the 15431 # other tables. Determine which type. 15432 my $is_property = $table->isa('Property'); 15433 15434 my $name = $table->name; 15435 my $complete_name = $table->complete_name; 15436 15437 # See if should suppress the table if is empty, but warn if it 15438 # contains something. 15439 my $suppress_if_empty_warn_if_not 15440 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0; 15441 15442 # Calculate if this table should have any code points associated 15443 # with it or not. 15444 my $expected_empty = 15445 15446 # $perl should be empty, as well as properties that we just 15447 # don't do anything with 15448 ($is_property 15449 && ($table == $perl 15450 || grep { $complete_name eq $_ } 15451 @unimplemented_properties 15452 ) 15453 ) 15454 15455 # Match tables in properties we skipped populating should be 15456 # empty 15457 || (! $is_property && ! $property->to_create_match_tables) 15458 15459 # Tables and properties that are expected to have no code 15460 # points should be empty 15461 || $suppress_if_empty_warn_if_not 15462 ; 15463 15464 # Set a boolean if this table is the complement of an empty binary 15465 # table 15466 my $is_complement_of_empty_binary = 15467 $type == $BINARY && 15468 (($table == $property->table('Y') 15469 && $property->table('N')->is_empty) 15470 || ($table == $property->table('N') 15471 && $property->table('Y')->is_empty)); 15472 15473 if ($table->is_empty) { 15474 15475 if ($suppress_if_empty_warn_if_not) { 15476 $table->set_fate($SUPPRESSED, 15477 $suppress_if_empty_warn_if_not); 15478 } 15479 15480 # Suppress (by skipping them) expected empty tables. 15481 next TABLE if $expected_empty; 15482 15483 # And setup to later output a warning for those that aren't 15484 # known to be allowed to be empty. Don't do the warning if 15485 # this table is a child of another one to avoid duplicating 15486 # the warning that should come from the parent one. 15487 if (($table == $property || $table->parent == $table) 15488 && $table->fate != $SUPPRESSED 15489 && $table->fate != $MAP_PROXIED 15490 && ! grep { $complete_name =~ /^$_$/ } 15491 @tables_that_may_be_empty) 15492 { 15493 push @unhandled_properties, "$table"; 15494 } 15495 15496 # An empty table is just the complement of everything. 15497 $table->set_complement($Any) if $table != $property; 15498 } 15499 elsif ($expected_empty) { 15500 my $because = ""; 15501 if ($suppress_if_empty_warn_if_not) { 15502 $because = " because $suppress_if_empty_warn_if_not"; 15503 } 15504 15505 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); 15506 } 15507 15508 # Some tables should match everything 15509 my $expected_full = 15510 ($table->fate == $SUPPRESSED) 15511 ? 0 15512 : ($is_property) 15513 ? # All these types of map tables will be full because 15514 # they will have been populated with defaults 15515 ($type == $ENUM || $type == $FORCED_BINARY) 15516 15517 : # A match table should match everything if its method 15518 # shows it should 15519 ($table->matches_all 15520 15521 # The complement of an empty binary table will match 15522 # everything 15523 || $is_complement_of_empty_binary 15524 ) 15525 ; 15526 15527 my $count = $table->count; 15528 if ($expected_full) { 15529 if ($count != $MAX_UNICODE_CODEPOINTS) { 15530 Carp::my_carp("$table matches only " 15531 . clarify_number($count) 15532 . " Unicode code points but should match " 15533 . clarify_number($MAX_UNICODE_CODEPOINTS) 15534 . " (off by " 15535 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count)) 15536 . "). Proceeding anyway."); 15537 } 15538 15539 # Here is expected to be full. If it is because it is the 15540 # complement of an (empty) binary table that is to be 15541 # suppressed, then suppress this one as well. 15542 if ($is_complement_of_empty_binary) { 15543 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; 15544 my $opposing = $property->table($opposing_name); 15545 my $opposing_status = $opposing->status; 15546 if ($opposing_status) { 15547 $table->set_status($opposing_status, 15548 $opposing->status_info); 15549 } 15550 } 15551 } 15552 elsif ($count == $MAX_UNICODE_CODEPOINTS) { 15553 if ($table == $property || $table->leader == $table) { 15554 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); 15555 } 15556 } 15557 15558 if ($table->fate == $SUPPRESSED) { 15559 if (! $is_property) { 15560 my @children = $table->children; 15561 foreach my $child (@children) { 15562 if ($child->fate != $SUPPRESSED) { 15563 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); 15564 } 15565 } 15566 } 15567 next TABLE; 15568 15569 } 15570 15571 if (! $is_property) { 15572 15573 make_ucd_table_pod_entries($table) if $table->property == $perl; 15574 15575 # Several things need to be done just once for each related 15576 # group of match tables. Do them on the parent. 15577 if ($table->parent == $table) { 15578 15579 # Add an entry in the pod file for the table; it also does 15580 # the children. 15581 make_re_pod_entries($table) if defined $pod_directory; 15582 15583 # See if the the table matches identical code points with 15584 # something that has already been output. In that case, 15585 # no need to have two files with the same code points in 15586 # them. We use the table's hash() method to store these 15587 # in buckets, so that it is quite likely that if two 15588 # tables are in the same bucket they will be identical, so 15589 # don't have to compare tables frequently. The tables 15590 # have to have the same status to share a file, so add 15591 # this to the bucket hash. (The reason for this latter is 15592 # that Heavy.pl associates a status with a file.) 15593 # We don't check tables that are inverses of others, as it 15594 # would lead to some coding complications, and checking 15595 # all the regular ones should find everything. 15596 if ($table->complement == 0) { 15597 my $hash = $table->hash . ';' . $table->status; 15598 15599 # Look at each table that is in the same bucket as 15600 # this one would be. 15601 foreach my $comparison 15602 (@{$match_tables_to_write{$hash}}) 15603 { 15604 if ($table->matches_identically_to($comparison)) { 15605 $table->set_equivalent_to($comparison, 15606 Related => 0); 15607 next TABLE; 15608 } 15609 } 15610 15611 # Here, not equivalent, add this table to the bucket. 15612 push @{$match_tables_to_write{$hash}}, $table; 15613 } 15614 } 15615 } 15616 else { 15617 15618 # Here is the property itself. 15619 # Don't write out or make references to the $perl property 15620 next if $table == $perl; 15621 15622 make_ucd_table_pod_entries($table); 15623 15624 # There is a mapping stored of the various synonyms to the 15625 # standardized name of the property for utf8_heavy.pl. 15626 # Also, the pod file contains entries of the form: 15627 # \p{alias: *} \p{full: *} 15628 # rather than show every possible combination of things. 15629 15630 my @property_aliases = $property->aliases; 15631 15632 my $full_property_name = $property->full_name; 15633 my $property_name = $property->name; 15634 my $standard_property_name = standardize($property_name); 15635 my $standard_property_full_name 15636 = standardize($full_property_name); 15637 15638 # We also create for Unicode::UCD a list of aliases for 15639 # the property. The list starts with the property name; 15640 # then its full name. 15641 my @property_list; 15642 my @standard_list; 15643 if ( $property->fate <= $MAP_PROXIED) { 15644 @property_list = ($property_name, $full_property_name); 15645 @standard_list = ($standard_property_name, 15646 $standard_property_full_name); 15647 } 15648 15649 # For each synonym ... 15650 for my $i (0 .. @property_aliases - 1) { 15651 my $alias = $property_aliases[$i]; 15652 my $alias_name = $alias->name; 15653 my $alias_standard = standardize($alias_name); 15654 15655 15656 # Add other aliases to the list of property aliases 15657 if ($property->fate <= $MAP_PROXIED 15658 && ! grep { $alias_standard eq $_ } @standard_list) 15659 { 15660 push @property_list, $alias_name; 15661 push @standard_list, $alias_standard; 15662 } 15663 15664 # For utf8_heavy, set the mapping of the alias to the 15665 # property 15666 if ($type == $STRING) { 15667 if ($property->fate <= $MAP_PROXIED) { 15668 $string_property_loose_to_name{$alias_standard} 15669 = $standard_property_name; 15670 } 15671 } 15672 else { 15673 if (exists ($loose_property_name_of{$alias_standard})) 15674 { 15675 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained"); 15676 } 15677 else { 15678 $loose_property_name_of{$alias_standard} 15679 = $standard_property_name; 15680 } 15681 15682 # Now for the re pod entry for this alias. Skip if not 15683 # outputting a pod; skip the first one, which is the 15684 # full name so won't have an entry like: '\p{full: *} 15685 # \p{full: *}', and skip if don't want an entry for 15686 # this one. 15687 next if $i == 0 15688 || ! defined $pod_directory 15689 || ! $alias->make_re_pod_entry; 15690 15691 my $rhs = "\\p{$full_property_name: *}"; 15692 if ($property != $perl && $table->perl_extension) { 15693 $rhs .= ' (Perl extension)'; 15694 } 15695 push @match_properties, 15696 format_pod_line($indent_info_column, 15697 '\p{' . $alias->name . ': *}', 15698 $rhs, 15699 $alias->status); 15700 } 15701 } 15702 15703 # The list of all possible names is attached to each alias, so 15704 # lookup is easy 15705 if (@property_list) { 15706 push @{$prop_aliases{$standard_list[0]}}, @property_list; 15707 } 15708 15709 if ($property->fate <= $MAP_PROXIED) { 15710 15711 # Similarly, we create for Unicode::UCD a list of 15712 # property-value aliases. 15713 15714 my $property_full_name = $property->full_name; 15715 15716 # Look at each table in the property... 15717 foreach my $table ($property->tables) { 15718 my @values_list; 15719 my $table_full_name = $table->full_name; 15720 my $standard_table_full_name 15721 = standardize($table_full_name); 15722 my $table_name = $table->name; 15723 my $standard_table_name = standardize($table_name); 15724 15725 # The list starts with the table name and its full 15726 # name. 15727 push @values_list, $table_name, $table_full_name; 15728 15729 # We add to the table each unique alias that isn't 15730 # discouraged from use. 15731 foreach my $alias ($table->aliases) { 15732 next if $alias->status 15733 && $alias->status eq $DISCOURAGED; 15734 my $name = $alias->name; 15735 my $standard = standardize($name); 15736 next if $standard eq $standard_table_name; 15737 next if $standard eq $standard_table_full_name; 15738 push @values_list, $name; 15739 } 15740 15741 # Here @values_list is a list of all the aliases for 15742 # the table. That is, all the property-values given 15743 # by this table. By agreement with Unicode::UCD, 15744 # if the name and full name are identical, and there 15745 # are no other names, drop the duplcate entry to save 15746 # memory. 15747 if (@values_list == 2 15748 && $values_list[0] eq $values_list[1]) 15749 { 15750 pop @values_list 15751 } 15752 15753 # To save memory, unlike the similar list for property 15754 # aliases above, only the standard forms hve the list. 15755 # This forces an extra step of converting from input 15756 # name to standard name, but the savings are 15757 # considerable. (There is only marginal savings if we 15758 # did this with the property aliases.) 15759 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list; 15760 } 15761 } 15762 15763 # Don't write out a mapping file if not desired. 15764 next if ! $property->to_output_map; 15765 } 15766 15767 # Here, we know we want to write out the table, but don't do it 15768 # yet because there may be other tables that come along and will 15769 # want to share the file, and the file's comments will change to 15770 # mention them. So save for later. 15771 push @writables, $table; 15772 15773 } # End of looping through the property and all its tables. 15774 } # End of looping through all properties. 15775 15776 # Now have all the tables that will have files written for them. Do it. 15777 foreach my $table (@writables) { 15778 my @directory; 15779 my $filename; 15780 my $property = $table->property; 15781 my $is_property = ($table == $property); 15782 if (! $is_property) { 15783 15784 # Match tables for the property go in lib/$subdirectory, which is 15785 # the property's name. Don't use the standard file name for this, 15786 # as may get an unfamiliar alias 15787 @directory = ($matches_directory, $property->external_name); 15788 } 15789 else { 15790 15791 @directory = $table->directory; 15792 $filename = $table->file; 15793 } 15794 15795 # Use specified filename if available, or default to property's 15796 # shortest name. We need an 8.3 safe filename (which means "an 8 15797 # safe" filename, since after the dot is only 'pl', which is < 3) 15798 # The 2nd parameter is if the filename shouldn't be changed, and 15799 # it shouldn't iff there is a hard-coded name for this table. 15800 $filename = construct_filename( 15801 $filename || $table->external_name, 15802 ! $filename, # mutable if no filename 15803 \@directory); 15804 15805 register_file_for_name($table, \@directory, $filename); 15806 15807 # Only need to write one file when shared by more than one 15808 # property 15809 next if ! $is_property 15810 && ($table->leader != $table || $table->complement != 0); 15811 15812 # Construct a nice comment to add to the file 15813 $table->set_final_comment; 15814 15815 $table->write; 15816 } 15817 15818 15819 # Write out the pod file 15820 make_pod; 15821 15822 # And Heavy.pl, Name.pm, UCD.pl 15823 make_Heavy; 15824 make_Name_pm; 15825 make_UCD; 15826 15827 make_property_test_script() if $make_test_script; 15828 return; 15829} 15830 15831my @white_space_separators = ( # This used only for making the test script. 15832 "", 15833 ' ', 15834 "\t", 15835 ' ' 15836 ); 15837 15838sub generate_separator($) { 15839 # This used only for making the test script. It generates the colon or 15840 # equal separator between the property and property value, with random 15841 # white space surrounding the separator 15842 15843 my $lhs = shift; 15844 15845 return "" if $lhs eq ""; # No separator if there's only one (the r) side 15846 15847 # Choose space before and after randomly 15848 my $spaces_before =$white_space_separators[rand(@white_space_separators)]; 15849 my $spaces_after = $white_space_separators[rand(@white_space_separators)]; 15850 15851 # And return the whole complex, half the time using a colon, half the 15852 # equals 15853 return $spaces_before 15854 . (rand() < 0.5) ? '=' : ':' 15855 . $spaces_after; 15856} 15857 15858sub generate_tests($$$$$) { 15859 # This used only for making the test script. It generates test cases that 15860 # are expected to compile successfully in perl. Note that the lhs and 15861 # rhs are assumed to already be as randomized as the caller wants. 15862 15863 my $lhs = shift; # The property: what's to the left of the colon 15864 # or equals separator 15865 my $rhs = shift; # The property value; what's to the right 15866 my $valid_code = shift; # A code point that's known to be in the 15867 # table given by lhs=rhs; undef if table is 15868 # empty 15869 my $invalid_code = shift; # A code point known to not be in the table; 15870 # undef if the table is all code points 15871 my $warning = shift; 15872 15873 # Get the colon or equal 15874 my $separator = generate_separator($lhs); 15875 15876 # The whole 'property=value' 15877 my $name = "$lhs$separator$rhs"; 15878 15879 my @output; 15880 # Create a complete set of tests, with complements. 15881 if (defined $valid_code) { 15882 push @output, <<"EOC" 15883Expect(1, $valid_code, '\\p{$name}', $warning); 15884Expect(0, $valid_code, '\\p{^$name}', $warning); 15885Expect(0, $valid_code, '\\P{$name}', $warning); 15886Expect(1, $valid_code, '\\P{^$name}', $warning); 15887EOC 15888 } 15889 if (defined $invalid_code) { 15890 push @output, <<"EOC" 15891Expect(0, $invalid_code, '\\p{$name}', $warning); 15892Expect(1, $invalid_code, '\\p{^$name}', $warning); 15893Expect(1, $invalid_code, '\\P{$name}', $warning); 15894Expect(0, $invalid_code, '\\P{^$name}', $warning); 15895EOC 15896 } 15897 return @output; 15898} 15899 15900sub generate_error($$$) { 15901 # This used only for making the test script. It generates test cases that 15902 # are expected to not only not match, but to be syntax or similar errors 15903 15904 my $lhs = shift; # The property: what's to the left of the 15905 # colon or equals separator 15906 my $rhs = shift; # The property value; what's to the right 15907 my $already_in_error = shift; # Boolean; if true it's known that the 15908 # unmodified lhs and rhs will cause an error. 15909 # This routine should not force another one 15910 # Get the colon or equal 15911 my $separator = generate_separator($lhs); 15912 15913 # Since this is an error only, don't bother to randomly decide whether to 15914 # put the error on the left or right side; and assume that the rhs is 15915 # loosely matched, again for convenience rather than rigor. 15916 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; 15917 15918 my $property = $lhs . $separator . $rhs; 15919 15920 return <<"EOC"; 15921Error('\\p{$property}'); 15922Error('\\P{$property}'); 15923EOC 15924} 15925 15926# These are used only for making the test script 15927# XXX Maybe should also have a bad strict seps, which includes underscore. 15928 15929my @good_loose_seps = ( 15930 " ", 15931 "-", 15932 "\t", 15933 "", 15934 "_", 15935 ); 15936my @bad_loose_seps = ( 15937 "/a/", 15938 ':=', 15939 ); 15940 15941sub randomize_stricter_name { 15942 # This used only for making the test script. Take the input name and 15943 # return a randomized, but valid version of it under the stricter matching 15944 # rules. 15945 15946 my $name = shift; 15947 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 15948 15949 # If the name looks like a number (integer, floating, or rational), do 15950 # some extra work 15951 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { 15952 my $sign = $1; 15953 my $number = $2; 15954 my $separator = $3; 15955 15956 # If there isn't a sign, part of the time add a plus 15957 # Note: Not testing having any denominator having a minus sign 15958 if (! $sign) { 15959 $sign = '+' if rand() <= .3; 15960 } 15961 15962 # And add 0 or more leading zeros. 15963 $name = $sign . ('0' x int rand(10)) . $number; 15964 15965 if (defined $separator) { 15966 my $extra_zeros = '0' x int rand(10); 15967 15968 if ($separator eq '.') { 15969 15970 # Similarly, add 0 or more trailing zeros after a decimal 15971 # point 15972 $name .= $extra_zeros; 15973 } 15974 else { 15975 15976 # Or, leading zeros before the denominator 15977 $name =~ s,/,/$extra_zeros,; 15978 } 15979 } 15980 } 15981 15982 # For legibility of the test, only change the case of whole sections at a 15983 # time. To do this, first split into sections. The split returns the 15984 # delimiters 15985 my @sections; 15986 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { 15987 trace $section if main::DEBUG && $to_trace; 15988 15989 if (length $section > 1 && $section !~ /\D/) { 15990 15991 # If the section is a sequence of digits, about half the time 15992 # randomly add underscores between some of them. 15993 if (rand() > .5) { 15994 15995 # Figure out how many underscores to add. max is 1 less than 15996 # the number of digits. (But add 1 at the end to make sure 15997 # result isn't 0, and compensate earlier by subtracting 2 15998 # instead of 1) 15999 my $num_underscores = int rand(length($section) - 2) + 1; 16000 16001 # And add them evenly throughout, for convenience, not rigor 16002 use integer; 16003 my $spacing = (length($section) - 1)/ $num_underscores; 16004 my $temp = $section; 16005 $section = ""; 16006 for my $i (1 .. $num_underscores) { 16007 $section .= substr($temp, 0, $spacing, "") . '_'; 16008 } 16009 $section .= $temp; 16010 } 16011 push @sections, $section; 16012 } 16013 else { 16014 16015 # Here not a sequence of digits. Change the case of the section 16016 # randomly 16017 my $switch = int rand(4); 16018 if ($switch == 0) { 16019 push @sections, uc $section; 16020 } 16021 elsif ($switch == 1) { 16022 push @sections, lc $section; 16023 } 16024 elsif ($switch == 2) { 16025 push @sections, ucfirst $section; 16026 } 16027 else { 16028 push @sections, $section; 16029 } 16030 } 16031 } 16032 trace "returning", join "", @sections if main::DEBUG && $to_trace; 16033 return join "", @sections; 16034} 16035 16036sub randomize_loose_name($;$) { 16037 # This used only for making the test script 16038 16039 my $name = shift; 16040 my $want_error = shift; # if true, make an error 16041 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 16042 16043 $name = randomize_stricter_name($name); 16044 16045 my @parts; 16046 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 16047 16048 # Preserve trailing ones for the sake of not stripping the underscore from 16049 # 'L_' 16050 for my $part (split /[-\s_]+ (?= . )/, $name) { 16051 if (@parts) { 16052 if ($want_error and rand() < 0.3) { 16053 push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; 16054 $want_error = 0; 16055 } 16056 else { 16057 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 16058 } 16059 } 16060 push @parts, $part; 16061 } 16062 my $new = join("", @parts); 16063 trace "$name => $new" if main::DEBUG && $to_trace; 16064 16065 if ($want_error) { 16066 if (rand() >= 0.5) { 16067 $new .= $bad_loose_seps[rand(@bad_loose_seps)]; 16068 } 16069 else { 16070 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; 16071 } 16072 } 16073 return $new; 16074} 16075 16076# Used to make sure don't generate duplicate test cases. 16077my %test_generated; 16078 16079sub make_property_test_script() { 16080 # This used only for making the test script 16081 # this written directly -- it's huge. 16082 16083 print "Making test script\n" if $verbosity >= $PROGRESS; 16084 16085 # This uses randomness to test different possibilities without testing all 16086 # possibilities. To ensure repeatability, set the seed to 0. But if 16087 # tests are added, it will perturb all later ones in the .t file 16088 srand 0; 16089 16090 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name 16091 16092 # Keep going down an order of magnitude 16093 # until find that adding this quantity to 16094 # 1 remains 1; but put an upper limit on 16095 # this so in case this algorithm doesn't 16096 # work properly on some platform, that we 16097 # won't loop forever. 16098 my $digits = 0; 16099 my $min_floating_slop = 1; 16100 while (1+ $min_floating_slop != 1 16101 && $digits++ < 50) 16102 { 16103 my $next = $min_floating_slop / 10; 16104 last if $next == 0; # If underflows, 16105 # use previous one 16106 $min_floating_slop = $next; 16107 } 16108 16109 # It doesn't matter whether the elements of this array contain single lines 16110 # or multiple lines. main::write doesn't count the lines. 16111 my @output; 16112 16113 foreach my $property (property_ref('*')) { 16114 foreach my $table ($property->tables) { 16115 16116 # Find code points that match, and don't match this table. 16117 my $valid = $table->get_valid_code_point; 16118 my $invalid = $table->get_invalid_code_point; 16119 my $warning = ($table->status eq $DEPRECATED) 16120 ? "'deprecated'" 16121 : '""'; 16122 16123 # Test each possible combination of the property's aliases with 16124 # the table's. If this gets to be too many, could do what is done 16125 # in the set_final_comment() for Tables 16126 my @table_aliases = $table->aliases; 16127 my @property_aliases = $table->property->aliases; 16128 16129 # Every property can be optionally be prefixed by 'Is_', so test 16130 # that those work, by creating such a new alias for each 16131 # pre-existing one. 16132 push @property_aliases, map { Alias->new("Is_" . $_->name, 16133 $_->loose_match, 16134 $_->make_re_pod_entry, 16135 $_->ok_as_filename, 16136 $_->status, 16137 $_->ucd, 16138 ) 16139 } @property_aliases; 16140 my $max = max(scalar @table_aliases, scalar @property_aliases); 16141 for my $j (0 .. $max - 1) { 16142 16143 # The current alias for property is the next one on the list, 16144 # or if beyond the end, start over. Similarly for table 16145 my $property_name 16146 = $property_aliases[$j % @property_aliases]->name; 16147 16148 $property_name = "" if $table->property == $perl; 16149 my $table_alias = $table_aliases[$j % @table_aliases]; 16150 my $table_name = $table_alias->name; 16151 my $loose_match = $table_alias->loose_match; 16152 16153 # If the table doesn't have a file, any test for it is 16154 # already guaranteed to be in error 16155 my $already_error = ! $table->file_path; 16156 16157 # Generate error cases for this alias. 16158 push @output, generate_error($property_name, 16159 $table_name, 16160 $already_error); 16161 16162 # If the table is guaranteed to always generate an error, 16163 # quit now without generating success cases. 16164 next if $already_error; 16165 16166 # Now for the success cases. 16167 my $random; 16168 if ($loose_match) { 16169 16170 # For loose matching, create an extra test case for the 16171 # standard name. 16172 my $standard = standardize($table_name); 16173 16174 # $test_name should be a unique combination for each test 16175 # case; used just to avoid duplicate tests 16176 my $test_name = "$property_name=$standard"; 16177 16178 # Don't output duplicate test cases. 16179 if (! exists $test_generated{$test_name}) { 16180 $test_generated{$test_name} = 1; 16181 push @output, generate_tests($property_name, 16182 $standard, 16183 $valid, 16184 $invalid, 16185 $warning, 16186 ); 16187 } 16188 $random = randomize_loose_name($table_name) 16189 } 16190 else { # Stricter match 16191 $random = randomize_stricter_name($table_name); 16192 } 16193 16194 # Now for the main test case for this alias. 16195 my $test_name = "$property_name=$random"; 16196 if (! exists $test_generated{$test_name}) { 16197 $test_generated{$test_name} = 1; 16198 push @output, generate_tests($property_name, 16199 $random, 16200 $valid, 16201 $invalid, 16202 $warning, 16203 ); 16204 16205 # If the name is a rational number, add tests for the 16206 # floating point equivalent. 16207 if ($table_name =~ qr{/}) { 16208 16209 # Calculate the float, and find just the fraction. 16210 my $float = eval $table_name; 16211 my ($whole, $fraction) 16212 = $float =~ / (.*) \. (.*) /x; 16213 16214 # Starting with one digit after the decimal point, 16215 # create a test for each possible precision (number of 16216 # digits past the decimal point) until well beyond the 16217 # native number found on this machine. (If we started 16218 # with 0 digits, it would be an integer, which could 16219 # well match an unrelated table) 16220 PLACE: 16221 for my $i (1 .. $min_floating_slop + 3) { 16222 my $table_name = sprintf("%.*f", $i, $float); 16223 if ($i < $MIN_FRACTION_LENGTH) { 16224 16225 # If the test case has fewer digits than the 16226 # minimum acceptable precision, it shouldn't 16227 # succeed, so we expect an error for it. 16228 # E.g., 2/3 = .7 at one decimal point, and we 16229 # shouldn't say it matches .7. We should make 16230 # it be .667 at least before agreeing that the 16231 # intent was to match 2/3. But at the 16232 # less-than- acceptable level of precision, it 16233 # might actually match an unrelated number. 16234 # So don't generate a test case if this 16235 # conflating is possible. In our example, we 16236 # don't want 2/3 matching 7/10, if there is 16237 # a 7/10 code point. 16238 for my $existing 16239 (keys %nv_floating_to_rational) 16240 { 16241 next PLACE 16242 if abs($table_name - $existing) 16243 < $MAX_FLOATING_SLOP; 16244 } 16245 push @output, generate_error($property_name, 16246 $table_name, 16247 1 # 1 => already an error 16248 ); 16249 } 16250 else { 16251 16252 # Here the number of digits exceeds the 16253 # minimum we think is needed. So generate a 16254 # success test case for it. 16255 push @output, generate_tests($property_name, 16256 $table_name, 16257 $valid, 16258 $invalid, 16259 $warning, 16260 ); 16261 } 16262 } 16263 } 16264 } 16265 } 16266 } 16267 } 16268 16269 &write($t_path, 16270 0, # Not utf8; 16271 [<DATA>, 16272 @output, 16273 (map {"Test_X('$_');\n"} @backslash_X_tests), 16274 "Finished();\n"]); 16275 return; 16276} 16277 16278# This is a list of the input files and how to handle them. The files are 16279# processed in their order in this list. Some reordering is possible if 16280# desired, but the v0 files should be first, and the extracted before the 16281# others except DAge.txt (as data in an extracted file can be over-ridden by 16282# the non-extracted. Some other files depend on data derived from an earlier 16283# file, like UnicodeData requires data from Jamo, and the case changing and 16284# folding requires data from Unicode. Mostly, it is safest to order by first 16285# version releases in (except the Jamo). DAge.txt is read before the 16286# extracted ones because of the rarely used feature $compare_versions. In the 16287# unlikely event that there were ever an extracted file that contained the Age 16288# property information, it would have to go in front of DAge. 16289# 16290# The version strings allow the program to know whether to expect a file or 16291# not, but if a file exists in the directory, it will be processed, even if it 16292# is in a version earlier than expected, so you can copy files from a later 16293# release into an earlier release's directory. 16294my @input_file_objects = ( 16295 Input_file->new('PropertyAliases.txt', v0, 16296 Handler => \&process_PropertyAliases, 16297 ), 16298 Input_file->new(undef, v0, # No file associated with this 16299 Progress_Message => 'Finishing property setup', 16300 Handler => \&finish_property_setup, 16301 ), 16302 Input_file->new('PropValueAliases.txt', v0, 16303 Handler => \&process_PropValueAliases, 16304 Has_Missings_Defaults => $NOT_IGNORED, 16305 ), 16306 Input_file->new('DAge.txt', v3.2.0, 16307 Has_Missings_Defaults => $NOT_IGNORED, 16308 Property => 'Age' 16309 ), 16310 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, 16311 Property => 'General_Category', 16312 ), 16313 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, 16314 Property => 'Canonical_Combining_Class', 16315 Has_Missings_Defaults => $NOT_IGNORED, 16316 ), 16317 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, 16318 Property => 'Numeric_Type', 16319 Has_Missings_Defaults => $NOT_IGNORED, 16320 ), 16321 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, 16322 Property => 'East_Asian_Width', 16323 Has_Missings_Defaults => $NOT_IGNORED, 16324 ), 16325 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, 16326 Property => 'Line_Break', 16327 Has_Missings_Defaults => $NOT_IGNORED, 16328 ), 16329 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, 16330 Property => 'Bidi_Class', 16331 Has_Missings_Defaults => $NOT_IGNORED, 16332 ), 16333 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, 16334 Property => 'Decomposition_Type', 16335 Has_Missings_Defaults => $NOT_IGNORED, 16336 ), 16337 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), 16338 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, 16339 Property => 'Numeric_Value', 16340 Each_Line_Handler => \&filter_numeric_value_line, 16341 Has_Missings_Defaults => $NOT_IGNORED, 16342 ), 16343 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, 16344 Property => 'Joining_Group', 16345 Has_Missings_Defaults => $NOT_IGNORED, 16346 ), 16347 16348 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, 16349 Property => 'Joining_Type', 16350 Has_Missings_Defaults => $NOT_IGNORED, 16351 ), 16352 Input_file->new('Jamo.txt', v2.0.0, 16353 Property => 'Jamo_Short_Name', 16354 Each_Line_Handler => \&filter_jamo_line, 16355 ), 16356 Input_file->new('UnicodeData.txt', v1.1.5, 16357 Pre_Handler => \&setup_UnicodeData, 16358 16359 # We clean up this file for some early versions. 16360 Each_Line_Handler => [ (($v_version lt v2.0.0 ) 16361 ? \&filter_v1_ucd 16362 : ($v_version eq v2.1.5) 16363 ? \&filter_v2_1_5_ucd 16364 16365 # And for 5.14 Perls with 6.0, 16366 # have to also make changes 16367 : ($v_version ge v6.0.0) 16368 ? \&filter_v6_ucd 16369 : undef), 16370 16371 # And the main filter 16372 \&filter_UnicodeData_line, 16373 ], 16374 EOF_Handler => \&EOF_UnicodeData, 16375 ), 16376 Input_file->new('ArabicShaping.txt', v2.0.0, 16377 Each_Line_Handler => 16378 [ ($v_version lt 4.1.0) 16379 ? \&filter_old_style_arabic_shaping 16380 : undef, 16381 \&filter_arabic_shaping_line, 16382 ], 16383 Has_Missings_Defaults => $NOT_IGNORED, 16384 ), 16385 Input_file->new('Blocks.txt', v2.0.0, 16386 Property => 'Block', 16387 Has_Missings_Defaults => $NOT_IGNORED, 16388 Each_Line_Handler => \&filter_blocks_lines 16389 ), 16390 Input_file->new('PropList.txt', v2.0.0, 16391 Each_Line_Handler => (($v_version lt v3.1.0) 16392 ? \&filter_old_style_proplist 16393 : undef), 16394 ), 16395 Input_file->new('Unihan.txt', v2.0.0, 16396 Pre_Handler => \&setup_unihan, 16397 Optional => 1, 16398 Each_Line_Handler => \&filter_unihan_line, 16399 ), 16400 Input_file->new('SpecialCasing.txt', v2.1.8, 16401 Each_Line_Handler => \&filter_special_casing_line, 16402 Pre_Handler => \&setup_special_casing, 16403 Has_Missings_Defaults => $IGNORED, 16404 ), 16405 Input_file->new( 16406 'LineBreak.txt', v3.0.0, 16407 Has_Missings_Defaults => $NOT_IGNORED, 16408 Property => 'Line_Break', 16409 # Early versions had problematic syntax 16410 Each_Line_Handler => (($v_version lt v3.1.0) 16411 ? \&filter_early_ea_lb 16412 : undef), 16413 ), 16414 Input_file->new('EastAsianWidth.txt', v3.0.0, 16415 Property => 'East_Asian_Width', 16416 Has_Missings_Defaults => $NOT_IGNORED, 16417 # Early versions had problematic syntax 16418 Each_Line_Handler => (($v_version lt v3.1.0) 16419 ? \&filter_early_ea_lb 16420 : undef), 16421 ), 16422 Input_file->new('CompositionExclusions.txt', v3.0.0, 16423 Property => 'Composition_Exclusion', 16424 ), 16425 Input_file->new('BidiMirroring.txt', v3.0.1, 16426 Property => 'Bidi_Mirroring_Glyph', 16427 ), 16428 Input_file->new("NormalizationTest.txt", v3.0.1, 16429 Skip => 'Validation Tests', 16430 ), 16431 Input_file->new('CaseFolding.txt', v3.0.1, 16432 Pre_Handler => \&setup_case_folding, 16433 Each_Line_Handler => 16434 [ ($v_version lt v3.1.0) 16435 ? \&filter_old_style_case_folding 16436 : undef, 16437 \&filter_case_folding_line 16438 ], 16439 Has_Missings_Defaults => $IGNORED, 16440 ), 16441 Input_file->new('DCoreProperties.txt', v3.1.0, 16442 # 5.2 changed this file 16443 Has_Missings_Defaults => (($v_version ge v5.2.0) 16444 ? $NOT_IGNORED 16445 : $NO_DEFAULTS), 16446 ), 16447 Input_file->new('Scripts.txt', v3.1.0, 16448 Property => 'Script', 16449 Has_Missings_Defaults => $NOT_IGNORED, 16450 ), 16451 Input_file->new('DNormalizationProps.txt', v3.1.0, 16452 Has_Missings_Defaults => $NOT_IGNORED, 16453 Each_Line_Handler => (($v_version lt v4.0.1) 16454 ? \&filter_old_style_normalization_lines 16455 : undef), 16456 ), 16457 Input_file->new('HangulSyllableType.txt', v4.0.0, 16458 Has_Missings_Defaults => $NOT_IGNORED, 16459 Property => 'Hangul_Syllable_Type'), 16460 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, 16461 Property => 'Word_Break', 16462 Has_Missings_Defaults => $NOT_IGNORED, 16463 ), 16464 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0, 16465 Property => 'Grapheme_Cluster_Break', 16466 Has_Missings_Defaults => $NOT_IGNORED, 16467 ), 16468 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, 16469 Handler => \&process_GCB_test, 16470 ), 16471 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0, 16472 Skip => 'Validation Tests', 16473 ), 16474 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, 16475 Skip => 'Validation Tests', 16476 ), 16477 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, 16478 Skip => 'Validation Tests', 16479 ), 16480 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, 16481 Property => 'Sentence_Break', 16482 Has_Missings_Defaults => $NOT_IGNORED, 16483 ), 16484 Input_file->new('NamedSequences.txt', v4.1.0, 16485 Handler => \&process_NamedSequences 16486 ), 16487 Input_file->new('NameAliases.txt', v5.0.0, 16488 Property => 'Name_Alias', 16489 Pre_Handler => ($v_version le v6.0.0) 16490 ? \&setup_early_name_alias 16491 : undef, 16492 Each_Line_Handler => ($v_version le v6.0.0) 16493 ? \&filter_early_version_name_alias_line 16494 : \&filter_later_version_name_alias_line, 16495 ), 16496 Input_file->new("BidiTest.txt", v5.2.0, 16497 Skip => 'Validation Tests', 16498 ), 16499 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, 16500 Optional => 1, 16501 Each_Line_Handler => \&filter_unihan_line, 16502 ), 16503 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, 16504 Optional => 1, 16505 Each_Line_Handler => \&filter_unihan_line, 16506 ), 16507 Input_file->new('UnihanIRGSources.txt', v5.2.0, 16508 Optional => 1, 16509 Pre_Handler => \&setup_unihan, 16510 Each_Line_Handler => \&filter_unihan_line, 16511 ), 16512 Input_file->new('UnihanNumericValues.txt', v5.2.0, 16513 Optional => 1, 16514 Each_Line_Handler => \&filter_unihan_line, 16515 ), 16516 Input_file->new('UnihanOtherMappings.txt', v5.2.0, 16517 Optional => 1, 16518 Each_Line_Handler => \&filter_unihan_line, 16519 ), 16520 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, 16521 Optional => 1, 16522 Each_Line_Handler => \&filter_unihan_line, 16523 ), 16524 Input_file->new('UnihanReadings.txt', v5.2.0, 16525 Optional => 1, 16526 Each_Line_Handler => \&filter_unihan_line, 16527 ), 16528 Input_file->new('UnihanVariants.txt', v5.2.0, 16529 Optional => 1, 16530 Each_Line_Handler => \&filter_unihan_line, 16531 ), 16532 Input_file->new('ScriptExtensions.txt', v6.0.0, 16533 Property => 'Script_Extensions', 16534 Pre_Handler => \&setup_script_extensions, 16535 Each_Line_Handler => \&filter_script_extensions_line, 16536 Has_Missings_Defaults => (($v_version le v6.0.0) 16537 ? $NO_DEFAULTS 16538 : $IGNORED), 16539 ), 16540 # The two Indic files are actually available starting in v6.0.0, but their 16541 # property values are missing from PropValueAliases.txt in that release, 16542 # so that further work would have to be done to get them to work properly 16543 # for that release. 16544 Input_file->new('IndicMatraCategory.txt', v6.1.0, 16545 Property => 'Indic_Matra_Category', 16546 Has_Missings_Defaults => $NOT_IGNORED, 16547 Skip => "Provisional; for the analysis and processing of Indic scripts", 16548 ), 16549 Input_file->new('IndicSyllabicCategory.txt', v6.1.0, 16550 Property => 'Indic_Syllabic_Category', 16551 Has_Missings_Defaults => $NOT_IGNORED, 16552 Skip => "Provisional; for the analysis and processing of Indic scripts", 16553 ), 16554); 16555 16556# End of all the preliminaries. 16557# Do it... 16558 16559if ($compare_versions) { 16560 Carp::my_carp(<<END 16561Warning. \$compare_versions is set. Output is not suitable for production 16562END 16563 ); 16564} 16565 16566# Put into %potential_files a list of all the files in the directory structure 16567# that could be inputs to this program, excluding those that we should ignore. 16568# Use absolute file names because it makes it easier across machine types. 16569my @ignored_files_full_names = map { File::Spec->rel2abs( 16570 internal_file_to_platform($_)) 16571 } keys %ignored_files; 16572File::Find::find({ 16573 wanted=>sub { 16574 return unless /\.txt$/i; # Some platforms change the name's case 16575 my $full = lc(File::Spec->rel2abs($_)); 16576 $potential_files{$full} = 1 16577 if ! grep { $full eq lc($_) } @ignored_files_full_names; 16578 return; 16579 } 16580}, File::Spec->curdir()); 16581 16582my @mktables_list_output_files; 16583my $old_start_time = 0; 16584 16585if (! -e $file_list) { 16586 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; 16587 $write_unchanged_files = 1; 16588} elsif ($write_unchanged_files) { 16589 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; 16590} 16591else { 16592 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; 16593 my $file_handle; 16594 if (! open $file_handle, "<", $file_list) { 16595 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); 16596 $glob_list = 1; 16597 } 16598 else { 16599 my @input; 16600 16601 # Read and parse mktables.lst, placing the results from the first part 16602 # into @input, and the second part into @mktables_list_output_files 16603 for my $list ( \@input, \@mktables_list_output_files ) { 16604 while (<$file_handle>) { 16605 s/^ \s+ | \s+ $//xg; 16606 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) { 16607 $old_start_time = $1; 16608 } 16609 next if /^ \s* (?: \# .* )? $/x; 16610 last if /^ =+ $/x; 16611 my ( $file ) = split /\t/; 16612 push @$list, $file; 16613 } 16614 @$list = uniques(@$list); 16615 next; 16616 } 16617 16618 # Look through all the input files 16619 foreach my $input (@input) { 16620 next if $input eq 'version'; # Already have checked this. 16621 16622 # Ignore if doesn't exist. The checking about whether we care or 16623 # not is done via the Input_file object. 16624 next if ! file_exists($input); 16625 16626 # The paths are stored with relative names, and with '/' as the 16627 # delimiter; convert to absolute on this machine 16628 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); 16629 $potential_files{lc $full} = 1 16630 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names; 16631 } 16632 } 16633 16634 close $file_handle; 16635} 16636 16637if ($glob_list) { 16638 16639 # Here wants to process all .txt files in the directory structure. 16640 # Convert them to full path names. They are stored in the platform's 16641 # relative style 16642 my @known_files; 16643 foreach my $object (@input_file_objects) { 16644 my $file = $object->file; 16645 next unless defined $file; 16646 push @known_files, File::Spec->rel2abs($file); 16647 } 16648 16649 my @unknown_input_files; 16650 foreach my $file (keys %potential_files) { # The keys are stored in lc 16651 next if grep { $file eq lc($_) } @known_files; 16652 16653 # Here, the file is unknown to us. Get relative path name 16654 $file = File::Spec->abs2rel($file); 16655 push @unknown_input_files, $file; 16656 16657 # What will happen is we create a data structure for it, and add it to 16658 # the list of input files to process. First get the subdirectories 16659 # into an array 16660 my (undef, $directories, undef) = File::Spec->splitpath($file); 16661 $directories =~ s;/$;;; # Can have extraneous trailing '/' 16662 my @directories = File::Spec->splitdir($directories); 16663 16664 # If the file isn't extracted (meaning none of the directories is the 16665 # extracted one), just add it to the end of the list of inputs. 16666 if (! grep { $EXTRACTED_DIR eq $_ } @directories) { 16667 push @input_file_objects, Input_file->new($file, v0); 16668 } 16669 else { 16670 16671 # Here, the file is extracted. It needs to go ahead of most other 16672 # processing. Search for the first input file that isn't a 16673 # special required property (that is, find one whose first_release 16674 # is non-0), and isn't extracted. Also, the Age property file is 16675 # processed before the extracted ones, just in case 16676 # $compare_versions is set. 16677 for (my $i = 0; $i < @input_file_objects; $i++) { 16678 if ($input_file_objects[$i]->first_released ne v0 16679 && lc($input_file_objects[$i]->file) ne 'dage.txt' 16680 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) 16681 { 16682 splice @input_file_objects, $i, 0, 16683 Input_file->new($file, v0); 16684 last; 16685 } 16686 } 16687 16688 } 16689 } 16690 if (@unknown_input_files) { 16691 print STDERR simple_fold(join_lines(<<END 16692 16693The following files are unknown as to how to handle. Assuming they are 16694typical property files. You'll know by later error messages if it worked or 16695not: 16696END 16697 ) . " " . join(", ", @unknown_input_files) . "\n\n"); 16698 } 16699} # End of looking through directory structure for more .txt files. 16700 16701# Create the list of input files from the objects we have defined, plus 16702# version 16703my @input_files = 'version'; 16704foreach my $object (@input_file_objects) { 16705 my $file = $object->file; 16706 next if ! defined $file; # Not all objects have files 16707 next if $object->optional && ! -e $file; 16708 push @input_files, $file; 16709} 16710 16711if ( $verbosity >= $VERBOSE ) { 16712 print "Expecting ".scalar( @input_files )." input files. ", 16713 "Checking ".scalar( @mktables_list_output_files )." output files.\n"; 16714} 16715 16716# We set $most_recent to be the most recently changed input file, including 16717# this program itself (done much earlier in this file) 16718foreach my $in (@input_files) { 16719 next unless -e $in; # Keep going even if missing a file 16720 my $mod_time = (stat $in)[9]; 16721 $most_recent = $mod_time if $mod_time > $most_recent; 16722 16723 # See that the input files have distinct names, to warn someone if they 16724 # are adding a new one 16725 if ($make_list) { 16726 my ($volume, $directories, $file ) = File::Spec->splitpath($in); 16727 $directories =~ s;/$;;; # Can have extraneous trailing '/' 16728 my @directories = File::Spec->splitdir($directories); 16729 my $base = $file =~ s/\.txt$//; 16730 construct_filename($file, 'mutable', \@directories); 16731 } 16732} 16733 16734my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild 16735 || ! scalar @mktables_list_output_files # or if no outputs known 16736 || $old_start_time < $most_recent; # or out-of-date 16737 16738# Now we check to see if any output files are older than youngest, if 16739# they are, we need to continue on, otherwise we can presumably bail. 16740if (! $rebuild) { 16741 foreach my $out (@mktables_list_output_files) { 16742 if ( ! file_exists($out)) { 16743 print "'$out' is missing.\n" if $verbosity >= $VERBOSE; 16744 $rebuild = 1; 16745 last; 16746 } 16747 #local $to_trace = 1 if main::DEBUG; 16748 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; 16749 if ( (stat $out)[9] <= $most_recent ) { 16750 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; 16751 print "'$out' is too old.\n" if $verbosity >= $VERBOSE; 16752 $rebuild = 1; 16753 last; 16754 } 16755 } 16756} 16757if (! $rebuild) { 16758 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; 16759 exit(0); 16760} 16761print "Must rebuild tables.\n" if $verbosity >= $VERBOSE; 16762 16763# Ready to do the major processing. First create the perl pseudo-property. 16764$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); 16765 16766# Process each input file 16767foreach my $file (@input_file_objects) { 16768 $file->run; 16769} 16770 16771# Finish the table generation. 16772 16773print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; 16774finish_Unicode(); 16775 16776print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; 16777compile_perl(); 16778 16779print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; 16780add_perl_synonyms(); 16781 16782print "Writing tables\n" if $verbosity >= $PROGRESS; 16783write_all_tables(); 16784 16785# Write mktables.lst 16786if ( $file_list and $make_list ) { 16787 16788 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; 16789 foreach my $file (@input_files, @files_actually_output) { 16790 my (undef, $directories, $file) = File::Spec->splitpath($file); 16791 my @directories = File::Spec->splitdir($directories); 16792 $file = join '/', @directories, $file; 16793 } 16794 16795 my $ofh; 16796 if (! open $ofh,">",$file_list) { 16797 Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); 16798 return 16799 } 16800 else { 16801 my $localtime = localtime $start_time; 16802 print $ofh <<"END"; 16803# 16804# $file_list -- File list for $0. 16805# 16806# Autogenerated starting on $start_time ($localtime) 16807# 16808# - First section is input files 16809# ($0 itself is not listed but is automatically considered an input) 16810# - Section separator is /^=+\$/ 16811# - Second section is a list of output files. 16812# - Lines matching /^\\s*#/ are treated as comments 16813# which along with blank lines are ignored. 16814# 16815 16816# Input files: 16817 16818END 16819 print $ofh "$_\n" for sort(@input_files); 16820 print $ofh "\n=================================\n# Output files:\n\n"; 16821 print $ofh "$_\n" for sort @files_actually_output; 16822 print $ofh "\n# ",scalar(@input_files)," input files\n", 16823 "# ",scalar(@files_actually_output)+1," output files\n\n", 16824 "# End list\n"; 16825 close $ofh 16826 or Carp::my_carp("Failed to close $ofh: $!"); 16827 16828 print "Filelist has ",scalar(@input_files)," input files and ", 16829 scalar(@files_actually_output)+1," output files\n" 16830 if $verbosity >= $VERBOSE; 16831 } 16832} 16833 16834# Output these warnings unless -q explicitly specified. 16835if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { 16836 if (@unhandled_properties) { 16837 print "\nProperties and tables that unexpectedly have no code points\n"; 16838 foreach my $property (sort @unhandled_properties) { 16839 print $property, "\n"; 16840 } 16841 } 16842 16843 if (%potential_files) { 16844 print "\nInput files that are not considered:\n"; 16845 foreach my $file (sort keys %potential_files) { 16846 print File::Spec->abs2rel($file), "\n"; 16847 } 16848 } 16849 print "\nAll done\n" if $verbosity >= $VERBOSE; 16850} 16851exit(0); 16852 16853# TRAILING CODE IS USED BY make_property_test_script() 16854__DATA__ 16855 16856use strict; 16857use warnings; 16858 16859# If run outside the normal test suite on an ASCII platform, you can 16860# just create a latin1_to_native() function that just returns its 16861# inputs, because that's the only function used from test.pl 16862require "test.pl"; 16863 16864# Test qr/\X/ and the \p{} regular expression constructs. This file is 16865# constructed by mktables from the tables it generates, so if mktables is 16866# buggy, this won't necessarily catch those bugs. Tests are generated for all 16867# feasible properties; a few aren't currently feasible; see 16868# is_code_point_usable() in mktables for details. 16869 16870# Standard test packages are not used because this manipulates SIG_WARN. It 16871# exits 0 if every non-skipped test succeeded; -1 if any failed. 16872 16873my $Tests = 0; 16874my $Fails = 0; 16875 16876sub Expect($$$$) { 16877 my $expected = shift; 16878 my $ord = shift; 16879 my $regex = shift; 16880 my $warning_type = shift; # Type of warning message, like 'deprecated' 16881 # or empty if none 16882 my $line = (caller)[2]; 16883 $ord = ord(latin1_to_native(chr($ord))); 16884 16885 # Convert the code point to hex form 16886 my $string = sprintf "\"\\x{%04X}\"", $ord; 16887 16888 my @tests = ""; 16889 16890 # The first time through, use all warnings. If the input should generate 16891 # a warning, add another time through with them turned off 16892 push @tests, "no warnings '$warning_type';" if $warning_type; 16893 16894 foreach my $no_warnings (@tests) { 16895 16896 # Store any warning messages instead of outputting them 16897 local $SIG{__WARN__} = $SIG{__WARN__}; 16898 my $warning_message; 16899 $SIG{__WARN__} = sub { $warning_message = $_[0] }; 16900 16901 $Tests++; 16902 16903 # A string eval is needed because of the 'no warnings'. 16904 # Assumes no parens in the regular expression 16905 my $result = eval "$no_warnings 16906 my \$RegObj = qr($regex); 16907 $string =~ \$RegObj ? 1 : 0"; 16908 if (not defined $result) { 16909 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; 16910 $Fails++; 16911 } 16912 elsif ($result ^ $expected) { 16913 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; 16914 $Fails++; 16915 } 16916 elsif ($warning_message) { 16917 if (! $warning_type || ($warning_type && $no_warnings)) { 16918 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; 16919 $Fails++; 16920 } 16921 else { 16922 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; 16923 } 16924 } 16925 elsif ($warning_type && ! $no_warnings) { 16926 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; 16927 $Fails++; 16928 } 16929 else { 16930 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; 16931 } 16932 } 16933 return; 16934} 16935 16936sub Error($) { 16937 my $regex = shift; 16938 $Tests++; 16939 if (eval { 'x' =~ qr/$regex/; 1 }) { 16940 $Fails++; 16941 my $line = (caller)[2]; 16942 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; 16943 } 16944 else { 16945 my $line = (caller)[2]; 16946 print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; 16947 } 16948 return; 16949} 16950 16951# GCBTest.txt character that separates grapheme clusters 16952my $breakable_utf8 = my $breakable = chr(0xF7); 16953utf8::upgrade($breakable_utf8); 16954 16955# GCBTest.txt character that indicates that the adjoining code points are part 16956# of the same grapheme cluster 16957my $nobreak_utf8 = my $nobreak = chr(0xD7); 16958utf8::upgrade($nobreak_utf8); 16959 16960sub Test_X($) { 16961 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt 16962 # Each such line is a sequence of code points given by their hex numbers, 16963 # separated by the two characters defined just before this subroutine that 16964 # indicate that either there can or cannot be a break between the adjacent 16965 # code points. If there isn't a break, that means the sequence forms an 16966 # extended grapheme cluster, which means that \X should match the whole 16967 # thing. If there is a break, \X should stop there. This is all 16968 # converted by this routine into a match: 16969 # $string =~ /(\X)/, 16970 # Each \X should match the next cluster; and that is what is checked. 16971 16972 my $template = shift; 16973 16974 my $line = (caller)[2]; 16975 16976 # The line contains characters above the ASCII range, but in Latin1. It 16977 # may or may not be in utf8, and if it is, it may or may not know it. So, 16978 # convert these characters to 8 bits. If knows is in utf8, simply 16979 # downgrade. 16980 if (utf8::is_utf8($template)) { 16981 utf8::downgrade($template); 16982 } else { 16983 16984 # Otherwise, if it is in utf8, but doesn't know it, the next lines 16985 # convert the two problematic characters to their 8-bit equivalents. 16986 # If it isn't in utf8, they don't harm anything. 16987 use bytes; 16988 $template =~ s/$nobreak_utf8/$nobreak/g; 16989 $template =~ s/$breakable_utf8/$breakable/g; 16990 } 16991 16992 # Get rid of the leading and trailing breakables 16993 $template =~ s/^ \s* $breakable \s* //x; 16994 $template =~ s/ \s* $breakable \s* $ //x; 16995 16996 # And no-breaks become just a space. 16997 $template =~ s/ \s* $nobreak \s* / /xg; 16998 16999 # Split the input into segments that are breakable between them. 17000 my @segments = split /\s*$breakable\s*/, $template; 17001 17002 my $string = ""; 17003 my $display_string = ""; 17004 my @should_match; 17005 my @should_display; 17006 17007 # Convert the code point sequence in each segment into a Perl string of 17008 # characters 17009 foreach my $segment (@segments) { 17010 my @code_points = split /\s+/, $segment; 17011 my $this_string = ""; 17012 my $this_display = ""; 17013 foreach my $code_point (@code_points) { 17014 $this_string .= latin1_to_native(chr(hex $code_point)); 17015 $this_display .= "\\x{$code_point}"; 17016 } 17017 17018 # The next cluster should match the string in this segment. 17019 push @should_match, $this_string; 17020 push @should_display, $this_display; 17021 $string .= $this_string; 17022 $display_string .= $this_display; 17023 } 17024 17025 # If a string can be represented in both non-ut8 and utf8, test both cases 17026 UPGRADE: 17027 for my $to_upgrade (0 .. 1) { 17028 17029 if ($to_upgrade) { 17030 17031 # If already in utf8, would just be a repeat 17032 next UPGRADE if utf8::is_utf8($string); 17033 17034 utf8::upgrade($string); 17035 } 17036 17037 # Finally, do the \X match. 17038 my @matches = $string =~ /(\X)/g; 17039 17040 # Look through each matched cluster to verify that it matches what we 17041 # expect. 17042 my $min = (@matches < @should_match) ? @matches : @should_match; 17043 for my $i (0 .. $min - 1) { 17044 $Tests++; 17045 if ($matches[$i] eq $should_match[$i]) { 17046 print "ok $Tests - "; 17047 if ($i == 0) { 17048 print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; 17049 } else { 17050 print "And \\X #", $i + 1, 17051 } 17052 print " correctly matched $should_display[$i]; line $line\n"; 17053 } else { 17054 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ } 17055 unpack("U*", $matches[$i])); 17056 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #", 17057 $i + 1, 17058 " should have matched $should_display[$i]", 17059 " but instead matched $matches[$i]", 17060 ". Abandoning rest of line $line\n"; 17061 next UPGRADE; 17062 } 17063 } 17064 17065 # And the number of matches should equal the number of expected matches. 17066 $Tests++; 17067 if (@matches == @should_match) { 17068 print "ok $Tests - Nothing was left over; line $line\n"; 17069 } else { 17070 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n"; 17071 } 17072 } 17073 17074 return; 17075} 17076 17077sub Finished() { 17078 print "1..$Tests\n"; 17079 exit($Fails ? -1 : 0); 17080} 17081 17082Error('\p{Script=InGreek}'); # Bug #69018 17083Test_X("1100 $nobreak 1161"); # Bug #70940 17084Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 17085Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 17086Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726 17087