1#!/usr/bin/perl -w 2 3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! 4# Any files created or read by this program should be listed in 'mktables.lst' 5# Use -makelist to regenerate it. 6 7# There was an attempt when this was first rewritten to make it 5.8 8# compatible, but that has now been abandoned, and newer constructs are used 9# as convenient. 10 11# NOTE: this script can run quite slowly in older/slower systems. 12# It can also consume a lot of memory (128 MB or more), you may need 13# to raise your process resource limits (e.g. in bash, "ulimit -a" 14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set) 15 16my $start_time; 17BEGIN { # Get the time the script started running; do it at compilation to 18 # get it as close as possible 19 $start_time= time; 20} 21 22require 5.010_001; 23use strict; 24use warnings; 25use builtin qw(refaddr); 26use Carp; 27use Config; 28use File::Find; 29use File::Path; 30use File::Spec; 31use Text::Tabs; 32use re "/aa"; 33 34use feature 'state'; 35use feature 'signatures'; 36no warnings qw( experimental::builtin ); 37 38sub DEBUG () { 0 } # Set to 0 for production; 1 for development 39$| = 1 if DEBUG; 40my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; 41 42sub NON_ASCII_PLATFORM { ord("A") != 65 } 43 44# When a new version of Unicode is published, unfortunately the algorithms for 45# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated 46# manually. The changes may or may not be backward compatible with older 47# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the 48# changes, then come back here and set the variable below to what version the 49# code is expecting. If a newer version of Unicode is being compiled than 50# expected, a warning will be generated. If an older version is being 51# compiled, any bounds tests that fail in the generated test file (-maketest 52# option) will be marked as TODO. 53my $version_of_mk_invlist_bounds = v15.0.0; 54 55########################################################################## 56# 57# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), 58# from the Unicode database files (lib/unicore/.../*.txt), It also generates 59# a pod file and .t files, depending on option parameters. 60# 61# The structure of this file is: 62# First these introductory comments; then 63# code needed for everywhere, such as debugging stuff; then 64# code to handle input parameters; then 65# data structures likely to be of external interest (some of which depend on 66# the input parameters, so follows them; then 67# more data structures and subroutine and package (class) definitions; then 68# the small actual loop to process the input files and finish up; then 69# a __DATA__ section, for the .t tests 70# 71# This program works on all releases of Unicode so far. The outputs have been 72# scrutinized most intently for release 5.1. The others have been checked for 73# somewhat more than just sanity. It can handle all non-provisional Unicode 74# character properties in those releases. 75# 76# This program is mostly about Unicode character (or code point) properties. 77# A property describes some attribute or quality of a code point, like if it 78# is lowercase or not, its name, what version of Unicode it was first defined 79# in, or what its uppercase equivalent is. Unicode deals with these disparate 80# possibilities by making all properties into mappings from each code point 81# into some corresponding value. In the case of it being lowercase or not, 82# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each 83# property maps each Unicode code point to a single value, called a "property 84# value". (Some more recently defined properties, map a code point to a set 85# of values.) 86# 87# When using a property in a regular expression, what is desired isn't the 88# mapping of the code point to its property's value, but the reverse (or the 89# mathematical "inverse relation"): starting with the property value, "Does a 90# code point map to it?" These are written in a "compound" form: 91# \p{property=value}, e.g., \p{category=punctuation}. This program generates 92# files containing the lists of code points that map to each such regular 93# expression property value, one file per list 94# 95# There is also a single form shortcut that Perl adds for many of the commonly 96# used properties. This happens for all binary properties, plus script, 97# general_category, and block properties. 98# 99# Thus the outputs of this program are files. There are map files, mostly in 100# the 'To' directory; and there are list files for use in regular expression 101# matching, all in subdirectories of the 'lib' directory, with each 102# subdirectory being named for the property that the lists in it are for. 103# Bookkeeping, test, and documentation files are also generated. 104 105my $matches_directory = 'lib'; # Where match (\p{}) files go. 106my $map_directory = 'To'; # Where map files go. 107 108# DATA STRUCTURES 109# 110# The major data structures of this program are Property, of course, but also 111# Table. There are two kinds of tables, very similar to each other. 112# "Match_Table" is the data structure giving the list of code points that have 113# a particular property value, mentioned above. There is also a "Map_Table" 114# data structure which gives the property's mapping from code point to value. 115# There are two structures because the match tables need to be combined in 116# various ways, such as constructing unions, intersections, complements, etc., 117# and the map ones don't. And there would be problems, perhaps subtle, if 118# a map table were inadvertently operated on in some of those ways. 119# The use of separate classes with operations defined on one but not the other 120# prevents accidentally confusing the two. 121# 122# At the heart of each table's data structure is a "Range_List", which is just 123# an ordered list of "Ranges", plus ancillary information, and methods to 124# operate on them. A Range is a compact way to store property information. 125# Each range has a starting code point, an ending code point, and a value that 126# is meant to apply to all the code points between the two end points, 127# inclusive. For a map table, this value is the property value for those 128# code points. Two such ranges could be written like this: 129# 0x41 .. 0x5A, 'Upper', 130# 0x61 .. 0x7A, 'Lower' 131# 132# Each range also has a type used as a convenience to classify the values. 133# Most ranges in this program will be Type 0, or normal, but there are some 134# ranges that have a non-zero type. These are used only in map tables, and 135# are for mappings that don't fit into the normal scheme of things. Mappings 136# that require a hash entry to communicate with utf8.c are one example; 137# another example is mappings for charnames.pm to use which indicate a name 138# that is algorithmically determinable from its code point (and the reverse). 139# These are used to significantly compact these tables, instead of listing 140# each one of the tens of thousands individually. 141# 142# In a match table, the value of a range is irrelevant (and hence the type as 143# well, which will always be 0), and arbitrarily set to the empty string. 144# Using the example above, there would be two match tables for those two 145# entries, one named Upper would contain the 0x41..0x5A range, and the other 146# named Lower would contain 0x61..0x7A. 147# 148# Actually, there are two types of range lists, "Range_Map" is the one 149# associated with map tables, and "Range_List" with match tables. 150# Again, this is so that methods can be defined on one and not the others so 151# as to prevent operating on them in incorrect ways. 152# 153# Eventually, most tables are written out to files to be read by Unicode::UCD. 154# All tables could in theory be written, but some are suppressed because there 155# is no current practical use for them. It is easy to change which get 156# written by changing various lists that are near the top of the actual code 157# in this file. The table data structures contain enough ancillary 158# information to allow them to be treated as separate entities for writing, 159# such as the path to each one's file. There is a heading in each map table 160# that gives the format of its entries, and what the map is for all the code 161# points missing from it. (This allows tables to be more compact.) 162# 163# The Property data structure contains one or more tables. All properties 164# contain a map table (except the $perl property which is a 165# pseudo-property containing only match tables), and any properties that 166# are usable in regular expression matches also contain various matching 167# tables, one for each value the property can have. A binary property can 168# have two values, True and False (or Y and N, which are preferred by Unicode 169# terminology). Thus each of these properties will have a map table that 170# takes every code point and maps it to Y or N (but having ranges cuts the 171# number of entries in that table way down), and two match tables, one 172# which has a list of all the code points that map to Y, and one for all the 173# code points that map to N. (For each binary property, a third table is also 174# generated for the pseudo Perl property. It contains the identical code 175# points as the Y table, but can be written in regular expressions, not in the 176# compound form, but in a "single" form like \p{IsUppercase}.) Many 177# properties are binary, but some properties have several possible values, 178# some have many, and properties like Name have a different value for every 179# named code point. Those will not, unless the controlling lists are changed, 180# have their match tables written out. But all the ones which can be used in 181# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally 182# a property would have either its map table or its match tables written but 183# not both. Again, what gets written is controlled by lists which can easily 184# be changed. Starting in 5.14, advantage was taken of this, and all the map 185# tables needed to reconstruct the Unicode db are now written out, while 186# suppressing the Unicode .txt files that contain the data. Our tables are 187# much more compact than the .txt files, so a significant space savings was 188# achieved. Also, tables are not written out that are trivially derivable 189# from tables that do get written. So, there typically is no file containing 190# the code points not matched by a binary property (the table for \P{} versus 191# lowercase \p{}), since you just need to invert the True table to get the 192# False table. 193 194# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on 195# how many match tables there are and the content of the maps. This 'Type' is 196# different than a range 'Type', so don't get confused by the two concepts 197# having the same name. 198# 199# For information about the Unicode properties, see Unicode's UAX44 document: 200 201my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; 202 203# As stated earlier, this program will work on any release of Unicode so far. 204# Most obvious problems in earlier data have NOT been corrected except when 205# necessary to make Perl or this program work reasonably, and to keep out 206# potential security issues. For example, no folding information was given in 207# early releases, so this program substitutes lower case instead, just so that 208# a regular expression with the /i option will do something that actually 209# gives the right results in many cases. There are also a couple other 210# corrections for version 1.1.5, commented at the point they are made. As an 211# example of corrections that weren't made (but could be) is this statement 212# from DerivedAge.txt: "The supplementary private use code points and the 213# non-character code points were assigned in version 2.0, but not specifically 214# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise 215# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is 216# further down in these introductory comments. 217# 218# This program works on all non-provisional properties as of the current 219# Unicode release, though the files for some are suppressed for various 220# reasons. You can change which are output by changing lists in this program. 221# 222# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's 223# loose matchings rules (from Unicode TR18): 224# 225# The recommended names for UCD properties and property values are in 226# PropertyAliases.txt [Prop] and PropertyValueAliases.txt 227# [PropValue]. There are both abbreviated names and longer, more 228# descriptive names. It is strongly recommended that both names be 229# recognized, and that loose matching of property names be used, 230# whereby the case distinctions, whitespace, hyphens, and underbar 231# are ignored. 232# 233# The program still allows Fuzzy to override its determination of if loose 234# matching should be used, but it isn't currently used, as it is no longer 235# needed; the calculations it makes are good enough. 236# 237# SUMMARY OF HOW IT WORKS: 238# 239# Process arguments 240# 241# A list is constructed containing each input file that is to be processed 242# 243# Each file on the list is processed in a loop, using the associated handler 244# code for each: 245# The PropertyAliases.txt and PropValueAliases.txt files are processed 246# first. These files name the properties and property values. 247# Objects are created of all the property and property value names 248# that the rest of the input should expect, including all synonyms. 249# The other input files give mappings from properties to property 250# values. That is, they list code points and say what the mapping 251# is under the given property. Some files give the mappings for 252# just one property; and some for many. This program goes through 253# each file and populates the properties and their map tables from 254# them. Some properties are listed in more than one file, and 255# Unicode has set up a precedence as to which has priority if there 256# is a conflict. Thus the order of processing matters, and this 257# program handles the conflict possibility by processing the 258# overriding input files last, so that if necessary they replace 259# earlier values. 260# After this is all done, the program creates the property mappings not 261# furnished by Unicode, but derivable from what it does give. 262# The tables of code points that match each property value in each 263# property that is accessible by regular expressions are created. 264# The Perl-defined properties are created and populated. Many of these 265# require data determined from the earlier steps 266# Any Perl-defined synonyms are created, and name clashes between Perl 267# and Unicode are reconciled and warned about. 268# All the properties are written to files 269# Any other files are written, and final warnings issued. 270# 271# For clarity, a number of operators have been overloaded to work on tables: 272# ~ means invert (take all characters not in the set). The more 273# conventional '!' is not used because of the possibility of confusing 274# it with the actual boolean operation. 275# + means union 276# - means subtraction 277# & means intersection 278# The precedence of these is the order listed. Parentheses should be 279# copiously used. These are not a general scheme. The operations aren't 280# defined for a number of things, deliberately, to avoid getting into trouble. 281# Operations are done on references and affect the underlying structures, so 282# that the copy constructors for them have been overloaded to not return a new 283# clone, but the input object itself. 284# 285# The bool operator is deliberately not overloaded to avoid confusion with 286# "should it mean if the object merely exists, or also is non-empty?". 287# 288# WHY CERTAIN DESIGN DECISIONS WERE MADE 289# 290# This program needs to be able to run under miniperl. Therefore, it uses a 291# minimum of other modules, and hence implements some things itself that could 292# be gotten from CPAN 293# 294# This program uses inputs published by the Unicode Consortium. These can 295# change incompatibly between releases without the Perl maintainers realizing 296# it. Therefore this program is now designed to try to flag these. It looks 297# at the directories where the inputs are, and flags any unrecognized files. 298# It keeps track of all the properties in the files it handles, and flags any 299# that it doesn't know how to handle. It also flags any input lines that 300# don't match the expected syntax, among other checks. 301# 302# It is also designed so if a new input file matches one of the known 303# templates, one hopefully just needs to add it to a list to have it 304# processed. 305# 306# As mentioned earlier, some properties are given in more than one file. In 307# particular, the files in the extracted directory are supposedly just 308# reformattings of the others. But they contain information not easily 309# derivable from the other files, including results for Unihan (which isn't 310# usually available to this program) and for unassigned code points. They 311# also have historically had errors or been incomplete. In an attempt to 312# create the best possible data, this program thus processes them first to 313# glean information missing from the other files; then processes those other 314# files to override any errors in the extracted ones. Much of the design was 315# driven by this need to store things and then possibly override them. 316# 317# It tries to keep fatal errors to a minimum, to generate something usable for 318# testing purposes. It always looks for files that could be inputs, and will 319# warn about any that it doesn't know how to handle (the -q option suppresses 320# the warning). 321# 322# Why is there more than one type of range? 323# This simplified things. There are some very specialized code points that 324# have to be handled specially for output, such as Hangul syllable names. 325# By creating a range type (done late in the development process), it 326# allowed this to be stored with the range, and overridden by other input. 327# Originally these were stored in another data structure, and it became a 328# mess trying to decide if a second file that was for the same property was 329# overriding the earlier one or not. 330# 331# Why are there two kinds of tables, match and map? 332# (And there is a base class shared by the two as well.) As stated above, 333# they actually are for different things. Development proceeded much more 334# smoothly when I (khw) realized the distinction. Map tables are used to 335# give the property value for every code point (actually every code point 336# that doesn't map to a default value). Match tables are used for regular 337# expression matches, and are essentially the inverse mapping. Separating 338# the two allows more specialized methods, and error checks so that one 339# can't just take the intersection of two map tables, for example, as that 340# is nonsensical. 341# 342# What about 'fate' and 'status'. The concept of a table's fate was created 343# late when it became clear that something more was needed. The difference 344# between this and 'status' is unclean, and could be improved if someone 345# wanted to spend the effort. 346# 347# DEBUGGING 348# 349# This program is written so it will run under miniperl. Occasionally changes 350# will cause an error where the backtrace doesn't work well under miniperl. 351# To diagnose the problem, you can instead run it under regular perl, if you 352# have one compiled. 353# 354# There is a good trace facility. To enable it, first sub DEBUG must be set 355# to return true. Then a line like 356# 357# local $to_trace = 1 if main::DEBUG; 358# 359# can be added to enable tracing in its lexical scope (plus dynamic) or until 360# you insert another line: 361# 362# local $to_trace = 0 if main::DEBUG; 363# 364# To actually trace, use a line like "trace $a, @b, %c, ...; 365# 366# Some of the more complex subroutines already have trace statements in them. 367# Permanent trace statements should be like: 368# 369# trace ... if main::DEBUG && $to_trace; 370# 371# main::stack_trace() will display what its name implies 372# 373# If there is just one or a few files that you're debugging, you can easily 374# cause most everything else to be skipped. Change the line 375# 376# my $debug_skip = 0; 377# 378# to 1, and every file whose object is in @input_file_objects and doesn't have 379# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping 380# Jamo.txt or UnicodeData.txt will likely cause fatal errors. 381# 382# To compare the output tables, it may be useful to specify the -annotate 383# flag. (As of this writing, this can't be done on a clean workspace, due to 384# requirements in Text::Tabs used in this option; so first run mktables 385# without this option.) This option adds comment lines to each table, one for 386# each non-algorithmically named character giving, currently its code point, 387# name, and graphic representation if printable (and you have a font that 388# knows about it). This makes it easier to see what the particular code 389# points are in each output table. Non-named code points are annotated with a 390# description of their status, and contiguous ones with the same description 391# will be output as a range rather than individually. Algorithmically named 392# characters are also output as ranges, except when there are just a few 393# contiguous ones. 394# 395# FUTURE ISSUES 396# 397# The program would break if Unicode were to change its names so that 398# interior white space, underscores, or dashes differences were significant 399# within property and property value names. 400# 401# It might be easier to use the xml versions of the UCD if this program ever 402# would need heavy revision, and the ability to handle old versions was not 403# required. Also, it turns out to be risky to rely on this, as in early 2024, 404# Unicode decided to drop the xml version. It was news to many that this was 405# not considered to be an official product that needs to be maintained going 406# forward. Someone acceptable to the Unicode management volunteered to take 407# over from the retiring volunteer, and so it continues, but beware. 408# 409# There is the potential for name collisions, in that Perl has chosen names 410# that Unicode could decide it also likes. There have been such collisions in 411# the past, with mostly Perl deciding to adopt the Unicode definition of the 412# name. However in the 5.2 Unicode beta testing, there were a number of such 413# collisions, which were withdrawn before the final release, because of Perl's 414# and other's protests. These all involved new properties which began with 415# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, 416# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a 417# Unicode document, so they are unlikely to be used by Unicode for another 418# purpose. However, they might try something beginning with 'In', or use any 419# of the other Perl-defined properties. This program will warn you of name 420# collisions, and refuse to generate tables with them, but manual intervention 421# will be required in this event. One scheme that could be implemented, if 422# necessary, would be to have this program generate another file, or add a 423# field to mktables.lst that gives the date of first definition of a property. 424# Each new release of Unicode would use that file as a basis for the next 425# iteration. And the Perl synonym addition code could sort based on the age 426# of the property, so older properties get priority, and newer ones that clash 427# would be refused; hence existing code would not be impacted, and some other 428# synonym would have to be used for the new property. This is ugly, and 429# manual intervention would certainly be easier to do in the short run; lets 430# hope it never comes to this. 431# 432# A NOTE ON UNIHAN 433# 434# This program can generate tables from the Unihan database. But that DB 435# isn't normally available, so it is marked as optional. Prior to version 436# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database 437# was split into 8 different files, all beginning with the letters 'Unihan'. 438# If you plunk those files down into the directory mktables ($0) is in, this 439# program will read them and automatically create tables for the properties 440# from it that are listed in PropertyAliases.txt and PropValueAliases.txt, 441# plus any you add to the @cjk_properties array and the @cjk_property_values 442# array, being sure to add necessary '# @missings' lines to the latter. For 443# Unicode versions earlier than 5.2, most of the Unihan properties are not 444# listed at all in PropertyAliases nor PropValueAliases. This program assumes 445# for these early releases that you want the properties that are specified in 446# the 5.2 release. 447# 448# You may need to adjust the entries to suit your purposes. setup_unihan(), 449# and filter_unihan_line() are the functions where this is done. This program 450# already does some adjusting to make the lines look more like the rest of the 451# Unicode DB; You can see what that is in filter_unihan_line() 452# 453# There is a bug in the 3.2 data file in which some values for the 454# kPrimaryNumeric property have commas and an unexpected comment. A filter 455# could be added to correct these; or for a particular installation, the 456# Unihan.txt file could be edited to fix them. 457# 458# HOW TO ADD A FILE TO BE PROCESSED 459# 460# A new file from Unicode needs to have an object constructed for it in 461# @input_file_objects, probably at the end or at the end of the extracted 462# ones. The program should warn you if its name will clash with others on 463# restrictive file systems, like DOS. If so, figure out a better name, and 464# add lines to the README.perl file giving that. If the file is a character 465# property, it should be in the format that Unicode has implicitly 466# standardized for such files for the more recently introduced ones. 467# If so, the Input_file constructor for @input_file_objects can just be the 468# file name and release it first appeared in. If not, then it should be 469# possible to construct an each_line_handler() to massage the line into the 470# standardized form. 471# 472# For non-character properties, more code will be needed. You can look at 473# the existing entries for clues. 474# 475# UNICODE VERSIONS NOTES 476# 477# The Unicode UCD has had a number of errors in it over the versions. And 478# these remain, by policy, in the standard for that version. Therefore it is 479# risky to correct them, because code may be expecting the error. So this 480# program doesn't generally make changes, unless the error breaks the Perl 481# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value 482# for U+1105, which causes real problems for the algorithms for Jamo 483# calculations, so it is changed here. 484# 485# But it isn't so clear cut as to what to do about concepts that are 486# introduced in a later release; should they extend back to earlier releases 487# where the concept just didn't exist? It was easier to do this than to not, 488# so that's what was done. For example, the default value for code points not 489# in the files for various properties was probably undefined until changed by 490# some version. No_Block for blocks is such an example. This program will 491# assign No_Block even in Unicode versions that didn't have it. This has the 492# benefit that code being written doesn't have to special case earlier 493# versions; and the detriment that it doesn't match the Standard precisely for 494# the affected versions. 495# 496# Here are some observations about some of the issues in early versions: 497# 498# Prior to version 3.0, there were 3 character decompositions. These are not 499# handled by Unicode::Normalize, nor will it compile when presented a version 500# that has them. However, you can trivially get it to compile by simply 501# ignoring those decompositions, by changing the croak to a carp. At the time 502# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or 503# dist/Unicode-Normalize/mkheader) reads 504# 505# croak("Weird Canonical Decomposition of U+$h"); 506# 507# Simply comment it out. It will compile, but will not know about any three 508# character decompositions. 509 510# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out 511# that the reason is that the CJK block starting at 4E00 was removed from 512# PropList, and was not put back in until 3.1.0. The Perl extension (the 513# single property name \p{alpha}) has the correct values. But the compound 514# form is simply not generated until 3.1, as it can be argued that prior to 515# this release, this was not an official property. The comments for 516# filter_old_style_proplist() give more details. 517# 518# Unicode introduced the synonym Space for White_Space in 4.1. Perl has 519# always had a \p{Space}. In release 3.2 only, they are not synonymous. The 520# reason is that 3.2 introduced U+205F=medium math space, which was not 521# classed as white space, but Perl figured out that it should have been. 4.0 522# reclassified it correctly. 523# 524# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 525# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB 526# became 202, and ATBL was left with no code points, as all the ones that 527# mapped to 202 stayed mapped to 202. Thus if your program used the numeric 528# name for the class, it would not have been affected, but if it used the 529# mnemonic, it would have been. 530# 531# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code 532# points which eventually came to have this script property value, instead 533# mapped to "Unknown". But in the next release all these code points were 534# moved to \p{sc=common} instead. 535 536# The tests furnished by Unicode for testing WordBreak and SentenceBreak 537# generate errors in 5.0 and earlier. 538# 539# The default for missing code points for BidiClass is complicated. Starting 540# in 3.1.1, the derived file DBidiClass.txt handles this, but this program 541# tries to do the best it can for earlier releases. It is done in 542# process_PropertyAliases() 543# 544# In version 2.1.2, the entry in UnicodeData.txt: 545# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; 546# should instead be 547# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F 548# Without this change, there are casing problems for this character. 549# 550# Search for $string_compare_versions to see how to compare changes to 551# properties between Unicode versions 552# 553############################################################################## 554 555my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing 556 # and errors 557my $MAX_LINE_WIDTH = 78; 558 559# Debugging aid to skip most files so as to not be distracted by them when 560# concentrating on the ones being debugged. Add 561# non_skip => 1, 562# to the constructor for those files you want processed when you set this. 563# Files with a first version number of 0 are special: they are always 564# processed regardless of the state of this flag. Generally, Jamo.txt and 565# UnicodeData.txt must not be skipped if you want this program to not die 566# before normal completion. 567my $debug_skip = 0; 568 569 570# Normally these are suppressed. 571my $write_Unicode_deprecated_tables = 0; 572 573# Set to 1 to enable tracing. 574our $to_trace = 0; 575 576{ # Closure for trace: debugging aid 577 my $print_caller = 1; # ? Include calling subroutine name 578 my $main_with_colon = 'main::'; 579 my $main_colon_length = length($main_with_colon); 580 581 sub trace { 582 return unless $to_trace; # Do nothing if global flag not set 583 584 my @input = @_; 585 586 local $DB::trace = 0; 587 $DB::trace = 0; # Quiet 'used only once' message 588 589 my $line_number; 590 591 # Loop looking up the stack to get the first non-trace caller 592 my $caller_line; 593 my $caller_name; 594 my $i = 0; 595 do { 596 $line_number = $caller_line; 597 (my $pkg, my $file, $caller_line, my $caller) = caller $i++; 598 $caller = $main_with_colon unless defined $caller; 599 600 $caller_name = $caller; 601 602 # get rid of pkg 603 $caller_name =~ s/.*:://; 604 if (substr($caller_name, 0, $main_colon_length) 605 eq $main_with_colon) 606 { 607 $caller_name = substr($caller_name, $main_colon_length); 608 } 609 610 } until ($caller_name ne 'trace'); 611 612 # If the stack was empty, we were called from the top level 613 $caller_name = 'main' if ($caller_name eq "" 614 || $caller_name eq 'trace'); 615 616 my $output = ""; 617 #print STDERR __LINE__, ": ", join ", ", @input, "\n"; 618 foreach my $string (@input) { 619 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { 620 $output .= simple_dumper($string); 621 } 622 else { 623 $string = "$string" if ref $string; 624 $string = $UNDEF unless defined $string; 625 chomp $string; 626 $string = '""' if $string eq ""; 627 $output .= " " if $output ne "" 628 && $string ne "" 629 && substr($output, -1, 1) ne " " 630 && substr($string, 0, 1) ne " "; 631 $output .= $string; 632 } 633 } 634 635 print STDERR sprintf "%4d: ", $line_number if defined $line_number; 636 print STDERR "$caller_name: " if $print_caller; 637 print STDERR $output, "\n"; 638 return; 639 } 640} 641 642sub stack_trace() { 643 local $to_trace = 1 if main::DEBUG; 644 my $line = (caller(0))[2]; 645 my $i = 1; 646 647 # Accumulate the stack trace 648 while (1) { 649 my ($pkg, $file, $caller_line, $caller) = caller $i++; 650 651 last unless defined $caller; 652 653 trace "called from $caller() at line $line"; 654 $line = $caller_line; 655 } 656} 657 658# This is for a rarely used development feature that allows you to compare two 659# versions of the Unicode standard without having to deal with changes caused 660# by the code points introduced in the later version. You probably also want 661# to use the -annotate option when using this. Run this program on a unicore 662# containing the starting release you want to compare. Save that output 663# structure. Then, switching to a unicore with the ending release, change the 664# "" in the $string_compare_versions definition just below to a string 665# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding 666# to the starting release. This program will then compile, but throw away all 667# code points introduced after the starting release. Finally use a diff tool 668# to compare the two directory structures. They include only the code points 669# common to both releases, and you can see the changes caused just by the 670# underlying release semantic changes. For versions earlier than 3.2, you 671# must copy a version of DAge.txt into the directory. 672my $string_compare_versions = DEBUG && ""; 673my $compare_versions = DEBUG 674 && $string_compare_versions 675 && pack "C*", split /\./, $string_compare_versions; 676 677sub uniques { 678 # Returns non-duplicated input values. From "Perl Best Practices: 679 # Encapsulated Cleverness". p. 455 in first edition. 680 681 my %seen; 682 # Arguably this breaks encapsulation, if the goal is to permit multiple 683 # distinct objects to stringify to the same value, and be interchangeable. 684 # However, for this program, no two objects stringify identically, and all 685 # lists passed to this function are either objects or strings. So this 686 # doesn't affect correctness, but it does give a couple of percent speedup. 687 no overloading; 688 return grep { ! $seen{$_}++ } @_; 689} 690 691$0 = File::Spec->canonpath($0); 692 693my $make_test_script = 0; # ? Should we output a test script 694my $make_norm_test_script = 0; # ? Should we output a normalization test script 695my $write_unchanged_files = 0; # ? Should we update the output files even if 696 # we don't think they have changed 697my $use_directory = ""; # ? Should we chdir somewhere. 698my $pod_directory; # input directory to store the pod file. 699my $pod_file = 'perluniprops'; 700my $t_path; # Path to the .t test file 701my $file_list = 'mktables.lst'; # File to store input and output file names. 702 # This is used to speed up the build, by not 703 # executing the main body of the program if 704 # nothing on the list has changed since the 705 # previous build 706my $make_list = 1; # ? Should we write $file_list. Set to always 707 # make a list so that when the release manager 708 # is preparing a release, they won't have to do 709 # special things 710my $glob_list = 0; # ? Should we try to include unknown .txt files 711 # in the input. 712my $output_range_counts = $debugging_build; # ? Should we include the number 713 # of code points in ranges in 714 # the output 715my $annotate = 0; # ? Should character names be in the output 716 717# Verbosity levels; 0 is quiet 718my $NORMAL_VERBOSITY = 1; 719my $PROGRESS = 2; 720my $VERBOSE = 3; 721 722my $verbosity = $NORMAL_VERBOSITY; 723 724# Stored in mktables.lst so that if this program is called with different 725# options, will regenerate even if the files otherwise look like they're 726# up-to-date. 727my $command_line_arguments = join " ", @ARGV; 728 729# Process arguments 730while (@ARGV) { 731 my $arg = shift @ARGV; 732 if ($arg eq '-v') { 733 $verbosity = $VERBOSE; 734 } 735 elsif ($arg eq '-p') { 736 $verbosity = $PROGRESS; 737 $| = 1; # Flush buffers as we go. 738 } 739 elsif ($arg eq '-q') { 740 $verbosity = 0; 741 } 742 elsif ($arg eq '-w') { 743 # update the files even if they haven't changed 744 $write_unchanged_files = 1; 745 } 746 elsif ($arg eq '-check') { 747 my $this = shift @ARGV; 748 my $ok = shift @ARGV; 749 if ($this ne $ok) { 750 print "Skipping as check params are not the same.\n"; 751 exit(0); 752 } 753 } 754 elsif ($arg eq '-P' && defined ($pod_directory = shift)) { 755 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; 756 } 757 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) 758 { 759 $make_test_script = 1; 760 } 761 elsif ($arg eq '-makenormtest') 762 { 763 $make_norm_test_script = 1; 764 } 765 elsif ($arg eq '-makelist') { 766 $make_list = 1; 767 } 768 elsif ($arg eq '-C' && defined ($use_directory = shift)) { 769 -d $use_directory or croak "Unknown directory '$use_directory'"; 770 } 771 elsif ($arg eq '-L') { 772 773 # Existence not tested until have chdir'd 774 $file_list = shift; 775 } 776 elsif ($arg eq '-globlist') { 777 $glob_list = 1; 778 } 779 elsif ($arg eq '-c') { 780 $output_range_counts = ! $output_range_counts 781 } 782 elsif ($arg eq '-annotate') { 783 $annotate = 1; 784 $debugging_build = 1; 785 $output_range_counts = 1; 786 } 787 else { 788 my $with_c = 'with'; 789 $with_c .= 'out' if $output_range_counts; # Complements the state 790 croak <<END; 791usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] 792 [ -T test_file_path ] [-globlist] [-makelist] [-maketest] 793 [-check A B ] 794 -c : Output comments $with_c number of code points in ranges 795 -q : Quiet Mode: Only output serious warnings. 796 -p : Set verbosity level to normal plus show progress. 797 -v : Set Verbosity level high: Show progress and non-serious 798 warnings 799 -w : Write files regardless 800 -C dir : Change to this directory before proceeding. All relative paths 801 except those specified by the -P and -T options will be done 802 with respect to this directory. 803 -P dir : Output $pod_file file to directory 'dir'. 804 -T path : Create a test script as 'path'; overrides -maketest 805 -L filelist : Use alternate 'filelist' instead of standard one 806 -globlist : Take as input all non-Test *.txt files in current and sub 807 directories 808 -maketest : Make test script 'TestProp.pl' in current (or -C directory), 809 overrides -T 810 -makelist : Rewrite the file list $file_list based on current setup 811 -annotate : Output an annotation for each character in the table files; 812 useful for debugging mktables, looking at diffs; but is slow 813 and memory intensive 814 -check A B : Executes $0 only if A and B are the same 815END 816 } 817} 818 819# Stores the most-recently changed file. If none have changed, can skip the 820# build 821my $most_recent = (stat $0)[9]; # Do this before the chdir! 822 823# Change directories now, because need to read 'version' early. 824if ($use_directory) { 825 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { 826 $pod_directory = File::Spec->rel2abs($pod_directory); 827 } 828 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { 829 $t_path = File::Spec->rel2abs($t_path); 830 } 831 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; 832 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { 833 $pod_directory = File::Spec->abs2rel($pod_directory); 834 } 835 if ($t_path && File::Spec->file_name_is_absolute($t_path)) { 836 $t_path = File::Spec->abs2rel($t_path); 837 } 838} 839 840# Get Unicode version into regular and v-string. This is done now because 841# various tables below get populated based on it. These tables are populated 842# here to be near the top of the file, and so easily seeable by those needing 843# to modify things. 844open my $VERSION, "<", "version" 845 or croak "$0: can't open required file 'version': $!\n"; 846my $string_version = <$VERSION>; 847close $VERSION; 848chomp $string_version; 849my $v_version = pack "C*", split /\./, $string_version; # v string 850 851my $unicode_version = ($compare_versions) 852 ? ( "$string_compare_versions (using " 853 . "$string_version rules)") 854 : $string_version; 855 856# The following are the complete names of properties with property values that 857# are known to not match any code points in some versions of Unicode, but that 858# may change in the future so they should be matchable, hence an empty file is 859# generated for them. 860my @tables_that_may_be_empty; 861push @tables_that_may_be_empty, 'Joining_Type=Left_Joining' 862 if $v_version lt v6.3.0; 863push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; 864push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; 865push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' 866 if $v_version ge v4.1.0; 867push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' 868 if $v_version ge v6.0.0; 869push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' 870 if $v_version ge v6.1.0; 871push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' 872 if $v_version ge v6.2.0; 873 874# The lists below are hashes, so the key is the item in the list, and the 875# value is the reason why it is in the list. This makes generation of 876# documentation easier. 877 878my %why_suppressed; # No file generated for these. 879 880# Files aren't generated for empty extraneous properties. This is arguable. 881# Extraneous properties generally come about because a property is no longer 882# used in a newer version of Unicode. If we generated a file without code 883# points, programs that used to work on that property will still execute 884# without errors. It just won't ever match (or will always match, with \P{}). 885# This means that the logic is now likely wrong. I (khw) think its better to 886# find this out by getting an error message. Just move them to the table 887# above to change this behavior 888my %why_suppress_if_empty_warn_if_not = ( 889 890 # It is the only property that has ever officially been removed from the 891 # Standard. The database never contained any code points for it. 892 'Special_Case_Condition' => 'Obsolete', 893 894 # Apparently never official, but there were code points in some versions of 895 # old-style PropList.txt 896 'Non_Break' => 'Obsolete', 897); 898 899# These would normally go in the warn table just above, but they were changed 900# a long time before this program was written, so warnings about them are 901# moot. 902if ($v_version gt v3.2.0) { 903 push @tables_that_may_be_empty, 904 'Canonical_Combining_Class=Attached_Below_Left' 905} 906 907# Obsoleted 908if ($v_version ge v11.0.0) { 909 push @tables_that_may_be_empty, qw( 910 Grapheme_Cluster_Break=E_Base 911 Grapheme_Cluster_Break=E_Base_GAZ 912 Grapheme_Cluster_Break=E_Modifier 913 Grapheme_Cluster_Break=Glue_After_Zwj 914 Word_Break=E_Base 915 Word_Break=E_Base_GAZ 916 Word_Break=E_Modifier 917 Word_Break=Glue_After_Zwj); 918} 919 920# Enum values for to_output_map() method in the Map_Table package. (0 is don't 921# output) 922my $EXTERNAL_MAP = 1; 923my $INTERNAL_MAP = 2; 924my $OUTPUT_ADJUSTED = 3; 925 926# To override computed values for writing the map tables for these properties. 927# The default for enum map tables is to write them out, so that the Unicode 928# .txt files can be removed, but all the data to compute any property value 929# for any code point is available in a more compact form. 930my %global_to_output_map = ( 931 # Needed by UCD.pm, but don't want to publicize that it exists, so won't 932 # get stuck supporting it if things change. Since it is a STRING 933 # property, it normally would be listed in the pod, but INTERNAL_MAP 934 # suppresses that. 935 Unicode_1_Name => $INTERNAL_MAP, 936 937 Present_In => 0, # Suppress, as easily computed from Age 938 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is 939 # retained, but needed for 940 # non-ASCII 941 942 # Suppress, as mapping can be found instead from the 943 # Perl_Decomposition_Mapping file 944 Decomposition_Type => 0, 945); 946 947# There are several types of obsolete properties defined by Unicode. These 948# must be hand-edited for every new Unicode release. 949my %why_deprecated; # Generates a deprecated warning message if used. 950my %why_stabilized; # Documentation only 951my %why_obsolete; # Documentation only 952 953{ # Closure 954 my $simple = 'Perl uses the more complete version'; 955 my $unihan = 'Unihan properties are by default not enabled in the Perl core.'; 956 957 my $other_properties = 'other properties'; 958 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; 959 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; 960 961 %why_deprecated = ( 962 'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 963 'Jamo_Short_Name' => $contributory, 964 'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 965 'Other_Alphabetic' => $contributory, 966 'Other_Default_Ignorable_Code_Point' => $contributory, 967 'Other_Grapheme_Extend' => $contributory, 968 'Other_ID_Continue' => $contributory, 969 'Other_ID_Start' => $contributory, 970 'Other_Lowercase' => $contributory, 971 'Other_Math' => $contributory, 972 'Other_Uppercase' => $contributory, 973 'Expands_On_NFC' => $why_no_expand, 974 'Expands_On_NFD' => $why_no_expand, 975 'Expands_On_NFKC' => $why_no_expand, 976 'Expands_On_NFKD' => $why_no_expand, 977 ); 978 979 %why_suppressed = ( 980 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which 981 # contains the same information, but without the algorithmically 982 # determinable Hangul syllables'. This file is not published, so it's 983 # existence is not noted in the comment. 984 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', 985 986 # Don't suppress ISO_Comment, as otherwise special handling is needed 987 # to differentiate between it and gc=c, which can be written as 'isc', 988 # which is the same characters as ISO_Comment's short name. 989 990 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::", 991 992 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD", 993 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 994 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 995 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 996 997 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful', 998 ); 999 1000 foreach my $property ( 1001 1002 # The following are suppressed because they were made contributory 1003 # or deprecated by Unicode before Perl ever thought about 1004 # supporting them. 1005 'Jamo_Short_Name', 1006 'Grapheme_Link', 1007 'Expands_On_NFC', 1008 'Expands_On_NFD', 1009 'Expands_On_NFKC', 1010 'Expands_On_NFKD', 1011 1012 # The following are suppressed because they have been marked 1013 # as deprecated for a sufficient amount of time 1014 'Other_Alphabetic', 1015 'Other_Default_Ignorable_Code_Point', 1016 'Other_Grapheme_Extend', 1017 'Other_ID_Continue', 1018 'Other_ID_Start', 1019 'Other_Lowercase', 1020 'Other_Math', 1021 'Other_Uppercase', 1022 ) { 1023 $why_suppressed{$property} = $why_deprecated{$property}; 1024 } 1025 1026 # Customize the message for all the 'Other_' properties 1027 foreach my $property (keys %why_deprecated) { 1028 next if (my $main_property = $property) !~ s/^Other_//; 1029 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; 1030 } 1031} 1032 1033if ($write_Unicode_deprecated_tables) { 1034 foreach my $property (keys %why_suppressed) { 1035 delete $why_suppressed{$property} if $property =~ 1036 / ^ Other | Grapheme /x; 1037 } 1038} 1039 1040if ($v_version ge 4.0.0) { 1041 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; 1042 if ($v_version ge 6.0.0) { 1043 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; 1044 } 1045} 1046if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { 1047 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; 1048 if ($v_version ge 6.0.0) { 1049 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; 1050 } 1051} 1052 1053# Probably obsolete forever 1054if ($v_version ge v4.1.0) { 1055 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; 1056} 1057if ($v_version ge v6.0.0) { 1058 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; 1059 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; 1060} 1061 1062# This program can create files for enumerated-like properties, such as 1063# 'Numeric_Type'. This file would be the same format as for a string 1064# property, with a mapping from code point to its value, so you could look up, 1065# for example, the script a code point is in. But no one so far wants this 1066# mapping, or they have found another way to get it since this is a new 1067# feature. So no file is generated except if it is in this list. 1068my @output_mapped_properties = split "\n", <<END; 1069END 1070 1071# If you want more Unihan properties than the default, you need to add them to 1072# these arrays. Depending on the property type, @missing lines might have to 1073# be added to the second array. A sample entry would be (including the '#'): 1074# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 1075my @cjk_properties = split "\n", <<'END'; 1076END 1077my @cjk_property_values = split "\n", <<'END'; 1078END 1079 1080# The input files don't list every code point. Those not listed are to be 1081# defaulted to some value. Below are hard-coded what those values are for 1082# non-binary properties as of 5.1. Starting in 5.0, there are 1083# machine-parsable comment lines in the files that give the defaults; so this 1084# list shouldn't have to be extended. The claim is that all missing entries 1085# for binary properties will default to 'N'. Unicode tried to change that in 1086# 5.2, but the beta period produced enough protest that they backed off. 1087# 1088# The defaults for the fields that appear in UnicodeData.txt in this hash must 1089# be in the form that it expects. The others may be synonyms. 1090my $CODE_POINT = '<code point>'; 1091my %default_mapping = ( 1092 Age => "Unassigned", 1093 # Bidi_Class => Complicated; set in code 1094 Bidi_Mirroring_Glyph => "", 1095 Block => 'No_Block', 1096 Canonical_Combining_Class => 0, 1097 Case_Folding => $CODE_POINT, 1098 Decomposition_Mapping => $CODE_POINT, 1099 Decomposition_Type => 'None', 1100 East_Asian_Width => "Neutral", 1101 FC_NFKC_Closure => $CODE_POINT, 1102 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned', 1103 Grapheme_Cluster_Break => 'Other', 1104 Hangul_Syllable_Type => 'NA', 1105 ISO_Comment => "", 1106 Jamo_Short_Name => "", 1107 Joining_Group => "No_Joining_Group", 1108 # Joining_Type => Complicated; set in code 1109 kIICore => 'N', # Is converted to binary 1110 #Line_Break => Complicated; set in code 1111 Lowercase_Mapping => $CODE_POINT, 1112 Name => "", 1113 Name_Alias => "", 1114 NFC_QC => 'Yes', 1115 NFD_QC => 'Yes', 1116 NFKC_QC => 'Yes', 1117 NFKD_QC => 'Yes', 1118 Numeric_Type => 'None', 1119 Numeric_Value => 'NaN', 1120 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', 1121 Sentence_Break => 'Other', 1122 Simple_Case_Folding => $CODE_POINT, 1123 Simple_Lowercase_Mapping => $CODE_POINT, 1124 Simple_Titlecase_Mapping => $CODE_POINT, 1125 Simple_Uppercase_Mapping => $CODE_POINT, 1126 Titlecase_Mapping => $CODE_POINT, 1127 Unicode_1_Name => "", 1128 Unicode_Radical_Stroke => "", 1129 Uppercase_Mapping => $CODE_POINT, 1130 Word_Break => 'Other', 1131); 1132 1133### End of externally interesting definitions, except for @input_file_objects 1134 1135my $HEADER=<<"EOF"; 1136# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1137# This file is machine-generated by $0 from the Unicode 1138# database, Version $unicode_version. Any changes made here will be lost! 1139EOF 1140 1141my $INTERNAL_ONLY_HEADER = <<"EOF"; 1142 1143# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 1144# This file is for internal use by core Perl only. The format and even the 1145# name or existence of this file are subject to change without notice. Don't 1146# use it directly. Use Unicode::UCD to access the Unicode character data 1147# base. 1148EOF 1149 1150my $DEVELOPMENT_ONLY=<<"EOF"; 1151# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! 1152# This file contains information artificially constrained to code points 1153# present in Unicode release $string_compare_versions. 1154# IT CANNOT BE RELIED ON. It is for use during development only and should 1155# not be used for production. 1156 1157EOF 1158 1159my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) 1160 ? "10FFFF" 1161 : "FFFF"; 1162my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; 1163my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; 1164 1165# We work with above-Unicode code points, up to IV_MAX, but we may want to use 1166# sentinels above that number. Therefore for internal use, we use a much 1167# smaller number, translating it to IV_MAX only for output. The exact number 1168# is immaterial (all above-Unicode code points are treated exactly the same), 1169# but the algorithm requires it to be at least 1170# 2 * $MAX_UNICODE_CODEPOINTS + 1 1171my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; 1172my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; 1173my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); 1174 1175my $MAX_PLATFORM_CODEPOINT = ~0 >> 1; 1176 1177# Matches legal code point. 4-6 hex numbers, If there are 6, the first 1178# two must be 10; if there are 5, the first must not be a 0. Written this way 1179# to decrease backtracking. The first regex allows the code point to be at 1180# the end of a word, but to work properly, the word shouldn't end with a valid 1181# hex character. The second one won't match a code point at the end of a 1182# word, and doesn't have the run-on issue 1183my $run_on_code_point_re = 1184 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; 1185my $code_point_re = qr/\b$run_on_code_point_re/; 1186 1187# This matches the beginning of the line in the Unicode DB files that give the 1188# defaults for code points not listed (i.e., missing) in the file. The code 1189# depends on this ending with a semi-colon, so it can assume it is a valid 1190# field when the line is split() by semi-colons 1191my $missing_defaults_prefix = qr/ ^ \# \s+ \@missing: \s+ 1192 ($code_point_re) 1193 \.\. 1194 ($code_point_re) 1195 \s* ; 1196 /x; 1197 1198# Property types. Unicode has more types, but these are sufficient for our 1199# purposes. 1200my $UNKNOWN = -1; # initialized to illegal value 1201my $NON_STRING = 1; # Either binary or enum 1202my $BINARY = 2; 1203my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal 1204 # tables, additional true and false tables are 1205 # generated so that false is anything matching the 1206 # default value, and true is everything else. 1207my $ENUM = 4; # Include catalog 1208my $STRING = 5; # Anything else: string or misc 1209 1210# Some input files have lines that give default values for code points not 1211# contained in the file. Sometimes these should be ignored. 1212my $NO_DEFAULTS = 0; # Must evaluate to false 1213my $NOT_IGNORED = 1; 1214my $IGNORED = 2; 1215 1216# Range types. Each range has a type. Most ranges are type 0, for normal, 1217# and will appear in the main body of the tables in the output files, but 1218# there are other types of ranges as well, listed below, that are specially 1219# handled. There are pseudo-types as well that will never be stored as a 1220# type, but will affect the calculation of the type. 1221 1222# 0 is for normal, non-specials 1223my $MULTI_CP = 1; # Sequence of more than code point 1224my $HANGUL_SYLLABLE = 2; 1225my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. 1226my $NULL = 4; # The map is to the null string; utf8.c can't 1227 # handle these, nor is there an accepted syntax 1228 # for them in \p{} constructs 1229my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would 1230 # otherwise be $MULTI_CP type are instead type 0 1231 1232# process_generic_property_file() can accept certain overrides in its input. 1233# Each of these must begin AND end with $CMD_DELIM. 1234my $CMD_DELIM = "\a"; 1235my $REPLACE_CMD = 'replace'; # Override the Replace 1236my $MAP_TYPE_CMD = 'map_type'; # Override the Type 1237 1238my $NO = 0; 1239my $YES = 1; 1240 1241# Values for the Replace argument to add_range. 1242# $NO # Don't replace; add only the code points not 1243 # already present. 1244my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in 1245 # the comments at the subroutine definition. 1246my $UNCONDITIONALLY = 2; # Replace without conditions. 1247my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if 1248 # already there 1249my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if 1250 # already there 1251my $CROAK = 6; # Die with an error if is already there 1252 1253# Flags to give property statuses. The phrases are to remind maintainers that 1254# if the flag is changed, the indefinite article referring to it in the 1255# documentation may need to be as well. 1256my $NORMAL = ""; 1257my $DEPRECATED = 'D'; 1258my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; 1259my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; 1260my $DISCOURAGED = 'X'; 1261my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; 1262my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; 1263my $STRICTER = 'T'; 1264my $a_bold_stricter = "a 'B<$STRICTER>'"; 1265my $A_bold_stricter = "A 'B<$STRICTER>'"; 1266my $STABILIZED = 'S'; 1267my $a_bold_stabilized = "an 'B<$STABILIZED>'"; 1268my $A_bold_stabilized = "An 'B<$STABILIZED>'"; 1269my $OBSOLETE = 'O'; 1270my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; 1271my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; 1272 1273# Aliases can also have an extra status: 1274my $INTERNAL_ALIAS = 'P'; 1275 1276my %status_past_participles = ( 1277 $DISCOURAGED => 'discouraged', 1278 $STABILIZED => 'stabilized', 1279 $OBSOLETE => 'obsolete', 1280 $DEPRECATED => 'deprecated', 1281 $INTERNAL_ALIAS => 'reserved for Perl core internal use only', 1282); 1283 1284# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be 1285# externally documented. 1286my $ORDINARY = 0; # The normal fate. 1287my $MAP_PROXIED = 1; # The map table for the property isn't written out, 1288 # but there is a file written that can be used to 1289 # reconstruct this table 1290my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is 1291 # for Perl's internal use only 1292my $SUPPRESSED = 3; # The file for this table is not written out, and as a 1293 # result, we don't bother to do many computations on 1294 # it. 1295my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the 1296 # computations anyway, as the values are needed for 1297 # things to work. This happens when we have Perl 1298 # extensions that depend on Unicode tables that 1299 # wouldn't normally be in a given Unicode version. 1300 1301# The format of the values of the tables: 1302my $EMPTY_FORMAT = ""; 1303my $BINARY_FORMAT = 'b'; 1304my $DECIMAL_FORMAT = 'd'; 1305my $FLOAT_FORMAT = 'f'; 1306my $INTEGER_FORMAT = 'i'; 1307my $HEX_FORMAT = 'x'; 1308my $RATIONAL_FORMAT = 'r'; 1309my $STRING_FORMAT = 's'; 1310my $ADJUST_FORMAT = 'a'; 1311my $HEX_ADJUST_FORMAT = 'ax'; 1312my $DECOMP_STRING_FORMAT = 'c'; 1313my $STRING_WHITE_SPACE_LIST = 'sw'; 1314 1315my %map_table_formats = ( 1316 $BINARY_FORMAT => 'binary', 1317 $DECIMAL_FORMAT => 'single decimal digit', 1318 $FLOAT_FORMAT => 'floating point number', 1319 $INTEGER_FORMAT => 'integer', 1320 $HEX_FORMAT => 'non-negative hex whole number; a code point', 1321 $RATIONAL_FORMAT => 'rational: an integer or a fraction', 1322 $STRING_FORMAT => 'string', 1323 $ADJUST_FORMAT => 'some entries need adjustment', 1324 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment', 1325 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', 1326 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' 1327); 1328 1329# Unicode didn't put such derived files in a separate directory at first. 1330my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; 1331my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; 1332my $AUXILIARY = 'auxiliary'; 1333my $EMOJI = 'emoji'; 1334 1335# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm 1336my %loose_to_file_of; # loosely maps table names to their respective 1337 # files 1338my %stricter_to_file_of; # same; but for stricter mapping. 1339my %loose_property_to_file_of; # Maps a loose property name to its map file 1340my %strict_property_to_file_of; # Same, but strict 1341my @inline_definitions = "V0"; # Each element gives a definition of a unique 1342 # inversion list. When a definition is inlined, 1343 # its value in the hash it's in (one of the two 1344 # defined just above) will include an index into 1345 # this array. The 0th element is initialized to 1346 # the definition for a zero length inversion list 1347my %file_to_swash_name; # Maps the file name to its corresponding key name 1348 # in the hash %Unicode::UCD::SwashInfo 1349my %nv_floating_to_rational; # maps numeric values floating point numbers to 1350 # their rational equivalent 1351my %loose_property_name_of; # Loosely maps (non_string) property names to 1352 # standard form 1353my %strict_property_name_of; # Strictly maps (non_string) property names to 1354 # standard form 1355my %string_property_loose_to_name; # Same, for string properties. 1356my %loose_defaults; # keys are of form "prop=value", where 'prop' is 1357 # the property name in standard loose form, and 1358 # 'value' is the default value for that property, 1359 # also in standard loose form. 1360my %loose_to_standard_value; # loosely maps table names to the canonical 1361 # alias for them 1362my %ambiguous_names; # keys are alias names (in standard form) that 1363 # have more than one possible meaning. 1364my %combination_property; # keys are alias names (in standard form) that 1365 # have both a map table, and a binary one that 1366 # yields true for all non-null maps. 1367my %prop_aliases; # Keys are standard property name; values are each 1368 # one's aliases 1369my %prop_value_aliases; # Keys of top level are standard property name; 1370 # values are keys to another hash, Each one is 1371 # one of the property's values, in standard form. 1372 # The values are that prop-val's aliases. 1373my %skipped_files; # List of files that we skip 1374my %ucd_pod; # Holds entries that will go into the UCD section of the pod 1375 1376# Most properties are immune to caseless matching, otherwise you would get 1377# nonsensical results, as properties are a function of a code point, not 1378# everything that is caselessly equivalent to that code point. For example, 1379# Changes_When_Case_Folded('s') should be false, whereas caselessly it would 1380# be true because 's' and 'S' are equivalent caselessly. However, 1381# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we 1382# extend that concept to those very few properties that are like this. Each 1383# such property will match the full range caselessly. They are hard-coded in 1384# the program; it's not worth trying to make it general as it's extremely 1385# unlikely that they will ever change. 1386my %caseless_equivalent_to; 1387 1388# This is the range of characters that were in Release 1 of Unicode, and 1389# removed in Release 2 (replaced with the current Hangul syllables starting at 1390# U+AC00). The range was reused starting in Release 3 for other purposes. 1391my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400; 1392my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF; 1393 1394# These constants names and values were taken from the Unicode standard, 1395# version 5.1, section 3.12. They are used in conjunction with Hangul 1396# syllables. The '_string' versions are so generated tables can retain the 1397# hex format, which is the more familiar value 1398my $SBase_string = "0xAC00"; 1399my $SBase = CORE::hex $SBase_string; 1400my $LBase_string = "0x1100"; 1401my $LBase = CORE::hex $LBase_string; 1402my $VBase_string = "0x1161"; 1403my $VBase = CORE::hex $VBase_string; 1404my $TBase_string = "0x11A7"; 1405my $TBase = CORE::hex $TBase_string; 1406my $SCount = 11172; 1407my $LCount = 19; 1408my $VCount = 21; 1409my $TCount = 28; 1410my $NCount = $VCount * $TCount; 1411 1412# For Hangul syllables; These store the numbers from Jamo.txt in conjunction 1413# with the above published constants. 1414my %Jamo; 1415my %Jamo_L; # Leading consonants 1416my %Jamo_V; # Vowels 1417my %Jamo_T; # Trailing consonants 1418 1419# For code points whose name contains its ordinal as a '-ABCD' suffix. 1420# The key is the base name of the code point, and the value is an 1421# array giving all the ranges that use this base name. Each range 1422# is actually a hash giving the 'low' and 'high' values of it. 1423my %names_ending_in_code_point; 1424my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes 1425 # removed from the names 1426# Inverse mapping. The list of ranges that have these kinds of 1427# names. Each element contains the low, high, and base names in an 1428# anonymous hash. 1429my @code_points_ending_in_code_point; 1430 1431# To hold Unicode's normalization test suite 1432my @normalization_tests; 1433 1434# Boolean: does this Unicode version have the hangul syllables, and are we 1435# writing out a table for them? 1436my $has_hangul_syllables = 0; 1437 1438# Does this Unicode version have code points whose names end in their 1439# respective code points, and are we writing out a table for them? 0 for no; 1440# otherwise points to first property that a table is needed for them, so that 1441# if multiple tables are needed, we don't create duplicates 1442my $needing_code_points_ending_in_code_point = 0; 1443 1444my @backslash_X_tests; # List of tests read in for testing \X 1445my @LB_tests; # List of tests read in for testing \b{lb} 1446my @SB_tests; # List of tests read in for testing \b{sb} 1447my @WB_tests; # List of tests read in for testing \b{wb} 1448my @unhandled_properties; # Will contain a list of properties found in 1449 # the input that we didn't process. 1450my @match_properties; # Properties that have match tables, to be 1451 # listed in the pod 1452my @map_properties; # Properties that get map files written 1453my @named_sequences; # NamedSequences.txt contents. 1454my %potential_files; # Generated list of all .txt files in the directory 1455 # structure so we can warn if something is being 1456 # ignored. 1457my @missing_early_files; # Generated list of absent files that we need to 1458 # proceed in compiling this early Unicode version 1459my @files_actually_output; # List of files we generated. 1460my @more_Names; # Some code point names are compound; this is used 1461 # to store the extra components of them. 1462my $E_FLOAT_PRECISION = 3; # The minimum number of digits after the decimal 1463 # point of a normalized floating point number 1464 # needed to match before we consider it equivalent 1465 # to a candidate rational 1466 1467# These store references to certain commonly used property objects 1468my $age; 1469my $ccc; 1470my $gc; 1471my $perl; 1472my $block; 1473my $perl_charname; 1474my $print; 1475my $All; 1476my $Assigned; # All assigned characters in this Unicode release 1477my $DI; # Default_Ignorable_Code_Point property 1478my $NChar; # Noncharacter_Code_Point property 1479my $script; 1480my $scx; # Script_Extensions property 1481my $idt; # Identifier_Type property 1482 1483# Are there conflicting names because of beginning with 'In_', or 'Is_' 1484my $has_In_conflicts = 0; 1485my $has_Is_conflicts = 0; 1486 1487sub internal_file_to_platform ($file=undef) { 1488 # Convert our file paths which have '/' separators to those of the 1489 # platform. 1490 1491 return undef unless defined $file; 1492 1493 return File::Spec->join(split '/', $file); 1494} 1495 1496sub file_exists ($file=undef) { # platform independent '-e'. This program internally 1497 # uses slash as a path separator. 1498 return 0 unless defined $file; 1499 return -e internal_file_to_platform($file); 1500} 1501 1502sub objaddr($addr) { 1503 # Returns the address of the blessed input object. 1504 # It doesn't check for blessedness because that would do a string eval 1505 # every call, and the program is structured so that this is never called 1506 # for a non-blessed object. 1507 1508 return pack 'J', refaddr $addr; 1509} 1510 1511# These are used only if $annotate is true. 1512# The entire range of Unicode characters is examined to populate these 1513# after all the input has been processed. But most can be skipped, as they 1514# have the same descriptive phrases, such as being unassigned 1515my @viacode; # Contains the 1 million character names 1516my @age; # And their ages ("" if none) 1517my @printable; # boolean: And are those characters printable? 1518my @annotate_char_type; # Contains a type of those characters, specifically 1519 # for the purposes of annotation. 1520my $annotate_ranges; # A map of ranges of code points that have the same 1521 # name for the purposes of annotation. They map to the 1522 # upper edge of the range, so that the end point can 1523 # be immediately found. This is used to skip ahead to 1524 # the end of a range, and avoid processing each 1525 # individual code point in it. 1526my $unassigned_sans_noncharacters; # A Range_List of the unassigned 1527 # characters, but excluding those which are 1528 # also noncharacter code points 1529 1530# The annotation types are an extension of the regular range types, though 1531# some of the latter are folded into one. Make the new types negative to 1532# avoid conflicting with the regular types 1533my $SURROGATE_TYPE = -1; 1534my $UNASSIGNED_TYPE = -2; 1535my $PRIVATE_USE_TYPE = -3; 1536my $NONCHARACTER_TYPE = -4; 1537my $CONTROL_TYPE = -5; 1538my $ABOVE_UNICODE_TYPE = -6; 1539my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program 1540 1541sub populate_char_info ($i) { 1542 # Used only with the $annotate option. Populates the arrays with the 1543 # input code point's info that are needed for outputting more detailed 1544 # comments. If calling context wants a return, it is the end point of 1545 # any contiguous range of characters that share essentially the same info 1546 1547 $viacode[$i] = $perl_charname->value_of($i) || ""; 1548 $age[$i] = (defined $age) 1549 ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x) 1550 ? $age->value_of($i) 1551 : "") 1552 : ""; 1553 1554 # A character is generally printable if Unicode says it is, 1555 # but below we make sure that most Unicode general category 'C' types 1556 # aren't. 1557 $printable[$i] = $print->contains($i); 1558 1559 # But the characters in this range were removed in v2.0 and replaced by 1560 # different ones later. Modern fonts will be for the replacement 1561 # characters, so suppress printing them. 1562 if (($v_version lt v2.0 1563 || ($compare_versions && $compare_versions lt v2.0)) 1564 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE 1565 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE)) 1566 { 1567 $printable[$i] = 0; 1568 } 1569 1570 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; 1571 1572 # Only these two regular types are treated specially for annotations 1573 # purposes 1574 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME 1575 && $annotate_char_type[$i] != $HANGUL_SYLLABLE; 1576 1577 # Give a generic name to all code points that don't have a real name. 1578 # We output ranges, if applicable, for these. Also calculate the end 1579 # point of the range. 1580 my $end; 1581 if (! $viacode[$i]) { 1582 if ($i > $MAX_UNICODE_CODEPOINT) { 1583 $viacode[$i] = 'Above-Unicode'; 1584 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; 1585 $printable[$i] = 0; 1586 $end = $MAX_WORKING_CODEPOINT; 1587 } 1588 elsif ($gc-> table('Private_use')->contains($i)) { 1589 $viacode[$i] = 'Private Use'; 1590 $annotate_char_type[$i] = $PRIVATE_USE_TYPE; 1591 $printable[$i] = 0; 1592 $end = $gc->table('Private_Use')->containing_range($i)->end; 1593 } 1594 elsif ($NChar->contains($i)) { 1595 $viacode[$i] = 'Noncharacter'; 1596 $annotate_char_type[$i] = $NONCHARACTER_TYPE; 1597 $printable[$i] = 0; 1598 $end = $NChar->containing_range($i)->end; 1599 } 1600 elsif ($gc-> table('Control')->contains($i)) { 1601 my $name_ref = property_ref('Name_Alias'); 1602 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref; 1603 $viacode[$i] = (defined $name_ref) 1604 ? $name_ref->value_of($i) 1605 : 'Control'; 1606 $annotate_char_type[$i] = $CONTROL_TYPE; 1607 $printable[$i] = 0; 1608 } 1609 elsif ($gc-> table('Unassigned')->contains($i)) { 1610 $annotate_char_type[$i] = $UNASSIGNED_TYPE; 1611 $printable[$i] = 0; 1612 $viacode[$i] = 'Unassigned'; 1613 1614 if (defined $block) { # No blocks in earliest releases 1615 $viacode[$i] .= ', block=' . $block-> value_of($i); 1616 $end = $gc-> table('Unassigned')->containing_range($i)->end; 1617 1618 # Because we name the unassigned by the blocks they are in, it 1619 # can't go past the end of that block, and it also can't go 1620 # past the unassigned range it is in. The special table makes 1621 # sure that the non-characters, which are unassigned, are 1622 # separated out. 1623 $end = min($block->containing_range($i)->end, 1624 $unassigned_sans_noncharacters-> 1625 containing_range($i)->end); 1626 } 1627 else { 1628 $end = $i + 1; 1629 while ($unassigned_sans_noncharacters->contains($end)) { 1630 $end++; 1631 } 1632 $end--; 1633 } 1634 } 1635 elsif ($perl->table('_Perl_Surrogate')->contains($i)) { 1636 $viacode[$i] = 'Surrogate'; 1637 $annotate_char_type[$i] = $SURROGATE_TYPE; 1638 $printable[$i] = 0; 1639 $end = $gc->table('Surrogate')->containing_range($i)->end; 1640 } 1641 else { 1642 Carp::my_carp_bug("Can't figure out how to annotate " 1643 . sprintf("U+%04X", $i) 1644 . ". Proceeding anyway."); 1645 $viacode[$i] = 'UNKNOWN'; 1646 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1647 $printable[$i] = 0; 1648 } 1649 } 1650 1651 # Here, has a name, but if it's one in which the code point number is 1652 # appended to the name, do that. 1653 elsif ($annotate_char_type[$i] == $CP_IN_NAME) { 1654 $viacode[$i] .= sprintf("-%04X", $i); 1655 1656 my $limit = $perl_charname->containing_range($i)->end; 1657 if (defined $age) { 1658 # Do all these as groups of the same age, instead of individually, 1659 # because their names are so meaningless, and there are typically 1660 # large quantities of them. 1661 $end = $i + 1; 1662 while ($end <= $limit && $age->value_of($end) == $age[$i]) { 1663 $end++; 1664 } 1665 $end--; 1666 } 1667 else { 1668 $end = $limit; 1669 } 1670 } 1671 1672 # And here, has a name, but if it's a hangul syllable one, replace it with 1673 # the correct name from the Unicode algorithm 1674 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { 1675 use integer; 1676 my $SIndex = $i - $SBase; 1677 my $L = $LBase + $SIndex / $NCount; 1678 my $V = $VBase + ($SIndex % $NCount) / $TCount; 1679 my $T = $TBase + $SIndex % $TCount; 1680 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; 1681 $viacode[$i] .= $Jamo{$T} if $T != $TBase; 1682 $end = $perl_charname->containing_range($i)->end; 1683 } 1684 1685 return if ! defined wantarray; 1686 return $i if ! defined $end; # If not a range, return the input 1687 1688 # Save this whole range so can find the end point quickly 1689 $annotate_ranges->add_map($i, $end, $end); 1690 1691 return $end; 1692} 1693 1694sub max($a, $b) { 1695 return $a >= $b ? $a : $b; 1696} 1697 1698sub min($a, $b) { 1699 return $a <= $b ? $a : $b; 1700} 1701 1702sub clarify_number ($number) { 1703 # This returns the input number with underscores inserted every 3 digits 1704 # in large (5 digits or more) numbers. Input must be entirely digits, not 1705 # checked. 1706 1707 my $pos = length($number) - 3; 1708 return $number if $pos <= 1; 1709 while ($pos > 0) { 1710 substr($number, $pos, 0) = '_'; 1711 $pos -= 3; 1712 } 1713 return $number; 1714} 1715 1716sub clarify_code_point_count ($number) { 1717 # This is like clarify_number(), but the input is assumed to be a count of 1718 # code points, rather than a generic number. 1719 1720 my $append = ""; 1721 1722 if ($number > $MAX_UNICODE_CODEPOINTS) { 1723 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); 1724 return "All above-Unicode code points" if $number == 0; 1725 $append = " + all above-Unicode code points"; 1726 } 1727 return clarify_number($number) . $append; 1728} 1729 1730package Carp; 1731 1732# These routines give a uniform treatment of messages in this program. They 1733# are placed in the Carp package to cause the stack trace to not include them, 1734# although an alternative would be to use another package and set @CARP_NOT 1735# for it. 1736 1737our $Verbose = 1 if main::DEBUG; # Useful info when debugging 1738 1739# This is a work-around suggested by Nicholas Clark to fix a problem with Carp 1740# and overload trying to load Scalar:Util under miniperl. See 1741# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html 1742undef $overload::VERSION; 1743 1744sub my_carp($message="", $nofold=0) { 1745 1746 if ($message) { 1747 $message = main::join_lines($message); 1748 $message =~ s/^$0: *//; # Remove initial program name 1749 $message =~ s/[.;,]+$//; # Remove certain ending punctuation 1750 $message = "\n$0: $message;"; 1751 1752 # Fold the message with program name, semi-colon end punctuation 1753 # (which looks good with the message that carp appends to it), and a 1754 # hanging indent for continuation lines. 1755 $message = main::simple_fold($message, "", 4) unless $nofold; 1756 $message =~ s/\n$//; # Remove the trailing nl so what carp 1757 # appends is to the same line 1758 } 1759 1760 return $message if defined wantarray; # If a caller just wants the msg 1761 1762 carp $message; 1763 return; 1764} 1765 1766sub my_carp_bug($message="") { 1767 # This is called when it is clear that the problem is caused by a bug in 1768 # this program. 1769 $message =~ s/^$0: *//; 1770 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); 1771 carp $message; 1772 return; 1773} 1774 1775sub carp_too_few_args($args_ref, $count) { 1776 my_carp_bug("Need at least $count arguments to " 1777 . (caller 1)[3] 1778 . ". Instead got: '" 1779 . join ', ', @$args_ref 1780 . "'. No action taken."); 1781 return; 1782} 1783 1784sub carp_extra_args($args_ref) { 1785 unless (ref $args_ref) { 1786 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); 1787 return; 1788 } 1789 my ($package, $file, $line) = caller; 1790 my $subroutine = (caller 1)[3]; 1791 1792 my $list; 1793 if (ref $args_ref eq 'HASH') { 1794 foreach my $key (keys %$args_ref) { 1795 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; 1796 } 1797 $list = join ', ', each %{$args_ref}; 1798 } 1799 elsif (ref $args_ref eq 'ARRAY') { 1800 foreach my $arg (@$args_ref) { 1801 $arg = $UNDEF unless defined $arg; 1802 } 1803 $list = join ', ', @$args_ref; 1804 } 1805 else { 1806 my_carp_bug("Can't cope with ref " 1807 . ref($args_ref) 1808 . " . argument to 'carp_extra_args'. Not checking arguments."); 1809 return; 1810 } 1811 1812 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); 1813 return; 1814} 1815 1816package main; 1817 1818{ # Closure 1819 1820 # This program uses the inside-out method for objects, as recommended in 1821 # "Perl Best Practices". (This is the best solution still, since this has 1822 # to run under miniperl.) This closure aids in generating those. There 1823 # are two routines. setup_package() is called once per package to set 1824 # things up, and then set_access() is called for each hash representing a 1825 # field in the object. These routines arrange for the object to be 1826 # properly destroyed when no longer used, and for standard accessor 1827 # functions to be generated. If you need more complex accessors, just 1828 # write your own and leave those accesses out of the call to set_access(). 1829 # More details below. 1830 1831 my %constructor_fields; # fields that are to be used in constructors; see 1832 # below 1833 1834 # The values of this hash will be the package names as keys to other 1835 # hashes containing the name of each field in the package as keys, and 1836 # references to their respective hashes as values. 1837 my %package_fields; 1838 1839 sub setup_package { 1840 # Sets up the package, creating standard DESTROY and dump methods 1841 # (unless already defined). The dump method is used in debugging by 1842 # simple_dumper(). 1843 # The optional parameters are: 1844 # a) a reference to a hash, that gets populated by later 1845 # set_access() calls with one of the accesses being 1846 # 'constructor'. The caller can then refer to this, but it is 1847 # not otherwise used by these two routines. 1848 # b) a reference to a callback routine to call during destruction 1849 # of the object, before any fields are actually destroyed 1850 1851 my %args = @_; 1852 my $constructor_ref = delete $args{'Constructor_Fields'}; 1853 my $destroy_callback = delete $args{'Destroy_Callback'}; 1854 Carp::carp_extra_args(\@_) if main::DEBUG && %args; 1855 1856 my %fields; 1857 my $package = (caller)[0]; 1858 1859 $package_fields{$package} = \%fields; 1860 $constructor_fields{$package} = $constructor_ref; 1861 1862 unless ($package->can('DESTROY')) { 1863 my $destroy_name = "${package}::DESTROY"; 1864 no strict "refs"; 1865 1866 # Use typeglob to give the anonymous subroutine the name we want 1867 *$destroy_name = sub ($self) { 1868 my $addr = pack 'J', refaddr $self; 1869 1870 $self->$destroy_callback if $destroy_callback; 1871 foreach my $field (keys %{$package_fields{$package}}) { 1872 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; 1873 delete $package_fields{$package}{$field}{$addr}; 1874 } 1875 return; 1876 } 1877 } 1878 1879 unless ($package->can('dump')) { 1880 my $dump_name = "${package}::dump"; 1881 no strict "refs"; 1882 *$dump_name = sub ($self, @_args) { 1883 return dump_inside_out($self, $package_fields{$package}, @_args); 1884 } 1885 } 1886 return; 1887 } 1888 1889 sub set_access($name, $field, @accessors) { 1890 # Arrange for the input field to be garbage collected when no longer 1891 # needed. Also, creates standard accessor functions for the field 1892 # based on the optional parameters-- none if none of these parameters: 1893 # 'addable' creates an 'add_NAME()' accessor function. 1894 # 'readable' or 'readable_array' creates a 'NAME()' accessor 1895 # function. 1896 # 'settable' creates a 'set_NAME()' accessor function. 1897 # 'constructor' doesn't create an accessor function, but adds the 1898 # field to the hash that was previously passed to 1899 # setup_package(); 1900 # Any of the accesses can be abbreviated down, so that 'a', 'ad', 1901 # 'add' etc. all mean 'addable'. 1902 # The read accessor function will work on both array and scalar 1903 # values. If another accessor in the parameter list is 'a', the read 1904 # access assumes an array. You can also force it to be array access 1905 # by specifying 'readable_array' instead of 'readable' 1906 # 1907 # A sort-of 'protected' access can be set-up by preceding the addable, 1908 # readable or settable with some initial portion of 'protected_' (but, 1909 # the underscore is required), like 'p_a', 'pro_set', etc. The 1910 # "protection" is only by convention. All that happens is that the 1911 # accessor functions' names begin with an underscore. So instead of 1912 # calling set_foo, the call is _set_foo. (Real protection could be 1913 # accomplished by having a new subroutine, end_package, called at the 1914 # end of each package, and then storing the __LINE__ ranges and 1915 # checking them on every accessor. But that is way overkill.) 1916 1917 # We create anonymous subroutines as the accessors and then use 1918 # typeglobs to assign them to the proper package and name 1919 1920 # $name Name of the field 1921 # $field Reference to the inside-out hash containing the 1922 # field 1923 1924 my $package = (caller)[0]; 1925 1926 if (! exists $package_fields{$package}) { 1927 croak "$0: Must call 'setup_package' before 'set_access'"; 1928 } 1929 1930 # Stash the field so DESTROY can get it. 1931 $package_fields{$package}{$name} = $field; 1932 1933 # Remaining arguments are the accessors. For each... 1934 foreach my $access (@accessors) { 1935 my $access = lc $access; 1936 1937 my $protected = ""; 1938 1939 # Match the input as far as it goes. 1940 if ($access =~ /^(p[^_]*)_/) { 1941 $protected = $1; 1942 if (substr('protected_', 0, length $protected) 1943 eq $protected) 1944 { 1945 1946 # Add 1 for the underscore not included in $protected 1947 $access = substr($access, length($protected) + 1); 1948 $protected = '_'; 1949 } 1950 else { 1951 $protected = ""; 1952 } 1953 } 1954 1955 if (substr('addable', 0, length $access) eq $access) { 1956 my $subname = "${package}::${protected}add_$name"; 1957 no strict "refs"; 1958 1959 # add_ accessor. Don't add if already there, which we 1960 # determine using 'eq' for scalars and '==' otherwise. 1961 *$subname = sub ($self, $value) { 1962 use strict "refs"; 1963 my $addr = pack 'J', refaddr $self; 1964 if (ref $value) { 1965 return if grep { $value == $_ } @{$field->{$addr}}; 1966 } 1967 else { 1968 return if grep { $value eq $_ } @{$field->{$addr}}; 1969 } 1970 push @{$field->{$addr}}, $value; 1971 return; 1972 } 1973 } 1974 elsif (substr('constructor', 0, length $access) eq $access) { 1975 if ($protected) { 1976 Carp::my_carp_bug("Can't set-up 'protected' constructors") 1977 } 1978 else { 1979 $constructor_fields{$package}{$name} = $field; 1980 } 1981 } 1982 elsif (substr('readable_array', 0, length $access) eq $access) { 1983 1984 # Here has read access. If one of the other parameters for 1985 # access is array, or this one specifies array (by being more 1986 # than just 'readable_'), then create a subroutine that 1987 # assumes the data is an array. Otherwise just a scalar 1988 my $subname = "${package}::${protected}$name"; 1989 if (grep { /^a/i } (@accessors) 1990 or length($access) > length('readable_')) 1991 { 1992 no strict "refs"; 1993 *$subname = sub ($_addr) { 1994 use strict "refs"; 1995 my $addr = pack 'J', refaddr $_addr; 1996 if (ref $field->{$addr} ne 'ARRAY') { 1997 my $type = ref $field->{$addr}; 1998 $type = 'scalar' unless $type; 1999 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); 2000 return; 2001 } 2002 return scalar @{$field->{$addr}} unless wantarray; 2003 2004 # Make a copy; had problems with caller modifying the 2005 # original otherwise 2006 my @return = @{$field->{$addr}}; 2007 return @return; 2008 } 2009 } 2010 else { 2011 2012 # Here not an array value, a simpler function. 2013 no strict "refs"; 2014 *$subname = sub ($addr) { 2015 use strict "refs"; 2016 return $field->{pack 'J', refaddr $addr}; 2017 } 2018 } 2019 } 2020 elsif (substr('settable', 0, length $access) eq $access) { 2021 my $subname = "${package}::${protected}set_$name"; 2022 no strict "refs"; 2023 *$subname = sub ($self, $value) { 2024 use strict "refs"; 2025 # $self is $_[0]; $value is $_[1] 2026 $field->{pack 'J', refaddr $self} = $value; 2027 return; 2028 } 2029 } 2030 else { 2031 Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); 2032 } 2033 } 2034 return; 2035 } 2036} 2037 2038package Input_file; 2039 2040# All input files use this object, which stores various attributes about them, 2041# and provides for convenient, uniform handling. The run method wraps the 2042# processing. It handles all the bookkeeping of opening, reading, and closing 2043# the file, returning only significant input lines. 2044# 2045# Each object gets a handler which processes the body of the file, and is 2046# called by run(). All character property files must use the generic, 2047# default handler, which has code scrubbed to handle things you might not 2048# expect, including automatic EBCDIC handling. For files that don't deal with 2049# mapping code points to a property value, such as test files, 2050# PropertyAliases, PropValueAliases, and named sequences, you can override the 2051# handler to be a custom one. Such a handler should basically be a 2052# while(next_line()) {...} loop. 2053# 2054# You can also set up handlers to 2055# 0) call during object construction time, after everything else is done 2056# 1) call before the first line is read, for pre processing 2057# 2) call to adjust each line of the input before the main handler gets 2058# them. This can be automatically generated, if appropriately simple 2059# enough, by specifying a Properties parameter in the constructor. 2060# 3) call upon EOF before the main handler exits its loop 2061# 4) call at the end, for post processing 2062# 2063# $_ is used to store the input line, and is to be filtered by the 2064# each_line_handler()s. So, if the format of the line is not in the desired 2065# format for the main handler, these are used to do that adjusting. They can 2066# be stacked (by enclosing them in an [ anonymous array ] in the constructor, 2067# so the $_ output of one is used as the input to the next. The EOF handler 2068# is also stackable, but none of the others are, but could easily be changed 2069# to be so. 2070# 2071# Some properties are used by the Perl core but aren't defined until later 2072# Unicode releases. The perl interpreter would have problems working when 2073# compiled with an earlier Unicode version that doesn't have them, so we need 2074# to define them somehow for those releases. The 'Early' constructor 2075# parameter can be used to automatically handle this. It is essentially 2076# ignored if the Unicode version being compiled has a data file for this 2077# property. Either code to execute or a file to read can be specified. 2078# Details are at the %early definition. 2079# 2080# Most of the handlers can call insert_lines() or insert_adjusted_lines() 2081# which insert the parameters as lines to be processed before the next input 2082# file line is read. This allows the EOF handler(s) to flush buffers, for 2083# example. The difference between the two routines is that the lines inserted 2084# by insert_lines() are subjected to the each_line_handler()s. (So if you 2085# called it from such a handler, you would get infinite recursion without some 2086# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go 2087# directly to the main handler without any adjustments. If the 2088# post-processing handler calls any of these, there will be no effect. Some 2089# error checking for these conditions could be added, but it hasn't been done. 2090# 2091# carp_bad_line() should be called to warn of bad input lines, which clears $_ 2092# to prevent further processing of the line. This routine will output the 2093# message as a warning once, and then keep a count of the lines that have the 2094# same message, and output that count at the end of the file's processing. 2095# This keeps the number of messages down to a manageable amount. 2096# 2097# get_missings() should be called to retrieve any @missing input lines. 2098# Messages will be raised if this isn't done if the options aren't to ignore 2099# missings. 2100 2101sub trace { return main::trace(@_); } 2102 2103{ # Closure 2104 # Keep track of fields that are to be put into the constructor. 2105 my %constructor_fields; 2106 2107 main::setup_package(Constructor_Fields => \%constructor_fields); 2108 2109 my %file; # Input file name, required 2110 main::set_access('file', \%file, qw{ c r }); 2111 2112 my %first_released; # Unicode version file was first released in, required 2113 main::set_access('first_released', \%first_released, qw{ c r }); 2114 2115 my %handler; # Subroutine to process the input file, defaults to 2116 # 'process_generic_property_file' 2117 main::set_access('handler', \%handler, qw{ c }); 2118 2119 my %property; 2120 # name of property this file is for. defaults to none, meaning not 2121 # applicable, or is otherwise determinable, for example, from each line. 2122 main::set_access('property', \%property, qw{ c r }); 2123 2124 my %optional; 2125 # This is either an unsigned number, or a list of property names. In the 2126 # former case, if it is non-zero, it means the file is optional, so if the 2127 # file is absent, no warning about that is output. In the latter case, it 2128 # is a list of properties that the file (exclusively) defines. If the 2129 # file is present, tables for those properties will be produced; if 2130 # absent, none will, even if they are listed elsewhere (namely 2131 # PropertyAliases.txt and PropValueAliases.txt) as being in this release, 2132 # and no warnings will be raised about them not being available. (And no 2133 # warning about the file itself will be raised.) 2134 main::set_access('optional', \%optional, qw{ c readable_array } ); 2135 2136 my %non_skip; 2137 # This is used for debugging, to skip processing of all but a few input 2138 # files. Add 'non_skip => 1' to the constructor for those files you want 2139 # processed when you set the $debug_skip global. 2140 main::set_access('non_skip', \%non_skip, 'c'); 2141 2142 my %skip; 2143 # This is used to skip processing of this input file (semi-) permanently. 2144 # The value should be the reason the file is being skipped. It is used 2145 # for files that we aren't planning to process anytime soon, but want to 2146 # allow to be in the directory and be checked for their names not 2147 # conflicting with any other files on a DOS 8.3 name filesystem, but to 2148 # not otherwise be processed, and to not raise a warning about not being 2149 # handled. In the constructor call, any value that evaluates to a numeric 2150 # 0 or undef means don't skip. Any other value is a string giving the 2151 # reason it is being skipped, and this will appear in generated pod. 2152 # However, an empty string reason will suppress the pod entry. 2153 # Internally, calls that evaluate to numeric 0 are changed into undef to 2154 # distinguish them from an empty string call. 2155 main::set_access('skip', \%skip, 'c', 'r'); 2156 2157 my %each_line_handler; 2158 # list of subroutines to look at and filter each non-comment line in the 2159 # file. defaults to none. The subroutines are called in order, each is 2160 # to adjust $_ for the next one, and the final one adjusts it for 2161 # 'handler' 2162 main::set_access('each_line_handler', \%each_line_handler, 'c'); 2163 2164 my %retain_trailing_comments; 2165 # This is used to not discard the comments that end data lines. This 2166 # would be used only for files with non-typical syntax, and most code here 2167 # assumes that comments have been stripped, so special handlers would have 2168 # to be written. It is assumed that the code will use these in 2169 # single-quoted contexts, and so any "'" marks in the comment will be 2170 # prefixed by a backslash. 2171 main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c'); 2172 2173 my %properties; # Optional ordered list of the properties that occur in each 2174 # meaningful line of the input file. If present, an appropriate 2175 # each_line_handler() is automatically generated and pushed onto the stack 2176 # of such handlers. This is useful when a file contains multiple 2177 # properties per line, but no other special considerations are necessary. 2178 # The special value "<ignored>" means to discard the corresponding input 2179 # field. 2180 # Any @missing lines in the file should also match this syntax; no such 2181 # files exist as of 6.3. But if it happens in a future release, the code 2182 # could be expanded to properly parse them. 2183 main::set_access('properties', \%properties, qw{ c r }); 2184 2185 my %has_missings_defaults; 2186 # ? Are there lines in the file giving default values for code points 2187 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is 2188 # the norm, but IGNORED means it has such lines, but the handler doesn't 2189 # use them. Having these three states allows us to catch changes to the 2190 # UCD that this program should track. XXX This could be expanded to 2191 # specify the syntax for such lines, like %properties above. 2192 main::set_access('has_missings_defaults', 2193 \%has_missings_defaults, qw{ c r }); 2194 2195 my %construction_time_handler; 2196 # Subroutine to call at the end of the new method. If undef, no such 2197 # handler is called. 2198 main::set_access('construction_time_handler', 2199 \%construction_time_handler, qw{ c }); 2200 2201 my %pre_handler; 2202 # Subroutine to call before doing anything else in the file. If undef, no 2203 # such handler is called. 2204 main::set_access('pre_handler', \%pre_handler, qw{ c }); 2205 2206 my %eof_handler; 2207 # Subroutines to call upon getting an EOF on the input file, but before 2208 # that is returned to the main handler. This is to allow buffers to be 2209 # flushed. The handler is expected to call insert_lines() or 2210 # insert_adjusted() with the buffered material 2211 main::set_access('eof_handler', \%eof_handler, qw{ c }); 2212 2213 my %post_handler; 2214 # Subroutine to call after all the lines of the file are read in and 2215 # processed. If undef, no such handler is called. Note that this cannot 2216 # add lines to be processed; instead use eof_handler 2217 main::set_access('post_handler', \%post_handler, qw{ c }); 2218 2219 my %progress_message; 2220 # Message to print to display progress in lieu of the standard one 2221 main::set_access('progress_message', \%progress_message, qw{ c }); 2222 2223 my %handle; 2224 # cache open file handle, internal. Is undef if file hasn't been 2225 # processed at all, empty if has; 2226 main::set_access('handle', \%handle); 2227 2228 my %added_lines; 2229 # cache of lines added virtually to the file, internal 2230 main::set_access('added_lines', \%added_lines); 2231 2232 my %remapped_lines; 2233 # cache of lines added virtually to the file, internal 2234 main::set_access('remapped_lines', \%remapped_lines); 2235 2236 my %errors; 2237 # cache of errors found, internal 2238 main::set_access('errors', \%errors); 2239 2240 my %missings; 2241 # storage of '@missing' defaults lines 2242 main::set_access('missings', \%missings); 2243 2244 my %early; 2245 # Used for properties that must be defined (for Perl's purposes) on 2246 # versions of Unicode earlier than Unicode itself defines them. The 2247 # parameter is an array (it would be better to be a hash, but not worth 2248 # bothering about due to its rare use). 2249 # 2250 # The first element is either a code reference to call when in a release 2251 # earlier than the Unicode file is available in, or it is an alternate 2252 # file to use instead of the non-existent one. This file must have been 2253 # plunked down in the same directory as mktables. Should you be compiling 2254 # on a release that needs such a file, mktables will abort the 2255 # compilation, and tell you where to get the necessary file(s), and what 2256 # name(s) to use to store them as. 2257 # In the case of specifying an alternate file, the array must contain two 2258 # further elements: 2259 # 2260 # [1] is the name of the property that will be generated by this file. 2261 # The class automatically takes the input file and excludes any code 2262 # points in it that were not assigned in the Unicode version being 2263 # compiled. It then uses this result to define the property in the given 2264 # version. Since the property doesn't actually exist in the Unicode 2265 # version being compiled, this should be a name accessible only by core 2266 # perl. If it is the same name as the regular property, the constructor 2267 # will mark the output table as a $PLACEHOLDER so that it doesn't actually 2268 # get output, and so will be unusable by non-core code. Otherwise it gets 2269 # marked as $INTERNAL_ONLY. 2270 # 2271 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to 2272 # the Hangul syllables in that release (which were ripped out in version 2273 # 2) for the given property . (Hence it is ignored except when compiling 2274 # version 1. You only get one value that applies to all of them, which 2275 # may not be the actual reality, but probably nobody cares anyway for 2276 # these obsolete characters.) 2277 # 2278 # [3] if present is the default value for the property to assign for code 2279 # points not given in the input. If not present, the default from the 2280 # normal property is used 2281 # 2282 # [-1] If there is an extra final element that is the string 'ONLY_EARLY'. 2283 # it means to not add the name in [1] as an alias to the property name 2284 # used for these. Normally, when compiling Unicode versions that don't 2285 # invoke the early handling, the name is added as a synonym. 2286 # 2287 # Not all files can be handled in the above way, and so the code ref 2288 # alternative is available. It can do whatever it needs to. The other 2289 # array elements are optional in this case, and the code is free to use or 2290 # ignore them if they are present. 2291 # 2292 # Internally, the constructor unshifts a 0 or 1 onto this array to 2293 # indicate if an early alternative is actually being used or not. This 2294 # makes for easier testing later on. 2295 main::set_access('early', \%early, 'c'); 2296 2297 my %only_early; 2298 main::set_access('only_early', \%only_early, 'c'); 2299 2300 my %required_even_in_debug_skip; 2301 # debug_skip is used to speed up compilation during debugging by skipping 2302 # processing files that are not needed for the task at hand. However, 2303 # some files pretty much can never be skipped, and this is used to specify 2304 # that this is one of them. In order to skip this file, the call to the 2305 # constructor must be edited to comment out this parameter. 2306 main::set_access('required_even_in_debug_skip', 2307 \%required_even_in_debug_skip, 'c'); 2308 2309 my %withdrawn; 2310 # Some files get removed from the Unicode DB. This is a version object 2311 # giving the first release without this file. 2312 main::set_access('withdrawn', \%withdrawn, 'c'); 2313 2314 my %ucd; 2315 # Some files are not actually part of the Unicode Character Database. 2316 # These typically have a different way of indicating their version 2317 main::set_access('ucd', \%ucd, 'c'); 2318 2319 my %in_this_release; 2320 # Calculated value from %first_released and %withdrawn. Are we compiling 2321 # a Unicode release which includes this file? 2322 main::set_access('in_this_release', \%in_this_release); 2323 2324 sub _next_line; 2325 sub _next_line_with_remapped_range; 2326 2327 sub new { 2328 my $class = shift; 2329 2330 my $self = bless \do{ my $anonymous_scalar }, $class; 2331 my $addr = pack 'J', refaddr $self; 2332 2333 # Set defaults 2334 $handler{$addr} = \&main::process_generic_property_file; 2335 $retain_trailing_comments{$addr} = 0; 2336 $non_skip{$addr} = 0; 2337 $skip{$addr} = undef; 2338 $has_missings_defaults{$addr} = $NO_DEFAULTS; 2339 $handle{$addr} = undef; 2340 $added_lines{$addr} = [ ]; 2341 $remapped_lines{$addr} = [ ]; 2342 $each_line_handler{$addr} = [ ]; 2343 $eof_handler{$addr} = [ ]; 2344 $errors{$addr} = { }; 2345 $missings{$addr} = [ ]; 2346 $early{$addr} = [ ]; 2347 $optional{$addr} = [ ]; 2348 $ucd{$addr} = 1; 2349 2350 # Two positional parameters. 2351 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2352 $file{$addr} = main::internal_file_to_platform(shift); 2353 $first_released{$addr} = shift; 2354 2355 # The rest of the arguments are key => value pairs 2356 # %constructor_fields has been set up earlier to list all possible 2357 # ones. Either set or push, depending on how the default has been set 2358 # up just above. 2359 my %args = @_; 2360 foreach my $key (keys %args) { 2361 my $argument = $args{$key}; 2362 2363 # Note that the fields are the lower case of the constructor keys 2364 my $hash = $constructor_fields{lc $key}; 2365 if (! defined $hash) { 2366 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); 2367 next; 2368 } 2369 if (ref $hash->{$addr} eq 'ARRAY') { 2370 if (ref $argument eq 'ARRAY') { 2371 foreach my $argument (@{$argument}) { 2372 next if ! defined $argument; 2373 push @{$hash->{$addr}}, $argument; 2374 } 2375 } 2376 else { 2377 push @{$hash->{$addr}}, $argument if defined $argument; 2378 } 2379 } 2380 else { 2381 $hash->{$addr} = $argument; 2382 } 2383 delete $args{$key}; 2384 }; 2385 2386 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr}; 2387 2388 # Convert 0 (meaning don't skip) to undef 2389 undef $skip{$addr} unless $skip{$addr}; 2390 2391 # Handle the case where this file is optional 2392 my $pod_message_for_non_existent_optional = ""; 2393 if ($optional{$addr}->@*) { 2394 2395 # First element is the pod message 2396 $pod_message_for_non_existent_optional 2397 = shift $optional{$addr}->@*; 2398 # Convert a 0 'Optional' argument to an empty list to make later 2399 # code more concise. 2400 if ( $optional{$addr}->@* 2401 && $optional{$addr}->@* == 1 2402 && $optional{$addr}[0] ne "" 2403 && $optional{$addr}[0] !~ /\D/ 2404 && $optional{$addr}[0] == 0) 2405 { 2406 $optional{$addr} = [ ]; 2407 } 2408 else { # But if the only element doesn't evaluate to 0, make sure 2409 # that this file is indeed considered optional below. 2410 unshift $optional{$addr}->@*, 1; 2411 } 2412 } 2413 2414 my $progress; 2415 my $function_instead_of_file = 0; 2416 2417 if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') { 2418 $only_early{$addr} = 1; 2419 pop $early{$addr}->@*; 2420 } 2421 2422 # If we are compiling a Unicode release earlier than the file became 2423 # available, the constructor may have supplied a substitute 2424 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) { 2425 2426 # Yes, we have a substitute, that we will use; mark it so 2427 unshift $early{$addr}->@*, 1; 2428 2429 # See the definition of %early for what the array elements mean. 2430 # Note that we have just unshifted onto the array, so the numbers 2431 # below are +1 of those in the %early description. 2432 # If we have a property this defines, create a table and default 2433 # map for it now (at essentially compile time), so that it will be 2434 # available for the whole of run time. (We will want to add this 2435 # name as an alias when we are using the official property name; 2436 # but this must be deferred until run(), because at construction 2437 # time the official names have yet to be defined.) 2438 if ($early{$addr}[2]) { 2439 my $fate = ($property{$addr} 2440 && $property{$addr} eq $early{$addr}[2]) 2441 ? $PLACEHOLDER 2442 : $INTERNAL_ONLY; 2443 my $prop_object = Property->new($early{$addr}[2], 2444 Fate => $fate, 2445 Perl_Extension => 1, 2446 ); 2447 2448 # If not specified by the constructor, use the default mapping 2449 # for the regular property for this substitute one. 2450 if ($early{$addr}[4]) { 2451 $prop_object->set_default_map($early{$addr}[4]); 2452 } 2453 elsif ( defined $property{$addr} 2454 && defined $default_mapping{$property{$addr}}) 2455 { 2456 $prop_object 2457 ->set_default_map($default_mapping{$property{$addr}}); 2458 } 2459 } 2460 2461 if (ref $early{$addr}[1] eq 'CODE') { 2462 $function_instead_of_file = 1; 2463 2464 # If the first element of the array is a code ref, the others 2465 # are optional. 2466 $handler{$addr} = $early{$addr}[1]; 2467 $property{$addr} = $early{$addr}[2] 2468 if defined $early{$addr}[2]; 2469 $progress = "substitute $file{$addr}"; 2470 2471 undef $file{$addr}; 2472 } 2473 else { # Specifying a substitute file 2474 2475 if (! main::file_exists($early{$addr}[1])) { 2476 2477 # If we don't see the substitute file, generate an error 2478 # message giving the needed things, and add it to the list 2479 # of such to output before actual processing happens 2480 # (hence the user finds out all of them in one run). 2481 # Instead of creating a general method for NameAliases, 2482 # hard-code it here, as there is unlikely to ever be a 2483 # second one which needs special handling. 2484 my $string_version = ($file{$addr} eq "NameAliases.txt") 2485 ? 'at least 6.1 (the later, the better)' 2486 : sprintf "%vd", $first_released{$addr}; 2487 push @missing_early_files, <<END; 2488'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'. 2489END 2490 ; 2491 return; 2492 } 2493 $progress = $early{$addr}[1]; 2494 $progress .= ", substituting for $file{$addr}" if $file{$addr}; 2495 $file{$addr} = $early{$addr}[1]; 2496 $property{$addr} = $early{$addr}[2]; 2497 2498 # Ignore code points not in the version being compiled 2499 push $each_line_handler{$addr}->@*, \&_exclude_unassigned; 2500 2501 if ( $v_version lt v2.0 # Hanguls in this release ... 2502 && defined $early{$addr}[3]) # ... need special treatment 2503 { 2504 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls; 2505 } 2506 } 2507 2508 # And this substitute is valid for all releases. 2509 $first_released{$addr} = v0; 2510 } 2511 else { # Normal behavior 2512 $progress = $file{$addr}; 2513 unshift $early{$addr}->@*, 0; # No substitute 2514 } 2515 2516 my $file = $file{$addr}; 2517 $progress_message{$addr} = "Processing $progress" 2518 unless $progress_message{$addr}; 2519 2520 # A file should be there if it is within the window of versions for 2521 # which Unicode supplies it 2522 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) { 2523 $in_this_release{$addr} = 0; 2524 $skip{$addr} = ""; 2525 } 2526 else { 2527 $in_this_release{$addr} = $first_released{$addr} le $v_version; 2528 2529 # Check that the file for this object (possibly using a substitute 2530 # for early releases) exists or we have a function alternative 2531 if ( ! $function_instead_of_file 2532 && ! main::file_exists($file)) 2533 { 2534 # Here there is nothing available for this release. This is 2535 # fine if we aren't expecting anything in this release. 2536 if (! $in_this_release{$addr}) { 2537 $skip{$addr} = ""; # Don't remark since we expected 2538 # nothing and got nothing 2539 } 2540 elsif ($optional{$addr}->@*) { 2541 2542 # Here the file is optional in this release; Use the 2543 # passed in text to document this case in the pod. 2544 $skip{$addr} = $pod_message_for_non_existent_optional; 2545 } 2546 elsif ( $in_this_release{$addr} 2547 && ! defined $skip{$addr} 2548 && defined $file) 2549 { # Doesn't exist but should. 2550 $skip{$addr} = "'$file' not found. Possibly Big problems"; 2551 Carp::my_carp($skip{$addr}); 2552 } 2553 } 2554 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr}) 2555 { 2556 2557 # The file exists; if not skipped for another reason, and we are 2558 # skipping most everything during debugging builds, use that as 2559 # the skip reason. 2560 $skip{$addr} = '$debug_skip is on' 2561 } 2562 } 2563 2564 if ( ! $debug_skip 2565 && $non_skip{$addr} 2566 && ! $required_even_in_debug_skip{$addr} 2567 && $verbosity) 2568 { 2569 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n"; 2570 } 2571 2572 # Here, we have figured out if we will be skipping this file or not. 2573 # If so, we add any single property it defines to any passed in 2574 # optional property list. These will be dealt with at run time. 2575 if (defined $skip{$addr}) { 2576 if ($property{$addr}) { 2577 push $optional{$addr}->@*, $property{$addr}; 2578 } 2579 } # Otherwise, are going to process the file. 2580 elsif ($property{$addr}) { 2581 2582 # If the file has a property defined in the constructor for it, it 2583 # means that the property is not listed in the file's entries. So 2584 # add a handler (to the list of line handlers) to insert the 2585 # property name into the lines, to provide a uniform interface to 2586 # the final processing subroutine. 2587 push @{$each_line_handler{$addr}}, \&_insert_property_into_line; 2588 } 2589 elsif ($properties{$addr}) { 2590 2591 # Similarly, there may be more than one property represented on 2592 # each line, with no clue but the constructor input what those 2593 # might be. Add a handler for each line in the input so that it 2594 # creates a separate input line for each property in those input 2595 # lines, thus making them suitable to handle generically. 2596 2597 push @{$each_line_handler{$addr}}, 2598 sub { 2599 my $file = shift; 2600 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2601 my @fields = split /\s*;\s*/, $_, -1; 2602 2603 if (@fields - 1 > @{$properties{$addr}}) { 2604 $file->carp_bad_line('Extra fields'); 2605 $_ = ""; 2606 return; 2607 } 2608 my $range = shift @fields; # 0th element is always the 2609 # range 2610 2611 # The next fields in the input line correspond 2612 # respectively to the stored properties. 2613 for my $i (0 .. @{$properties{$addr}} - 1) { 2614 my $property_name = $properties{$addr}[$i]; 2615 next if $property_name eq '<ignored>'; 2616 $file->insert_adjusted_lines( 2617 "$range; $property_name; $fields[$i]"); 2618 } 2619 $_ = ""; 2620 2621 return; 2622 }; 2623 } 2624 2625 { # On non-ascii platforms, we use a special pre-handler 2626 no strict; 2627 no warnings 'once'; 2628 *next_line = (main::NON_ASCII_PLATFORM) 2629 ? *_next_line_with_remapped_range 2630 : *_next_line; 2631 } 2632 2633 &{$construction_time_handler{$addr}}($self) 2634 if $construction_time_handler{$addr}; 2635 2636 return $self; 2637 } 2638 2639 2640 use overload 2641 fallback => 0, 2642 qw("") => "_operator_stringify", 2643 "." => \&main::_operator_dot, 2644 ".=" => \&main::_operator_dot_equal, 2645 ; 2646 2647 sub _operator_stringify($self, $other="", $reversed=0) { 2648 return __PACKAGE__ . " object for " . $self->file; 2649 } 2650 2651 sub run($self) { 2652 # Process the input object $self. This opens and closes the file and 2653 # calls all the handlers for it. Currently, this can only be called 2654 # once per file, as it destroy's the EOF handlers 2655 2656 # flag to make sure extracted files are processed early 2657 state $seen_non_extracted = 0; 2658 2659 my $addr = pack 'J', refaddr $self; 2660 2661 my $file = $file{$addr}; 2662 2663 if (! $file) { 2664 $handle{$addr} = 'pretend_is_open'; 2665 } 2666 else { 2667 if ($seen_non_extracted) { 2668 if ($file =~ /$EXTRACTED/i) # Some platforms may change the 2669 # case of the file's name 2670 { 2671 Carp::my_carp_bug(main::join_lines(<<END 2672$file should be processed just after the 'Prop...Alias' files, and before 2673anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may 2674have subtle problems 2675END 2676 )); 2677 } 2678 } 2679 elsif ($EXTRACTED_DIR 2680 2681 # We only do this check for generic property files 2682 && $handler{$addr} == \&main::process_generic_property_file 2683 2684 && $file !~ /$EXTRACTED/i) 2685 { 2686 # We don't set this (by the 'if' above) if we have no 2687 # extracted directory, so if running on an early version, 2688 # this test won't work. Not worth worrying about. 2689 $seen_non_extracted = 1; 2690 } 2691 2692 # Mark the file as having being processed, and warn if it 2693 # isn't a file we are expecting. As we process the files, 2694 # they are deleted from the hash, so any that remain at the 2695 # end of the program are files that we didn't process. 2696 my $fkey = File::Spec->rel2abs($file); 2697 my $exists = delete $potential_files{lc($fkey)}; 2698 2699 Carp::my_carp("Was not expecting '$file'.") 2700 if $exists && ! $in_this_release{$addr}; 2701 2702 # If there is special handling for compiling Unicode releases 2703 # earlier than the first one in which Unicode defines this 2704 # property ... 2705 if ($early{$addr}->@* > 1) { 2706 2707 # Mark as processed any substitute file that would be used in 2708 # such a release 2709 $fkey = File::Spec->rel2abs($early{$addr}[1]); 2710 delete $potential_files{lc($fkey)}; 2711 2712 # As commented in the constructor code, when using the 2713 # official property, we still have to allow the publicly 2714 # inaccessible early name so that the core code which uses it 2715 # will work regardless. 2716 if ( ! $only_early{$addr} 2717 && ! $early{$addr}[0] 2718 && $early{$addr}->@* > 2) 2719 { 2720 my $early_property_name = $early{$addr}[2]; 2721 if ($property{$addr} ne $early_property_name) { 2722 main::property_ref($property{$addr}) 2723 ->add_alias($early_property_name); 2724 } 2725 } 2726 } 2727 2728 # We may be skipping this file ... 2729 if (defined $skip{$addr}) { 2730 2731 # If the file isn't supposed to be in this release, there is 2732 # nothing to do 2733 if ($in_this_release{$addr}) { 2734 2735 # But otherwise, we may print a message 2736 if ($debug_skip) { 2737 print STDERR "Skipping input file '$file'", 2738 " because '$skip{$addr}'\n"; 2739 } 2740 2741 # And add it to the list of skipped files, which is later 2742 # used to make the pod 2743 $skipped_files{$file} = $skip{$addr}; 2744 2745 # The 'optional' list contains properties that are also to 2746 # be skipped along with the file. (There may also be 2747 # digits which are just placeholders to make sure it isn't 2748 # an empty list 2749 foreach my $property ($optional{$addr}->@*) { 2750 next unless $property =~ /\D/; 2751 my $prop_object = main::property_ref($property); 2752 next unless defined $prop_object; 2753 $prop_object->set_fate($SUPPRESSED, $skip{$addr}); 2754 } 2755 } 2756 2757 return; 2758 } 2759 2760 # Here, we are going to process the file. Open it, converting the 2761 # slashes used in this program into the proper form for the OS 2762 my $file_handle; 2763 if (not open $file_handle, "<", $file) { 2764 Carp::my_carp("Can't open $file. Skipping: $!"); 2765 return; 2766 } 2767 $handle{$addr} = $file_handle; # Cache the open file handle 2768 2769 # If possible, make sure that the file is the correct version. 2770 # (This data isn't available on early Unicode releases or in 2771 # UnicodeData.txt.) We don't do this check if we are using a 2772 # substitute file instead of the official one (though the code 2773 # could be extended to do so). 2774 if ($in_this_release{$addr} 2775 && ! $early{$addr}[0] 2776 && lc($file) ne 'unicodedata.txt') 2777 { 2778 my $this_version; 2779 2780 if ($file !~ /^Unihan/i) { 2781 2782 # The non-Unihan files started getting version numbers in 2783 # 3.2, but some files in 4.0 are unchanged from 3.2, and 2784 # marked as 3.2. 4.0.1 is the first version where there 2785 # are no files marked as being from less than 4.0, though 2786 # some are marked as 4.0. In versions after that, the 2787 # numbers are correct. 2788 if ($v_version ge v4.0.1) { 2789 $_ = <$file_handle>; # The version number is in the 2790 # very first line if it is a 2791 # UCD file; otherwise, it 2792 # might be 2793 goto valid_version if $_ =~ / - $string_version \. /x; 2794 chomp; 2795 if ($ucd{$addr}) { 2796 $_ =~ s/^#\s*//; 2797 2798 # 4.0.1 had some valid files that weren't updated. 2799 goto valid_version 2800 if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/; 2801 $this_version = $_; 2802 goto wrong_version; 2803 } 2804 else { 2805 my $BOM = "\x{FEFF}"; 2806 utf8::encode($BOM); 2807 my $BOM_re = qr/ ^ (?:$BOM)? /x; 2808 2809 do { 2810 chomp; 2811 2812 # BOM; seems to be on many lines in some 2813 # files!! 2814 $_ =~ s/$BOM_re//; 2815 2816 if (/./) { 2817 2818 # Only look for the version if in the 2819 # first comment block. 2820 goto no_version unless $_ =~ /^#/; 2821 2822 if ($_ =~ /Version:? (\S*)/) { 2823 $this_version = $1; 2824 goto valid_version 2825 if $this_version eq $string_version; 2826 goto valid_version 2827 if "$this_version.0" 2828 eq $string_version; 2829 } 2830 } 2831 } while (<$file_handle>); 2832 2833 goto no_version; 2834 } 2835 } 2836 } 2837 elsif ($v_version ge v6.0.0) { # Unihan 2838 2839 # Unihan files didn't get accurate version numbers until 2840 # 6.0. The version is somewhere in the first comment 2841 # block 2842 while (<$file_handle>) { 2843 goto no_version if $_ !~ /^#/; 2844 chomp; 2845 $_ =~ s/^#\s*//; 2846 next if $_ !~ / version: /x; 2847 goto valid_version if $_ =~ /$string_version/; 2848 goto wrong_version; 2849 } 2850 goto no_version; 2851 } 2852 else { # Old Unihan; have to assume is valid 2853 goto valid_version; 2854 } 2855 2856 wrong_version: 2857 die Carp::my_carp("File '$file' is version " 2858 . "'$this_version'. It should be " 2859 . "version $string_version"); 2860 no_version: 2861 Carp::my_carp_bug("Could not find the expected " 2862 . "version info in file '$file'"); 2863 } 2864 } 2865 2866 valid_version: 2867 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS; 2868 2869 # Call any special handler for before the file. 2870 &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; 2871 2872 # Then the main handler 2873 &{$handler{$addr}}($self); 2874 2875 # Then any special post-file handler. 2876 &{$post_handler{$addr}}($self) if $post_handler{$addr}; 2877 2878 # If any errors have been accumulated, output the counts (as the first 2879 # error message in each class was output when it was encountered). 2880 if ($errors{$addr}) { 2881 my $total = 0; 2882 my $types = 0; 2883 foreach my $error (keys %{$errors{$addr}}) { 2884 $total += $errors{$addr}->{$error}; 2885 delete $errors{$addr}->{$error}; 2886 $types++; 2887 } 2888 if ($total > 1) { 2889 my $message 2890 = "A total of $total lines had errors in $file. "; 2891 2892 $message .= ($types == 1) 2893 ? '(Only the first one was displayed.)' 2894 : '(Only the first of each type was displayed.)'; 2895 Carp::my_carp($message); 2896 } 2897 } 2898 2899 if (@{$missings{$addr}}) { 2900 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); 2901 } 2902 2903 # If a real file handle, close it. 2904 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if 2905 ref $handle{$addr}; 2906 $handle{$addr} = ""; # Uses empty to indicate that has already seen 2907 # the file, as opposed to undef 2908 return; 2909 } 2910 2911 sub _next_line($self) { 2912 # Sets $_ to be the next logical input line, if any. Returns non-zero 2913 # if such a line exists. 'logical' means that any lines that have 2914 # been added via insert_lines() will be returned in $_ before the file 2915 # is read again. 2916 2917 my $addr = pack 'J', refaddr $self; 2918 2919 # Here the file is open (or if the handle is not a ref, is an open 2920 # 'virtual' file). Get the next line; any inserted lines get priority 2921 # over the file itself. 2922 my $adjusted; 2923 2924 LINE: 2925 while (1) { # Loop until find non-comment, non-empty line 2926 #local $to_trace = 1 if main::DEBUG; 2927 my $inserted_ref = shift @{$added_lines{$addr}}; 2928 if (defined $inserted_ref) { 2929 ($adjusted, $_) = @{$inserted_ref}; 2930 trace $adjusted, $_ if main::DEBUG && $to_trace; 2931 return 1 if $adjusted; 2932 } 2933 else { 2934 last if ! ref $handle{$addr}; # Don't read unless is real file 2935 last if ! defined ($_ = readline $handle{$addr}); 2936 } 2937 chomp; 2938 trace $_ if main::DEBUG && $to_trace; 2939 2940 # See if this line is the comment line that defines what property 2941 # value that code points that are not listed in the file should 2942 # have. The format or existence of these lines is not guaranteed 2943 # by Unicode since they are comments, but the documentation says 2944 # that this was added for machine-readability, so probably won't 2945 # change. This works starting in Unicode Version 5.0. They look 2946 # like: 2947 # 2948 # @missing: 0000..10FFFF; Not_Reordered 2949 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> 2950 # @missing: 0000..10FFFF; ; NaN 2951 # 2952 # Save the line for a later get_missings() call. 2953 if (/$missing_defaults_prefix/) { 2954 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { 2955 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); 2956 } 2957 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { 2958 my $start = $1; # The pattern saves the beginning and 2959 my $end = $2; # end points of the range the default 2960 # is for 2961 my @defaults = split /\s* ; \s*/x, $_; 2962 2963 # The first field is the @missing, which ends in a 2964 # semi-colon, so can safely shift. 2965 shift @defaults; 2966 2967 # Some of these lines may have empty field placeholders 2968 # which get in the way. An example is: 2969 # @missing: 0000..10FFFF; ; NaN 2970 # Remove them. Process starting from the top so the 2971 # splice doesn't affect things still to be looked at. 2972 for (my $i = @defaults - 1; $i >= 0; $i--) { 2973 next if $defaults[$i] ne ""; 2974 splice @defaults, $i, 1; 2975 } 2976 2977 # What's left should be just the property (maybe) and the 2978 # default. Having only one element means it doesn't have 2979 # the property. 2980 my $default; 2981 my $property; 2982 if (@defaults >= 1) { 2983 if (@defaults == 1) { 2984 $default = $defaults[0]; 2985 } 2986 else { 2987 $property = $defaults[0]; 2988 $default = $defaults[1]; 2989 } 2990 } 2991 2992 if (@defaults < 1 2993 || @defaults > 2 2994 || ($default =~ /^</ 2995 && $default !~ /^<code *point>$/i 2996 && $default !~ /^<none>$/i 2997 && $default !~ /^<script>$/i)) 2998 { 2999 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); 3000 } 3001 else { 3002 3003 # If the property is missing from the line, it should 3004 # be the one for the whole file 3005 $property = $property{$addr} if ! defined $property; 3006 3007 # Change <none> to the null string, which is what it 3008 # really means. If the default is the code point 3009 # itself, set it to <code point>, which is what 3010 # Unicode uses (but sometimes they've forgotten the 3011 # space) 3012 if ($default =~ /^<none>$/i) { 3013 $default = ""; 3014 } 3015 elsif ($default =~ /^<code *point>$/i) { 3016 $default = $CODE_POINT; 3017 } 3018 elsif ($default =~ /^<script>$/i) { 3019 3020 # Special case this one. Currently is from 3021 # ScriptExtensions.txt, and means for all unlisted 3022 # code points, use their Script property values. 3023 # For the code points not listed in that file, the 3024 # default value is 'Unknown'. 3025 $default = "Unknown"; 3026 } 3027 3028 # Store them as a sub-hash as part of an array, with 3029 # both components. 3030 push @{$missings{$addr}}, { start => hex $start, 3031 end => hex $end, 3032 default => $default, 3033 property => $property 3034 }; 3035 } 3036 } 3037 3038 # There is nothing for the caller to process on this comment 3039 # line. 3040 next; 3041 } 3042 3043 # Unless to keep, remove comments. If to keep, ignore 3044 # comment-only lines 3045 if ($retain_trailing_comments{$addr}) { 3046 next if / ^ \s* \# /x; 3047 3048 # But escape any single quotes (done in both the comment and 3049 # non-comment portion; this could be a bug someday, but not 3050 # likely) 3051 s/'/\\'/g; 3052 } 3053 else { 3054 s/#.*//; 3055 } 3056 3057 # Remove trailing space, and skip this line if the result is empty 3058 s/\s+$//; 3059 next if /^$/; 3060 3061 # Call any handlers for this line, and skip further processing of 3062 # the line if the handler sets the line to null. 3063 foreach my $sub_ref (@{$each_line_handler{$addr}}) { 3064 &{$sub_ref}($self); 3065 next LINE if /^$/; 3066 } 3067 3068 # Here the line is ok. return success. 3069 return 1; 3070 } # End of looping through lines. 3071 3072 # If there are EOF handlers, call each (only once) and if it generates 3073 # more lines to process go back in the loop to handle them. 3074 while ($eof_handler{$addr}->@*) { 3075 &{$eof_handler{$addr}[0]}($self); 3076 shift $eof_handler{$addr}->@*; # Currently only get one shot at it. 3077 goto LINE if $added_lines{$addr}; 3078 } 3079 3080 # Return failure -- no more lines. 3081 return 0; 3082 3083 } 3084 3085 sub _next_line_with_remapped_range($self) { 3086 # like _next_line(), but for use on non-ASCII platforms. It sets $_ 3087 # to be the next logical input line, if any. Returns non-zero if such 3088 # a line exists. 'logical' means that any lines that have been added 3089 # via insert_lines() will be returned in $_ before the file is read 3090 # again. 3091 # 3092 # The difference from _next_line() is that this remaps the Unicode 3093 # code points in the input to those of the native platform. Each 3094 # input line contains a single code point, or a single contiguous 3095 # range of them This routine splits each range into its individual 3096 # code points and caches them. It returns the cached values, 3097 # translated into their native equivalents, one at a time, for each 3098 # call, before reading the next line. Since native values can only be 3099 # a single byte wide, no translation is needed for code points above 3100 # 0xFF, and ranges that are entirely above that number are not split. 3101 # If an input line contains the range 254-1000, it would be split into 3102 # three elements: 254, 255, and 256-1000. (The downstream table 3103 # insertion code will sort and coalesce the individual code points 3104 # into appropriate ranges.) 3105 3106 my $addr = pack 'J', refaddr $self; 3107 3108 while (1) { 3109 3110 # Look in cache before reading the next line. Return any cached 3111 # value, translated 3112 my $inserted = shift @{$remapped_lines{$addr}}; 3113 if (defined $inserted) { 3114 trace $inserted if main::DEBUG && $to_trace; 3115 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer; 3116 trace $_ if main::DEBUG && $to_trace; 3117 return 1; 3118 } 3119 3120 # Get the next line. 3121 return 0 unless _next_line($self); 3122 3123 # If there is a special handler for it, return the line, 3124 # untranslated. This should happen only for files that are 3125 # special, not being code-point related, such as property names. 3126 return 1 if $handler{$addr} 3127 != \&main::process_generic_property_file; 3128 3129 my ($range, $property_name, $map, @remainder) 3130 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3131 3132 if (@remainder 3133 || ! defined $property_name 3134 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3135 { 3136 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored"); 3137 } 3138 3139 my $low = hex $1; 3140 my $high = (defined $2) ? hex $2 : $low; 3141 3142 # If the input maps the range to another code point, remap the 3143 # target if it is between 0 and 255. 3144 my $tail; 3145 if (defined $map) { 3146 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe; 3147 $tail = "$property_name; $map"; 3148 $_ = "$range; $tail"; 3149 } 3150 else { 3151 $tail = $property_name; 3152 } 3153 3154 # If entire range is above 255, just return it, unchanged (except 3155 # any mapped-to code point, already changed above) 3156 return 1 if $low > 255; 3157 3158 # Cache an entry for every code point < 255. For those in the 3159 # range above 255, return a dummy entry for just that portion of 3160 # the range. Note that this will be out-of-order, but that is not 3161 # a problem. 3162 foreach my $code_point ($low .. $high) { 3163 if ($code_point > 255) { 3164 $_ = sprintf "%04X..%04X; $tail", $code_point, $high; 3165 return 1; 3166 } 3167 push @{$remapped_lines{$addr}}, "$code_point; $tail"; 3168 } 3169 } # End of looping through lines. 3170 3171 # NOTREACHED 3172 } 3173 3174# Not currently used, not fully tested. 3175# sub peek { 3176# # Non-destructive lookahead one non-adjusted, non-comment, non-blank 3177# # record. Not callable from an each_line_handler(), nor does it call 3178# # an each_line_handler() on the line. 3179# 3180# my $self = shift; 3181# my $addr = pack 'J', refaddr $self; 3182# 3183# foreach my $inserted_ref (@{$added_lines{$addr}}) { 3184# my ($adjusted, $line) = @{$inserted_ref}; 3185# next if $adjusted; 3186# 3187# # Remove comments and trailing space, and return a non-empty 3188# # resulting line 3189# $line =~ s/#.*//; 3190# $line =~ s/\s+$//; 3191# return $line if $line ne ""; 3192# } 3193# 3194# return if ! ref $handle{$addr}; # Don't read unless is real file 3195# while (1) { # Loop until find non-comment, non-empty line 3196# local $to_trace = 1 if main::DEBUG; 3197# trace $_ if main::DEBUG && $to_trace; 3198# return if ! defined (my $line = readline $handle{$addr}); 3199# chomp $line; 3200# push @{$added_lines{$addr}}, [ 0, $line ]; 3201# 3202# $line =~ s/#.*//; 3203# $line =~ s/\s+$//; 3204# return $line if $line ne ""; 3205# } 3206# 3207# return; 3208# } 3209 3210 3211 sub insert_lines($self, @lines) { 3212 # Lines can be inserted so that it looks like they were in the input 3213 # file at the place it was when this routine is called. See also 3214 # insert_adjusted_lines(). Lines inserted via this routine go through 3215 # any each_line_handler() 3216 3217 # Each inserted line is an array, with the first element being 0 to 3218 # indicate that this line hasn't been adjusted, and needs to be 3219 # processed. 3220 push @{$added_lines{pack 'J', refaddr $self}}, map { [ 0, $_ ] } @lines; 3221 return; 3222 } 3223 3224 sub insert_adjusted_lines($self, @lines) { 3225 # Lines can be inserted so that it looks like they were in the input 3226 # file at the place it was when this routine is called. See also 3227 # insert_lines(). Lines inserted via this routine are already fully 3228 # adjusted, ready to be processed; each_line_handler()s handlers will 3229 # not be called. This means this is not a completely general 3230 # facility, as only the last each_line_handler on the stack should 3231 # call this. It could be made more general, by passing to each of the 3232 # line_handlers their position on the stack, which they would pass on 3233 # to this routine, and that would replace the boolean first element in 3234 # the anonymous array pushed here, so that the next_line routine could 3235 # use that to call only those handlers whose index is after it on the 3236 # stack. But this is overkill for what is needed now. 3237 3238 trace $self if main::DEBUG && $to_trace; 3239 3240 # Each inserted line is an array, with the first element being 1 to 3241 # indicate that this line has been adjusted 3242 push @{$added_lines{pack 'J', refaddr $self}}, map { [ 1, $_ ] } @lines; 3243 return; 3244 } 3245 3246 sub get_missings($self) { 3247 # Returns the stored up @missings lines' values, and clears the list. 3248 # The values are in a hash, consisting of 'default' and 'property'. 3249 # However, since these lines can be stacked up, the return is an array 3250 # of all these hashes. 3251 3252 my $addr = pack 'J', refaddr $self; 3253 3254 # If not accepting a list return, just return the first one. 3255 return shift @{$missings{$addr}} unless wantarray; 3256 3257 my @return = @{$missings{$addr}}; 3258 undef @{$missings{$addr}}; 3259 return @return; 3260 } 3261 3262 sub _exclude_unassigned($self) { 3263 3264 # Takes the range in $_ and excludes code points that aren't assigned 3265 # in this release 3266 3267 state $skip_inserted_count = 0; 3268 3269 # Ignore recursive calls. 3270 if ($skip_inserted_count) { 3271 $skip_inserted_count--; 3272 return; 3273 } 3274 3275 # Find what code points are assigned in this release 3276 main::calculate_Assigned() if ! defined $Assigned; 3277 3278 my ($range, @remainder) 3279 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3280 3281 # Examine the range. 3282 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3283 { 3284 my $low = hex $1; 3285 my $high = (defined $2) ? hex $2 : $low; 3286 3287 # Split the range into subranges of just those code points in it 3288 # that are assigned. 3289 my @ranges = (Range_List->new(Initialize 3290 => Range->new($low, $high)) & $Assigned)->ranges; 3291 3292 # Do nothing if nothing in the original range is assigned in this 3293 # release; handle normally if everything is in this release. 3294 if (! @ranges) { 3295 $_ = ""; 3296 } 3297 elsif (@ranges != 1) { 3298 3299 # Here, some code points in the original range aren't in this 3300 # release; @ranges gives the ones that are. Create fake input 3301 # lines for each of the ranges, and set things up so that when 3302 # this routine is called on that fake input, it will do 3303 # nothing. 3304 $skip_inserted_count = @ranges; 3305 my $remainder = join ";", @remainder; 3306 for my $range (@ranges) { 3307 $self->insert_lines(sprintf("%04X..%04X;%s", 3308 $range->start, $range->end, $remainder)); 3309 } 3310 $_ = ""; # The original range is now defunct. 3311 } 3312 } 3313 3314 return; 3315 } 3316 3317 sub _fixup_obsolete_hanguls($self) { 3318 3319 # This is called only when compiling Unicode version 1. All Unicode 3320 # data for subsequent releases assumes that the code points that were 3321 # Hangul syllables in this release only are something else, so if 3322 # using such data, we have to override it 3323 3324 my $addr = pack 'J', refaddr $self; 3325 3326 my $object = main::property_ref($property{$addr}); 3327 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE, 3328 $FINAL_REMOVED_HANGUL_SYLLABLE, 3329 $early{$addr}[3], # Passed-in value for these 3330 Replace => $UNCONDITIONALLY); 3331 } 3332 3333 sub _insert_property_into_line($self) { 3334 # Add a property field to $_, if this file requires it. 3335 3336 my $property = $property{pack 'J', refaddr $self}; 3337 $_ =~ s/(;|$)/; $property$1/; 3338 return; 3339 } 3340 3341 sub carp_bad_line($self, $message="") { 3342 # Output consistent error messages, using either a generic one, or the 3343 # one given by the optional parameter. To avoid gazillions of the 3344 # same message in case the syntax of a file is way off, this routine 3345 # only outputs the first instance of each message, incrementing a 3346 # count so the totals can be output at the end of the file. 3347 3348 my $addr = pack 'J', refaddr $self; 3349 3350 $message = 'Unexpected line' unless $message; 3351 3352 # No trailing punctuation so as to fit with our addenda. 3353 $message =~ s/[.:;,]$//; 3354 3355 # If haven't seen this exact message before, output it now. Otherwise 3356 # increment the count of how many times it has occurred 3357 unless ($errors{$addr}->{$message}) { 3358 Carp::my_carp("$message in '$_' in " 3359 . $file{$addr} 3360 . " at line $.. Skipping this line;"); 3361 $errors{$addr}->{$message} = 1; 3362 } 3363 else { 3364 $errors{$addr}->{$message}++; 3365 } 3366 3367 # Clear the line to prevent any further (meaningful) processing of it. 3368 $_ = ""; 3369 3370 return; 3371 } 3372} # End closure 3373 3374package Multi_Default; 3375 3376sub trace { return main::trace(@_); } 3377 3378# Certain properties in early versions of Unicode had more than one possible 3379# default for code points missing from the files. In these cases, one 3380# default applies to everything left over after all the others are applied, 3381# and for each of the others, there is a description of which class of code 3382# points applies to it. This object helps implement this by storing the 3383# defaults, and for all but that final default, an eval string that generates 3384# the class that it applies to. That class must be a Range_List, or contains 3385# a Range_List that the overloaded operators recognize as to be operated on. 3386# A string is used because this is called early when we know symbolically what 3387# needs to be done, but typically before any data is gathered. Thus the 3388# evaluation gets delayed until we have at hand all the needed information. 3389 3390{ # Closure 3391 3392 main::setup_package(); 3393 3394 my %class_defaults; 3395 # The defaults structure for the classes 3396 main::set_access('class_defaults', \%class_defaults, 'readable_array'); 3397 3398 my %other_default; 3399 # The default that applies to everything left over. 3400 main::set_access('other_default', \%other_default, 'r'); 3401 3402 my %iterator; 3403 3404 sub new { 3405 # The constructor is called with default => eval pairs, terminated by 3406 # the left-over default. e.g. 3407 # Multi_Default->new( 3408 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C 3409 # - 0x200D', 3410 # 'R' => 'some other expression that evaluates to code points', 3411 # . 3412 # . 3413 # . 3414 # 'U')); 3415 # It is best to leave the final value be the one that matches the 3416 # above-Unicode code points. 3417 3418 my $class = shift; 3419 3420 my $self = bless \do{my $anonymous_scalar}, $class; 3421 my $addr = pack 'J', refaddr $self; 3422 $iterator{$addr} = 0; 3423 3424 return $self unless @_; 3425 3426 while (@_ > 1) { 3427 $self->append_default(shift, shift); 3428 } 3429 3430 $self->set_final_default(shift); 3431 3432 return $self; 3433 } 3434 3435 sub append_default($self, $new_default, $eval) { 3436 my $addr = pack 'J', refaddr $self; 3437 3438 # Pushes a default setting to the current list 3439 push $class_defaults{$addr}->@*, [ $new_default, $eval ]; 3440 } 3441 3442 sub set_final_default($self, $new_default) { 3443 my $addr = pack 'J', refaddr $self; 3444 $other_default{$addr} = $new_default; 3445 } 3446 3447 sub get_next_defaults($self) { 3448 # Iterates and returns the next class of defaults. 3449 3450 my $addr = pack 'J', refaddr $self; 3451 if ($iterator{$addr}++ < $class_defaults{$addr}->@*) { 3452 return $class_defaults{$addr}->[$iterator{$addr}-1]->@*; 3453 } 3454 3455 $iterator{$addr} = 0; 3456 return undef; 3457 } 3458} 3459 3460package Alias; 3461 3462# An alias is one of the names that a table goes by. This class defines them 3463# including some attributes. Everything is currently setup in the 3464# constructor. 3465 3466 3467{ # Closure 3468 3469 main::setup_package(); 3470 3471 my %name; 3472 main::set_access('name', \%name, 'r'); 3473 3474 my %loose_match; 3475 # Should this name match loosely or not. 3476 main::set_access('loose_match', \%loose_match, 'r'); 3477 3478 my %make_re_pod_entry; 3479 # Some aliases should not get their own entries in the re section of the 3480 # pod, because they are covered by a wild-card, and some we want to 3481 # discourage use of. Binary 3482 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); 3483 3484 my %ucd; 3485 # Is this documented to be accessible via Unicode::UCD 3486 main::set_access('ucd', \%ucd, 'r', 's'); 3487 3488 my %status; 3489 # Aliases have a status, like deprecated, or even suppressed (which means 3490 # they don't appear in documentation). Enum 3491 main::set_access('status', \%status, 'r'); 3492 3493 my %ok_as_filename; 3494 # Similarly, some aliases should not be considered as usable ones for 3495 # external use, such as file names, or we don't want documentation to 3496 # recommend them. Boolean 3497 main::set_access('ok_as_filename', \%ok_as_filename, 'r'); 3498 3499 sub new { 3500 my $class = shift; 3501 3502 my $self = bless \do { my $anonymous_scalar }, $class; 3503 my $addr = pack 'J', refaddr $self; 3504 3505 $name{$addr} = shift; 3506 $loose_match{$addr} = shift; 3507 $make_re_pod_entry{$addr} = shift; 3508 $ok_as_filename{$addr} = shift; 3509 $status{$addr} = shift; 3510 $ucd{$addr} = shift; 3511 3512 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3513 3514 # Null names are never ok externally 3515 $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; 3516 3517 return $self; 3518 } 3519} 3520 3521package Range; 3522 3523# A range is the basic unit for storing code points, and is described in the 3524# comments at the beginning of the program. Each range has a starting code 3525# point; an ending code point (not less than the starting one); a value 3526# that applies to every code point in between the two end-points, inclusive; 3527# and an enum type that applies to the value. The type is for the user's 3528# convenience, and has no meaning here, except that a non-zero type is 3529# considered to not obey the normal Unicode rules for having standard forms. 3530# 3531# The same structure is used for both map and match tables, even though in the 3532# latter, the value (and hence type) is irrelevant and could be used as a 3533# comment. In map tables, the value is what all the code points in the range 3534# map to. Type 0 values have the standardized version of the value stored as 3535# well, so as to not have to recalculate it a lot. 3536 3537sub trace { return main::trace(@_); } 3538 3539{ # Closure 3540 3541 main::setup_package(); 3542 3543 my %start; 3544 main::set_access('start', \%start, 'r', 's'); 3545 3546 my %end; 3547 main::set_access('end', \%end, 'r', 's'); 3548 3549 my %value; 3550 main::set_access('value', \%value, 'r', 's'); 3551 3552 my %type; 3553 main::set_access('type', \%type, 'r'); 3554 3555 my %standard_form; 3556 # The value in internal standard form. Defined only if the type is 0. 3557 main::set_access('standard_form', \%standard_form); 3558 3559 # Note that if these fields change, the dump() method should as well 3560 3561 sub new($class, $_addr, $_end, @_args) { 3562 my $self = bless \do { my $anonymous_scalar }, $class; 3563 my $addr = pack 'J', refaddr $self; 3564 3565 $start{$addr} = $_addr; 3566 $end{$addr} = $_end; 3567 3568 my %args = @_args; 3569 3570 my $value = delete $args{'Value'}; # Can be 0 3571 $value = "" unless defined $value; 3572 $value{$addr} = $value; 3573 3574 $type{$addr} = delete $args{'Type'} || 0; 3575 3576 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3577 3578 return $self; 3579 } 3580 3581 use overload 3582 fallback => 0, 3583 qw("") => "_operator_stringify", 3584 "." => \&main::_operator_dot, 3585 ".=" => \&main::_operator_dot_equal, 3586 ; 3587 3588 sub _operator_stringify($self, $other="", $reversed=0) { 3589 my $addr = pack 'J', refaddr $self; 3590 3591 # Output it like '0041..0065 (value)' 3592 my $return = sprintf("%04X", $start{$addr}) 3593 . '..' 3594 . sprintf("%04X", $end{$addr}); 3595 my $value = $value{$addr}; 3596 my $type = $type{$addr}; 3597 $return .= ' ('; 3598 $return .= "$value"; 3599 $return .= ", Type=$type" if $type != 0; 3600 $return .= ')'; 3601 3602 return $return; 3603 } 3604 3605 sub standard_form($self) { 3606 # Calculate the standard form only if needed, and cache the result. 3607 # The standard form is the value itself if the type is special. 3608 # This represents a considerable CPU and memory saving - at the time 3609 # of writing there are 368676 non-special objects, but the standard 3610 # form is only requested for 22047 of them - ie about 6%. 3611 3612 my $addr = pack 'J', refaddr $self; 3613 3614 return $standard_form{$addr} if defined $standard_form{$addr}; 3615 3616 my $value = $value{$addr}; 3617 return $value if $type{$addr}; 3618 return $standard_form{$addr} = main::standardize($value); 3619 } 3620 3621 sub dump($self, $indent) { 3622 # Human, not machine readable. For machine readable, comment out this 3623 # entire routine and let the standard one take effect. 3624 my $addr = pack 'J', refaddr $self; 3625 3626 my $return = $indent 3627 . sprintf("%04X", $start{$addr}) 3628 . '..' 3629 . sprintf("%04X", $end{$addr}) 3630 . " '$value{$addr}';"; 3631 if (! defined $standard_form{$addr}) { 3632 $return .= "(type=$type{$addr})"; 3633 } 3634 elsif ($standard_form{$addr} ne $value{$addr}) { 3635 $return .= "(standard '$standard_form{$addr}')"; 3636 } 3637 return $return; 3638 } 3639} # End closure 3640 3641package _Range_List_Base; 3642 3643# Base class for range lists. A range list is simply an ordered list of 3644# ranges, so that the ranges with the lowest starting numbers are first in it. 3645# 3646# When a new range is added that is adjacent to an existing range that has the 3647# same value and type, it merges with it to form a larger range. 3648# 3649# Ranges generally do not overlap, except that there can be multiple entries 3650# of single code point ranges. This is because of NameAliases.txt. 3651# 3652# In this program, there is a standard value such that if two different 3653# values, have the same standard value, they are considered equivalent. This 3654# value was chosen so that it gives correct results on Unicode data 3655 3656# There are a number of methods to manipulate range lists, and some operators 3657# are overloaded to handle them. 3658 3659sub trace { return main::trace(@_); } 3660 3661{ # Closure 3662 3663 our $addr; 3664 3665 # Max is initialized to a negative value that isn't adjacent to 0, for 3666 # simpler tests 3667 my $max_init = -2; 3668 3669 main::setup_package(); 3670 3671 my %ranges; 3672 # The list of ranges 3673 main::set_access('ranges', \%ranges, 'readable_array'); 3674 3675 my %max; 3676 # The highest code point in the list. This was originally a method, but 3677 # actual measurements said it was used a lot. 3678 main::set_access('max', \%max, 'r'); 3679 3680 my %each_range_iterator; 3681 # Iterator position for each_range() 3682 main::set_access('each_range_iterator', \%each_range_iterator); 3683 3684 my %owner_name_of; 3685 # Name of parent this is attached to, if any. Solely for better error 3686 # messages. 3687 main::set_access('owner_name_of', \%owner_name_of, 'p_r'); 3688 3689 my %_search_ranges_cache; 3690 # A cache of the previous result from _search_ranges(), for better 3691 # performance 3692 main::set_access('_search_ranges_cache', \%_search_ranges_cache); 3693 3694 sub new { 3695 my $class = shift; 3696 my %args = @_; 3697 3698 # Optional initialization data for the range list. NOTE: For large 3699 # ranges, it is better to use Range object rather than 3700 # [ low .. high ] 3701 # as it iterates through each one individually in the latter case. 3702 my $initialize = delete $args{'Initialize'}; 3703 3704 my $self; 3705 3706 # Use _union() to initialize. _union() returns an object of this 3707 # class, which means that it will call this constructor recursively. 3708 # But it won't have this $initialize parameter so that it won't 3709 # infinitely loop on this. 3710 return _union($class, $initialize, %args) if defined $initialize; 3711 3712 $self = bless \do { my $anonymous_scalar }, $class; 3713 my $addr = pack 'J', refaddr $self; 3714 3715 # Optional parent object, only for debug info. 3716 $owner_name_of{$addr} = delete $args{'Owner'}; 3717 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; 3718 3719 # Stringify, in case it is an object. 3720 $owner_name_of{$addr} = "$owner_name_of{$addr}"; 3721 3722 # This is used only for error messages, and so a colon is added 3723 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; 3724 3725 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3726 3727 $max{$addr} = $max_init; 3728 3729 $_search_ranges_cache{$addr} = 0; 3730 $ranges{$addr} = []; 3731 3732 return $self; 3733 } 3734 3735 use overload 3736 fallback => 0, 3737 qw("") => "_operator_stringify", 3738 "." => \&main::_operator_dot, 3739 ".=" => \&main::_operator_dot_equal, 3740 ; 3741 3742 sub _operator_stringify($self, $other="", $reversed=0) { 3743 my $addr = pack 'J', refaddr $self; 3744 3745 return "Range_List attached to '$owner_name_of{$addr}'" 3746 if $owner_name_of{$addr}; 3747 return "anonymous Range_List " . \$self; 3748 } 3749 3750 sub _union { 3751 # Returns the union of the input code points. It can be called as 3752 # either a constructor or a method. If called as a method, the result 3753 # will be a new() instance of the calling object, containing the union 3754 # of that object with the other parameter's code points; if called as 3755 # a constructor, the first parameter gives the class that the new object 3756 # should be, and the second parameter gives the code points to go into 3757 # it. 3758 # In either case, there are two parameters looked at by this routine; 3759 # any additional parameters are passed to the new() constructor. 3760 # 3761 # The code points can come in the form of some object that contains 3762 # ranges, and has a conventionally named method to access them; or 3763 # they can be an array of individual code points (as integers); or 3764 # just a single code point. 3765 # 3766 # If they are ranges, this routine doesn't make any effort to preserve 3767 # the range values and types of one input over the other. Therefore 3768 # this base class should not allow _union to be called from other than 3769 # initialization code, so as to prevent two tables from being added 3770 # together where the range values matter. The general form of this 3771 # routine therefore belongs in a derived class, but it was moved here 3772 # to avoid duplication of code. The failure to overload this in this 3773 # class keeps it safe. 3774 # 3775 # It does make the effort during initialization to accept tables with 3776 # multiple values for the same code point, and to preserve the order 3777 # of these. If there is only one input range or range set, it doesn't 3778 # sort (as it should already be sorted to the desired order), and will 3779 # accept multiple values per code point. Otherwise it will merge 3780 # multiple values into a single one. 3781 3782 my $self; 3783 my @args; # Arguments to pass to the constructor 3784 3785 my $class = shift; 3786 3787 # If a method call, will start the union with the object itself, and 3788 # the class of the new object will be the same as self. 3789 if (ref $class) { 3790 $self = $class; 3791 $class = ref $self; 3792 push @args, $self; 3793 } 3794 3795 # Add the other required parameter. 3796 push @args, shift; 3797 # Rest of parameters are passed on to the constructor 3798 3799 # Accumulate all records from both lists. 3800 my @records; 3801 my $input_count = 0; 3802 for my $arg (@args) { 3803 #local $to_trace = 0 if main::DEBUG; 3804 trace "argument = $arg" if main::DEBUG && $to_trace; 3805 if (! defined $arg) { 3806 my $message = ""; 3807 if (defined $self) { 3808 $message .= $owner_name_of{pack 'J', refaddr $self}; 3809 } 3810 Carp::my_carp_bug($message . "Undefined argument to _union. No union done."); 3811 return; 3812 } 3813 3814 $arg = [ $arg ] if ! ref $arg; 3815 my $type = ref $arg; 3816 if ($type eq 'ARRAY') { 3817 foreach my $element (@$arg) { 3818 push @records, Range->new($element, $element); 3819 $input_count++; 3820 } 3821 } 3822 elsif ($arg->isa('Range')) { 3823 push @records, $arg; 3824 $input_count++; 3825 } 3826 elsif ($arg->can('ranges')) { 3827 push @records, $arg->ranges; 3828 $input_count++; 3829 } 3830 else { 3831 my $message = ""; 3832 if (defined $self) { 3833 $message .= $owner_name_of{pack 'J', refaddr $self}; 3834 } 3835 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); 3836 return; 3837 } 3838 } 3839 3840 # Sort with the range containing the lowest ordinal first, but if 3841 # two ranges start at the same code point, sort with the bigger range 3842 # of the two first, because it takes fewer cycles. 3843 if ($input_count > 1) { 3844 @records = sort { ($a->start <=> $b->start) 3845 or 3846 # if b is shorter than a, b->end will be 3847 # less than a->end, and we want to select 3848 # a, so want to return -1 3849 ($b->end <=> $a->end) 3850 } @records; 3851 } 3852 3853 my $new = $class->new(@_); 3854 3855 # Fold in records so long as they add new information. 3856 for my $set (@records) { 3857 my $start = $set->start; 3858 my $end = $set->end; 3859 my $value = $set->value; 3860 my $type = $set->type; 3861 if ($start > $new->max) { 3862 $new->_add_delete('+', $start, $end, $value, Type => $type); 3863 } 3864 elsif ($end > $new->max) { 3865 $new->_add_delete('+', $new->max +1, $end, $value, 3866 Type => $type); 3867 } 3868 elsif ($input_count == 1) { 3869 # Here, overlaps existing range, but is from a single input, 3870 # so preserve the multiple values from that input. 3871 $new->_add_delete('+', $start, $end, $value, Type => $type, 3872 Replace => $MULTIPLE_AFTER); 3873 } 3874 } 3875 3876 return $new; 3877 } 3878 3879 sub range_count($self) { # Return the number of ranges in the range list 3880 return scalar @{$ranges{pack 'J', refaddr $self}}; 3881 } 3882 3883 sub min($self) { 3884 # Returns the minimum code point currently in the range list, or if 3885 # the range list is empty, 2 beyond the max possible. This is a 3886 # method because used so rarely, that not worth saving between calls, 3887 # and having to worry about changing it as ranges are added and 3888 # deleted. 3889 3890 my $addr = pack 'J', refaddr $self; 3891 3892 # If the range list is empty, return a large value that isn't adjacent 3893 # to any that could be in the range list, for simpler tests 3894 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; 3895 return $ranges{$addr}->[0]->start; 3896 } 3897 3898 sub contains($self, $codepoint) { 3899 # Boolean: Is argument in the range list? If so returns $i such that: 3900 # range[$i]->end < $codepoint <= range[$i+1]->end 3901 # which is one beyond what you want; this is so that the 0th range 3902 # doesn't return false 3903 3904 my $i = $self->_search_ranges($codepoint); 3905 return 0 unless defined $i; 3906 3907 # The search returns $i, such that 3908 # range[$i-1]->end < $codepoint <= range[$i]->end 3909 # So is in the table if and only iff it is at least the start position 3910 # of range $i. 3911 return 0 if $ranges{pack 'J', refaddr $self}->[$i]->start > $codepoint; 3912 return $i + 1; 3913 } 3914 3915 sub containing_range($self, $codepoint) { 3916 # Returns the range object that contains the code point, undef if none 3917 my $i = $self->contains($codepoint); 3918 return unless $i; 3919 3920 # contains() returns 1 beyond where we should look 3921 return $ranges{pack 'J', refaddr $self}->[$i-1]; 3922 } 3923 3924 sub value_of($self, $codepoint) { 3925 # Returns the value associated with the code point, undef if none 3926 my $range = $self->containing_range($codepoint); 3927 return unless defined $range; 3928 3929 return $range->value; 3930 } 3931 3932 sub type_of($self, $codepoint) { 3933 # Returns the type of the range containing the code point, undef if 3934 # the code point is not in the table 3935 my $range = $self->containing_range($codepoint); 3936 return unless defined $range; 3937 3938 return $range->type; 3939 } 3940 3941 sub _search_ranges($self, $code_point) { 3942 # Find the range in the list which contains a code point, or where it 3943 # should go if were to add it. That is, it returns $i, such that: 3944 # range[$i-1]->end < $codepoint <= range[$i]->end 3945 # Returns undef if no such $i is possible (e.g. at end of table), or 3946 # if there is an error. 3947 my $addr = pack 'J', refaddr $self; 3948 3949 return if $code_point > $max{$addr}; 3950 my $r = $ranges{$addr}; # The current list of ranges 3951 my $range_list_size = scalar @$r; 3952 my $i; 3953 3954 use integer; # want integer division 3955 3956 # Use the cached result as the starting guess for this one, because, 3957 # an experiment on 5.1 showed that 90% of the time the cache was the 3958 # same as the result on the next call (and 7% it was one less). 3959 $i = $_search_ranges_cache{$addr}; 3960 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. 3961 # from an intervening deletion 3962 #local $to_trace = 1 if main::DEBUG; 3963 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); 3964 return $i if $code_point <= $r->[$i]->end 3965 && ($i == 0 || $r->[$i-1]->end < $code_point); 3966 3967 # Here the cache doesn't yield the correct $i. Try adding 1. 3968 if ($i < $range_list_size - 1 3969 && $r->[$i]->end < $code_point && 3970 $code_point <= $r->[$i+1]->end) 3971 { 3972 $i++; 3973 trace "next \$i is correct: $i" if main::DEBUG && $to_trace; 3974 $_search_ranges_cache{$addr} = $i; 3975 return $i; 3976 } 3977 3978 # Here, adding 1 also didn't work. We do a binary search to 3979 # find the correct position, starting with current $i 3980 my $lower = 0; 3981 my $upper = $range_list_size - 1; 3982 while (1) { 3983 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; 3984 3985 if ($code_point <= $r->[$i]->end) { 3986 3987 # Here we have met the upper constraint. We can quit if we 3988 # also meet the lower one. 3989 last if $i == 0 || $r->[$i-1]->end < $code_point; 3990 3991 $upper = $i; # Still too high. 3992 3993 } 3994 else { 3995 3996 # Here, $r[$i]->end < $code_point, so look higher up. 3997 $lower = $i; 3998 } 3999 4000 # Split search domain in half to try again. 4001 my $temp = ($upper + $lower) / 2; 4002 4003 # No point in continuing unless $i changes for next time 4004 # in the loop. 4005 if ($temp == $i) { 4006 4007 # We can't reach the highest element because of the averaging. 4008 # So if one below the upper edge, force it there and try one 4009 # more time. 4010 if ($i == $range_list_size - 2) { 4011 4012 trace "Forcing to upper edge" if main::DEBUG && $to_trace; 4013 $i = $range_list_size - 1; 4014 4015 # Change $lower as well so if fails next time through, 4016 # taking the average will yield the same $i, and we will 4017 # quit with the error message just below. 4018 $lower = $i; 4019 next; 4020 } 4021 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); 4022 return; 4023 } 4024 $i = $temp; 4025 } # End of while loop 4026 4027 if (main::DEBUG && $to_trace) { 4028 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; 4029 trace "i= [ $i ]", $r->[$i]; 4030 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; 4031 } 4032 4033 # Here we have found the offset. Cache it as a starting point for the 4034 # next call. 4035 $_search_ranges_cache{$addr} = $i; 4036 return $i; 4037 } 4038 4039 sub _add_delete { 4040 # Add, replace or delete ranges to or from a list. The $type 4041 # parameter gives which: 4042 # '+' => insert or replace a range, returning a list of any changed 4043 # ranges. 4044 # '-' => delete a range, returning a list of any deleted ranges. 4045 # 4046 # The next three parameters give respectively the start, end, and 4047 # value associated with the range. 'value' should be null unless the 4048 # operation is '+'; 4049 # 4050 # The range list is kept sorted so that the range with the lowest 4051 # starting position is first in the list, and generally, adjacent 4052 # ranges with the same values are merged into a single larger one (see 4053 # exceptions below). 4054 # 4055 # There are more parameters; all are key => value pairs: 4056 # Type gives the type of the value. It is only valid for '+'. 4057 # All ranges have types; if this parameter is omitted, 0 is 4058 # assumed. Ranges with type 0 are assumed to obey the 4059 # Unicode rules for casing, etc; ranges with other types are 4060 # not. Otherwise, the type is arbitrary, for the caller's 4061 # convenience, and looked at only by this routine to keep 4062 # adjacent ranges of different types from being merged into 4063 # a single larger range, and when Replace => 4064 # $IF_NOT_EQUIVALENT is specified (see just below). 4065 # Replace determines what to do if the range list already contains 4066 # ranges which coincide with all or portions of the input 4067 # range. It is only valid for '+': 4068 # => $NO means that the new value is not to replace 4069 # any existing ones, but any empty gaps of the 4070 # range list coinciding with the input range 4071 # will be filled in with the new value. 4072 # => $UNCONDITIONALLY means to replace the existing values with 4073 # this one unconditionally. However, if the 4074 # new and old values are identical, the 4075 # replacement is skipped to save cycles 4076 # => $IF_NOT_EQUIVALENT means to replace the existing values 4077 # (the default) with this one if they are not equivalent. 4078 # Ranges are equivalent if their types are the 4079 # same, and they are the same string; or if 4080 # both are type 0 ranges, if their Unicode 4081 # standard forms are identical. In this last 4082 # case, the routine chooses the more "modern" 4083 # one to use. This is because some of the 4084 # older files are formatted with values that 4085 # are, for example, ALL CAPs, whereas the 4086 # derived files have a more modern style, 4087 # which looks better. By looking for this 4088 # style when the pre-existing and replacement 4089 # standard forms are the same, we can move to 4090 # the modern style 4091 # => $MULTIPLE_BEFORE means that if this range duplicates an 4092 # existing one, but has a different value, 4093 # don't replace the existing one, but insert 4094 # this one so that the same range can occur 4095 # multiple times. They are stored LIFO, so 4096 # that the final one inserted is the first one 4097 # returned in an ordered search of the table. 4098 # If this is an exact duplicate, including the 4099 # value, the original will be moved to be 4100 # first, before any other duplicate ranges 4101 # with different values. 4102 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored 4103 # FIFO, so that this one is inserted after all 4104 # others that currently exist. If this is an 4105 # exact duplicate, including value, of an 4106 # existing range, this one is discarded 4107 # (leaving the existing one in its original, 4108 # higher priority position 4109 # => $CROAK Die with an error if is already there 4110 # => anything else is the same as => $IF_NOT_EQUIVALENT 4111 # 4112 # "same value" means identical for non-type-0 ranges, and it means 4113 # having the same standard forms for type-0 ranges. 4114 4115 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; 4116 4117 my $self = shift; 4118 my $operation = shift; # '+' for add/replace; '-' for delete; 4119 my $start = shift; 4120 my $end = shift; 4121 my $value = shift; 4122 4123 my %args = @_; 4124 4125 $value = "" if not defined $value; # warning: $value can be "0" 4126 4127 my $replace = delete $args{'Replace'}; 4128 $replace = $IF_NOT_EQUIVALENT unless defined $replace; 4129 4130 my $type = delete $args{'Type'}; 4131 $type = 0 unless defined $type; 4132 4133 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4134 4135 my $addr = pack 'J', refaddr $self; 4136 4137 if ($operation ne '+' && $operation ne '-') { 4138 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); 4139 return; 4140 } 4141 unless (defined $start && defined $end) { 4142 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); 4143 return; 4144 } 4145 unless ($end >= $start) { 4146 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); 4147 return; 4148 } 4149 #local $to_trace = 1 if main::DEBUG; 4150 4151 if ($operation eq '-') { 4152 if ($replace != $IF_NOT_EQUIVALENT) { 4153 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); 4154 $replace = $IF_NOT_EQUIVALENT; 4155 } 4156 if ($type) { 4157 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); 4158 $type = 0; 4159 } 4160 if ($value ne "") { 4161 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); 4162 $value = ""; 4163 } 4164 } 4165 4166 my $r = $ranges{$addr}; # The current list of ranges 4167 my $range_list_size = scalar @$r; # And its size 4168 my $max = $max{$addr}; # The current high code point in 4169 # the list of ranges 4170 4171 # Do a special case requiring fewer machine cycles when the new range 4172 # starts after the current highest point. The Unicode input data is 4173 # structured so this is common. 4174 if ($start > $max) { 4175 4176 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace; 4177 return if $operation eq '-'; # Deleting a non-existing range is a 4178 # no-op 4179 4180 # If the new range doesn't logically extend the current final one 4181 # in the range list, create a new range at the end of the range 4182 # list. (max cleverly is initialized to a negative number not 4183 # adjacent to 0 if the range list is empty, so even adding a range 4184 # to an empty range list starting at 0 will have this 'if' 4185 # succeed.) 4186 if ($start > $max + 1 # non-adjacent means can't extend. 4187 || @{$r}[-1]->value ne $value # values differ, can't extend. 4188 || @{$r}[-1]->type != $type # types differ, can't extend. 4189 ) { 4190 push @$r, Range->new($start, $end, 4191 Value => $value, 4192 Type => $type); 4193 } 4194 else { 4195 4196 # Here, the new range starts just after the current highest in 4197 # the range list, and they have the same type and value. 4198 # Extend the existing range to incorporate the new one. 4199 @{$r}[-1]->set_end($end); 4200 } 4201 4202 # This becomes the new maximum. 4203 $max{$addr} = $end; 4204 4205 return; 4206 } 4207 #local $to_trace = 0 if main::DEBUG; 4208 4209 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; 4210 4211 # Here, the input range isn't after the whole rest of the range list. 4212 # Most likely 'splice' will be needed. The rest of the routine finds 4213 # the needed splice parameters, and if necessary, does the splice. 4214 # First, find the offset parameter needed by the splice function for 4215 # the input range. Note that the input range may span multiple 4216 # existing ones, but we'll worry about that later. For now, just find 4217 # the beginning. If the input range is to be inserted starting in a 4218 # position not currently in the range list, it must (obviously) come 4219 # just after the range below it, and just before the range above it. 4220 # Slightly less obviously, it will occupy the position currently 4221 # occupied by the range that is to come after it. More formally, we 4222 # are looking for the position, $i, in the array of ranges, such that: 4223 # 4224 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end 4225 # 4226 # (The ordered relationships within existing ranges are also shown in 4227 # the equation above). However, if the start of the input range is 4228 # within an existing range, the splice offset should point to that 4229 # existing range's position in the list; that is $i satisfies a 4230 # somewhat different equation, namely: 4231 # 4232 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end 4233 # 4234 # More briefly, $start can come before or after r[$i]->start, and at 4235 # this point, we don't know which it will be. However, these 4236 # two equations share these constraints: 4237 # 4238 # r[$i-1]->end < $start <= r[$i]->end 4239 # 4240 # And that is good enough to find $i. 4241 4242 my $i = $self->_search_ranges($start); 4243 if (! defined $i) { 4244 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); 4245 return; 4246 } 4247 4248 # The search function returns $i such that: 4249 # 4250 # r[$i-1]->end < $start <= r[$i]->end 4251 # 4252 # That means that $i points to the first range in the range list 4253 # that could possibly be affected by this operation. We still don't 4254 # know if the start of the input range is within r[$i], or if it 4255 # points to empty space between r[$i-1] and r[$i]. 4256 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; 4257 4258 # Special case the insertion of data that is not to replace any 4259 # existing data. 4260 if ($replace == $NO) { # If $NO, has to be operation '+' 4261 #local $to_trace = 1 if main::DEBUG; 4262 trace "Doesn't replace" if main::DEBUG && $to_trace; 4263 4264 # Here, the new range is to take effect only on those code points 4265 # that aren't already in an existing range. This can be done by 4266 # looking through the existing range list and finding the gaps in 4267 # the ranges that this new range affects, and then calling this 4268 # function recursively on each of those gaps, leaving untouched 4269 # anything already in the list. Gather up a list of the changed 4270 # gaps first so that changes to the internal state as new ranges 4271 # are added won't be a problem. 4272 my @gap_list; 4273 4274 # First, if the starting point of the input range is outside an 4275 # existing one, there is a gap from there to the beginning of the 4276 # existing range -- add a span to fill the part that this new 4277 # range occupies 4278 if ($start < $r->[$i]->start) { 4279 push @gap_list, Range->new($start, 4280 main::min($end, 4281 $r->[$i]->start - 1), 4282 Type => $type); 4283 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; 4284 } 4285 4286 # Then look through the range list for other gaps until we reach 4287 # the highest range affected by the input one. 4288 my $j; 4289 for ($j = $i+1; $j < $range_list_size; $j++) { 4290 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; 4291 last if $end < $r->[$j]->start; 4292 4293 # If there is a gap between when this range starts and the 4294 # previous one ends, add a span to fill it. Note that just 4295 # because there are two ranges doesn't mean there is a 4296 # non-zero gap between them. It could be that they have 4297 # different values or types 4298 if ($r->[$j-1]->end + 1 != $r->[$j]->start) { 4299 push @gap_list, 4300 Range->new($r->[$j-1]->end + 1, 4301 $r->[$j]->start - 1, 4302 Type => $type); 4303 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; 4304 } 4305 } 4306 4307 # Here, we have either found an existing range in the range list, 4308 # beyond the area affected by the input one, or we fell off the 4309 # end of the loop because the input range affects the whole rest 4310 # of the range list. In either case, $j is 1 higher than the 4311 # highest affected range. If $j == $i, it means that there are no 4312 # affected ranges, that the entire insertion is in the gap between 4313 # r[$i-1], and r[$i], which we already have taken care of before 4314 # the loop. 4315 # On the other hand, if there are affected ranges, it might be 4316 # that there is a gap that needs filling after the final such 4317 # range to the end of the input range 4318 if ($r->[$j-1]->end < $end) { 4319 push @gap_list, Range->new(main::max($start, 4320 $r->[$j-1]->end + 1), 4321 $end, 4322 Type => $type); 4323 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; 4324 } 4325 4326 # Call recursively to fill in all the gaps. 4327 foreach my $gap (@gap_list) { 4328 $self->_add_delete($operation, 4329 $gap->start, 4330 $gap->end, 4331 $value, 4332 Type => $type); 4333 } 4334 4335 return; 4336 } 4337 4338 # Here, we have taken care of the case where $replace is $NO. 4339 # Remember that here, r[$i-1]->end < $start <= r[$i]->end 4340 # If inserting a multiple record, this is where it goes, before the 4341 # first (if any) existing one if inserting LIFO. (If this is to go 4342 # afterwards, FIFO, we below move the pointer to there.) These imply 4343 # an insertion, and no change to any existing ranges. Note that $i 4344 # can be -1 if this new range doesn't actually duplicate any existing, 4345 # and comes at the beginning of the list. 4346 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) { 4347 4348 if ($start != $end) { 4349 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken."); 4350 return; 4351 } 4352 4353 # If the new code point is within a current range ... 4354 if ($end >= $r->[$i]->start) { 4355 4356 # Don't add an exact duplicate, as it isn't really a multiple 4357 my $existing_value = $r->[$i]->value; 4358 my $existing_type = $r->[$i]->type; 4359 return if $value eq $existing_value && $type eq $existing_type; 4360 4361 # If the multiple value is part of an existing range, we want 4362 # to split up that range, so that only the single code point 4363 # is affected. To do this, we first call ourselves 4364 # recursively to delete that code point from the table, having 4365 # preserved its current data above. Then we call ourselves 4366 # recursively again to add the new multiple, which we know by 4367 # the test just above is different than the current code 4368 # point's value, so it will become a range containing a single 4369 # code point: just itself. Finally, we add back in the 4370 # pre-existing code point, which will again be a single code 4371 # point range. Because 'i' likely will have changed as a 4372 # result of these operations, we can't just continue on, but 4373 # do this operation recursively as well. If we are inserting 4374 # LIFO, the pre-existing code point needs to go after the new 4375 # one, so use MULTIPLE_AFTER; and vice versa. 4376 if ($r->[$i]->start != $r->[$i]->end) { 4377 $self->_add_delete('-', $start, $end, ""); 4378 $self->_add_delete('+', $start, $end, $value, Type => $type); 4379 return $self->_add_delete('+', 4380 $start, $end, 4381 $existing_value, 4382 Type => $existing_type, 4383 Replace => ($replace == $MULTIPLE_BEFORE) 4384 ? $MULTIPLE_AFTER 4385 : $MULTIPLE_BEFORE); 4386 } 4387 } 4388 4389 # If to place this new record after, move to beyond all existing 4390 # ones; but don't add this one if identical to any of them, as it 4391 # isn't really a multiple. This leaves the original order, so 4392 # that the current request is ignored. The reasoning is that the 4393 # previous request that wanted this record to have high priority 4394 # should have precedence. 4395 if ($replace == $MULTIPLE_AFTER) { 4396 while ($i < @$r && $r->[$i]->start == $start) { 4397 return if $value eq $r->[$i]->value 4398 && $type eq $r->[$i]->type; 4399 $i++; 4400 } 4401 } 4402 else { 4403 # If instead we are to place this new record before any 4404 # existing ones, remove any identical ones that come after it. 4405 # This changes the existing order so that the new one is 4406 # first, as is being requested. 4407 for (my $j = $i + 1; 4408 $j < @$r && $r->[$j]->start == $start; 4409 $j++) 4410 { 4411 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { 4412 splice @$r, $j, 1; 4413 last; # There should only be one instance, so no 4414 # need to keep looking 4415 } 4416 } 4417 } 4418 4419 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; 4420 my @return = splice @$r, 4421 $i, 4422 0, 4423 Range->new($start, 4424 $end, 4425 Value => $value, 4426 Type => $type); 4427 if (main::DEBUG && $to_trace) { 4428 trace "After splice:"; 4429 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4430 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4431 trace "i =[", $i, "]", $r->[$i] if $i >= 0; 4432 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4433 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4434 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3; 4435 } 4436 return @return; 4437 } 4438 4439 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This 4440 # leaves delete, insert, and replace either unconditionally or if not 4441 # equivalent. $i still points to the first potential affected range. 4442 # Now find the highest range affected, which will determine the length 4443 # parameter to splice. (The input range can span multiple existing 4444 # ones.) If this isn't a deletion, while we are looking through the 4445 # range list, see also if this is a replacement rather than a clean 4446 # insertion; that is if it will change the values of at least one 4447 # existing range. Start off assuming it is an insert, until find it 4448 # isn't. 4449 my $clean_insert = $operation eq '+'; 4450 my $j; # This will point to the highest affected range 4451 4452 # For non-zero types, the standard form is the value itself; 4453 my $standard_form = ($type) ? $value : main::standardize($value); 4454 4455 for ($j = $i; $j < $range_list_size; $j++) { 4456 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; 4457 4458 # If find a range that it doesn't overlap into, we can stop 4459 # searching 4460 last if $end < $r->[$j]->start; 4461 4462 # Here, overlaps the range at $j. If the values don't match, 4463 # and so far we think this is a clean insertion, it becomes a 4464 # non-clean insertion, i.e., a 'change' or 'replace' instead. 4465 if ($clean_insert) { 4466 if ($r->[$j]->standard_form ne $standard_form) { 4467 $clean_insert = 0; 4468 if ($replace == $CROAK) { 4469 main::croak("The range to add " 4470 . sprintf("%04X", $start) 4471 . '-' 4472 . sprintf("%04X", $end) 4473 . " with value '$value' overlaps an existing range $r->[$j]"); 4474 } 4475 } 4476 else { 4477 4478 # Here, the two values are essentially the same. If the 4479 # two are actually identical, replacing wouldn't change 4480 # anything so skip it. 4481 my $pre_existing = $r->[$j]->value; 4482 if ($pre_existing ne $value) { 4483 4484 # Here the new and old standardized values are the 4485 # same, but the non-standardized values aren't. If 4486 # replacing unconditionally, then replace 4487 if( $replace == $UNCONDITIONALLY) { 4488 $clean_insert = 0; 4489 } 4490 else { 4491 4492 # Here, are replacing conditionally. Decide to 4493 # replace or not based on which appears to look 4494 # the "nicest". If one is mixed case and the 4495 # other isn't, choose the mixed case one. 4496 my $new_mixed = $value =~ /[A-Z]/ 4497 && $value =~ /[a-z]/; 4498 my $old_mixed = $pre_existing =~ /[A-Z]/ 4499 && $pre_existing =~ /[a-z]/; 4500 4501 if ($old_mixed != $new_mixed) { 4502 $clean_insert = 0 if $new_mixed; 4503 if (main::DEBUG && $to_trace) { 4504 if ($clean_insert) { 4505 trace "Retaining $pre_existing over $value"; 4506 } 4507 else { 4508 trace "Replacing $pre_existing with $value"; 4509 } 4510 } 4511 } 4512 else { 4513 4514 # Here casing wasn't different between the two. 4515 # If one has hyphens or underscores and the 4516 # other doesn't, choose the one with the 4517 # punctuation. 4518 my $new_punct = $value =~ /[-_]/; 4519 my $old_punct = $pre_existing =~ /[-_]/; 4520 4521 if ($old_punct != $new_punct) { 4522 $clean_insert = 0 if $new_punct; 4523 if (main::DEBUG && $to_trace) { 4524 if ($clean_insert) { 4525 trace "Retaining $pre_existing over $value"; 4526 } 4527 else { 4528 trace "Replacing $pre_existing with $value"; 4529 } 4530 } 4531 } # else existing one is just as "good"; 4532 # retain it to save cycles. 4533 } 4534 } 4535 } 4536 } 4537 } 4538 } # End of loop looking for highest affected range. 4539 4540 # Here, $j points to one beyond the highest range that this insertion 4541 # affects (hence to beyond the range list if that range is the final 4542 # one in the range list). 4543 4544 # The splice length is all the affected ranges. Get it before 4545 # subtracting, for efficiency, so we don't have to later add 1. 4546 my $length = $j - $i; 4547 4548 $j--; # $j now points to the highest affected range. 4549 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; 4550 4551 # Here, have taken care of $NO and $MULTIPLE_foo replaces. 4552 # $j points to the highest affected range. But it can be < $i or even 4553 # -1. These happen only if the insertion is entirely in the gap 4554 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop 4555 # above exited first time through with $end < $r->[$i]->start. (And 4556 # then we subtracted one from j) This implies also that $start < 4557 # $r->[$i]->start, but we know from above that $r->[$i-1]->end < 4558 # $start, so the entire input range is in the gap. 4559 if ($j < $i) { 4560 4561 # Here the entire input range is in the gap before $i. 4562 4563 if (main::DEBUG && $to_trace) { 4564 if ($i) { 4565 trace "Entire range is between $r->[$i-1] and $r->[$i]"; 4566 } 4567 else { 4568 trace "Entire range is before $r->[$i]"; 4569 } 4570 } 4571 return if $operation ne '+'; # Deletion of a non-existent range is 4572 # a no-op 4573 } 4574 else { 4575 4576 # Here part of the input range is not in the gap before $i. Thus, 4577 # there is at least one affected one, and $j points to the highest 4578 # such one. 4579 4580 # At this point, here is the situation: 4581 # This is not an insertion of a multiple, nor of tentative ($NO) 4582 # data. 4583 # $i points to the first element in the current range list that 4584 # may be affected by this operation. In fact, we know 4585 # that the range at $i is affected because we are in 4586 # the else branch of this 'if' 4587 # $j points to the highest affected range. 4588 # In other words, 4589 # r[$i-1]->end < $start <= r[$i]->end 4590 # And: 4591 # r[$i-1]->end < $start <= $end < r[$j+1]->start 4592 # 4593 # Also: 4594 # $clean_insert is a boolean which is set true if and only if 4595 # this is a "clean insertion", i.e., not a change nor a 4596 # deletion (multiple was handled above). 4597 4598 # We now have enough information to decide if this call is a no-op 4599 # or not. It is a no-op if this is an insertion of already 4600 # existing data. To be so, it must be contained entirely in one 4601 # range. 4602 4603 if (main::DEBUG && $to_trace && $clean_insert 4604 && $start >= $r->[$i]->start 4605 && $end <= $r->[$i]->end) 4606 { 4607 trace "no-op"; 4608 } 4609 return if $clean_insert 4610 && $start >= $r->[$i]->start 4611 && $end <= $r->[$i]->end; 4612 } 4613 4614 # Here, we know that some action will have to be taken. We have 4615 # calculated the offset and length (though adjustments may be needed) 4616 # for the splice. Now start constructing the replacement list. 4617 my @replacement; 4618 my $splice_start = $i; 4619 4620 my $extends_below; 4621 my $extends_above; 4622 4623 # See if should extend any adjacent ranges. 4624 if ($operation eq '-') { # Don't extend deletions 4625 $extends_below = $extends_above = 0; 4626 } 4627 else { # Here, should extend any adjacent ranges. See if there are 4628 # any. 4629 $extends_below = ($i > 0 4630 # can't extend unless adjacent 4631 && $r->[$i-1]->end == $start -1 4632 # can't extend unless are same standard value 4633 && $r->[$i-1]->standard_form eq $standard_form 4634 # can't extend unless share type 4635 && $r->[$i-1]->type == $type); 4636 $extends_above = ($j+1 < $range_list_size 4637 && $r->[$j+1]->start == $end +1 4638 && $r->[$j+1]->standard_form eq $standard_form 4639 && $r->[$j+1]->type == $type); 4640 } 4641 if ($extends_below && $extends_above) { # Adds to both 4642 $splice_start--; # start replace at element below 4643 $length += 2; # will replace on both sides 4644 trace "Extends both below and above ranges" if main::DEBUG && $to_trace; 4645 4646 # The result will fill in any gap, replacing both sides, and 4647 # create one large range. 4648 @replacement = Range->new($r->[$i-1]->start, 4649 $r->[$j+1]->end, 4650 Value => $value, 4651 Type => $type); 4652 } 4653 else { 4654 4655 # Here we know that the result won't just be the conglomeration of 4656 # a new range with both its adjacent neighbors. But it could 4657 # extend one of them. 4658 4659 if ($extends_below) { 4660 4661 # Here the new element adds to the one below, but not to the 4662 # one above. If inserting, and only to that one range, can 4663 # just change its ending to include the new one. 4664 if ($length == 0 && $clean_insert) { 4665 $r->[$i-1]->set_end($end); 4666 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; 4667 return; 4668 } 4669 else { 4670 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; 4671 $splice_start--; # start replace at element below 4672 $length++; # will replace the element below 4673 $start = $r->[$i-1]->start; 4674 } 4675 } 4676 elsif ($extends_above) { 4677 4678 # Here the new element adds to the one above, but not below. 4679 # Mirror the code above 4680 if ($length == 0 && $clean_insert) { 4681 $r->[$j+1]->set_start($start); 4682 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; 4683 return; 4684 } 4685 else { 4686 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; 4687 $length++; # will replace the element above 4688 $end = $r->[$j+1]->end; 4689 } 4690 } 4691 4692 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; 4693 4694 # Finally, here we know there will have to be a splice. 4695 # If the change or delete affects only the highest portion of the 4696 # first affected range, the range will have to be split. The 4697 # splice will remove the whole range, but will replace it by a new 4698 # range containing just the unaffected part. So, in this case, 4699 # add to the replacement list just this unaffected portion. 4700 if (! $extends_below 4701 && $start > $r->[$i]->start && $start <= $r->[$i]->end) 4702 { 4703 push @replacement, 4704 Range->new($r->[$i]->start, 4705 $start - 1, 4706 Value => $r->[$i]->value, 4707 Type => $r->[$i]->type); 4708 } 4709 4710 # In the case of an insert or change, but not a delete, we have to 4711 # put in the new stuff; this comes next. 4712 if ($operation eq '+') { 4713 push @replacement, Range->new($start, 4714 $end, 4715 Value => $value, 4716 Type => $type); 4717 } 4718 4719 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; 4720 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; 4721 4722 # And finally, if we're changing or deleting only a portion of the 4723 # highest affected range, it must be split, as the lowest one was. 4724 if (! $extends_above 4725 && $j >= 0 # Remember that j can be -1 if before first 4726 # current element 4727 && $end >= $r->[$j]->start 4728 && $end < $r->[$j]->end) 4729 { 4730 push @replacement, 4731 Range->new($end + 1, 4732 $r->[$j]->end, 4733 Value => $r->[$j]->value, 4734 Type => $r->[$j]->type); 4735 } 4736 } 4737 4738 # And do the splice, as calculated above 4739 if (main::DEBUG && $to_trace) { 4740 trace "replacing $length element(s) at $i with "; 4741 foreach my $replacement (@replacement) { 4742 trace " $replacement"; 4743 } 4744 trace "Before splice:"; 4745 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4746 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4747 trace "i =[", $i, "]", $r->[$i]; 4748 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4749 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4750 } 4751 4752 my @return = splice @$r, $splice_start, $length, @replacement; 4753 4754 if (main::DEBUG && $to_trace) { 4755 trace "After splice:"; 4756 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4757 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4758 trace "i =[", $i, "]", $r->[$i]; 4759 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4760 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4761 trace "removed ", @return if @return; 4762 } 4763 4764 # An actual deletion could have changed the maximum in the list. 4765 # There was no deletion if the splice didn't return something, but 4766 # otherwise recalculate it. This is done too rarely to worry about 4767 # performance. 4768 if ($operation eq '-' && @return) { 4769 if (@$r) { 4770 $max{$addr} = $r->[-1]->end; 4771 } 4772 else { # Now empty 4773 $max{$addr} = $max_init; 4774 } 4775 } 4776 return @return; 4777 } 4778 4779 sub reset_each_range($self) { # reset the iterator for each_range(); 4780 undef $each_range_iterator{pack 'J', refaddr $self}; 4781 return; 4782 } 4783 4784 sub each_range($self) { 4785 # Iterate over each range in a range list. Results are undefined if 4786 # the range list is changed during the iteration. 4787 my $addr = pack 'J', refaddr $self; 4788 4789 return if $self->is_empty; 4790 4791 $each_range_iterator{$addr} = -1 4792 if ! defined $each_range_iterator{$addr}; 4793 $each_range_iterator{$addr}++; 4794 return $ranges{$addr}->[$each_range_iterator{$addr}] 4795 if $each_range_iterator{$addr} < @{$ranges{$addr}}; 4796 undef $each_range_iterator{$addr}; 4797 return; 4798 } 4799 4800 sub count($self) { # Returns count of code points in range list 4801 my $addr = pack 'J', refaddr $self; 4802 4803 my $count = 0; 4804 foreach my $range (@{$ranges{$addr}}) { 4805 $count += $range->end - $range->start + 1; 4806 } 4807 return $count; 4808 } 4809 4810 sub delete_range($self, $start, $end) { # Delete a range 4811 return $self->_add_delete('-', $start, $end, ""); 4812 } 4813 4814 sub is_empty($self) { # Returns boolean as to if a range list is empty 4815 return scalar @{$ranges{pack 'J', refaddr $self}} == 0; 4816 } 4817 4818 sub hash($self) { 4819 # Quickly returns a scalar suitable for separating tables into 4820 # buckets, i.e. it is a hash function of the contents of a table, so 4821 # there are relatively few conflicts. 4822 my $addr = pack 'J', refaddr $self; 4823 4824 # These are quickly computable. Return looks like 'min..max;count' 4825 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; 4826 } 4827} # End closure for _Range_List_Base 4828 4829package Range_List; 4830use parent '-norequire', '_Range_List_Base'; 4831 4832# A Range_List is a range list for match tables; i.e. the range values are 4833# not significant. Thus a number of operations can be safely added to it, 4834# such as inversion, intersection. Note that union is also an unsafe 4835# operation when range values are cared about, and that method is in the base 4836# class, not here. But things are set up so that that method is callable only 4837# during initialization. Only in this derived class, is there an operation 4838# that combines two tables. A Range_Map can thus be used to initialize a 4839# Range_List, and its mappings will be in the list, but are not significant to 4840# this class. 4841 4842sub trace { return main::trace(@_); } 4843 4844{ # Closure 4845 4846 use overload 4847 fallback => 0, 4848 '+' => sub { my $self = shift; 4849 my $other = shift; 4850 4851 return $self->_union($other) 4852 }, 4853 '+=' => sub { my $self = shift; 4854 my $other = shift; 4855 my $reversed = shift; 4856 4857 if ($reversed) { 4858 Carp::my_carp_bug("Bad news. Can't cope with '" 4859 . ref($other) 4860 . ' += ' 4861 . ref($self) 4862 . "'. undef returned."); 4863 return; 4864 } 4865 4866 return $self->_union($other) 4867 }, 4868 '&' => sub { my $self = shift; 4869 my $other = shift; 4870 4871 return $self->_intersect($other, 0); 4872 }, 4873 '&=' => sub { my $self = shift; 4874 my $other = shift; 4875 my $reversed = shift; 4876 4877 if ($reversed) { 4878 Carp::my_carp_bug("Bad news. Can't cope with '" 4879 . ref($other) 4880 . ' &= ' 4881 . ref($self) 4882 . "'. undef returned."); 4883 return; 4884 } 4885 4886 return $self->_intersect($other, 0); 4887 }, 4888 '~' => "_invert", 4889 '-' => "_subtract", 4890 ; 4891 4892 sub _invert($self, @) { 4893 # Returns a new Range_List that gives all code points not in $self. 4894 my $new = Range_List->new; 4895 4896 # Go through each range in the table, finding the gaps between them 4897 my $max = -1; # Set so no gap before range beginning at 0 4898 for my $range ($self->ranges) { 4899 my $start = $range->start; 4900 my $end = $range->end; 4901 4902 # If there is a gap before this range, the inverse will contain 4903 # that gap. 4904 if ($start > $max + 1) { 4905 $new->add_range($max + 1, $start - 1); 4906 } 4907 $max = $end; 4908 } 4909 4910 # And finally, add the gap from the end of the table to the max 4911 # possible code point 4912 if ($max < $MAX_WORKING_CODEPOINT) { 4913 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT); 4914 } 4915 return $new; 4916 } 4917 4918 sub _subtract($self, $other, $reversed=0) { 4919 # Returns a new Range_List with the argument deleted from it. The 4920 # argument can be a single code point, a range, or something that has 4921 # a range, with the _range_list() method on it returning them 4922 4923 if ($reversed) { 4924 Carp::my_carp_bug("Bad news. Can't cope with '" 4925 . ref($other) 4926 . ' - ' 4927 . ref($self) 4928 . "'. undef returned."); 4929 return; 4930 } 4931 4932 my $new = Range_List->new(Initialize => $self); 4933 4934 if (! ref $other) { # Single code point 4935 $new->delete_range($other, $other); 4936 } 4937 elsif ($other->isa('Range')) { 4938 $new->delete_range($other->start, $other->end); 4939 } 4940 elsif ($other->can('_range_list')) { 4941 foreach my $range ($other->_range_list->ranges) { 4942 $new->delete_range($range->start, $range->end); 4943 } 4944 } 4945 else { 4946 Carp::my_carp_bug("Can't cope with a " 4947 . ref($other) 4948 . " argument to '-'. Subtraction ignored." 4949 ); 4950 return $self; 4951 } 4952 4953 return $new; 4954 } 4955 4956 sub _intersect($a_object, $b_object, $check_if_overlapping=0) { 4957 # Returns either a boolean giving whether the two inputs' range lists 4958 # intersect (overlap), or a new Range_List containing the intersection 4959 # of the two lists. The optional final parameter being true indicates 4960 # to do the check instead of the intersection. 4961 4962 if (! defined $b_object) { 4963 my $message = ""; 4964 $message .= $a_object->_owner_name_of if defined $a_object; 4965 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); 4966 return; 4967 } 4968 4969 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) 4970 # Thus the intersection could be much more simply be written: 4971 # return ~(~$a_object + ~$b_object); 4972 # But, this is slower, and when taking the inverse of a large 4973 # range_size_1 table, back when such tables were always stored that 4974 # way, it became prohibitively slow, hence the code was changed to the 4975 # below 4976 4977 if ($b_object->isa('Range')) { 4978 $b_object = Range_List->new(Initialize => $b_object, 4979 Owner => $a_object->_owner_name_of); 4980 } 4981 $b_object = $b_object->_range_list if $b_object->can('_range_list'); 4982 4983 my @a_ranges = $a_object->ranges; 4984 my @b_ranges = $b_object->ranges; 4985 4986 #local $to_trace = 1 if main::DEBUG; 4987 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; 4988 4989 # Start with the first range in each list 4990 my $a_i = 0; 4991 my $range_a = $a_ranges[$a_i]; 4992 my $b_i = 0; 4993 my $range_b = $b_ranges[$b_i]; 4994 4995 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) 4996 if ! $check_if_overlapping; 4997 4998 # If either list is empty, there is no intersection and no overlap 4999 if (! defined $range_a || ! defined $range_b) { 5000 return $check_if_overlapping ? 0 : $new; 5001 } 5002 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5003 5004 # Otherwise, must calculate the intersection/overlap. Start with the 5005 # very first code point in each list 5006 my $a = $range_a->start; 5007 my $b = $range_b->start; 5008 5009 # Loop through all the ranges of each list; in each iteration, $a and 5010 # $b are the current code points in their respective lists 5011 while (1) { 5012 5013 # If $a and $b are the same code point, ... 5014 if ($a == $b) { 5015 5016 # it means the lists overlap. If just checking for overlap 5017 # know the answer now, 5018 return 1 if $check_if_overlapping; 5019 5020 # The intersection includes this code point plus anything else 5021 # common to both current ranges. 5022 my $start = $a; 5023 my $end = main::min($range_a->end, $range_b->end); 5024 if (! $check_if_overlapping) { 5025 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; 5026 $new->add_range($start, $end); 5027 } 5028 5029 # Skip ahead to the end of the current intersect 5030 $a = $b = $end; 5031 5032 # If the current intersect ends at the end of either range (as 5033 # it must for at least one of them), the next possible one 5034 # will be the beginning code point in it's list's next range. 5035 if ($a == $range_a->end) { 5036 $range_a = $a_ranges[++$a_i]; 5037 last unless defined $range_a; 5038 $a = $range_a->start; 5039 } 5040 if ($b == $range_b->end) { 5041 $range_b = $b_ranges[++$b_i]; 5042 last unless defined $range_b; 5043 $b = $range_b->start; 5044 } 5045 5046 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5047 } 5048 elsif ($a < $b) { 5049 5050 # Not equal, but if the range containing $a encompasses $b, 5051 # change $a to be the middle of the range where it does equal 5052 # $b, so the next iteration will get the intersection 5053 if ($range_a->end >= $b) { 5054 $a = $b; 5055 } 5056 else { 5057 5058 # Here, the current range containing $a is entirely below 5059 # $b. Go try to find a range that could contain $b. 5060 $a_i = $a_object->_search_ranges($b); 5061 5062 # If no range found, quit. 5063 last unless defined $a_i; 5064 5065 # The search returns $a_i, such that 5066 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end 5067 # Set $a to the beginning of this new range, and repeat. 5068 $range_a = $a_ranges[$a_i]; 5069 $a = $range_a->start; 5070 } 5071 } 5072 else { # Here, $b < $a. 5073 5074 # Mirror image code to the leg just above 5075 if ($range_b->end >= $a) { 5076 $b = $a; 5077 } 5078 else { 5079 $b_i = $b_object->_search_ranges($a); 5080 last unless defined $b_i; 5081 $range_b = $b_ranges[$b_i]; 5082 $b = $range_b->start; 5083 } 5084 } 5085 } # End of looping through ranges. 5086 5087 # Intersection fully computed, or now know that there is no overlap 5088 return $check_if_overlapping ? 0 : $new; 5089 } 5090 5091 sub overlaps($self, $other) { 5092 # Returns boolean giving whether the two arguments overlap somewhere 5093 return $self->_intersect($other, 1); 5094 } 5095 5096 sub add_range($self, $start, $end) { 5097 # Add a range to the list. 5098 return $self->_add_delete('+', $start, $end, ""); 5099 } 5100 5101 sub matches_identically_to($self, $other) { 5102 # Return a boolean as to whether or not two Range_Lists match identical 5103 # sets of code points. 5104 # These are ordered in increasing real time to figure out (at least 5105 # until a patch changes that and doesn't change this) 5106 return 0 if $self->max != $other->max; 5107 return 0 if $self->min != $other->min; 5108 return 0 if $self->range_count != $other->range_count; 5109 return 0 if $self->count != $other->count; 5110 5111 # Here they could be identical because all the tests above passed. 5112 # The loop below is somewhat simpler since we know they have the same 5113 # number of elements. Compare range by range, until reach the end or 5114 # find something that differs. 5115 my @a_ranges = $self->ranges; 5116 my @b_ranges = $other->ranges; 5117 for my $i (0 .. @a_ranges - 1) { 5118 my $a = $a_ranges[$i]; 5119 my $b = $b_ranges[$i]; 5120 trace "self $a; other $b" if main::DEBUG && $to_trace; 5121 return 0 if ! defined $b 5122 || $a->start != $b->start 5123 || $a->end != $b->end; 5124 } 5125 return 1; 5126 } 5127 5128 sub is_code_point_usable($code, $try_hard) { 5129 # This used only for making the test script. See if the input 5130 # proposed trial code point is one that Perl will handle. If second 5131 # parameter is 0, it won't select some code points for various 5132 # reasons, noted below. 5133 return 0 if $code < 0; # Never use a negative 5134 5135 # shun null. I'm (khw) not sure why this was done, but NULL would be 5136 # the character very frequently used. 5137 return $try_hard if $code == 0x0000; 5138 5139 # shun non-character code points. 5140 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; 5141 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF 5142 5143 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range 5144 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate 5145 5146 return 1; 5147 } 5148 5149 sub get_valid_code_point($self) { 5150 # Return a code point that's part of the range list. Returns nothing 5151 # if the table is empty or we can't find a suitable code point. This 5152 # used only for making the test script. 5153 5154 # On first pass, don't choose less desirable code points; if no good 5155 # one is found, repeat, allowing a less desirable one to be selected. 5156 for my $try_hard (0, 1) { 5157 5158 # Look through all the ranges for a usable code point. 5159 for my $set (reverse $self->ranges) { 5160 5161 # Try the edge cases first, starting with the end point of the 5162 # range. 5163 my $end = $set->end; 5164 return $end if is_code_point_usable($end, $try_hard); 5165 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT; 5166 5167 # End point didn't, work. Start at the beginning and try 5168 # every one until find one that does work. 5169 for my $trial ($set->start .. $end - 1) { 5170 return $trial if is_code_point_usable($trial, $try_hard); 5171 } 5172 } 5173 } 5174 return (); # If none found, give up. 5175 } 5176 5177 sub get_invalid_code_point($self) { 5178 # Return a code point that's not part of the table. Returns nothing 5179 # if the table covers all code points or a suitable code point can't 5180 # be found. This used only for making the test script. 5181 5182 # Just find a valid code point of the inverse, if any. 5183 return Range_List->new(Initialize => ~ $self)->get_valid_code_point; 5184 } 5185} # end closure for Range_List 5186 5187package Range_Map; 5188use parent '-norequire', '_Range_List_Base'; 5189 5190# A Range_Map is a range list in which the range values (called maps) are 5191# significant, and hence shouldn't be manipulated by our other code, which 5192# could be ambiguous or lose things. For example, in taking the union of two 5193# lists, which share code points, but which have differing values, which one 5194# has precedence in the union? 5195# It turns out that these operations aren't really necessary for map tables, 5196# and so this class was created to make sure they aren't accidentally 5197# applied to them. 5198 5199{ # Closure 5200 5201 sub add_map($self, @add) { 5202 # Add a range containing a mapping value to the list 5203 return $self->_add_delete('+', @add); 5204 } 5205 5206 sub replace_map($self, @list) { 5207 # Replace a range 5208 return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY); 5209 } 5210 5211 sub add_duplicate { 5212 # Adds entry to a range list which can duplicate an existing entry 5213 5214 my $self = shift; 5215 my $code_point = shift; 5216 my $value = shift; 5217 my %args = @_; 5218 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE; 5219 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5220 5221 return $self->add_map($code_point, $code_point, 5222 $value, Replace => $replace); 5223 } 5224} # End of closure for package Range_Map 5225 5226package _Base_Table; 5227 5228# A table is the basic data structure that gets written out into a file for 5229# use by the Perl core. This is the abstract base class implementing the 5230# common elements from the derived ones. A list of the methods to be 5231# furnished by an implementing class is just after the constructor. 5232 5233sub standardize { return main::standardize($_[0]); } 5234sub trace { return main::trace(@_); } 5235 5236{ # Closure 5237 5238 main::setup_package(); 5239 5240 my %range_list; 5241 # Object containing the ranges of the table. 5242 main::set_access('range_list', \%range_list, 'p_r', 'p_s'); 5243 5244 my %full_name; 5245 # The full table name. 5246 main::set_access('full_name', \%full_name, 'r'); 5247 5248 my %name; 5249 # The table name, almost always shorter 5250 main::set_access('name', \%name, 'r'); 5251 5252 my %short_name; 5253 # The shortest of all the aliases for this table, with underscores removed 5254 main::set_access('short_name', \%short_name); 5255 5256 my %nominal_short_name_length; 5257 # The length of short_name before removing underscores 5258 main::set_access('nominal_short_name_length', 5259 \%nominal_short_name_length); 5260 5261 my %complete_name; 5262 # The complete name, including property. 5263 main::set_access('complete_name', \%complete_name, 'r'); 5264 5265 my %property; 5266 # Parent property this table is attached to. 5267 main::set_access('property', \%property, 'r'); 5268 5269 my %aliases; 5270 # Ordered list of alias objects of the table's name. The first ones in 5271 # the list are output first in comments 5272 main::set_access('aliases', \%aliases, 'readable_array'); 5273 5274 my %comment; 5275 # A comment associated with the table for human readers of the files 5276 main::set_access('comment', \%comment, 's'); 5277 5278 my %description; 5279 # A comment giving a short description of the table's meaning for human 5280 # readers of the files. 5281 main::set_access('description', \%description, 'readable_array'); 5282 5283 my %note; 5284 # A comment giving a short note about the table for human readers of the 5285 # files. 5286 main::set_access('note', \%note, 'readable_array'); 5287 5288 my %fate; 5289 # Enum; there are a number of possibilities for what happens to this 5290 # table: it could be normal, or suppressed, or not for external use. See 5291 # values at definition for $SUPPRESSED. 5292 main::set_access('fate', \%fate, 'r'); 5293 5294 my %find_table_from_alias; 5295 # The parent property passes this pointer to a hash which this class adds 5296 # all its aliases to, so that the parent can quickly take an alias and 5297 # find this table. 5298 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); 5299 5300 my %locked; 5301 # After this table is made equivalent to another one; we shouldn't go 5302 # changing the contents because that could mean it's no longer equivalent 5303 main::set_access('locked', \%locked, 'r'); 5304 5305 my %file_path; 5306 # This gives the final path to the file containing the table. Each 5307 # directory in the path is an element in the array 5308 main::set_access('file_path', \%file_path, 'readable_array'); 5309 5310 my %status; 5311 # What is the table's status, normal, $OBSOLETE, etc. Enum 5312 main::set_access('status', \%status, 'r'); 5313 5314 my %status_info; 5315 # A comment about its being obsolete, or whatever non normal status it has 5316 main::set_access('status_info', \%status_info, 'r'); 5317 5318 my %caseless_equivalent; 5319 # The table this is equivalent to under /i matching, if any. 5320 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's'); 5321 5322 my %range_size_1; 5323 # Is the table to be output with each range only a single code point? 5324 # This is done to avoid breaking existing code that may have come to rely 5325 # on this behavior in previous versions of this program.) 5326 main::set_access('range_size_1', \%range_size_1, 'r', 's'); 5327 5328 my %perl_extension; 5329 # A boolean set iff this table is a Perl extension to the Unicode 5330 # standard. 5331 main::set_access('perl_extension', \%perl_extension, 'r'); 5332 5333 my %output_range_counts; 5334 # A boolean set iff this table is to have comments written in the 5335 # output file that contain the number of code points in the range. 5336 # The constructor can override the global flag of the same name. 5337 main::set_access('output_range_counts', \%output_range_counts, 'r'); 5338 5339 my %write_as_invlist; 5340 # A boolean set iff the output file for this table is to be in the form of 5341 # an inversion list/map. 5342 main::set_access('write_as_invlist', \%write_as_invlist, 'r'); 5343 5344 my %format; 5345 # The format of the entries of the table. This is calculated from the 5346 # data in the table (or passed in the constructor). This is an enum e.g., 5347 # $STRING_FORMAT. It is marked protected as it should not be generally 5348 # used to override calculations. 5349 main::set_access('format', \%format, 'r', 'p_s'); 5350 5351 my %has_dependency; 5352 # A boolean that gives whether some other table in this property is 5353 # defined as the complement of this table. This is a crude, but currently 5354 # sufficient, mechanism to make this table not get destroyed before what 5355 # is dependent on it is. Other dependencies could be added, so the name 5356 # was chosen to reflect a more general situation than actually is 5357 # currently the case. 5358 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 5359 5360 sub new { 5361 # All arguments are key => value pairs, which you can see below, most 5362 # of which match fields documented above. Otherwise: Re_Pod_Entry, 5363 # OK_as_Filename, and Fuzzy apply to the names of the table, and are 5364 # documented in the Alias package 5365 5366 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 5367 5368 my $class = shift; 5369 5370 my $self = bless \do { my $anonymous_scalar }, $class; 5371 my $addr = pack 'J', refaddr $self; 5372 5373 my %args = @_; 5374 5375 $name{$addr} = delete $args{'Name'}; 5376 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; 5377 $full_name{$addr} = delete $args{'Full_Name'}; 5378 my $complete_name = $complete_name{$addr} 5379 = delete $args{'Complete_Name'}; 5380 $format{$addr} = delete $args{'Format'}; 5381 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; 5382 $property{$addr} = delete $args{'_Property'}; 5383 $range_list{$addr} = delete $args{'_Range_List'}; 5384 $status{$addr} = delete $args{'Status'} || $NORMAL; 5385 $status_info{$addr} = delete $args{'_Status_Info'} || ""; 5386 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; 5387 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; 5388 $fate{$addr} = delete $args{'Fate'} || $ORDINARY; 5389 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default 5390 my $ucd = delete $args{'UCD'}; 5391 5392 my $description = delete $args{'Description'}; 5393 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5394 my $loose_match = delete $args{'Fuzzy'}; 5395 my $note = delete $args{'Note'}; 5396 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 5397 my $perl_extension = delete $args{'Perl_Extension'}; 5398 my $suppression_reason = delete $args{'Suppression_Reason'}; 5399 5400 # Shouldn't have any left over 5401 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5402 5403 # Can't use || above because conceivably the name could be 0, and 5404 # can't use // operator in case this program gets used in Perl 5.8 5405 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; 5406 $output_range_counts{$addr} = $output_range_counts if 5407 ! defined $output_range_counts{$addr}; 5408 5409 $aliases{$addr} = [ ]; 5410 $comment{$addr} = [ ]; 5411 $description{$addr} = [ ]; 5412 $note{$addr} = [ ]; 5413 $file_path{$addr} = [ ]; 5414 $locked{$addr} = ""; 5415 $has_dependency{$addr} = 0; 5416 5417 push @{$description{$addr}}, $description if $description; 5418 push @{$note{$addr}}, $note if $note; 5419 5420 if ($fate{$addr} == $PLACEHOLDER) { 5421 5422 # A placeholder table doesn't get documented, is a perl extension, 5423 # and quite likely will be empty 5424 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5425 $perl_extension = 1 if ! defined $perl_extension; 5426 $ucd = 0 if ! defined $ucd; 5427 push @tables_that_may_be_empty, $complete_name{$addr}; 5428 $self->add_comment(<<END); 5429This is a placeholder because it is not in Version $string_version of Unicode, 5430but is needed by the Perl core to work gracefully. Because it is not in this 5431version of Unicode, it will not be listed in $pod_file.pod 5432END 5433 } 5434 elsif (exists $why_suppressed{$complete_name} 5435 # Don't suppress if overridden 5436 && ! grep { $_ eq $complete_name{$addr} } 5437 @output_mapped_properties) 5438 { 5439 $fate{$addr} = $SUPPRESSED; 5440 } 5441 elsif ($fate{$addr} == $SUPPRESSED) { 5442 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason; 5443 # Though currently unused 5444 } 5445 elsif ($suppression_reason) { 5446 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed"); 5447 } 5448 5449 # If hasn't set its status already, see if it is on one of the 5450 # lists of properties or tables that have particular statuses; if 5451 # not, is normal. The lists are prioritized so the most serious 5452 # ones are checked first 5453 if (! $status{$addr}) { 5454 if (exists $why_deprecated{$complete_name}) { 5455 $status{$addr} = $DEPRECATED; 5456 } 5457 elsif (exists $why_stabilized{$complete_name}) { 5458 $status{$addr} = $STABILIZED; 5459 } 5460 elsif (exists $why_obsolete{$complete_name}) { 5461 $status{$addr} = $OBSOLETE; 5462 } 5463 5464 # Existence above doesn't necessarily mean there is a message 5465 # associated with it. Use the most serious message. 5466 if ($status{$addr}) { 5467 if ($why_deprecated{$complete_name}) { 5468 $status_info{$addr} 5469 = $why_deprecated{$complete_name}; 5470 } 5471 elsif ($why_stabilized{$complete_name}) { 5472 $status_info{$addr} 5473 = $why_stabilized{$complete_name}; 5474 } 5475 elsif ($why_obsolete{$complete_name}) { 5476 $status_info{$addr} 5477 = $why_obsolete{$complete_name}; 5478 } 5479 } 5480 } 5481 5482 $perl_extension{$addr} = $perl_extension || 0; 5483 5484 # Don't list a property by default that is internal only 5485 if ($fate{$addr} > $MAP_PROXIED) { 5486 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5487 $ucd = 0 if ! defined $ucd; 5488 } 5489 else { 5490 $ucd = 1 if ! defined $ucd; 5491 } 5492 5493 # By convention what typically gets printed only or first is what's 5494 # first in the list, so put the full name there for good output 5495 # clarity. Other routines rely on the full name being first on the 5496 # list 5497 $self->add_alias($full_name{$addr}, 5498 OK_as_Filename => $ok_as_filename, 5499 Fuzzy => $loose_match, 5500 Re_Pod_Entry => $make_re_pod_entry, 5501 Status => $status{$addr}, 5502 UCD => $ucd, 5503 ); 5504 5505 # Then comes the other name, if meaningfully different. 5506 if (standardize($full_name{$addr}) ne standardize($name{$addr})) { 5507 $self->add_alias($name{$addr}, 5508 OK_as_Filename => $ok_as_filename, 5509 Fuzzy => $loose_match, 5510 Re_Pod_Entry => $make_re_pod_entry, 5511 Status => $status{$addr}, 5512 UCD => $ucd, 5513 ); 5514 } 5515 5516 return $self; 5517 } 5518 5519 # Here are the methods that are required to be defined by any derived 5520 # class 5521 for my $sub (qw( 5522 handle_special_range 5523 append_to_body 5524 pre_body 5525 )) 5526 # write() knows how to write out normal ranges, but it calls 5527 # handle_special_range() when it encounters a non-normal one. 5528 # append_to_body() is called by it after it has handled all 5529 # ranges to add anything after the main portion of the table. 5530 # And finally, pre_body() is called after all this to build up 5531 # anything that should appear before the main portion of the 5532 # table. Doing it this way allows things in the middle to 5533 # affect what should appear before the main portion of the 5534 # table. 5535 { 5536 no strict "refs"; 5537 *$sub = sub { 5538 Carp::my_carp_bug( __LINE__ 5539 . ": Must create method '$sub()' for " 5540 . ref shift); 5541 return; 5542 } 5543 } 5544 5545 use overload 5546 fallback => 0, 5547 "." => \&main::_operator_dot, 5548 ".=" => \&main::_operator_dot_equal, 5549 '!=' => \&main::_operator_not_equal, 5550 '==' => \&main::_operator_equal, 5551 ; 5552 5553 sub ranges { 5554 # Returns the array of ranges associated with this table. 5555 5556 return $range_list{pack 'J', refaddr shift}->ranges; 5557 } 5558 5559 sub add_alias { 5560 # Add a synonym for this table. 5561 5562 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 5563 5564 my $self = shift; 5565 my $name = shift; # The name to add. 5566 my $pointer = shift; # What the alias hash should point to. For 5567 # map tables, this is the parent property; 5568 # for match tables, it is the table itself. 5569 5570 my %args = @_; 5571 my $loose_match = delete $args{'Fuzzy'}; 5572 5573 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5574 $ok_as_filename = 1 unless defined $ok_as_filename; 5575 5576 # An internal name does not get documented, unless overridden by the 5577 # input; same for making tests for it. 5578 my $status = delete $args{'Status'} || (($name =~ /^_/) 5579 ? $INTERNAL_ALIAS 5580 : $NORMAL); 5581 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'} 5582 // (($status ne $INTERNAL_ALIAS) 5583 ? (($name =~ /^_/) ? $NO : $YES) 5584 : $NO); 5585 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); 5586 5587 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5588 5589 # Capitalize the first letter of the alias unless it is one of the CJK 5590 # ones which specifically begins with a lower 'k'. Do this because 5591 # Unicode has varied whether they capitalize first letters or not, and 5592 # have later changed their minds and capitalized them, but not the 5593 # other way around. So do it always and avoid changes from release to 5594 # release 5595 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 5596 5597 my $addr = pack 'J', refaddr $self; 5598 5599 # Figure out if should be loosely matched if not already specified. 5600 if (! defined $loose_match) { 5601 5602 # Is a loose_match if isn't null, and doesn't begin with an 5603 # underscore and isn't just a number 5604 if ($name ne "" 5605 && substr($name, 0, 1) ne '_' 5606 && $name !~ qr{^[0-9_.+-/]+$}) 5607 { 5608 $loose_match = 1; 5609 } 5610 else { 5611 $loose_match = 0; 5612 } 5613 } 5614 5615 # If this alias has already been defined, do nothing. 5616 return if defined $find_table_from_alias{$addr}->{$name}; 5617 5618 # That includes if it is standardly equivalent to an existing alias, 5619 # in which case, add this name to the list, so won't have to search 5620 # for it again. 5621 my $standard_name = main::standardize($name); 5622 if (defined $find_table_from_alias{$addr}->{$standard_name}) { 5623 $find_table_from_alias{$addr}->{$name} 5624 = $find_table_from_alias{$addr}->{$standard_name}; 5625 return; 5626 } 5627 5628 # Set the index hash for this alias for future quick reference. 5629 $find_table_from_alias{$addr}->{$name} = $pointer; 5630 $find_table_from_alias{$addr}->{$standard_name} = $pointer; 5631 local $to_trace = 0 if main::DEBUG; 5632 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; 5633 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; 5634 5635 5636 # Put the new alias at the end of the list of aliases unless the final 5637 # element begins with an underscore (meaning it is for internal perl 5638 # use) or is all numeric, in which case, put the new one before that 5639 # one. This floats any all-numeric or underscore-beginning aliases to 5640 # the end. This is done so that they are listed last in output lists, 5641 # to encourage the user to use a better name (either more descriptive 5642 # or not an internal-only one) instead. This ordering is relied on 5643 # implicitly elsewhere in this program, like in short_name() 5644 my $list = $aliases{$addr}; 5645 my $insert_position = (@$list == 0 5646 || (substr($list->[-1]->name, 0, 1) ne '_' 5647 && $list->[-1]->name =~ /\D/)) 5648 ? @$list 5649 : @$list - 1; 5650 splice @$list, 5651 $insert_position, 5652 0, 5653 Alias->new($name, $loose_match, $make_re_pod_entry, 5654 $ok_as_filename, $status, $ucd); 5655 5656 # This name may be shorter than any existing ones, so clear the cache 5657 # of the shortest, so will have to be recalculated. 5658 undef $short_name{pack 'J', refaddr $self}; 5659 return; 5660 } 5661 5662 sub short_name($self, $nominal_length_ptr=undef) { 5663 # Returns a name suitable for use as the base part of a file name. 5664 # That is, shorter wins. It can return undef if there is no suitable 5665 # name. The name has all non-essential underscores removed. 5666 5667 # The optional second parameter is a reference to a scalar in which 5668 # this routine will store the length the returned name had before the 5669 # underscores were removed, or undef if the return is undef. 5670 5671 # The shortest name can change if new aliases are added. So using 5672 # this should be deferred until after all these are added. The code 5673 # that does that should clear this one's cache. 5674 # Any name with alphabetics is preferred over an all numeric one, even 5675 # if longer. 5676 5677 my $addr = pack 'J', refaddr $self; 5678 5679 # For efficiency, don't recalculate, but this means that adding new 5680 # aliases could change what the shortest is, so the code that does 5681 # that needs to undef this. 5682 if (defined $short_name{$addr}) { 5683 if ($nominal_length_ptr) { 5684 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5685 } 5686 return $short_name{$addr}; 5687 } 5688 5689 # Look at each alias 5690 my $is_last_resort = 0; 5691 my $deprecated_or_discouraged 5692 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x; 5693 foreach my $alias ($self->aliases()) { 5694 5695 # Don't use an alias that isn't ok to use for an external name. 5696 next if ! $alias->ok_as_filename; 5697 5698 my $name = main::Standardize($alias->name); 5699 trace $self, $name if main::DEBUG && $to_trace; 5700 5701 # Take the first one, or any non-deprecated non-discouraged one 5702 # over one that is, or a shorter one that isn't numeric. This 5703 # relies on numeric aliases always being last in the array 5704 # returned by aliases(). Any alpha one will have precedence. 5705 if ( ! defined $short_name{$addr} 5706 || ( $is_last_resort 5707 && $alias->status !~ $deprecated_or_discouraged) 5708 || ($name =~ /\D/ 5709 && length($name) < length($short_name{$addr}))) 5710 { 5711 # Remove interior underscores. 5712 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; 5713 5714 $nominal_short_name_length{$addr} = length $name; 5715 $is_last_resort = $alias->status =~ $deprecated_or_discouraged; 5716 } 5717 } 5718 5719 # If the short name isn't a nice one, perhaps an equivalent table has 5720 # a better one. 5721 if ( $self->can('children') 5722 && ( ! defined $short_name{$addr} 5723 || $short_name{$addr} eq "" 5724 || $short_name{$addr} eq "_")) 5725 { 5726 my $return; 5727 foreach my $follower ($self->children) { # All equivalents 5728 my $follower_name = $follower->short_name; 5729 next unless defined $follower_name; 5730 5731 # Anything (except undefined) is better than underscore or 5732 # empty 5733 if (! defined $return || $return eq "_") { 5734 $return = $follower_name; 5735 next; 5736 } 5737 5738 # If the new follower name isn't "_" and is shorter than the 5739 # current best one, prefer the new one. 5740 next if $follower_name eq "_"; 5741 next if length $follower_name > length $return; 5742 $return = $follower_name; 5743 } 5744 $short_name{$addr} = $return if defined $return; 5745 } 5746 5747 # If no suitable external name return undef 5748 if (! defined $short_name{$addr}) { 5749 $$nominal_length_ptr = undef if $nominal_length_ptr; 5750 return; 5751 } 5752 5753 # Don't allow a null short name. 5754 if ($short_name{$addr} eq "") { 5755 $short_name{$addr} = '_'; 5756 $nominal_short_name_length{$addr} = 1; 5757 } 5758 5759 trace $self, $short_name{$addr} if main::DEBUG && $to_trace; 5760 5761 if ($nominal_length_ptr) { 5762 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5763 } 5764 return $short_name{$addr}; 5765 } 5766 5767 sub external_name($self) { 5768 # Returns the external name that this table should be known by. This 5769 # is usually the short_name, but not if the short_name is undefined, 5770 # in which case the external_name is arbitrarily set to the 5771 # underscore. 5772 5773 my $short = $self->short_name; 5774 return $short if defined $short; 5775 5776 return '_'; 5777 } 5778 5779 sub add_description($self, $description) { # Adds the parameter as a short description. 5780 push @{$description{pack 'J', refaddr $self}}, $description; 5781 5782 return; 5783 } 5784 5785 sub add_note($self, $note) { # Adds the parameter as a short note. 5786 push @{$note{pack 'J', refaddr $self}}, $note; 5787 5788 return; 5789 } 5790 5791 sub add_comment($self, $comment) { # Adds the parameter as a comment. 5792 5793 return unless $debugging_build; 5794 5795 chomp $comment; 5796 5797 push @{$comment{pack 'J', refaddr $self}}, $comment; 5798 5799 return; 5800 } 5801 5802 sub comment($self) { 5803 # Return the current comment for this table. If called in list 5804 # context, returns the array of comments. In scalar, returns a string 5805 # of each element joined together with a period ending each. 5806 5807 my $addr = pack 'J', refaddr $self; 5808 my @list = @{$comment{$addr}}; 5809 return @list if wantarray; 5810 my $return = ""; 5811 foreach my $sentence (@list) { 5812 $return .= '. ' if $return; 5813 $return .= $sentence; 5814 $return =~ s/\.$//; 5815 } 5816 $return .= '.' if $return; 5817 return $return; 5818 } 5819 5820 sub initialize($self, $initialization) { 5821 # Initialize the table with the argument which is any valid 5822 # initialization for range lists. 5823 5824 my $addr = pack 'J', refaddr $self; 5825 5826 # Replace the current range list with a new one of the same exact 5827 # type. 5828 my $class = ref $range_list{$addr}; 5829 $range_list{$addr} = $class->new(Owner => $self, 5830 Initialize => $initialization); 5831 return; 5832 5833 } 5834 5835 sub header($self) { 5836 # The header that is output for the table in the file it is written 5837 # in. 5838 my $return = ""; 5839 $return .= $DEVELOPMENT_ONLY if $compare_versions; 5840 $return .= $HEADER; 5841 return $return; 5842 } 5843 5844 sub merge_single_annotation_line ($output, $annotation, $annotation_column) { 5845 5846 # This appends an annotation comment, $annotation, to $output, 5847 # starting in or after column $annotation_column, removing any 5848 # pre-existing comment from $output. 5849 5850 $annotation =~ s/^ \s* \# \ //x; 5851 $output =~ s/ \s* ( \# \N* )? \n //x; 5852 $output = Text::Tabs::expand($output); 5853 5854 my $spaces = $annotation_column - length $output; 5855 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment 5856 5857 $output = sprintf "%s%*s# %s", 5858 $output, 5859 $spaces, 5860 " ", 5861 $annotation; 5862 return Text::Tabs::unexpand $output; 5863 } 5864 5865 sub write($self, $use_adjustments=0, $suppress_value=0) { 5866 # Write a representation of the table to its file. It calls several 5867 # functions furnished by sub-classes of this abstract base class to 5868 # handle non-normal ranges, to add stuff before the table, and at its 5869 # end. If the table is to be written so that adjustments are 5870 # required, this does that conversion. 5871 5872 5873 # $use_adjustments ? output in adjusted format or not 5874 # $suppress_value Optional, if the value associated with 5875 # a range equals this one, don't write 5876 # the range 5877 5878 my $addr = pack 'J', refaddr $self; 5879 my $write_as_invlist = $write_as_invlist{$addr}; 5880 5881 # Start with the header 5882 my @HEADER = $self->header; 5883 5884 # Then the comments 5885 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n" 5886 if $comment{$addr}; 5887 5888 # Things discovered processing the main body of the document may 5889 # affect what gets output before it, therefore pre_body() isn't called 5890 # until after all other processing of the table is done. 5891 5892 # The main body looks like a 'here' document. If there are comments, 5893 # get rid of them when processing it. 5894 my @OUT; 5895 if ($annotate || $output_range_counts) { 5896 # Use the line below in Perls that don't have /r 5897 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n"; 5898 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n"; 5899 } else { 5900 push @OUT, "return <<'END';\n"; 5901 } 5902 5903 if ($range_list{$addr}->is_empty) { 5904 5905 # This is a kludge for empty tables to silence a warning in 5906 # utf8.c, which can't really deal with empty tables, but it can 5907 # deal with a table that matches nothing, as the inverse of 'All' 5908 # does. 5909 push @OUT, "!Unicode::UCD::All\n"; 5910 } 5911 elsif ($self->name eq 'N' 5912 5913 # To save disk space and table cache space, avoid putting out 5914 # binary N tables, but instead create a file which just inverts 5915 # the Y table. Since the file will still exist and occupy a 5916 # certain number of blocks, might as well output the whole 5917 # thing if it all will fit in one block. The number of 5918 # ranges below is an approximate number for that. 5919 && ($self->property->type == $BINARY 5920 || $self->property->type == $FORCED_BINARY) 5921 # && $self->property->tables == 2 Can't do this because the 5922 # non-binary properties, like NFDQC aren't specifiable 5923 # by the notation 5924 && $range_list{$addr}->ranges > 15 5925 && ! $annotate) # Under --annotate, want to see everything 5926 { 5927 push @OUT, "!Unicode::UCD::" . $self->property->name . "\n"; 5928 } 5929 else { 5930 my $range_size_1 = $range_size_1{$addr}; 5931 5932 # To make it more readable, use a minimum indentation 5933 my $comment_indent; 5934 5935 # These are used only in $annotate option 5936 my $format; # e.g. $HEX_ADJUST_FORMAT 5937 my $include_name; # ? Include the character's name in the 5938 # annotation? 5939 my $include_cp; # ? Include its code point 5940 5941 if (! $annotate) { 5942 $comment_indent = ($self->isa('Map_Table')) 5943 ? 24 5944 : ($write_as_invlist) 5945 ? 8 5946 : 16; 5947 } 5948 else { 5949 $format = $self->format; 5950 5951 # The name of the character is output only for tables that 5952 # don't already include the name in the output. 5953 my $property = $self->property; 5954 $include_name = 5955 ! ($property == $perl_charname 5956 || $property == main::property_ref('Unicode_1_Name') 5957 || $property == main::property_ref('Name') 5958 || $property == main::property_ref('Name_Alias') 5959 ); 5960 5961 # Don't include the code point in the annotation where all 5962 # lines are a single code point, so it can be easily found in 5963 # the first column 5964 $include_cp = ! $range_size_1; 5965 5966 if (! $self->isa('Map_Table')) { 5967 $comment_indent = ($write_as_invlist) ? 8 : 16; 5968 } 5969 else { 5970 $comment_indent = 16; 5971 5972 # There are just a few short ranges in this table, so no 5973 # need to include the code point in the annotation. 5974 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT; 5975 5976 # We're trying to get this to look good, as the whole 5977 # point is to make human-readable tables. It is easier to 5978 # read if almost all the annotation comments begin in the 5979 # same column. Map tables have varying width maps, so can 5980 # create a jagged comment appearance. This code does a 5981 # preliminary pass through these tables looking for the 5982 # maximum width map in each, and causing the comments to 5983 # begin just to the right of that. However, if the 5984 # comments begin too far to the right of most lines, it's 5985 # hard to line them up horizontally with their real data. 5986 # Therefore we ignore the longest outliers 5987 my $ignore_longest_X_percent = 2; # Discard longest X% 5988 5989 # Each key in this hash is a width of at least one of the 5990 # maps in the table. Its value is how many lines have 5991 # that width. 5992 my %widths; 5993 5994 # We won't space things further left than one tab stop 5995 # after the rest of the line; initializing it to that 5996 # number saves some work. 5997 my $max_map_width = 8; 5998 5999 # Fill in the %widths hash 6000 my $total = 0; 6001 for my $set ($range_list{$addr}->ranges) { 6002 my $value = $set->value; 6003 6004 # These range types don't appear in the main table 6005 next if $set->type == 0 6006 && defined $suppress_value 6007 && $value eq $suppress_value; 6008 next if $set->type == $MULTI_CP 6009 || $set->type == $NULL; 6010 6011 # Include 2 spaces before the beginning of the 6012 # comment 6013 my $this_width = length($value) + 2; 6014 6015 # Ranges of the remaining non-zero types usually 6016 # occupy just one line (maybe occasionally two, but 6017 # this doesn't have to be dead accurate). This is 6018 # because these ranges are like "unassigned code 6019 # points" 6020 my $count = ($set->type != 0) 6021 ? 1 6022 : $set->end - $set->start + 1; 6023 $widths{$this_width} += $count; 6024 $total += $count; 6025 $max_map_width = $this_width 6026 if $max_map_width < $this_width; 6027 } 6028 6029 # If the widest map gives us less than two tab stops 6030 # worth, just take it as-is. 6031 if ($max_map_width > 16) { 6032 6033 # Otherwise go through %widths until we have included 6034 # the desired percentage of lines in the whole table. 6035 my $running_total = 0; 6036 foreach my $width (sort { $a <=> $b } keys %widths) 6037 { 6038 $running_total += $widths{$width}; 6039 use integer; 6040 if ($running_total * 100 / $total 6041 >= 100 - $ignore_longest_X_percent) 6042 { 6043 $max_map_width = $width; 6044 last; 6045 } 6046 } 6047 } 6048 $comment_indent += $max_map_width; 6049 } 6050 } 6051 6052 # Values for previous time through the loop. Initialize to 6053 # something that won't be adjacent to the first iteration; 6054 # only $previous_end matters for that. 6055 my $previous_start; 6056 my $previous_end = -2; 6057 my $previous_value; 6058 6059 # Values for next time through the portion of the loop that splits 6060 # the range. 0 in $next_start means there is no remaining portion 6061 # to deal with. 6062 my $next_start = 0; 6063 my $next_end; 6064 my $next_value; 6065 my $offset = 0; 6066 my $invlist_count = 0; 6067 6068 my $output_value_in_hex = $self->isa('Map_Table') 6069 && ($self->format eq $HEX_ADJUST_FORMAT 6070 || $self->to_output_map == $EXTERNAL_MAP); 6071 # Use leading zeroes just for files whose format should not be 6072 # changed from what it has been. Otherwise, they just take up 6073 # space and time to process. 6074 my $hex_format = ($self->isa('Map_Table') 6075 && $self->to_output_map == $EXTERNAL_MAP) 6076 ? "%04X" 6077 : "%X"; 6078 6079 # The values for some of these tables are stored in mktables as 6080 # hex strings. Normally, these are just output as strings without 6081 # change, but when we are doing adjustments, we have to operate on 6082 # these numerically, so we convert those to decimal to do that, 6083 # and back to hex for output 6084 my $convert_map_to_from_hex = 0; 6085 my $output_map_in_hex = 0; 6086 if ($self->isa('Map_Table')) { 6087 $convert_map_to_from_hex 6088 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT) 6089 || ($annotate && $self->format eq $HEX_FORMAT); 6090 $output_map_in_hex = $convert_map_to_from_hex 6091 || $self->format eq $HEX_FORMAT; 6092 } 6093 6094 # To store any annotations about the characters. 6095 my @annotation; 6096 6097 # Output each range as part of the here document. 6098 RANGE: 6099 for my $set ($range_list{$addr}->ranges) { 6100 if ($set->type != 0) { 6101 $self->handle_special_range($set); 6102 next RANGE; 6103 } 6104 my $start = $set->start; 6105 my $end = $set->end; 6106 my $value = $set->value; 6107 6108 # Don't output ranges whose value is the one to suppress 6109 next RANGE if defined $suppress_value 6110 && $value eq $suppress_value; 6111 6112 $value = CORE::hex $value if $convert_map_to_from_hex; 6113 6114 6115 { # This bare block encloses the scope where we may need to 6116 # 'redo' to. Consider a table that is to be written out 6117 # using single item ranges. This is given in the 6118 # $range_size_1 boolean. To accomplish this, we split the 6119 # range each time through the loop into two portions, the 6120 # first item, and the rest. We handle that first item 6121 # this time in the loop, and 'redo' to repeat the process 6122 # for the rest of the range. 6123 # 6124 # We may also have to do it, with other special handling, 6125 # if the table has adjustments. Consider the table that 6126 # contains the lowercasing maps. mktables stores the 6127 # ASCII range ones as 26 ranges: 6128 # ord('A') => ord('a'), .. ord('Z') => ord('z') 6129 # For compactness, the table that gets written has this as 6130 # just one range 6131 # ( ord('A') .. ord('Z') ) => ord('a') 6132 # and the software that reads the tables is smart enough 6133 # to "connect the dots". This change is accomplished in 6134 # this loop by looking to see if the current iteration 6135 # fits the paradigm of the previous iteration, and if so, 6136 # we merge them by replacing the final output item with 6137 # the merged data. Repeated 25 times, this gets A-Z. But 6138 # we also have to make sure we don't screw up cases where 6139 # we have internally stored 6140 # ( 0x1C4 .. 0x1C6 ) => 0x1C5 6141 # This single internal range has to be output as 3 ranges, 6142 # which is done by splitting, like we do for $range_size_1 6143 # tables. (There are very few of such ranges that need to 6144 # be split, so the gain of doing the combining of other 6145 # ranges far outweighs the splitting of these.) The 6146 # values to use for the redo at the end of this block are 6147 # set up just below in the scalars whose names begin with 6148 # '$next_'. 6149 6150 if (($use_adjustments || $range_size_1) && $end != $start) 6151 { 6152 $next_start = $start + 1; 6153 $next_end = $end; 6154 $next_value = $value; 6155 $end = $start; 6156 } 6157 6158 if ($use_adjustments && ! $range_size_1) { 6159 6160 # If this range is adjacent to the previous one, and 6161 # the values in each are integers that are also 6162 # adjacent (differ by 1), then this range really 6163 # extends the previous one that is already in element 6164 # $OUT[-1]. So we pop that element, and pretend that 6165 # the range starts with whatever it started with. 6166 # $offset is incremented by 1 each time so that it 6167 # gives the current offset from the first element in 6168 # the accumulating range, and we keep in $value the 6169 # value of that first element. 6170 if ($start == $previous_end + 1 6171 && $value =~ /^ -? \d+ $/xa 6172 && $previous_value =~ /^ -? \d+ $/xa 6173 && ($value == ($previous_value + ++$offset))) 6174 { 6175 pop @OUT; 6176 $start = $previous_start; 6177 $value = $previous_value; 6178 } 6179 else { 6180 $offset = 0; 6181 if (@annotation == 1) { 6182 $OUT[-1] = merge_single_annotation_line( 6183 $OUT[-1], $annotation[0], $comment_indent); 6184 } 6185 else { 6186 push @OUT, @annotation; 6187 } 6188 } 6189 undef @annotation; 6190 6191 # Save the current values for the next time through 6192 # the loop. 6193 $previous_start = $start; 6194 $previous_end = $end; 6195 $previous_value = $value; 6196 } 6197 6198 if ($write_as_invlist) { 6199 if ( $previous_end > 0 6200 && $output_range_counts{$addr}) 6201 { 6202 my $complement_count = $start - $previous_end - 1; 6203 if ($complement_count > 1) { 6204 $OUT[-1] = merge_single_annotation_line( 6205 $OUT[-1], 6206 "#" 6207 . (" " x 17) 6208 . "[" 6209 . main::clarify_code_point_count( 6210 $complement_count) 6211 . "] in complement\n", 6212 $comment_indent); 6213 } 6214 } 6215 6216 # Inversion list format has a single number per line, 6217 # the starting code point of a range that matches the 6218 # property 6219 push @OUT, $start, "\n"; 6220 $invlist_count++; 6221 6222 # Add a comment with the size of the range, if 6223 # requested. 6224 if ($output_range_counts{$addr}) { 6225 $OUT[-1] = merge_single_annotation_line( 6226 $OUT[-1], 6227 "# [" 6228 . main::clarify_code_point_count($end - $start + 1) 6229 . "]\n", 6230 $comment_indent); 6231 } 6232 } 6233 elsif ($start != $end) { # If there is a range 6234 if ($end == $MAX_WORKING_CODEPOINT) { 6235 push @OUT, sprintf "$hex_format\t$hex_format", 6236 $start, 6237 $MAX_PLATFORM_CODEPOINT; 6238 } 6239 else { 6240 push @OUT, sprintf "$hex_format\t$hex_format", 6241 $start, $end; 6242 } 6243 if (length $value) { 6244 if ($convert_map_to_from_hex) { 6245 $OUT[-1] .= sprintf "\t$hex_format\n", $value; 6246 } 6247 else { 6248 $OUT[-1] .= "\t$value\n"; 6249 } 6250 } 6251 6252 # Add a comment with the size of the range, if 6253 # requested. 6254 if ($output_range_counts{$addr}) { 6255 $OUT[-1] = merge_single_annotation_line( 6256 $OUT[-1], 6257 "# [" 6258 . main::clarify_code_point_count($end - $start + 1) 6259 . "]\n", 6260 $comment_indent); 6261 } 6262 } 6263 else { # Here to output a single code point per line. 6264 6265 # Use any passed in subroutine to output. 6266 if (ref $range_size_1 eq 'CODE') { 6267 for my $i ($start .. $end) { 6268 push @OUT, &{$range_size_1}($i, $value); 6269 } 6270 } 6271 else { 6272 6273 # Here, caller is ok with default output. 6274 for (my $i = $start; $i <= $end; $i++) { 6275 if ($convert_map_to_from_hex) { 6276 push @OUT, 6277 sprintf "$hex_format\t\t$hex_format\n", 6278 $i, $value; 6279 } 6280 else { 6281 push @OUT, sprintf $hex_format, $i; 6282 $OUT[-1] .= "\t\t$value" if $value ne ""; 6283 $OUT[-1] .= "\n"; 6284 } 6285 } 6286 } 6287 } 6288 6289 if ($annotate) { 6290 for (my $i = $start; $i <= $end; $i++) { 6291 my $annotation = ""; 6292 6293 # Get character information if don't have it already 6294 main::populate_char_info($i) 6295 if ! defined $viacode[$i]; 6296 my $type = $annotate_char_type[$i]; 6297 6298 # Figure out if should output the next code points 6299 # as part of a range or not. If this is not in an 6300 # annotation range, then won't output as a range, 6301 # so returns $i. Otherwise use the end of the 6302 # annotation range, but no further than the 6303 # maximum possible end point of the loop. 6304 my $range_end = 6305 $range_size_1 6306 ? $start 6307 : main::min( 6308 $annotate_ranges->value_of($i) || $i, 6309 $end); 6310 6311 # Use a range if it is a range, and either is one 6312 # of the special annotation ranges, or the range 6313 # is at most 3 long. This last case causes the 6314 # algorithmically named code points to be output 6315 # individually in spans of at most 3, as they are 6316 # the ones whose $type is > 0. 6317 if ($range_end != $i 6318 && ( $type < 0 || $range_end - $i > 2)) 6319 { 6320 # Here is to output a range. We don't allow a 6321 # caller-specified output format--just use the 6322 # standard one. 6323 my $range_name = $viacode[$i]; 6324 6325 # For the code points which end in their hex 6326 # value, we eliminate that from the output 6327 # annotation, and capitalize only the first 6328 # letter of each word. 6329 if ($type == $CP_IN_NAME) { 6330 my $hex = sprintf $hex_format, $i; 6331 $range_name =~ s/-$hex$//; 6332 my @words = split " ", $range_name; 6333 for my $word (@words) { 6334 $word = 6335 ucfirst(lc($word)) if $word ne 'CJK'; 6336 } 6337 $range_name = join " ", @words; 6338 } 6339 elsif ($type == $HANGUL_SYLLABLE) { 6340 $range_name = "Hangul Syllable"; 6341 } 6342 6343 # If the annotation would just repeat what's 6344 # already being output as the range, skip it. 6345 # (When an inversion list is being written, it 6346 # isn't a repeat, as that always is in 6347 # decimal) 6348 if ( $write_as_invlist 6349 || $i != $start 6350 || $range_end < $end) 6351 { 6352 if ($range_end < $MAX_WORKING_CODEPOINT) 6353 { 6354 $annotation = sprintf "%04X..%04X", 6355 $i, $range_end; 6356 } 6357 else { 6358 $annotation = sprintf "%04X..INFINITY", 6359 $i; 6360 } 6361 } 6362 else { # Indent if not displaying code points 6363 $annotation = " " x 4; 6364 } 6365 6366 if ($range_name) { 6367 $annotation .= " $age[$i]" if $age[$i]; 6368 $annotation .= " $range_name"; 6369 } 6370 6371 # Include the number of code points in the 6372 # range 6373 my $count = 6374 main::clarify_code_point_count($range_end - $i + 1); 6375 $annotation .= " [$count]\n"; 6376 6377 # Skip to the end of the range 6378 $i = $range_end; 6379 } 6380 else { # Not in a range. 6381 my $comment = ""; 6382 6383 # When outputting the names of each character, 6384 # use the character itself if printable 6385 $comment .= "'" . main::display_chr($i) . "' " 6386 if $printable[$i]; 6387 6388 my $output_value = $value; 6389 6390 # Determine the annotation 6391 if ($format eq $DECOMP_STRING_FORMAT) { 6392 6393 # This is very specialized, with the type 6394 # of decomposition beginning the line 6395 # enclosed in <...>, and the code points 6396 # that the code point decomposes to 6397 # separated by blanks. Create two 6398 # strings, one of the printable 6399 # characters, and one of their official 6400 # names. 6401 (my $map = $output_value) 6402 =~ s/ \ * < .*? > \ +//x; 6403 my $tostr = ""; 6404 my $to_name = ""; 6405 my $to_chr = ""; 6406 foreach my $to (split " ", $map) { 6407 $to = CORE::hex $to; 6408 $to_name .= " + " if $to_name; 6409 $to_chr .= main::display_chr($to); 6410 main::populate_char_info($to) 6411 if ! defined $viacode[$to]; 6412 $to_name .= $viacode[$to]; 6413 } 6414 6415 $comment .= 6416 "=> '$to_chr'; $viacode[$i] => $to_name"; 6417 } 6418 else { 6419 $output_value += $i - $start 6420 if $use_adjustments 6421 # Don't try to adjust a 6422 # non-integer 6423 && $output_value !~ /[-\D]/; 6424 6425 if ($output_map_in_hex) { 6426 main::populate_char_info($output_value) 6427 if ! defined $viacode[$output_value]; 6428 $comment .= " => '" 6429 . main::display_chr($output_value) 6430 . "'; " if $printable[$output_value]; 6431 } 6432 if ($include_name && $viacode[$i]) { 6433 $comment .= " " if $comment; 6434 $comment .= $viacode[$i]; 6435 } 6436 if ($output_map_in_hex) { 6437 $comment .= 6438 " => $viacode[$output_value]" 6439 if $viacode[$output_value]; 6440 $output_value = sprintf($hex_format, 6441 $output_value); 6442 } 6443 } 6444 6445 if ($include_cp) { 6446 $annotation = sprintf "%04X %s", $i, $age[$i]; 6447 if ($use_adjustments) { 6448 $annotation .= " => $output_value"; 6449 } 6450 } 6451 6452 if ($comment ne "") { 6453 $annotation .= " " if $annotation ne ""; 6454 $annotation .= $comment; 6455 } 6456 $annotation .= "\n" if $annotation ne ""; 6457 } 6458 6459 if ($annotation ne "") { 6460 push @annotation, (" " x $comment_indent) 6461 . "# $annotation"; 6462 } 6463 } 6464 6465 # If not adjusting, we don't have to go through the 6466 # loop again to know that the annotation comes next 6467 # in the output. 6468 if (! $use_adjustments) { 6469 if (@annotation == 1) { 6470 $OUT[-1] = merge_single_annotation_line( 6471 $OUT[-1], $annotation[0], $comment_indent); 6472 } 6473 else { 6474 push @OUT, map { Text::Tabs::unexpand $_ } 6475 @annotation; 6476 } 6477 undef @annotation; 6478 } 6479 } 6480 6481 # Add the beginning of the range that doesn't match the 6482 # property, except if the just added match range extends 6483 # to infinity. We do this after any annotations for the 6484 # match range. 6485 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) { 6486 push @OUT, $end + 1, "\n"; 6487 $invlist_count++; 6488 } 6489 6490 # If we split the range, set up so the next time through 6491 # we get the remainder, and redo. 6492 if ($next_start) { 6493 $start = $next_start; 6494 $end = $next_end; 6495 $value = $next_value; 6496 $next_start = 0; 6497 redo; 6498 } 6499 } # End of redo block 6500 } # End of loop through all the table's ranges 6501 6502 push @OUT, @annotation; # Add orphaned annotation, if any 6503 6504 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count; 6505 } 6506 6507 # Add anything that goes after the main body, but within the here 6508 # document, 6509 my $append_to_body = $self->append_to_body; 6510 push @OUT, $append_to_body if $append_to_body; 6511 6512 # And finish the here document. 6513 push @OUT, "END\n"; 6514 6515 # Done with the main portion of the body. Can now figure out what 6516 # should appear before it in the file. 6517 my $pre_body = $self->pre_body; 6518 push @HEADER, $pre_body, "\n" if $pre_body; 6519 6520 # All these files should have a .pl suffix added to them. 6521 my @file_with_pl = @{$file_path{$addr}}; 6522 $file_with_pl[-1] .= '.pl'; 6523 6524 main::write(\@file_with_pl, 6525 $annotate, # utf8 iff annotating 6526 \@HEADER, 6527 \@OUT); 6528 return; 6529 } 6530 6531 sub set_status($self, $status, $info) { # Set the table's status 6532 # status The status enum value 6533 # info Any message associated with it. 6534 my $addr = pack 'J', refaddr $self; 6535 6536 $status{$addr} = $status; 6537 $status_info{$addr} = $info; 6538 return; 6539 } 6540 6541 sub set_fate($self, $fate, $reason=undef) { # Set the fate of a table 6542 my $addr = pack 'J', refaddr $self; 6543 6544 return if $fate{$addr} == $fate; # If no-op 6545 6546 # Can only change the ordinary fate, except if going to $MAP_PROXIED 6547 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; 6548 6549 $fate{$addr} = $fate; 6550 6551 # Don't document anything to do with a non-normal fated table 6552 if ($fate != $ORDINARY) { 6553 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; 6554 foreach my $alias ($self->aliases) { 6555 $alias->set_ucd($put_in_pod); 6556 6557 # MAP_PROXIED doesn't affect the match tables 6558 next if $fate == $MAP_PROXIED; 6559 $alias->set_make_re_pod_entry($put_in_pod); 6560 } 6561 } 6562 6563 # Save the reason for suppression for output 6564 if ($fate >= $SUPPRESSED) { 6565 $reason = "" unless defined $reason; 6566 $why_suppressed{$complete_name{$addr}} = $reason; 6567 } 6568 6569 return; 6570 } 6571 6572 sub lock($self) { 6573 # Don't allow changes to the table from now on. This stores a stack 6574 # trace of where it was called, so that later attempts to modify it 6575 # can immediately show where it got locked. 6576 my $addr = pack 'J', refaddr $self; 6577 6578 $locked{$addr} = ""; 6579 6580 my $line = (caller(0))[2]; 6581 my $i = 1; 6582 6583 # Accumulate the stack trace 6584 while (1) { 6585 my ($pkg, $file, $caller_line, $caller) = caller $i++; 6586 6587 last unless defined $caller; 6588 6589 $locked{$addr} .= " called from $caller() at line $line\n"; 6590 $line = $caller_line; 6591 } 6592 $locked{$addr} .= " called from main at line $line\n"; 6593 6594 return; 6595 } 6596 6597 sub carp_if_locked($self) { 6598 # Return whether a table is locked or not, and, by the way, complain 6599 # if is locked 6600 my $addr = pack 'J', refaddr $self; 6601 6602 return 0 if ! $locked{$addr}; 6603 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); 6604 return 1; 6605 } 6606 6607 sub set_file_path($self, @path) { # Set the final directory path for this table 6608 @{$file_path{pack 'J', refaddr $self}} = @path; 6609 return 6610 } 6611 6612 # Accessors for the range list stored in this table. First for 6613 # unconditional 6614 for my $sub (qw( 6615 containing_range 6616 contains 6617 count 6618 each_range 6619 hash 6620 is_empty 6621 matches_identically_to 6622 max 6623 min 6624 range_count 6625 reset_each_range 6626 type_of 6627 value_of 6628 )) 6629 { 6630 no strict "refs"; 6631 *$sub = sub { 6632 use strict "refs"; 6633 my $self = shift; 6634 return $self->_range_list->$sub(@_); 6635 } 6636 } 6637 6638 # Then for ones that should fail if locked 6639 for my $sub (qw( 6640 delete_range 6641 )) 6642 { 6643 no strict "refs"; 6644 *$sub = sub { 6645 use strict "refs"; 6646 my $self = shift; 6647 6648 return if $self->carp_if_locked; 6649 no overloading; 6650 return $self->_range_list->$sub(@_); 6651 } 6652 } 6653 6654} # End closure 6655 6656package Map_Table; 6657use parent '-norequire', '_Base_Table'; 6658 6659# A Map Table is a table that contains the mappings from code points to 6660# values. There are two weird cases: 6661# 1) Anomalous entries are ones that aren't maps of ranges of code points, but 6662# are written in the table's file at the end of the table nonetheless. It 6663# requires specially constructed code to handle these; utf8.c can not read 6664# these in, so they should not go in $map_directory. As of this writing, 6665# the only case that these happen is for named sequences used in 6666# charnames.pm. But this code doesn't enforce any syntax on these, so 6667# something else could come along that uses it. 6668# 2) Specials are anything that doesn't fit syntactically into the body of the 6669# table. The ranges for these have a map type of non-zero. The code below 6670# knows about and handles each possible type. In most cases, these are 6671# written as part of the header. 6672# 6673# A map table deliberately can't be manipulated at will unlike match tables. 6674# This is because of the ambiguities having to do with what to do with 6675# overlapping code points. And there just isn't a need for those things; 6676# what one wants to do is just query, add, replace, or delete mappings, plus 6677# write the final result. 6678# However, there is a method to get the list of possible ranges that aren't in 6679# this table to use for defaulting missing code point mappings. And, 6680# map_add_or_replace_non_nulls() does allow one to add another table to this 6681# one, but it is clearly very specialized, and defined that the other's 6682# non-null values replace this one's if there is any overlap. 6683 6684sub trace { return main::trace(@_); } 6685 6686{ # Closure 6687 6688 main::setup_package(); 6689 6690 my %default_map; 6691 # Many input files omit some entries; this gives what the mapping for the 6692 # missing entries should be 6693 main::set_access('default_map', \%default_map, 'r'); 6694 6695 my %anomalous_entries; 6696 # Things that go in the body of the table which don't fit the normal 6697 # scheme of things, like having a range. Not much can be done with these 6698 # once there except to output them. This was created to handle named 6699 # sequences. 6700 main::set_access('anomalous_entry', \%anomalous_entries, 'a'); 6701 main::set_access('anomalous_entries', # Append singular, read plural 6702 \%anomalous_entries, 6703 'readable_array'); 6704 my %to_output_map; 6705 # Enum as to whether or not to write out this map table, and how: 6706 # 0 don't output 6707 # $EXTERNAL_MAP means its existence is noted in the documentation, and 6708 # it should not be removed nor its format changed. This 6709 # is done for those files that have traditionally been 6710 # output. 6711 # $INTERNAL_MAP means Perl reserves the right to do anything it wants 6712 # with this file 6713 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of 6714 # outputting the actual mappings as-is, we adjust things 6715 # to create a much more compact table. Only those few 6716 # tables where the mapping is convertible at least to an 6717 # integer and compacting makes a big difference should 6718 # have this. Hence, the default is to not do this 6719 # unless the table's default mapping is to $CODE_POINT, 6720 # and the range size is not 1. 6721 main::set_access('to_output_map', \%to_output_map, 's'); 6722 6723 sub new { 6724 my $class = shift; 6725 my $name = shift; 6726 6727 my %args = @_; 6728 6729 # Optional initialization data for the table. 6730 my $initialize = delete $args{'Initialize'}; 6731 6732 my $default_map = delete $args{'Default_Map'}; 6733 my $property = delete $args{'_Property'}; 6734 my $full_name = delete $args{'Full_Name'}; 6735 my $to_output_map = delete $args{'To_Output_Map'}; 6736 6737 # Rest of parameters passed on 6738 6739 my $range_list = Range_Map->new(Owner => $property); 6740 6741 my $self = $class->SUPER::new( 6742 Name => $name, 6743 Complete_Name => $full_name, 6744 Full_Name => $full_name, 6745 _Property => $property, 6746 _Range_List => $range_list, 6747 Write_As_Invlist => 0, 6748 %args); 6749 6750 my $addr = pack 'J', refaddr $self; 6751 6752 $anomalous_entries{$addr} = []; 6753 $default_map{$addr} = $default_map; 6754 $to_output_map{$addr} = $to_output_map; 6755 6756 $self->initialize($initialize) if defined $initialize; 6757 6758 return $self; 6759 } 6760 6761 use overload 6762 fallback => 0, 6763 qw("") => "_operator_stringify", 6764 ; 6765 6766 sub _operator_stringify($self, $other="", $reversed=0) { 6767 6768 my $name = $self->property->full_name; 6769 $name = '""' if $name eq ""; 6770 return "Map table for Property '$name'"; 6771 } 6772 6773 sub add_alias { 6774 # Add a synonym for this table (which means the property itself) 6775 my $self = shift; 6776 my $name = shift; 6777 # Rest of parameters passed on. 6778 6779 $self->SUPER::add_alias($name, $self->property, @_); 6780 return; 6781 } 6782 6783 sub add_map { 6784 # Add a range of code points to the list of specially-handled code 6785 # points. 0 is assumed if the type of special is not passed 6786 # in. 6787 6788 my $self = shift; 6789 my $lower = shift; 6790 my $upper = shift; 6791 my $string = shift; 6792 my %args = @_; 6793 6794 my $type = delete $args{'Type'} || 0; 6795 # Rest of parameters passed on 6796 6797 # Can't change the table if locked. 6798 return if $self->carp_if_locked; 6799 6800 $self->_range_list->add_map($lower, $upper, 6801 $string, 6802 @_, 6803 Type => $type); 6804 return; 6805 } 6806 6807 sub append_to_body($self) { 6808 # Adds to the written HERE document of the table's body any anomalous 6809 # entries in the table.. 6810 my $addr = pack 'J', refaddr $self; 6811 6812 return "" unless @{$anomalous_entries{$addr}}; 6813 return join("\n", @{$anomalous_entries{$addr}}) . "\n"; 6814 } 6815 6816 sub map_add_or_replace_non_nulls($self, $other) { 6817 # This adds the mappings in the table $other to $self. Non-null 6818 # mappings from $other override those in $self. It essentially merges 6819 # the two tables, with the second having priority except for null 6820 # mappings. 6821 return if $self->carp_if_locked; 6822 6823 if (! $other->isa(__PACKAGE__)) { 6824 Carp::my_carp_bug("$other should be a " 6825 . __PACKAGE__ 6826 . ". Not a '" 6827 . ref($other) 6828 . "'. Not added;"); 6829 return; 6830 } 6831 6832 local $to_trace = 0 if main::DEBUG; 6833 6834 my $self_range_list = $self->_range_list; 6835 my $other_range_list = $other->_range_list; 6836 foreach my $range ($other_range_list->ranges) { 6837 my $value = $range->value; 6838 next if $value eq ""; 6839 $self_range_list->_add_delete('+', 6840 $range->start, 6841 $range->end, 6842 $value, 6843 Type => $range->type, 6844 Replace => $UNCONDITIONALLY); 6845 } 6846 6847 return; 6848 } 6849 6850 sub set_default_map($self, $map, $use_full_name=0) { 6851 # Define what code points that are missing from the input files should 6852 # map to. The optional second parameter 'full_name' indicates to 6853 # force using the full name of the map instead of its standard name. 6854 if ($use_full_name && $use_full_name ne 'full_name') { 6855 Carp::my_carp_bug("Second parameter to set_default_map() if" 6856 . " present, must be 'full_name'"); 6857 } 6858 6859 my $addr = pack 'J', refaddr $self; 6860 6861 # Convert the input to the standard equivalent, if any (won't have any 6862 # for $STRING properties) 6863 my $standard = $self->property->table($map); 6864 if (defined $standard) { 6865 $map = ($use_full_name) 6866 ? $standard->full_name 6867 : $standard->name; 6868 } 6869 6870 # Warn if there already is a non-equivalent default map for this 6871 # property. Note that a default map can be a ref, which means that 6872 # what it actually means is delayed until later in the program, and it 6873 # IS permissible to override it here without a message. 6874 my $default_map = $default_map{$addr}; 6875 if (defined $default_map 6876 && ! ref($default_map) 6877 && $default_map ne $map 6878 && main::Standardize($map) ne $default_map) 6879 { 6880 my $property = $self->property; 6881 my $map_table = $property->table($map); 6882 my $default_table = $property->table($default_map); 6883 if (defined $map_table 6884 && defined $default_table 6885 && $map_table != $default_table) 6886 { 6887 Carp::my_carp("Changing the default mapping for " 6888 . $property 6889 . " from $default_map to $map'"); 6890 } 6891 } 6892 6893 $default_map{$addr} = $map; 6894 6895 # Don't also create any missing table for this map at this point, 6896 # because if we did, it could get done before the main table add is 6897 # done for PropValueAliases.txt; instead the caller will have to make 6898 # sure it exists, if desired. 6899 return; 6900 } 6901 6902 sub to_output_map($self) { 6903 # Returns boolean: should we write this map table? 6904 my $addr = pack 'J', refaddr $self; 6905 6906 # If overridden, use that 6907 return $to_output_map{$addr} if defined $to_output_map{$addr}; 6908 6909 my $full_name = $self->full_name; 6910 return $global_to_output_map{$full_name} 6911 if defined $global_to_output_map{$full_name}; 6912 6913 # If table says to output, do so; if says to suppress it, do so. 6914 my $fate = $self->fate; 6915 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; 6916 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; 6917 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; 6918 6919 my $type = $self->property->type; 6920 6921 # Don't want to output binary map tables even for debugging. 6922 return 0 if $type == $BINARY; 6923 6924 # But do want to output string ones. All the ones that remain to 6925 # be dealt with (i.e. which haven't explicitly been set to external) 6926 # are for internal Perl use only. The default for those that map to 6927 # $CODE_POINT and haven't been restricted to a single element range 6928 # is to use the adjusted form. 6929 if ($type == $STRING) { 6930 return $INTERNAL_MAP if $self->range_size_1 6931 || $default_map{$addr} ne $CODE_POINT; 6932 return $OUTPUT_ADJUSTED; 6933 } 6934 6935 # Otherwise is an $ENUM, do output it, for Perl's purposes 6936 return $INTERNAL_MAP; 6937 } 6938 6939 sub inverse_list($self) { 6940 # Returns a Range_List that is gaps of the current table. That is, 6941 # the inversion 6942 my $current = Range_List->new(Initialize => $self->_range_list, 6943 Owner => $self->property); 6944 return ~ $current; 6945 } 6946 6947 sub header($self) { 6948 my $return = $self->SUPER::header(); 6949 6950 if ($self->to_output_map >= $INTERNAL_MAP) { 6951 $return .= $INTERNAL_ONLY_HEADER; 6952 } 6953 else { 6954 # Other properties have fixed formats. 6955 my $property_name = $self->property->full_name; 6956 6957 $return .= <<END; 6958 6959# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!! 6960 6961# This file is for internal use by core Perl only. It is retained for 6962# backwards compatibility with applications that may have come to rely on it, 6963# but its format and even its name or existence are subject to change without 6964# notice in a future Perl version. Don't use it directly. Instead, its 6965# contents are now retrievable through a stable API in the Unicode::UCD 6966# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual 6967# code points can be retrieved via Unicode::UCD::charprop()); 6968END 6969 } 6970 return $return; 6971 } 6972 6973 sub set_final_comment($self) { 6974 # Just before output, create the comment that heads the file 6975 # containing this table. 6976 6977 return unless $debugging_build; 6978 6979 # No sense generating a comment if aren't going to write it out. 6980 return if ! $self->to_output_map; 6981 6982 my $addr = pack 'J', refaddr $self; 6983 6984 my $property = $self->property; 6985 6986 # Get all the possible names for this property. Don't use any that 6987 # aren't ok for use in a file name, etc. This is perhaps causing that 6988 # flag to do double duty, and may have to be changed in the future to 6989 # have our own flag for just this purpose; but it works now to exclude 6990 # Perl generated synonyms from the lists for properties, where the 6991 # name is always the proper Unicode one. 6992 my @property_aliases = grep { $_->ok_as_filename } $self->aliases; 6993 6994 my $count = $self->count; 6995 my $default_map = $default_map{$addr}; 6996 6997 # The ranges that map to the default aren't output, so subtract that 6998 # to get those actually output. A property with matching tables 6999 # already has the information calculated. 7000 if ($property->type != $STRING && $property->type != $FORCED_BINARY) { 7001 $count -= $property->table($default_map)->count; 7002 } 7003 elsif (defined $default_map) { 7004 7005 # But for $STRING properties, must calculate now. Subtract the 7006 # count from each range that maps to the default. 7007 foreach my $range ($self->_range_list->ranges) { 7008 if ($range->value eq $default_map) { 7009 $count -= $range->end +1 - $range->start; 7010 } 7011 } 7012 7013 } 7014 7015 # Get a string version of $count with underscores in large numbers, 7016 # for clarity. 7017 my $string_count = main::clarify_code_point_count($count); 7018 7019 my $code_points = ($count == 1) 7020 ? 'single code point' 7021 : "$string_count code points"; 7022 7023 my $mapping; 7024 my $these_mappings; 7025 my $are; 7026 if (@property_aliases <= 1) { 7027 $mapping = 'mapping'; 7028 $these_mappings = 'this mapping'; 7029 $are = 'is' 7030 } 7031 else { 7032 $mapping = 'synonymous mappings'; 7033 $these_mappings = 'these mappings'; 7034 $are = 'are' 7035 } 7036 my $cp; 7037 if ($count >= $MAX_UNICODE_CODEPOINTS) { 7038 $cp = "any code point in Unicode Version $string_version"; 7039 } 7040 else { 7041 my $map_to; 7042 if ($default_map eq "") { 7043 $map_to = 'the empty string'; 7044 } 7045 elsif ($default_map eq $CODE_POINT) { 7046 $map_to = "itself"; 7047 } 7048 else { 7049 $map_to = "'$default_map'"; 7050 } 7051 if ($count == 1) { 7052 $cp = "the single code point"; 7053 } 7054 else { 7055 $cp = "one of the $code_points"; 7056 } 7057 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to"; 7058 } 7059 7060 my $comment = ""; 7061 7062 my $status = $self->status; 7063 if ($status ne $NORMAL) { 7064 my $warn = uc $status_past_participles{$status}; 7065 $comment .= <<END; 7066 7067!!!!!!! $warn !!!!!!!!!!!!!!!!!!! 7068 All property or property=value combinations contained in this file are $warn. 7069 See $unicode_reference_url for what this means. 7070 7071END 7072 } 7073 $comment .= "This file returns the $mapping:\n"; 7074 7075 my $ucd_accessible_name = ""; 7076 my $has_underscore_name = 0; 7077 my $full_name = $self->property->full_name; 7078 for my $i (0 .. @property_aliases - 1) { 7079 my $name = $property_aliases[$i]->name; 7080 $has_underscore_name = 1 if $name =~ /^_/; 7081 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); 7082 if ($property_aliases[$i]->ucd) { 7083 if ($name eq $full_name) { 7084 $ucd_accessible_name = $full_name; 7085 } 7086 elsif (! $ucd_accessible_name) { 7087 $ucd_accessible_name = $name; 7088 } 7089 } 7090 } 7091 $comment .= "\nwhere 'cp' is $cp."; 7092 if ($ucd_accessible_name) { 7093 $comment .= " Note that $these_mappings"; 7094 if ($has_underscore_name) { 7095 $comment .= " (except for the one(s) that begin with an underscore)"; 7096 } 7097 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; 7098 7099 } 7100 7101 # And append any commentary already set from the actual property. 7102 $comment .= "\n\n" . $self->comment if $self->comment; 7103 if ($self->description) { 7104 $comment .= "\n\n" . join " ", $self->description; 7105 } 7106 if ($self->note) { 7107 $comment .= "\n\n" . join " ", $self->note; 7108 } 7109 $comment .= "\n"; 7110 7111 if (! $self->perl_extension) { 7112 $comment .= <<END; 7113 7114For information about what this property really means, see: 7115$unicode_reference_url 7116END 7117 } 7118 7119 if ($count) { # Format differs for empty table 7120 $comment.= "\nThe format of the "; 7121 if ($self->range_size_1) { 7122 $comment.= <<END; 7123main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT 7124is in hex; MAPPING is what CODE_POINT maps to. 7125END 7126 } 7127 else { 7128 7129 # There are tables which end up only having one element per 7130 # range, but it is not worth keeping track of for making just 7131 # this comment a little better. 7132 $comment .= <<END; 7133non-comment portions of the main body of lines of this file is: 7134START\\tSTOP\\tMAPPING where START is the starting code point of the 7135range, in hex; STOP is the ending point, or if omitted, the range has just one 7136code point; MAPPING is what each code point between START and STOP maps to. 7137END 7138 if ($self->output_range_counts) { 7139 $comment .= <<END; 7140Numbers in comments in [brackets] indicate how many code points are in the 7141range (omitted when the range is a single code point or if the mapping is to 7142the null string). 7143END 7144 } 7145 } 7146 } 7147 $self->set_comment(main::join_lines($comment)); 7148 return; 7149 } 7150 7151 my %swash_keys; # Makes sure don't duplicate swash names. 7152 7153 # The remaining variables are temporaries used while writing each table, 7154 # to output special ranges. 7155 my @multi_code_point_maps; # Map is to more than one code point. 7156 7157 sub handle_special_range($self, $range) { 7158 # Called in the middle of write when it finds a range it doesn't know 7159 # how to handle. 7160 7161 my $addr = pack 'J', refaddr $self; 7162 7163 my $type = $range->type; 7164 7165 my $low = $range->start; 7166 my $high = $range->end; 7167 my $map = $range->value; 7168 7169 # No need to output the range if it maps to the default. 7170 return if $map eq $default_map{$addr}; 7171 7172 my $property = $self->property; 7173 7174 # Switch based on the map type... 7175 if ($type == $HANGUL_SYLLABLE) { 7176 7177 # These are entirely algorithmically determinable based on 7178 # some constants furnished by Unicode; for now, just set a 7179 # flag to indicate that have them. After everything is figured 7180 # out, we will output the code that does the algorithm. (Don't 7181 # output them if not needed because we are suppressing this 7182 # property.) 7183 $has_hangul_syllables = 1 if $property->to_output_map; 7184 } 7185 elsif ($type == $CP_IN_NAME) { 7186 7187 # Code points whose name ends in their code point are also 7188 # algorithmically determinable, but need information about the map 7189 # to do so. Both the map and its inverse are stored in data 7190 # structures output in the file. They are stored in the mean time 7191 # in global lists The lists will be written out later into Name.pm, 7192 # which is created only if needed. In order to prevent duplicates 7193 # in the list, only add to them for one property, should multiple 7194 # ones need them. 7195 if ($needing_code_points_ending_in_code_point == 0) { 7196 $needing_code_points_ending_in_code_point = $property; 7197 } 7198 if ($property == $needing_code_points_ending_in_code_point) { 7199 push @{$names_ending_in_code_point{$map}->{'low'}}, $low; 7200 push @{$names_ending_in_code_point{$map}->{'high'}}, $high; 7201 7202 my $squeezed = $map =~ s/[-\s]+//gr; 7203 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, 7204 $low; 7205 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, 7206 $high; 7207 7208 # Calculate the set of legal characters in names of this 7209 # series. It includes every character in the name prefix. 7210 my %legal; 7211 $legal{$_} = 1 for split //, $map; 7212 7213 # Plus the hex code point chars, blank, and minus. Also \n 7214 # can show up as being required due to anchoring 7215 for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") { 7216 $legal{$i} = 1; 7217 } 7218 my $legal = join "", sort { $a cmp $b } keys %legal; 7219 7220 # The legal chars can be used in match optimizations 7221 push @code_points_ending_in_code_point, { low => $low, 7222 high => $high, 7223 name => $map, 7224 legal => $legal, 7225 }; 7226 } 7227 } 7228 elsif ($range->type == $MULTI_CP || $range->type == $NULL) { 7229 7230 # Multi-code point maps and null string maps have an entry 7231 # for each code point in the range. They use the same 7232 # output format. 7233 for my $code_point ($low .. $high) { 7234 7235 # The pack() below can't cope with surrogates. XXX This may 7236 # no longer be true 7237 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { 7238 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); 7239 next; 7240 } 7241 7242 # Generate the hash entries for these in the form that 7243 # utf8.c understands. 7244 my $tostr = ""; 7245 my $to_name = ""; 7246 my $to_chr = ""; 7247 foreach my $to (split " ", $map) { 7248 if ($to !~ /^$code_point_re$/) { 7249 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); 7250 next; 7251 } 7252 $tostr .= sprintf "\\x{%s}", $to; 7253 $to = CORE::hex $to; 7254 if ($annotate) { 7255 $to_name .= " + " if $to_name; 7256 $to_chr .= main::display_chr($to); 7257 main::populate_char_info($to) 7258 if ! defined $viacode[$to]; 7259 $to_name .= $viacode[$to]; 7260 } 7261 } 7262 7263 # The unpack yields a list of the bytes that comprise the 7264 # UTF-8 of $code_point, which are each placed in \xZZ format 7265 # and output in the %s to map to $tostr, so the result looks 7266 # like: 7267 # "\xC4\xB0" => "\x{0069}\x{0307}", 7268 my $utf8 = sprintf(qq["%s" => "$tostr",], 7269 join("", map { sprintf "\\x%02X", $_ } 7270 unpack("U0C*", chr $code_point))); 7271 7272 # Add a comment so that a human reader can more easily 7273 # see what's going on. 7274 push @multi_code_point_maps, 7275 sprintf("%-45s # U+%04X", $utf8, $code_point); 7276 if (! $annotate) { 7277 $multi_code_point_maps[-1] .= " => $map"; 7278 } 7279 else { 7280 main::populate_char_info($code_point) 7281 if ! defined $viacode[$code_point]; 7282 $multi_code_point_maps[-1] .= " '" 7283 . main::display_chr($code_point) 7284 . "' => '$to_chr'; $viacode[$code_point] => $to_name"; 7285 } 7286 } 7287 } 7288 else { 7289 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written"); 7290 } 7291 7292 return; 7293 } 7294 7295 sub pre_body($self) { 7296 # Returns the string that should be output in the file before the main 7297 # body of this table. It isn't called until the main body is 7298 # calculated, saving a pass. The string includes some hash entries 7299 # identifying the format of the body, and what the single value should 7300 # be for all ranges missing from it. It also includes any code points 7301 # which have map_types that don't go in the main table. 7302 7303 my $addr = pack 'J', refaddr $self; 7304 7305 my $name = $self->property->swash_name; 7306 7307 # Currently there is nothing in the pre_body unless a swash is being 7308 # generated. 7309 return unless defined $name; 7310 7311 if (defined $swash_keys{$name}) { 7312 Carp::my_carp(main::join_lines(<<END 7313Already created a swash name '$name' for $swash_keys{$name}. This means that 7314the same name desired for $self shouldn't be used. Bad News. This must be 7315fixed before production use, but proceeding anyway 7316END 7317 )); 7318 } 7319 $swash_keys{$name} = "$self"; 7320 7321 my $pre_body = ""; 7322 7323 # Here we assume we were called after have gone through the whole 7324 # file. If we actually generated anything for each map type, add its 7325 # respective header and trailer 7326 my $specials_name = ""; 7327 if (@multi_code_point_maps) { 7328 $specials_name = "Unicode::UCD::ToSpec$name"; 7329 $pre_body .= <<END; 7330 7331# Some code points require special handling because their mappings are each to 7332# multiple code points. These do not appear in the main body, but are defined 7333# in the hash below. 7334 7335# Each key is the string of N bytes that together make up the UTF-8 encoding 7336# for the code point. (i.e. the same as looking at the code point's UTF-8 7337# under "use bytes"). Each value is the UTF-8 of the translation, for speed. 7338\%$specials_name = ( 7339END 7340 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; 7341 } 7342 7343 my $format = $self->format; 7344 7345 my $return = ""; 7346 7347 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7348 if ($output_adjusted) { 7349 if ($specials_name) { 7350 $return .= <<END; 7351# The mappings in the non-hash portion of this file must be modified to get the 7352# correct values by adding the code point ordinal number to each one that is 7353# numeric. 7354END 7355 } 7356 else { 7357 $return .= <<END; 7358# The mappings must be modified to get the correct values by adding the code 7359# point ordinal number to each one that is numeric. 7360END 7361 } 7362 } 7363 7364 $return .= <<END; 7365 7366# The name this table is to be known by, with the format of the mappings in 7367# the main body of the table, and what all code points missing from this file 7368# map to. 7369\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} 7370END 7371 if ($specials_name) { 7372 $return .= <<END; 7373\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings 7374END 7375 } 7376 my $default_map = $default_map{$addr}; 7377 7378 # For $CODE_POINT default maps and using adjustments, instead the default 7379 # becomes zero. 7380 $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '" 7381 . (($output_adjusted && $default_map eq $CODE_POINT) 7382 ? "0" 7383 : $default_map) 7384 . "';"; 7385 7386 if ($default_map eq $CODE_POINT) { 7387 $return .= ' # code point maps to itself'; 7388 } 7389 elsif ($default_map eq "") { 7390 $return .= ' # code point maps to the empty string'; 7391 } 7392 $return .= "\n"; 7393 7394 $return .= $pre_body; 7395 7396 return $return; 7397 } 7398 7399 sub write($self) { 7400 # Write the table to the file. 7401 7402 my $addr = pack 'J', refaddr $self; 7403 7404 # Clear the temporaries 7405 undef @multi_code_point_maps; 7406 7407 # Calculate the format of the table if not already done. 7408 my $format = $self->format; 7409 my $type = $self->property->type; 7410 my $default_map = $self->default_map; 7411 if (! defined $format) { 7412 if ($type == $BINARY) { 7413 7414 # Don't bother checking the values, because we elsewhere 7415 # verify that a binary table has only 2 values. 7416 $format = $BINARY_FORMAT; 7417 } 7418 else { 7419 my @ranges = $self->_range_list->ranges; 7420 7421 # default an empty table based on its type and default map 7422 if (! @ranges) { 7423 7424 # But it turns out that the only one we can say is a 7425 # non-string (besides binary, handled above) is when the 7426 # table is a string and the default map is to a code point 7427 if ($type == $STRING && $default_map eq $CODE_POINT) { 7428 $format = $HEX_FORMAT; 7429 } 7430 else { 7431 $format = $STRING_FORMAT; 7432 } 7433 } 7434 else { 7435 7436 # Start with the most restrictive format, and as we find 7437 # something that doesn't fit with that, change to the next 7438 # most restrictive, and so on. 7439 $format = $DECIMAL_FORMAT; 7440 foreach my $range (@ranges) { 7441 next if $range->type != 0; # Non-normal ranges don't 7442 # affect the main body 7443 my $map = $range->value; 7444 if ($map ne $default_map) { 7445 last if $format eq $STRING_FORMAT; # already at 7446 # least 7447 # restrictive 7448 $format = $INTEGER_FORMAT 7449 if $format eq $DECIMAL_FORMAT 7450 && $map !~ / ^ [0-9] $ /x; 7451 $format = $FLOAT_FORMAT 7452 if $format eq $INTEGER_FORMAT 7453 && $map !~ / ^ -? [0-9]+ $ /x; 7454 $format = $RATIONAL_FORMAT 7455 if $format eq $FLOAT_FORMAT 7456 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; 7457 $format = $HEX_FORMAT 7458 if ($format eq $RATIONAL_FORMAT 7459 && $map !~ 7460 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x) 7461 # Assume a leading zero means hex, 7462 # even if all digits are 0-9 7463 || ($format eq $INTEGER_FORMAT 7464 && $map =~ /^0[0-9A-F]/); 7465 $format = $STRING_FORMAT if $format eq $HEX_FORMAT 7466 && $map =~ /[^0-9A-F]/; 7467 } 7468 } 7469 } 7470 } 7471 } # end of calculating format 7472 7473 if ($default_map eq $CODE_POINT 7474 && $format ne $HEX_FORMAT 7475 && ! defined $self->format) # manual settings are always 7476 # considered ok 7477 { 7478 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'") 7479 } 7480 7481 # If the output is to be adjusted, the format of the table that gets 7482 # output is actually 'a' or 'ax' instead of whatever it is stored 7483 # internally as. 7484 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7485 if ($output_adjusted) { 7486 if ($default_map eq $CODE_POINT) { 7487 $format = $HEX_ADJUST_FORMAT; 7488 } 7489 else { 7490 $format = $ADJUST_FORMAT; 7491 } 7492 } 7493 7494 $self->_set_format($format); 7495 7496 return $self->SUPER::write( 7497 $output_adjusted, 7498 $default_map); # don't write defaulteds 7499 } 7500 7501 # Accessors for the underlying list that should fail if locked. 7502 for my $sub (qw( 7503 add_duplicate 7504 replace_map 7505 )) 7506 { 7507 no strict "refs"; 7508 *$sub = sub { 7509 use strict "refs"; 7510 my $self = shift; 7511 7512 return if $self->carp_if_locked; 7513 return $self->_range_list->$sub(@_); 7514 } 7515 } 7516} # End closure for Map_Table 7517 7518package Match_Table; 7519use parent '-norequire', '_Base_Table'; 7520 7521# A Match table is one which is a list of all the code points that have 7522# the same property and property value, for use in \p{property=value} 7523# constructs in regular expressions. It adds very little data to the base 7524# structure, but many methods, as these lists can be combined in many ways to 7525# form new ones. 7526# There are only a few concepts added: 7527# 1) Equivalents and Relatedness. 7528# Two tables can match the identical code points, but have different names. 7529# This always happens when there is a perl single form extension 7530# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two 7531# tables are set to be related, with the Perl extension being a child, and 7532# the Unicode property being the parent. 7533# 7534# It may be that two tables match the identical code points and we don't 7535# know if they are related or not. This happens most frequently when the 7536# Block and Script properties have the exact range. But note that a 7537# revision to Unicode could add new code points to the script, which would 7538# now have to be in a different block (as the block was filled, or there 7539# would have been 'Unknown' script code points in it and they wouldn't have 7540# been identical). So we can't rely on any two properties from Unicode 7541# always matching the same code points from release to release, and thus 7542# these tables are considered coincidentally equivalent--not related. When 7543# two tables are unrelated but equivalent, one is arbitrarily chosen as the 7544# 'leader', and the others are 'equivalents'. This concept is useful 7545# to minimize the number of tables written out. Only one file is used for 7546# any identical set of code points, with entries in UCD.pl mapping all 7547# the involved tables to it. 7548# 7549# Related tables will always be identical; we set them up to be so. Thus 7550# if the Unicode one is deprecated, the Perl one will be too. Not so for 7551# unrelated tables. Relatedness makes generating the documentation easier. 7552# 7553# 2) Complement. 7554# Like equivalents, two tables may be the inverses of each other, the 7555# intersection between them is null, and the union is every Unicode code 7556# point. The two tables that occupy a binary property are necessarily like 7557# this. By specifying one table as the complement of another, we can avoid 7558# storing it on disk (using the other table and performing a fast 7559# transform), and some memory and calculations. 7560# 7561# 3) Conflicting. It may be that there will eventually be name clashes, with 7562# the same name meaning different things. For a while, there actually were 7563# conflicts, but they have so far been resolved by changing Perl's or 7564# Unicode's definitions to match the other, but when this code was written, 7565# it wasn't clear that that was what was going to happen. (Unicode changed 7566# because of protests during their beta period.) Name clashes are warned 7567# about during compilation, and the documentation. The generated tables 7568# are sane, free of name clashes, because the code suppresses the Perl 7569# version. But manual intervention to decide what the actual behavior 7570# should be may be required should this happen. The introductory comments 7571# have more to say about this. 7572# 7573# 4) Definition. This is a string for human consumption that specifies the 7574# code points that this table matches. This is used only for the generated 7575# pod file. It may be specified explicitly, or automatically computed. 7576# Only the first portion of complicated definitions is computed and 7577# displayed. 7578 7579sub standardize { return main::standardize($_[0]); } 7580sub trace { return main::trace(@_); } 7581 7582 7583{ # Closure 7584 7585 main::setup_package(); 7586 7587 my %leader; 7588 # The leader table of this one; initially $self. 7589 main::set_access('leader', \%leader, 'r'); 7590 7591 my %equivalents; 7592 # An array of any tables that have this one as their leader 7593 main::set_access('equivalents', \%equivalents, 'readable_array'); 7594 7595 my %parent; 7596 # The parent table to this one, initially $self. This allows us to 7597 # distinguish between equivalent tables that are related (for which this 7598 # is set to), and those which may not be, but share the same output file 7599 # because they match the exact same set of code points in the current 7600 # Unicode release. 7601 main::set_access('parent', \%parent, 'r'); 7602 7603 my %children; 7604 # An array of any tables that have this one as their parent 7605 main::set_access('children', \%children, 'readable_array'); 7606 7607 my %conflicting; 7608 # Array of any tables that would have the same name as this one with 7609 # a different meaning. This is used for the generated documentation. 7610 main::set_access('conflicting', \%conflicting, 'readable_array'); 7611 7612 my %matches_all; 7613 # Set in the constructor for tables that are expected to match all code 7614 # points. 7615 main::set_access('matches_all', \%matches_all, 'r'); 7616 7617 my %complement; 7618 # Points to the complement that this table is expressed in terms of; 0 if 7619 # none. 7620 main::set_access('complement', \%complement, 'r'); 7621 7622 my %definition; 7623 # Human readable string of the first few ranges of code points matched by 7624 # this table 7625 main::set_access('definition', \%definition, 'r', 's'); 7626 7627 sub new { 7628 my $class = shift; 7629 7630 my %args = @_; 7631 7632 # The property for which this table is a listing of property values. 7633 my $property = delete $args{'_Property'}; 7634 7635 my $name = delete $args{'Name'}; 7636 my $full_name = delete $args{'Full_Name'}; 7637 $full_name = $name if ! defined $full_name; 7638 7639 # Optional 7640 my $initialize = delete $args{'Initialize'}; 7641 my $matches_all = delete $args{'Matches_All'} || 0; 7642 my $format = delete $args{'Format'}; 7643 my $definition = delete $args{'Definition'} // ""; 7644 # Rest of parameters passed on. 7645 7646 my $range_list = Range_List->new(Initialize => $initialize, 7647 Owner => $property); 7648 7649 my $complete = $full_name; 7650 $complete = '""' if $complete eq ""; # A null name shouldn't happen, 7651 # but this helps debug if it 7652 # does 7653 # The complete name for a match table includes it's property in a 7654 # compound form 'property=table', except if the property is the 7655 # pseudo-property, perl, in which case it is just the single form, 7656 # 'table' (If you change the '=' must also change the ':' in lots of 7657 # places in this program that assume an equal sign) 7658 $complete = $property->full_name . "=$complete" if $property != $perl; 7659 7660 my $self = $class->SUPER::new(%args, 7661 Name => $name, 7662 Complete_Name => $complete, 7663 Full_Name => $full_name, 7664 _Property => $property, 7665 _Range_List => $range_list, 7666 Format => $EMPTY_FORMAT, 7667 Write_As_Invlist => 1, 7668 ); 7669 my $addr = pack 'J', refaddr $self; 7670 7671 $conflicting{$addr} = [ ]; 7672 $equivalents{$addr} = [ ]; 7673 $children{$addr} = [ ]; 7674 $matches_all{$addr} = $matches_all; 7675 $leader{$addr} = $self; 7676 $parent{$addr} = $self; 7677 $complement{$addr} = 0; 7678 $definition{$addr} = $definition; 7679 7680 if (defined $format && $format ne $EMPTY_FORMAT) { 7681 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); 7682 } 7683 7684 return $self; 7685 } 7686 7687 # See this program's beginning comment block about overloading these. 7688 use overload 7689 fallback => 0, 7690 qw("") => "_operator_stringify", 7691 '=' => sub { 7692 my $self = shift; 7693 7694 return if $self->carp_if_locked; 7695 return $self; 7696 }, 7697 7698 '+' => sub { 7699 my $self = shift; 7700 my $other = shift; 7701 7702 return $self->_range_list + $other; 7703 }, 7704 '&' => sub { 7705 my $self = shift; 7706 my $other = shift; 7707 7708 return $self->_range_list & $other; 7709 }, 7710 '+=' => sub { 7711 my $self = shift; 7712 my $other = shift; 7713 my $reversed = shift; 7714 7715 if ($reversed) { 7716 Carp::my_carp_bug("Bad news. Can't cope with '" 7717 . ref($other) 7718 . ' += ' 7719 . ref($self) 7720 . "'. undef returned."); 7721 return; 7722 } 7723 7724 return if $self->carp_if_locked; 7725 7726 if (ref $other) { 7727 7728 # Change the range list of this table to be the 7729 # union of the two. 7730 $self->_set_range_list($self->_range_list 7731 + $other); 7732 } 7733 else { # $other is just a simple value 7734 $self->add_range($other, $other); 7735 } 7736 return $self; 7737 }, 7738 '&=' => sub { 7739 my $self = shift; 7740 my $other = shift; 7741 my $reversed = shift; 7742 7743 if ($reversed) { 7744 Carp::my_carp_bug("Bad news. Can't cope with '" 7745 . ref($other) 7746 . ' &= ' 7747 . ref($self) 7748 . "'. undef returned."); 7749 return; 7750 } 7751 7752 return if $self->carp_if_locked; 7753 $self->_set_range_list($self->_range_list & $other); 7754 return $self; 7755 }, 7756 '-' => sub { my $self = shift; 7757 my $other = shift; 7758 my $reversed = shift; 7759 if ($reversed) { 7760 Carp::my_carp_bug("Bad news. Can't cope with '" 7761 . ref($other) 7762 . ' - ' 7763 . ref($self) 7764 . "'. undef returned."); 7765 return; 7766 } 7767 7768 return $self->_range_list - $other; 7769 }, 7770 '~' => sub { my $self = shift; 7771 return ~ $self->_range_list; 7772 }, 7773 ; 7774 7775 sub _operator_stringify($self, $other="", $reversed=0) { 7776 7777 my $name = $self->complete_name; 7778 return "Table '$name'"; 7779 } 7780 7781 sub _range_list { 7782 # Returns the range list associated with this table, which will be the 7783 # complement's if it has one. 7784 7785 my $self = shift; 7786 my $complement = $self->complement; 7787 7788 # In order to avoid re-complementing on each access, only do the 7789 # complement the first time, and store the result in this table's 7790 # range list to use henceforth. However, this wouldn't work if the 7791 # controlling (complement) table changed after we do this, so lock it. 7792 # Currently, the value of the complement isn't needed until after it 7793 # is fully constructed, so this works. If this were to change, the 7794 # each_range iteration functionality would no longer work on this 7795 # complement. 7796 if ($complement != 0 && $self->SUPER::_range_list->count == 0) { 7797 $self->_set_range_list($self->SUPER::_range_list 7798 + ~ $complement->_range_list); 7799 $complement->lock; 7800 } 7801 7802 return $self->SUPER::_range_list; 7803 } 7804 7805 sub add_alias { 7806 # Add a synonym for this table. See the comments in the base class 7807 7808 my $self = shift; 7809 my $name = shift; 7810 # Rest of parameters passed on. 7811 7812 $self->SUPER::add_alias($name, $self, @_); 7813 return; 7814 } 7815 7816 sub add_conflicting { 7817 # Add the name of some other object to the list of ones that name 7818 # clash with this match table. 7819 7820 my $self = shift; 7821 my $conflicting_name = shift; # The name of the conflicting object 7822 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? 7823 my $conflicting_object = shift; # Optional, the conflicting object 7824 # itself. This is used to 7825 # disambiguate the text if the input 7826 # name is identical to any of the 7827 # aliases $self is known by. 7828 # Sometimes the conflicting object is 7829 # merely hypothetical, so this has to 7830 # be an optional parameter. 7831 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7832 7833 my $addr = pack 'J', refaddr $self; 7834 7835 # Check if the conflicting name is exactly the same as any existing 7836 # alias in this table (as long as there is a real object there to 7837 # disambiguate with). 7838 if (defined $conflicting_object) { 7839 foreach my $alias ($self->aliases) { 7840 if (standardize($alias->name) eq standardize($conflicting_name)) { 7841 7842 # Here, there is an exact match. This results in 7843 # ambiguous comments, so disambiguate by changing the 7844 # conflicting name to its object's complete equivalent. 7845 $conflicting_name = $conflicting_object->complete_name; 7846 last; 7847 } 7848 } 7849 } 7850 7851 # Convert to the \p{...} final name 7852 $conflicting_name = "\\$p" . "{$conflicting_name}"; 7853 7854 # Only add once 7855 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; 7856 7857 push @{$conflicting{$addr}}, $conflicting_name; 7858 7859 return; 7860 } 7861 7862 sub is_set_equivalent_to($self, $other=undef) { 7863 # Return boolean of whether or not the other object is a table of this 7864 # type and has been marked equivalent to this one. 7865 7866 return 0 if ! defined $other; # Can happen for incomplete early 7867 # releases 7868 unless ($other->isa(__PACKAGE__)) { 7869 my $ref_other = ref $other; 7870 my $ref_self = ref $self; 7871 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); 7872 return 0; 7873 } 7874 7875 # Two tables are equivalent if they have the same leader. 7876 return $leader{pack 'J', refaddr $self} == $leader{pack 'J', refaddr $other}; 7877 return; 7878 } 7879 7880 sub set_equivalent_to { 7881 # Set $self equivalent to the parameter table. 7882 # The required Related => 'x' parameter is a boolean indicating 7883 # whether these tables are related or not. If related, $other becomes 7884 # the 'parent' of $self; if unrelated it becomes the 'leader' 7885 # 7886 # Related tables share all characteristics except names; equivalents 7887 # not quite so many. 7888 # If they are related, one must be a perl extension. This is because 7889 # we can't guarantee that Unicode won't change one or the other in a 7890 # later release even if they are identical now. 7891 7892 my $self = shift; 7893 my $other = shift; 7894 7895 my %args = @_; 7896 my $related = delete $args{'Related'}; 7897 7898 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 7899 7900 return if ! defined $other; # Keep on going; happens in some early 7901 # Unicode releases. 7902 7903 if (! defined $related) { 7904 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); 7905 $related = 0; 7906 } 7907 7908 # If already are equivalent, no need to re-do it; if subroutine 7909 # returns null, it found an error, also do nothing 7910 my $are_equivalent = $self->is_set_equivalent_to($other); 7911 return if ! defined $are_equivalent || $are_equivalent; 7912 7913 my $addr = pack 'J', refaddr $self; 7914 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; 7915 7916 if ($related) { 7917 if ($current_leader->perl_extension) { 7918 if ($other->perl_extension) { 7919 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); 7920 return; 7921 } 7922 } elsif ($self->property != $other->property # Depending on 7923 # situation, might 7924 # be better to use 7925 # add_alias() 7926 # instead for same 7927 # property 7928 && ! $other->perl_extension 7929 7930 # We allow the sc and scx properties to be marked as 7931 # related. They are in fact related, and this allows 7932 # the pod to show that better. This test isn't valid 7933 # if this is an early Unicode release without the scx 7934 # property (having that also implies the sc property 7935 # exists, so don't have to test for no 'sc') 7936 && ( ! defined $scx 7937 && ! ( ( $self->property == $script 7938 || $self->property == $scx) 7939 && ( $self->property == $script 7940 || $self->property == $scx)))) 7941 { 7942 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); 7943 $related = 0; 7944 } 7945 } 7946 7947 if (! $self->is_empty && ! $self->matches_identically_to($other)) { 7948 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); 7949 return; 7950 } 7951 7952 my $leader = pack 'J', refaddr $current_leader; 7953 my $other_addr = pack 'J', refaddr $other; 7954 7955 # Any tables that are equivalent to or children of this table must now 7956 # instead be equivalent to or (children) to the new leader (parent), 7957 # still equivalent. The equivalency includes their matches_all info, 7958 # and for related tables, their fate and status. 7959 # All related tables are of necessity equivalent, but the converse 7960 # isn't necessarily true 7961 my $status = $other->status; 7962 my $status_info = $other->status_info; 7963 my $fate = $other->fate; 7964 my $matches_all = $matches_all{other_addr}; 7965 my $caseless_equivalent = $other->caseless_equivalent; 7966 foreach my $table ($current_leader, @{$equivalents{$leader}}) { 7967 next if $table == $other; 7968 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; 7969 7970 my $table_addr = pack 'J', refaddr $table; 7971 $leader{$table_addr} = $other; 7972 $matches_all{$table_addr} = $matches_all; 7973 $self->_set_range_list($other->_range_list); 7974 push @{$equivalents{$other_addr}}, $table; 7975 if ($related) { 7976 $parent{$table_addr} = $other; 7977 push @{$children{$other_addr}}, $table; 7978 $table->set_status($status, $status_info); 7979 7980 # This reason currently doesn't get exposed outside; otherwise 7981 # would have to look up the parent's reason and use it instead. 7982 $table->set_fate($fate, "Parent's fate"); 7983 7984 $self->set_caseless_equivalent($caseless_equivalent); 7985 } 7986 } 7987 7988 # Now that we've declared these to be equivalent, any changes to one 7989 # of the tables would invalidate that equivalency. 7990 $self->lock; 7991 $other->lock; 7992 return; 7993 } 7994 7995 sub set_complement($self, $other) { 7996 # Set $self to be the complement of the parameter table. $self is 7997 # locked, as what it contains should all come from the other table. 7998 7999 if ($other->complement != 0) { 8000 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); 8001 return; 8002 } 8003 $complement{pack 'J', refaddr $self} = $other; 8004 8005 # Be sure the other property knows we are depending on them; or the 8006 # other table if it is one in the current property. 8007 if ($self->property != $other->property) { 8008 $other->property->set_has_dependency(1); 8009 } 8010 else { 8011 $other->set_has_dependency(1); 8012 } 8013 $self->lock; 8014 return; 8015 } 8016 8017 sub add_range($self, @range) { # Add a range to the list for this table. 8018 # Rest of parameters passed on 8019 8020 return if $self->carp_if_locked; 8021 return $self->_range_list->add_range(@range); 8022 } 8023 8024 sub header($self) { 8025 # All match tables are to be used only by the Perl core. 8026 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; 8027 } 8028 8029 sub pre_body { # Does nothing for match tables. 8030 return 8031 } 8032 8033 sub append_to_body { # Does nothing for match tables. 8034 return 8035 } 8036 8037 sub set_fate($self, $fate, $reason=undef) { 8038 $self->SUPER::set_fate($fate, $reason); 8039 8040 # All children share this fate 8041 foreach my $child ($self->children) { 8042 $child->set_fate($fate, $reason); 8043 } 8044 return; 8045 } 8046 8047 sub calculate_table_definition 8048 { 8049 # Returns a human-readable string showing some or all of the code 8050 # points matched by this table. The string will include a 8051 # bracketed-character class for all characters matched in the 00-FF 8052 # range, and the first few ranges matched beyond that. 8053 my $max_ranges = 6; 8054 8055 my $self = shift; 8056 my $definition = $self->definition || ""; 8057 8058 # Skip this if already have a definition. 8059 return $definition if $definition; 8060 8061 my $lows_string = ""; # The string representation of the 0-FF 8062 # characters 8063 my $string_range = ""; # The string rep. of the above FF ranges 8064 my $range_count = 0; # How many ranges in $string_rage 8065 8066 my @lows_invlist; # The inversion list of the 0-FF code points 8067 my $first_non_control = ord(" "); # Everything below this is a 8068 # control, on ASCII or EBCDIC 8069 my $max_table_code_point = $self->max; 8070 8071 # On ASCII platforms, the range 80-FF contains no printables. 8072 my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126); 8073 8074 8075 # Look through the first few ranges matched by this table. 8076 $self->reset_each_range; # Defensive programming 8077 while (defined (my $range = $self->each_range())) { 8078 my $start = $range->start; 8079 my $end = $range->end; 8080 8081 # Accumulate an inversion list of the 00-FF code points 8082 if ($start < 256 && ($start > 0 || $end < 256)) { 8083 push @lows_invlist, $start; 8084 push @lows_invlist, 1 + (($end < 256) ? $end : 255); 8085 8086 # Get next range if there are more ranges below 256 8087 next if $end < 256 && $end < $max_table_code_point; 8088 8089 # If the range straddles the 255/256 boundary, we split it 8090 # there. We already added above the low portion to the 8091 # inversion list 8092 $start = 256 if $end > 256; 8093 } 8094 8095 # Here, @lows_invlist contains the code points below 256, and 8096 # there is no other range, or the current one starts at or above 8097 # 256. Generate the [char class] for the 0-255 ones. 8098 while (@lows_invlist) { 8099 8100 # If this range (necessarily the first one, by the way) starts 8101 # at 0 ... 8102 if ($lows_invlist[0] == 0) { 8103 8104 # If it ends within the block of controls, that means that 8105 # some controls are in it and some aren't. Since Unicode 8106 # properties pretty much only know about a few of the 8107 # controls, like \n, \t, this means that its one of them 8108 # that isn't in the range. Complement the inversion list 8109 # which will likely cause these to be output using their 8110 # mnemonics, hence being clearer. 8111 if ($lows_invlist[1] < $first_non_control) { 8112 $lows_string .= '^'; 8113 shift @lows_invlist; 8114 push @lows_invlist, 256; 8115 } 8116 elsif ($lows_invlist[1] <= $highest_printable) { 8117 8118 # Here, it extends into the printables block. Split 8119 # into two ranges so that the controls are separate. 8120 $lows_string .= sprintf "\\x00-\\x%02x", 8121 $first_non_control - 1; 8122 $lows_invlist[0] = $first_non_control; 8123 } 8124 } 8125 8126 # If the range completely contains the printables, don't 8127 # individually spell out the printables. 8128 if ( $lows_invlist[0] <= $first_non_control 8129 && $lows_invlist[1] > $highest_printable) 8130 { 8131 $lows_string .= sprintf "\\x%02x-\\x%02x", 8132 $lows_invlist[0], $lows_invlist[1] - 1; 8133 shift @lows_invlist; 8134 shift @lows_invlist; 8135 next; 8136 } 8137 8138 # Here, the range may include some but not all printables. 8139 # Look at each one individually 8140 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) { 8141 my $char = chr $ord; 8142 8143 # If there is already something in the list, an 8144 # alphanumeric char could be the next in sequence. If so, 8145 # we start or extend a range. That is, we could have so 8146 # far something like 'a-c', and the next char is a 'd', so 8147 # we change it to 'a-d'. We use native_to_unicode() 8148 # because a-z on EBCDIC means 26 chars, and excludes the 8149 # gap ones. 8150 if ($lows_string ne "" && $char =~ /[[:alnum:]]/) { 8151 my $prev = substr($lows_string, -1); 8152 if ( $prev !~ /[[:alnum:]]/ 8153 || utf8::native_to_unicode(ord $prev) + 1 8154 != utf8::native_to_unicode(ord $char)) 8155 { 8156 # Not extending the range 8157 $lows_string .= $char; 8158 } 8159 elsif ( length $lows_string > 1 8160 && substr($lows_string, -2, 1) eq '-') 8161 { 8162 # We had a sequence like '-c' and the current 8163 # character is 'd'. Extend the range. 8164 substr($lows_string, -1, 1) = $char; 8165 } 8166 else { 8167 # We had something like 'd' and this is 'e'. 8168 # Start a range. 8169 $lows_string .= "-$char"; 8170 } 8171 } 8172 elsif ($char =~ /[[:graph:]]/) { 8173 8174 # We output a graphic char as-is, preceded by a 8175 # backslash if it is a metacharacter 8176 $lows_string .= '\\' 8177 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/; 8178 $lows_string .= $char; 8179 } # Otherwise use mnemonic for any that have them 8180 elsif ($char =~ /[\a]/) { 8181 $lows_string .= '\a'; 8182 } 8183 elsif ($char =~ /[\b]/) { 8184 $lows_string .= '\b'; 8185 } 8186 elsif ($char eq "\e") { 8187 $lows_string .= '\e'; 8188 } 8189 elsif ($char eq "\f") { 8190 $lows_string .= '\f'; 8191 } 8192 elsif ($char eq "\cK") { 8193 $lows_string .= '\cK'; 8194 } 8195 elsif ($char eq "\n") { 8196 $lows_string .= '\n'; 8197 } 8198 elsif ($char eq "\r") { 8199 $lows_string .= '\r'; 8200 } 8201 elsif ($char eq "\t") { 8202 $lows_string .= '\t'; 8203 } 8204 else { 8205 8206 # Here is a non-graphic without a mnemonic. We use \x 8207 # notation. But if the ordinal of this is one above 8208 # the previous, create or extend the range 8209 my $hex_representation = sprintf("%02x", ord $char); 8210 if ( length $lows_string >= 4 8211 && substr($lows_string, -4, 2) eq '\\x' 8212 && hex(substr($lows_string, -2)) + 1 == ord $char) 8213 { 8214 if ( length $lows_string >= 5 8215 && substr($lows_string, -5, 1) eq '-' 8216 && ( length $lows_string == 5 8217 || substr($lows_string, -6, 1) ne '\\')) 8218 { 8219 substr($lows_string, -2) = $hex_representation; 8220 } 8221 else { 8222 $lows_string .= '-\\x' . $hex_representation; 8223 } 8224 } 8225 else { 8226 $lows_string .= '\\x' . $hex_representation; 8227 } 8228 } 8229 } 8230 } 8231 8232 # Done with assembling the string of all lows. If there are only 8233 # lows in the property, are completely done. 8234 if ($max_table_code_point < 256) { 8235 $self->reset_each_range; 8236 last; 8237 } 8238 8239 # Otherwise, quit if reached max number of non-lows ranges. If 8240 # there are lows, count them as one unit towards the maximum. 8241 $range_count++; 8242 if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) { 8243 $string_range .= " ..."; 8244 $self->reset_each_range; 8245 last; 8246 } 8247 8248 # Otherwise add this range. 8249 $string_range .= ", " if $string_range ne ""; 8250 if ($start == $end) { 8251 $string_range .= sprintf("U+%04X", $start); 8252 } 8253 elsif ($end >= $MAX_WORKING_CODEPOINT) { 8254 $string_range .= sprintf("U+%04X..infinity", $start); 8255 } 8256 else { 8257 $string_range .= sprintf("U+%04X..%04X", 8258 $start, $end); 8259 } 8260 } 8261 8262 # Done with all the ranges we're going to look at. Assemble the 8263 # definition from the lows + non-lows. 8264 8265 if ($lows_string ne "" || $string_range ne "") { 8266 if ($lows_string ne "") { 8267 $definition .= "[$lows_string]"; 8268 $definition .= ", " if $string_range; 8269 } 8270 $definition .= $string_range; 8271 } 8272 8273 return $definition; 8274 } 8275 8276 sub write($self) { 8277 return $self->SUPER::write(0); # No adjustments 8278 } 8279 8280 # $leader - Should only be called on the leader table of an equivalent group 8281 sub set_final_comment($leader) { 8282 # This creates a comment for the file that is to hold the match table 8283 # $self. It is somewhat convoluted to make the English read nicely, 8284 # but, heh, it's just a comment. 8285 # This should be called only with the leader match table of all the 8286 # ones that share the same file. It lists all such tables, ordered so 8287 # that related ones are together. 8288 8289 return unless $debugging_build; 8290 8291 my $addr = pack 'J', refaddr $leader; 8292 8293 if ($leader{$addr} != $leader) { 8294 Carp::my_carp_bug(<<END 8295set_final_comment() must be called on a leader table, which $leader is not. 8296It is equivalent to $leader{$addr}. No comment created 8297END 8298 ); 8299 return; 8300 } 8301 8302 # Get the number of code points matched by each of the tables in this 8303 # file, and add underscores for clarity. 8304 my $count = $leader->count; 8305 my $unicode_count; 8306 my $non_unicode_string; 8307 if ($count > $MAX_UNICODE_CODEPOINTS) { 8308 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 8309 - $MAX_UNICODE_CODEPOINT); 8310 $non_unicode_string = "All above-Unicode code points match as well, and are also returned"; 8311 } 8312 else { 8313 $unicode_count = $count; 8314 $non_unicode_string = ""; 8315 } 8316 my $string_count = main::clarify_code_point_count($unicode_count); 8317 8318 my $loose_count = 0; # how many aliases loosely matched 8319 my $compound_name = ""; # ? Are any names compound?, and if so, an 8320 # example 8321 my $properties_with_compound_names = 0; # count of these 8322 8323 8324 my %flags; # The status flags used in the file 8325 my $total_entries = 0; # number of entries written in the comment 8326 my $matches_comment = ""; # The portion of the comment about the 8327 # \p{}'s 8328 my @global_comments; # List of all the tables' comments that are 8329 # there before this routine was called. 8330 my $has_ucd_alias = 0; # If there is an alias that is accessible via 8331 # Unicode::UCD. If not, then don't say it is 8332 # in the comment 8333 8334 # Get list of all the parent tables that are equivalent to this one 8335 # (including itself). 8336 my @parents = grep { $parent{main::objaddr $_} == $_ } 8337 main::uniques($leader, @{$equivalents{$addr}}); 8338 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated 8339 # tables 8340 for my $parent (@parents) { 8341 8342 my $property = $parent->property; 8343 8344 # Special case 'N' tables in properties with two match tables when 8345 # the other is a 'Y' one. These are likely to be binary tables, 8346 # but not necessarily. In either case, \P{} will match the 8347 # complement of \p{}, and so if something is a synonym of \p, the 8348 # complement of that something will be the synonym of \P. This 8349 # would be true of any property with just two match tables, not 8350 # just those whose values are Y and N; but that would require a 8351 # little extra work, and there are none such so far in Unicode. 8352 my $perl_p = 'p'; # which is it? \p{} or \P{} 8353 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table 8354 8355 if (scalar $property->tables == 2 8356 && $parent == $property->table('N') 8357 && defined (my $yes = $property->table('Y'))) 8358 { 8359 my $yes_addr = pack 'J', refaddr $yes; 8360 @yes_perl_synonyms 8361 = grep { $_->property == $perl } 8362 main::uniques($yes, 8363 $parent{$yes_addr}, 8364 $parent{$yes_addr}->children); 8365 8366 # But these synonyms are \P{} ,not \p{} 8367 $perl_p = 'P'; 8368 } 8369 8370 my @description; # Will hold the table description 8371 my @note; # Will hold the table notes. 8372 my @conflicting; # Will hold the table conflicts. 8373 8374 # Look at the parent, any yes synonyms, and all the children 8375 my $parent_addr = pack 'J', refaddr $parent; 8376 for my $table ($parent, 8377 @yes_perl_synonyms, 8378 @{$children{$parent_addr}}) 8379 { 8380 my $table_addr = pack 'J', refaddr $table; 8381 my $table_property = $table->property; 8382 8383 # Tables are separated by a blank line to create a grouping. 8384 $matches_comment .= "\n" if $matches_comment; 8385 8386 # The table is named based on the property and value 8387 # combination it is for, like script=greek. But there may be 8388 # a number of synonyms for each side, like 'sc' for 'script', 8389 # and 'grek' for 'greek'. Any combination of these is a valid 8390 # name for this table. In this case, there are three more, 8391 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than 8392 # listing all possible combinations in the comment, we make 8393 # sure that each synonym occurs at least once, and add 8394 # commentary that the other combinations are possible. 8395 # Because regular expressions don't recognize things like 8396 # \p{jsn=}, only look at non-null right-hand-sides 8397 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases; 8398 my @table_aliases = grep { $_->name ne "" } $table->aliases; 8399 8400 # The alias lists above are already ordered in the order we 8401 # want to output them. To ensure that each synonym is listed, 8402 # we must use the max of the two numbers. But if there are no 8403 # legal synonyms (nothing in @table_aliases), then we don't 8404 # list anything. 8405 my $listed_combos = (@table_aliases) 8406 ? main::max(scalar @table_aliases, 8407 scalar @property_aliases) 8408 : 0; 8409 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG; 8410 8411 my $property_had_compound_name = 0; 8412 8413 for my $i (0 .. $listed_combos - 1) { 8414 $total_entries++; 8415 8416 # The current alias for the property is the next one on 8417 # the list, or if beyond the end, start over. Similarly 8418 # for the table (\p{prop=table}) 8419 my $property_alias = $property_aliases 8420 [$i % @property_aliases]->name; 8421 my $table_alias_object = $table_aliases 8422 [$i % @table_aliases]; 8423 my $table_alias = $table_alias_object->name; 8424 my $loose_match = $table_alias_object->loose_match; 8425 $has_ucd_alias |= $table_alias_object->ucd; 8426 8427 if ($table_alias !~ /\D/) { # Clarify large numbers. 8428 $table_alias = main::clarify_number($table_alias) 8429 } 8430 8431 # Add a comment for this alias combination 8432 my $current_match_comment; 8433 if ($table_property == $perl) { 8434 $current_match_comment = "\\$perl_p" 8435 . "{$table_alias}"; 8436 } 8437 else { 8438 $current_match_comment 8439 = "\\p{$property_alias=$table_alias}"; 8440 $property_had_compound_name = 1; 8441 } 8442 8443 # Flag any abnormal status for this table. 8444 my $flag = $property->status 8445 || $table->status 8446 || $table_alias_object->status; 8447 if ($flag && $flag ne $PLACEHOLDER) { 8448 $flags{$flag} = $status_past_participles{$flag}; 8449 } 8450 8451 $loose_count++; 8452 8453 # Pretty up the comment. Note the \b; it says don't make 8454 # this line a continuation. 8455 $matches_comment .= sprintf("\b%-1s%-s%s\n", 8456 $flag, 8457 " " x 7, 8458 $current_match_comment); 8459 } # End of generating the entries for this table. 8460 8461 # Save these for output after this group of related tables. 8462 push @description, $table->description; 8463 push @note, $table->note; 8464 push @conflicting, $table->conflicting; 8465 8466 # And this for output after all the tables. 8467 push @global_comments, $table->comment; 8468 8469 # Compute an alternate compound name using the final property 8470 # synonym and the first table synonym with a colon instead of 8471 # the equal sign used elsewhere. 8472 if ($property_had_compound_name) { 8473 $properties_with_compound_names ++; 8474 if (! $compound_name || @property_aliases > 1) { 8475 $compound_name = $property_aliases[-1]->name 8476 . ': ' 8477 . $table_aliases[0]->name; 8478 } 8479 } 8480 } # End of looping through all children of this table 8481 8482 # Here have assembled in $matches_comment all the related tables 8483 # to the current parent (preceded by the same info for all the 8484 # previous parents). Put out information that applies to all of 8485 # the current family. 8486 if (@conflicting) { 8487 8488 # But output the conflicting information now, as it applies to 8489 # just this table. 8490 my $conflicting = join ", ", @conflicting; 8491 if ($conflicting) { 8492 $matches_comment .= <<END; 8493 8494 Note that contrary to what you might expect, the above is NOT the same as 8495END 8496 $matches_comment .= "any of: " if @conflicting > 1; 8497 $matches_comment .= "$conflicting\n"; 8498 } 8499 } 8500 if (@description) { 8501 $matches_comment .= "\n Meaning: " 8502 . join('; ', @description) 8503 . "\n"; 8504 } 8505 if (@note) { 8506 $matches_comment .= "\n Note: " 8507 . join("\n ", @note) 8508 . "\n"; 8509 } 8510 } # End of looping through all tables 8511 8512 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string; 8513 8514 8515 my $code_points; 8516 my $match; 8517 my $any_of_these; 8518 if ($unicode_count == 1) { 8519 $match = 'matches'; 8520 $code_points = 'single code point'; 8521 } 8522 else { 8523 $match = 'match'; 8524 $code_points = "$string_count code points"; 8525 } 8526 8527 my $synonyms; 8528 my $entries; 8529 if ($total_entries == 1) { 8530 $synonyms = ""; 8531 $entries = 'entry'; 8532 $any_of_these = 'this' 8533 } 8534 else { 8535 $synonyms = " any of the following regular expression constructs"; 8536 $entries = 'entries'; 8537 $any_of_these = 'any of these' 8538 } 8539 8540 my $comment = ""; 8541 if ($has_ucd_alias) { 8542 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; 8543 } 8544 if ($has_unrelated) { 8545 $comment .= <<END; 8546This file is for tables that are not necessarily related: To conserve 8547resources, every table that matches the identical set of code points in this 8548version of Unicode uses this file. Each one is listed in a separate group 8549below. It could be that the tables will match the same set of code points in 8550other Unicode releases, or it could be purely coincidence that they happen to 8551be the same in Unicode $unicode_version, and hence may not in other versions. 8552 8553END 8554 } 8555 8556 if (%flags) { 8557 foreach my $flag (sort keys %flags) { 8558 $comment .= <<END; 8559'$flag' below means that this form is $flags{$flag}. 8560END 8561 if ($flag eq $INTERNAL_ALIAS) { 8562 $comment .= "DO NOT USE!!!"; 8563 } 8564 else { 8565 $comment .= "Consult $pod_file.pod"; 8566 } 8567 $comment .= "\n"; 8568 } 8569 $comment .= "\n"; 8570 } 8571 8572 if ($total_entries == 0) { 8573 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); 8574 $comment .= <<END; 8575This file returns the $code_points in Unicode Version 8576$unicode_version for 8577$leader, but it is inaccessible through Perl regular expressions, as 8578"\\p{prop=}" is not recognized. 8579END 8580 8581 } else { 8582 $comment .= <<END; 8583This file returns the $code_points in Unicode Version 8584$unicode_version that 8585$match$synonyms: 8586 8587$matches_comment 8588$pod_file.pod should be consulted for the syntax rules for $any_of_these, 8589including if adding or subtracting white space, underscore, and hyphen 8590characters matters or doesn't matter, and other permissible syntactic 8591variants. Upper/lower case distinctions never matter. 8592END 8593 8594 } 8595 if ($compound_name) { 8596 $comment .= <<END; 8597 8598A colon can be substituted for the equals sign, and 8599END 8600 if ($properties_with_compound_names > 1) { 8601 $comment .= <<END; 8602within each group above, 8603END 8604 } 8605 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name); 8606 8607 # Note the \b below, it says don't make that line a continuation. 8608 $comment .= <<END; 8609anything to the left of the equals (or colon) can be combined with anything to 8610the right. Thus, for example, 8611$compound_name 8612\bis also valid. 8613END 8614 } 8615 8616 # And append any comment(s) from the actual tables. They are all 8617 # gathered here, so may not read all that well. 8618 if (@global_comments) { 8619 $comment .= "\n" . join("\n\n", @global_comments) . "\n"; 8620 } 8621 8622 if ($count) { # The format differs if no code points, and needs no 8623 # explanation in that case 8624 if ($leader->write_as_invlist) { 8625 $comment.= <<END; 8626 8627The first data line of this file begins with the letter V to indicate it is in 8628inversion list format. The number following the V gives the number of lines 8629remaining. Each of those remaining lines is a single number representing the 8630starting code point of a range which goes up to but not including the number 8631on the next line; The 0th, 2nd, 4th... ranges are for code points that match 8632the property; the 1st, 3rd, 5th... are ranges of code points that don't match 8633the property. The final line's range extends to the platform's infinity. 8634END 8635 } 8636 else { 8637 $comment.= <<END; 8638The format of the lines of this file is: 8639START\\tSTOP\\twhere START is the starting code point of the range, in hex; 8640STOP is the ending point, or if omitted, the range has just one code point. 8641END 8642 } 8643 if ($leader->output_range_counts) { 8644 $comment .= <<END; 8645Numbers in comments in [brackets] indicate how many code points are in the 8646range. 8647END 8648 } 8649 } 8650 8651 $leader->set_comment(main::join_lines($comment)); 8652 return; 8653 } 8654 8655 # Accessors for the underlying list 8656 for my $sub (qw( 8657 get_valid_code_point 8658 get_invalid_code_point 8659 )) 8660 { 8661 no strict "refs"; 8662 *$sub = sub { 8663 use strict "refs"; 8664 my $self = shift; 8665 8666 return $self->_range_list->$sub(@_); 8667 } 8668 } 8669} # End closure for Match_Table 8670 8671package Property; 8672 8673# The Property class represents a Unicode property, or the $perl 8674# pseudo-property. It contains a map table initialized empty at construction 8675# time, and for properties accessible through regular expressions, various 8676# match tables, created through the add_match_table() method, and referenced 8677# by the table('NAME') or tables() methods, the latter returning a list of all 8678# of the match tables. Otherwise table operations implicitly are for the map 8679# table. 8680# 8681# Most of the data in the property is actually about its map table, so it 8682# mostly just uses that table's accessors for most methods. The two could 8683# have been combined into one object, but for clarity because of their 8684# differing semantics, they have been kept separate. It could be argued that 8685# the 'file' and 'directory' fields should be kept with the map table. 8686# 8687# Each property has a type. This can be set in the constructor, or in the 8688# set_type accessor, but mostly it is figured out by the data. Every property 8689# starts with unknown type, overridden by a parameter to the constructor, or 8690# as match tables are added, or ranges added to the map table, the data is 8691# inspected, and the type changed. After the table is mostly or entirely 8692# filled, compute_type() should be called to finalize the analysis. 8693# 8694# There are very few operations defined. One can safely remove a range from 8695# the map table, and property_add_or_replace_non_nulls() adds the maps from another 8696# table to this one, replacing any in the intersection of the two. 8697 8698sub standardize { return main::standardize($_[0]); } 8699sub trace { return main::trace(@_) if main::DEBUG && $to_trace } 8700 8701{ # Closure 8702 8703 # This hash will contain as keys, all the aliases of all properties, and 8704 # as values, pointers to their respective property objects. This allows 8705 # quick look-up of a property from any of its names. 8706 my %alias_to_property_of; 8707 8708 sub dump_alias_to_property_of { 8709 # For debugging 8710 8711 print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; 8712 return; 8713 } 8714 8715 sub property_ref($name) { 8716 # This is a package subroutine, not called as a method. 8717 # If the single parameter is a literal '*' it returns a list of all 8718 # defined properties. 8719 # Otherwise, the single parameter is a name, and it returns a pointer 8720 # to the corresponding property object, or undef if none. 8721 # 8722 # Properties can have several different names. The 'standard' form of 8723 # each of them is stored in %alias_to_property_of as they are defined. 8724 # But it's possible that this subroutine will be called with some 8725 # variant, so if the initial lookup fails, it is repeated with the 8726 # standardized form of the input name. If found, besides returning the 8727 # result, the input name is added to the list so future calls won't 8728 # have to do the conversion again. 8729 8730 if (! defined $name) { 8731 Carp::my_carp_bug("Undefined input property. No action taken."); 8732 return; 8733 } 8734 8735 return main::uniques(values %alias_to_property_of) if $name eq '*'; 8736 8737 # Return cached result if have it. 8738 my $result = $alias_to_property_of{$name}; 8739 return $result if defined $result; 8740 8741 # Convert the input to standard form. 8742 my $standard_name = standardize($name); 8743 8744 $result = $alias_to_property_of{$standard_name}; 8745 return unless defined $result; # Don't cache undefs 8746 8747 # Cache the result before returning it. 8748 $alias_to_property_of{$name} = $result; 8749 return $result; 8750 } 8751 8752 8753 main::setup_package(); 8754 8755 my %map; 8756 # A pointer to the map table object for this property 8757 main::set_access('map', \%map); 8758 8759 my %full_name; 8760 # The property's full name. This is a duplicate of the copy kept in the 8761 # map table, but is needed because stringify needs it during 8762 # construction of the map table, and then would have a chicken before egg 8763 # problem. 8764 main::set_access('full_name', \%full_name, 'r'); 8765 8766 my %table_ref; 8767 # This hash will contain as keys, all the aliases of any match tables 8768 # attached to this property, and as values, the pointers to their 8769 # respective tables. This allows quick look-up of a table from any of its 8770 # names. 8771 main::set_access('table_ref', \%table_ref); 8772 8773 my %type; 8774 # The type of the property, $ENUM, $BINARY, etc 8775 main::set_access('type', \%type, 'r'); 8776 8777 my %file; 8778 # The filename where the map table will go (if actually written). 8779 # Normally defaulted, but can be overridden. 8780 main::set_access('file', \%file, 'r', 's'); 8781 8782 my %directory; 8783 # The directory where the map table will go (if actually written). 8784 # Normally defaulted, but can be overridden. 8785 main::set_access('directory', \%directory, 's'); 8786 8787 my %pseudo_map_type; 8788 # This is used to affect the calculation of the map types for all the 8789 # ranges in the table. It should be set to one of the values that signify 8790 # to alter the calculation. 8791 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); 8792 8793 my %has_only_code_point_maps; 8794 # A boolean used to help in computing the type of data in the map table. 8795 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); 8796 8797 my %unique_maps; 8798 # A list of the first few distinct mappings this property has. This is 8799 # used to disambiguate between binary and enum property types, so don't 8800 # have to keep more than three. 8801 main::set_access('unique_maps', \%unique_maps); 8802 8803 my %pre_declared_maps; 8804 # A boolean that gives whether the input data should declare all the 8805 # tables used, or not. If the former, unknown ones raise a warning. 8806 main::set_access('pre_declared_maps', 8807 \%pre_declared_maps, 'r', 's'); 8808 8809 my %match_subdir; 8810 # For properties whose shortest names are too long for a DOS 8.3 8811 # filesystem to distinguish between, this is used to manually give short 8812 # names for the directory name immediately under $match_tables that the 8813 # match tables for this property should be placed in. 8814 main::set_access('match_subdir', \%match_subdir, 'r'); 8815 8816 my %has_dependency; 8817 # A boolean that gives whether some table somewhere is defined as the 8818 # complement of a table in this property. This is a crude, but currently 8819 # sufficient, mechanism to make this property not get destroyed before 8820 # what is dependent on it is. Other dependencies could be added, so the 8821 # name was chosen to reflect a more general situation than actually is 8822 # currently the case. 8823 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 8824 8825 sub new { 8826 # The only required parameter is the positionally first, name. All 8827 # other parameters are key => value pairs. See the documentation just 8828 # above for the meanings of the ones not passed directly on to the map 8829 # table constructor. 8830 8831 my $class = shift; 8832 my $name = shift || ""; 8833 8834 my $self = property_ref($name); 8835 if (defined $self) { 8836 my $options_string = join ", ", @_; 8837 $options_string = ". Ignoring options $options_string" if $options_string; 8838 Carp::my_carp("$self is already in use. Using existing one$options_string;"); 8839 return $self; 8840 } 8841 8842 my %args = @_; 8843 8844 $self = bless \do { my $anonymous_scalar }, $class; 8845 my $addr = pack 'J', refaddr $self; 8846 8847 $directory{$addr} = delete $args{'Directory'}; 8848 $file{$addr} = delete $args{'File'}; 8849 $full_name{$addr} = delete $args{'Full_Name'} || $name; 8850 $type{$addr} = delete $args{'Type'} || $UNKNOWN; 8851 $pseudo_map_type{$addr} = delete $args{'Map_Type'}; 8852 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'} 8853 # Starting in this release, property 8854 # values should be defined for all 8855 # properties, except those overriding this 8856 // $v_version ge v5.1.0; 8857 $match_subdir{$addr} = delete $args{'Match_SubDir'}; 8858 8859 # Rest of parameters passed on. 8860 8861 $has_only_code_point_maps{$addr} = 1; 8862 $table_ref{$addr} = { }; 8863 $unique_maps{$addr} = { }; 8864 $has_dependency{$addr} = 0; 8865 8866 $map{$addr} = Map_Table->new($name, 8867 Full_Name => $full_name{$addr}, 8868 _Alias_Hash => \%alias_to_property_of, 8869 _Property => $self, 8870 %args); 8871 return $self; 8872 } 8873 8874 # See this program's beginning comment block about overloading the copy 8875 # constructor. Few operations are defined on properties, but a couple are 8876 # useful. It is safe to take the inverse of a property, and to remove a 8877 # single code point from it. 8878 use overload 8879 fallback => 0, 8880 qw("") => "_operator_stringify", 8881 "." => \&main::_operator_dot, 8882 ".=" => \&main::_operator_dot_equal, 8883 '==' => \&main::_operator_equal, 8884 '!=' => \&main::_operator_not_equal, 8885 '=' => sub { return shift }, 8886 '-=' => "_minus_and_equal", 8887 ; 8888 8889 sub _operator_stringify($self, $other="", $reversed=0) { 8890 return "Property '" . $self->full_name . "'"; 8891 } 8892 8893 sub _minus_and_equal($self, $other, $reversed=0) { 8894 # Remove a single code point from the map table of a property. 8895 if (ref $other) { 8896 Carp::my_carp_bug("Bad news. Can't cope with a " 8897 . ref($other) 8898 . " argument to '-='. Subtraction ignored."); 8899 return $self; 8900 } 8901 elsif ($reversed) { # Shouldn't happen in a -=, but just in case 8902 Carp::my_carp_bug("Bad news. Can't cope with subtracting a " 8903 . ref $self 8904 . " from a non-object. undef returned."); 8905 return; 8906 } 8907 else { 8908 $map{pack 'J', refaddr $self}->delete_range($other, $other); 8909 } 8910 return $self; 8911 } 8912 8913 sub add_match_table { 8914 # Add a new match table for this property, with name given by the 8915 # parameter. It returns a pointer to the table. 8916 8917 my $self = shift; 8918 my $name = shift; 8919 my %args = @_; 8920 8921 my $addr = pack 'J', refaddr $self; 8922 8923 my $table = $table_ref{$addr}{$name}; 8924 my $standard_name = main::standardize($name); 8925 if (defined $table 8926 || (defined ($table = $table_ref{$addr}{$standard_name}))) 8927 { 8928 Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); 8929 $table_ref{$addr}{$name} = $table; 8930 return $table; 8931 } 8932 else { 8933 8934 # See if this is a perl extension, if not passed in. 8935 my $perl_extension = delete $args{'Perl_Extension'}; 8936 $perl_extension 8937 = $self->perl_extension if ! defined $perl_extension; 8938 8939 my $fate; 8940 my $suppression_reason = ""; 8941 if ($self->name =~ /^_/) { 8942 $fate = $SUPPRESSED; 8943 $suppression_reason = "Parent property is internal only"; 8944 } 8945 elsif ($self->fate >= $SUPPRESSED) { 8946 $fate = $self->fate; 8947 $suppression_reason = $why_suppressed{$self->complete_name}; 8948 8949 } 8950 elsif ($name =~ /^_/) { 8951 $fate = $INTERNAL_ONLY; 8952 } 8953 $table = Match_Table->new( 8954 Name => $name, 8955 Perl_Extension => $perl_extension, 8956 _Alias_Hash => $table_ref{$addr}, 8957 _Property => $self, 8958 Fate => $fate, 8959 Suppression_Reason => $suppression_reason, 8960 Status => $self->status, 8961 _Status_Info => $self->status_info, 8962 %args); 8963 return unless defined $table; 8964 } 8965 8966 # Save the names for quick look up 8967 $table_ref{$addr}{$standard_name} = $table; 8968 $table_ref{$addr}{$name} = $table; 8969 8970 # Perhaps we can figure out the type of this property based on the 8971 # fact of adding this match table. First, string properties don't 8972 # have match tables; second, a binary property can't have 3 match 8973 # tables 8974 if ($type{$addr} == $UNKNOWN) { 8975 $type{$addr} = $NON_STRING; 8976 } 8977 elsif ($type{$addr} == $STRING) { 8978 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); 8979 $type{$addr} = $NON_STRING; 8980 } 8981 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { 8982 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) { 8983 if ($type{$addr} == $BINARY) { 8984 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); 8985 } 8986 $type{$addr} = $ENUM; 8987 } 8988 } 8989 8990 return $table; 8991 } 8992 8993 sub delete_match_table($self, $table_to_remove) { 8994 # Delete the table referred to by $2 from the property $1. 8995 my $addr = pack 'J', refaddr $self; 8996 8997 # Remove all names that refer to it. 8998 foreach my $key (keys %{$table_ref{$addr}}) { 8999 delete $table_ref{$addr}{$key} 9000 if $table_ref{$addr}{$key} == $table_to_remove; 9001 } 9002 9003 $table_to_remove->DESTROY; 9004 return; 9005 } 9006 9007 sub table($self, $name) { 9008 # Return a pointer to the match table (with name given by the 9009 # parameter) associated with this property; undef if none. 9010 my $addr = pack 'J', refaddr $self; 9011 9012 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; 9013 9014 # If quick look-up failed, try again using the standard form of the 9015 # input name. If that succeeds, cache the result before returning so 9016 # won't have to standardize this input name again. 9017 my $standard_name = main::standardize($name); 9018 return unless defined $table_ref{$addr}{$standard_name}; 9019 9020 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; 9021 return $table_ref{$addr}{$name}; 9022 } 9023 9024 sub tables { 9025 # Return a list of pointers to all the match tables attached to this 9026 # property 9027 9028 return main::uniques(values %{$table_ref{pack 'J', refaddr shift}}); 9029 } 9030 9031 sub directory { 9032 # Returns the directory the map table for this property should be 9033 # output in. If a specific directory has been specified, that has 9034 # priority; 'undef' is returned if the type isn't defined; 9035 # or $map_directory for everything else. 9036 9037 my $addr = pack 'J', refaddr shift; 9038 9039 return $directory{$addr} if defined $directory{$addr}; 9040 return undef if $type{$addr} == $UNKNOWN; 9041 return $map_directory; 9042 } 9043 9044 sub swash_name($self) { 9045 # Return the name that is used to both: 9046 # 1) Name the file that the map table is written to. 9047 # 2) The name of swash related stuff inside that file. 9048 # The reason for this is that the Perl core historically has used 9049 # certain names that aren't the same as the Unicode property names. 9050 # To continue using these, $file is hard-coded in this file for those, 9051 # but otherwise the standard name is used. This is different from the 9052 # external_name, so that the rest of the files, like in lib can use 9053 # the standard name always, without regard to historical precedent. 9054 my $addr = pack 'J', refaddr $self; 9055 9056 # Swash names are used only on either 9057 # 1) regular or internal-only map tables 9058 # 2) otherwise there should be no access to the 9059 # property map table from other parts of Perl. 9060 return if $map{$addr}->fate != $ORDINARY 9061 && ! ($map{$addr}->name =~ /^_/ 9062 && $map{$addr}->fate == $INTERNAL_ONLY); 9063 9064 return $file{$addr} if defined $file{$addr}; 9065 return $map{$addr}->external_name; 9066 } 9067 9068 sub to_create_match_tables($self) { 9069 # Returns a boolean as to whether or not match tables should be 9070 # created for this property. 9071 9072 # The whole point of this pseudo property is match tables. 9073 return 1 if $self == $perl; 9074 9075 my $addr = pack 'J', refaddr $self; 9076 9077 # Don't generate tables of code points that match the property values 9078 # of a string property. Such a list would most likely have many 9079 # property values, each with just one or very few code points mapping 9080 # to it. 9081 return 0 if $type{$addr} == $STRING; 9082 9083 # Otherwise, do. 9084 return 1; 9085 } 9086 9087 sub property_add_or_replace_non_nulls($self, $other) { 9088 # This adds the mappings in the property $other to $self. Non-null 9089 # mappings from $other override those in $self. It essentially merges 9090 # the two properties, with the second having priority except for null 9091 # mappings. 9092 9093 if (! $other->isa(__PACKAGE__)) { 9094 Carp::my_carp_bug("$other should be a " 9095 . __PACKAGE__ 9096 . ". Not a '" 9097 . ref($other) 9098 . "'. Not added;"); 9099 return; 9100 } 9101 9102 return $map{pack 'J', refaddr $self}->map_add_or_replace_non_nulls($map{pack 'J', refaddr $other}); 9103 } 9104 9105 sub set_proxy_for { 9106 # Certain tables are not generally written out to files, but 9107 # Unicode::UCD has the intelligence to know that the file for $self 9108 # can be used to reconstruct those tables. This routine just changes 9109 # things so that UCD pod entries for those suppressed tables are 9110 # generated, so the fact that a proxy is used is invisible to the 9111 # user. 9112 9113 my $self = shift; 9114 9115 foreach my $property_name (@_) { 9116 my $ref = property_ref($property_name); 9117 next if $ref->to_output_map; 9118 $ref->set_fate($MAP_PROXIED); 9119 } 9120 } 9121 9122 sub set_type($self, $type) { 9123 # Set the type of the property. Mostly this is figured out by the 9124 # data in the table. But this is used to set it explicitly. The 9125 # reason it is not a standard accessor is that when setting a binary 9126 # property, we need to make sure that all the true/false aliases are 9127 # present, as they were omitted in early Unicode releases. 9128 9129 if ($type != $ENUM 9130 && $type != $BINARY 9131 && $type != $FORCED_BINARY 9132 && $type != $STRING) 9133 { 9134 Carp::my_carp("Unrecognized type '$type'. Type not set"); 9135 return; 9136 } 9137 9138 $type{pack 'J', refaddr $self} = $type; 9139 return if $type != $BINARY && $type != $FORCED_BINARY; 9140 9141 my $yes = $self->table('Y'); 9142 $yes = $self->table('Yes') if ! defined $yes; 9143 $yes = $self->add_match_table('Y', Full_Name => 'Yes') 9144 if ! defined $yes; 9145 9146 # Add aliases in order wanted, duplicates will be ignored. We use a 9147 # binary property present in all releases for its ordered lists of 9148 # true/false aliases. Note, that could run into problems in 9149 # outputting things in that we don't distinguish between the name and 9150 # full name of these. Hopefully, if the table was already created 9151 # before this code is executed, it was done with these set properly. 9152 my $bm = property_ref("Bidi_Mirrored"); 9153 foreach my $alias ($bm->table("Y")->aliases) { 9154 $yes->add_alias($alias->name); 9155 } 9156 my $no = $self->table('N'); 9157 $no = $self->table('No') if ! defined $no; 9158 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; 9159 foreach my $alias ($bm->table("N")->aliases) { 9160 $no->add_alias($alias->name); 9161 } 9162 9163 return; 9164 } 9165 9166 sub add_map { 9167 # Add a map to the property's map table. This also keeps 9168 # track of the maps so that the property type can be determined from 9169 # its data. 9170 9171 my $self = shift; 9172 my $start = shift; # First code point in range 9173 my $end = shift; # Final code point in range 9174 my $map = shift; # What the range maps to. 9175 # Rest of parameters passed on. 9176 9177 my $addr = pack 'J', refaddr $self; 9178 9179 # If haven't the type of the property, gather information to figure it 9180 # out. 9181 if ($type{$addr} == $UNKNOWN) { 9182 9183 # If the map contains an interior blank or dash, or most other 9184 # nonword characters, it will be a string property. This 9185 # heuristic may actually miss some string properties. If so, they 9186 # may need to have explicit set_types called for them. This 9187 # happens in the Unihan properties. 9188 if ($map =~ / (?<= . ) [ -] (?= . ) /x 9189 || $map =~ / [^\w.\/\ -] /x) 9190 { 9191 $self->set_type($STRING); 9192 9193 # $unique_maps is used for disambiguating between ENUM and 9194 # BINARY later; since we know the property is not going to be 9195 # one of those, no point in keeping the data around 9196 undef $unique_maps{$addr}; 9197 } 9198 else { 9199 9200 # Not necessarily a string. The final decision has to be 9201 # deferred until all the data are in. We keep track of if all 9202 # the values are code points for that eventual decision. 9203 $has_only_code_point_maps{$addr} &= 9204 $map =~ / ^ $code_point_re $/x; 9205 9206 # For the purposes of disambiguating between binary and other 9207 # enumerations at the end, we keep track of the first three 9208 # distinct property values. Once we get to three, we know 9209 # it's not going to be binary, so no need to track more. 9210 if (scalar keys %{$unique_maps{$addr}} < 3) { 9211 $unique_maps{$addr}{main::standardize($map)} = 1; 9212 } 9213 } 9214 } 9215 9216 # Add the mapping by calling our map table's method 9217 return $map{$addr}->add_map($start, $end, $map, @_); 9218 } 9219 9220 sub compute_type($self) { 9221 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This 9222 # should be called after the property is mostly filled with its maps. 9223 # We have been keeping track of what the property values have been, 9224 # and now have the necessary information to figure out the type. 9225 9226 my $addr = pack 'J', refaddr $self; 9227 9228 my $type = $type{$addr}; 9229 9230 # If already have figured these out, no need to do so again, but we do 9231 # a double check on ENUMS to make sure that a string property hasn't 9232 # improperly been classified as an ENUM, so continue on with those. 9233 return if $type == $STRING 9234 || $type == $BINARY 9235 || $type == $FORCED_BINARY; 9236 9237 # If every map is to a code point, is a string property. 9238 if ($type == $UNKNOWN 9239 && ($has_only_code_point_maps{$addr} 9240 || (defined $map{$addr}->default_map 9241 && $map{$addr}->default_map eq ""))) 9242 { 9243 $self->set_type($STRING); 9244 } 9245 else { 9246 9247 # Otherwise, it is to some sort of enumeration. (The case where 9248 # it is a Unicode miscellaneous property, and treated like a 9249 # string in this program is handled in add_map()). Distinguish 9250 # between binary and some other enumeration type. Of course, if 9251 # there are more than two values, it's not binary. But more 9252 # subtle is the test that the default mapping is defined means it 9253 # isn't binary. This in fact may change in the future if Unicode 9254 # changes the way its data is structured. But so far, no binary 9255 # properties ever have @missing lines for them, so the default map 9256 # isn't defined for them. The few properties that are two-valued 9257 # and aren't considered binary have the default map defined 9258 # starting in Unicode 5.0, when the @missing lines appeared; and 9259 # this program has special code to put in a default map for them 9260 # for earlier than 5.0 releases. 9261 if ($type == $ENUM 9262 || scalar keys %{$unique_maps{$addr}} > 2 9263 || defined $self->default_map) 9264 { 9265 my $tables = $self->tables; 9266 my $count = $self->count; 9267 if ($verbosity && $tables > 500 && $tables/$count > .1) { 9268 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n"); 9269 } 9270 $self->set_type($ENUM); 9271 } 9272 else { 9273 $self->set_type($BINARY); 9274 } 9275 } 9276 undef $unique_maps{$addr}; # Garbage collect 9277 return; 9278 } 9279 9280 # $reaons - Ignored unless suppressing 9281 sub set_fate($self, $fate, $reason=undef) { 9282 my $addr = pack 'J', refaddr $self; 9283 if ($fate >= $SUPPRESSED) { 9284 $why_suppressed{$self->complete_name} = $reason; 9285 } 9286 9287 # Each table shares the property's fate, except that MAP_PROXIED 9288 # doesn't affect match tables 9289 $map{$addr}->set_fate($fate, $reason); 9290 if ($fate != $MAP_PROXIED) { 9291 foreach my $table ($map{$addr}, $self->tables) { 9292 $table->set_fate($fate, $reason); 9293 } 9294 } 9295 return; 9296 } 9297 9298 9299 # Most of the accessors for a property actually apply to its map table. 9300 # Setup up accessor functions for those, referring to %map 9301 for my $sub (qw( 9302 add_alias 9303 add_anomalous_entry 9304 add_comment 9305 add_conflicting 9306 add_description 9307 add_duplicate 9308 add_note 9309 aliases 9310 comment 9311 complete_name 9312 containing_range 9313 count 9314 default_map 9315 definition 9316 delete_range 9317 description 9318 each_range 9319 external_name 9320 fate 9321 file_path 9322 format 9323 initialize 9324 inverse_list 9325 is_empty 9326 name 9327 note 9328 perl_extension 9329 property 9330 range_count 9331 ranges 9332 range_size_1 9333 replace_map 9334 reset_each_range 9335 set_comment 9336 set_default_map 9337 set_file_path 9338 set_final_comment 9339 _set_format 9340 set_range_size_1 9341 set_status 9342 set_to_output_map 9343 short_name 9344 status 9345 status_info 9346 to_output_map 9347 type_of 9348 value_of 9349 write 9350 )) 9351 # 'property' above is for symmetry, so that one can take 9352 # the property of a property and get itself, and so don't 9353 # have to distinguish between properties and tables in 9354 # calling code 9355 { 9356 no strict "refs"; 9357 *$sub = sub { 9358 use strict "refs"; 9359 my $self = shift; 9360 return $map{pack 'J', refaddr $self}->$sub(@_); 9361 } 9362 } 9363 9364 9365} # End closure 9366 9367package main; 9368 9369sub display_chr { 9370 # Converts an ordinal printable character value to a displayable string, 9371 # using a dotted circle to hold combining characters. 9372 9373 my $ord = shift; 9374 my $chr = chr $ord; 9375 return $chr if $ccc->table(0)->contains($ord); 9376 return "\x{25CC}$chr"; 9377} 9378 9379sub join_lines($input) { 9380 # Returns lines of the input joined together, so that they can be folded 9381 # properly. 9382 # This causes continuation lines to be joined together into one long line 9383 # for folding. A continuation line is any line that doesn't begin with a 9384 # space or "\b" (the latter is stripped from the output). This is so 9385 # lines can be in a HERE document so as to fit nicely in the terminal 9386 # width, but be joined together in one long line, and then folded with 9387 # indents, '#' prefixes, etc, properly handled. 9388 # A blank separates the joined lines except if there is a break; an extra 9389 # blank is inserted after a period ending a line. 9390 9391 # Initialize the return with the first line. 9392 my ($return, @lines) = split "\n", $input; 9393 9394 # If the first line is null, it was an empty line, add the \n back in 9395 $return = "\n" if $return eq ""; 9396 9397 # Now join the remainder of the physical lines. 9398 for my $line (@lines) { 9399 9400 # An empty line means wanted a blank line, so add two \n's to get that 9401 # effect, and go to the next line. 9402 if (length $line == 0) { 9403 $return .= "\n\n"; 9404 next; 9405 } 9406 9407 # Look at the last character of what we have so far. 9408 my $previous_char = substr($return, -1, 1); 9409 9410 # And at the next char to be output. 9411 my $next_char = substr($line, 0, 1); 9412 9413 if ($previous_char ne "\n") { 9414 9415 # Here didn't end wth a nl. If the next char a blank or \b, it 9416 # means that here there is a break anyway. So add a nl to the 9417 # output. 9418 if ($next_char eq " " || $next_char eq "\b") { 9419 $previous_char = "\n"; 9420 $return .= $previous_char; 9421 } 9422 9423 # Add an extra space after periods. 9424 $return .= " " if $previous_char eq '.'; 9425 } 9426 9427 # Here $previous_char is still the latest character to be output. If 9428 # it isn't a nl, it means that the next line is to be a continuation 9429 # line, with a blank inserted between them. 9430 $return .= " " if $previous_char ne "\n"; 9431 9432 # Get rid of any \b 9433 substr($line, 0, 1) = "" if $next_char eq "\b"; 9434 9435 # And append this next line. 9436 $return .= $line; 9437 } 9438 9439 return $return; 9440} 9441 9442sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) { 9443 # Returns a string of the input (string or an array of strings) folded 9444 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus 9445 # a \n 9446 # This is tailored for the kind of text written by this program, 9447 # especially the pod file, which can have very long names with 9448 # underscores in the middle, or words like AbcDefgHij.... We allow 9449 # breaking in the middle of such constructs if the line won't fit 9450 # otherwise. The break in such cases will come either just after an 9451 # underscore, or just before one of the Capital letters. 9452 9453 local $to_trace = 0 if main::DEBUG; 9454 9455 # $prefix Optional string to prepend to each output line 9456 # $hanging_indent Optional number of spaces to indent 9457 # continuation lines 9458 # $right_margin Optional number of spaces to narrow the 9459 # total width by. 9460 9461 # The space available doesn't include what's automatically prepended 9462 # to each line, or what's reserved on the right. 9463 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; 9464 # XXX Instead of using the 'nofold' perhaps better to look up the stack 9465 9466 if (DEBUG && $hanging_indent >= $max) { 9467 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); 9468 $hanging_indent = 0; 9469 } 9470 9471 # First, split into the current physical lines. 9472 my @line; 9473 if (ref $line) { # Better be an array, because not bothering to 9474 # test 9475 foreach my $line (@{$line}) { 9476 push @line, split /\n/, $line; 9477 } 9478 } 9479 else { 9480 @line = split /\n/, $line; 9481 } 9482 9483 #local $to_trace = 1 if main::DEBUG; 9484 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; 9485 9486 # Look at each current physical line. 9487 for (my $i = 0; $i < @line; $i++) { 9488 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; 9489 #local $to_trace = 1 if main::DEBUG; 9490 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; 9491 9492 # Remove prefix, because will be added back anyway, don't want 9493 # doubled prefix 9494 $line[$i] =~ s/^$prefix//; 9495 9496 # Remove trailing space 9497 $line[$i] =~ s/\s+\Z//; 9498 9499 # If the line is too long, fold it. 9500 if (length $line[$i] > $max) { 9501 my $remainder; 9502 9503 # Here needs to fold. Save the leading space in the line for 9504 # later. 9505 $line[$i] =~ /^ ( \s* )/x; 9506 my $leading_space = $1; 9507 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; 9508 9509 # If character at final permissible position is white space, 9510 # fold there, which will delete that white space 9511 if (substr($line[$i], $max - 1, 1) =~ /\s/) { 9512 $remainder = substr($line[$i], $max); 9513 $line[$i] = substr($line[$i], 0, $max - 1); 9514 } 9515 else { 9516 9517 # Otherwise fold at an acceptable break char closest to 9518 # the max length. Look at just the maximal initial 9519 # segment of the line 9520 my $segment = substr($line[$i], 0, $max - 1); 9521 if ($segment =~ 9522 /^ ( .{$hanging_indent} # Don't look before the 9523 # indent. 9524 \ * # Don't look in leading 9525 # blanks past the indent 9526 [^ ] .* # Find the right-most 9527 (?: # acceptable break: 9528 [ \s = ] # space or equal 9529 | - (?! [.0-9] ) # or non-unary minus. 9530 | [^\\[(] (?= \\ )# break before single backslash 9531 # not immediately after opening 9532 # punctuation 9533 ) # $1 includes the character 9534 )/x) 9535 { 9536 # Split into the initial part that fits, and remaining 9537 # part of the input 9538 $remainder = substr($line[$i], length $1); 9539 $line[$i] = $1; 9540 trace $line[$i] if DEBUG && $to_trace; 9541 trace $remainder if DEBUG && $to_trace; 9542 } 9543 9544 # If didn't find a good breaking spot, see if there is a 9545 # not-so-good breaking spot. These are just after 9546 # underscores or where the case changes from lower to 9547 # upper. Use \a as a soft hyphen, but give up 9548 # and don't break the line if there is actually a \a 9549 # already in the input. We use an ascii character for the 9550 # soft-hyphen to avoid any attempt by miniperl to try to 9551 # access the files that this program is creating. 9552 elsif ($segment !~ /\a/ 9553 && ($segment =~ s/_/_\a/g 9554 || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg)) 9555 { 9556 # Here were able to find at least one place to insert 9557 # our substitute soft hyphen. Find the right-most one 9558 # and replace it by a real hyphen. 9559 trace $segment if DEBUG && $to_trace; 9560 substr($segment, 9561 rindex($segment, "\a"), 9562 1) = '-'; 9563 9564 # Then remove the soft hyphen substitutes. 9565 $segment =~ s/\a//g; 9566 trace $segment if DEBUG && $to_trace; 9567 9568 # And split into the initial part that fits, and 9569 # remainder of the line 9570 my $pos = rindex($segment, '-'); 9571 $remainder = substr($line[$i], $pos); 9572 trace $remainder if DEBUG && $to_trace; 9573 $line[$i] = substr($segment, 0, $pos + 1); 9574 } 9575 } 9576 9577 # Here we know if we can fold or not. If we can, $remainder 9578 # is what remains to be processed in the next iteration. 9579 if (defined $remainder) { 9580 trace "folded='$line[$i]'" if main::DEBUG && $to_trace; 9581 9582 # Insert the folded remainder of the line as a new element 9583 # of the array. (It may still be too long, but we will 9584 # deal with that next time through the loop.) Omit any 9585 # leading space in the remainder. 9586 $remainder =~ s/^\s+//; 9587 trace "remainder='$remainder'" if main::DEBUG && $to_trace; 9588 9589 # But then indent by whichever is larger of: 9590 # 1) the leading space on the input line; 9591 # 2) the hanging indent. 9592 # This preserves indentation in the original line. 9593 my $lead = ($leading_space) 9594 ? length $leading_space 9595 : $hanging_indent; 9596 $lead = max($lead, $hanging_indent); 9597 splice @line, $i+1, 0, (" " x $lead) . $remainder; 9598 } 9599 } 9600 9601 # Ready to output the line. Get rid of any trailing space 9602 # And prefix by the required $prefix passed in. 9603 $line[$i] =~ s/\s+$//; 9604 $line[$i] = "$prefix$line[$i]\n"; 9605 } # End of looping through all the lines. 9606 9607 return join "", @line; 9608} 9609 9610sub property_ref { # Returns a reference to a property object. 9611 return Property::property_ref(@_); 9612} 9613 9614sub force_unlink ($filename) { 9615 return unless file_exists($filename); 9616 return if CORE::unlink($filename); 9617 9618 # We might need write permission 9619 chmod 0777, $filename; 9620 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); 9621 return; 9622} 9623 9624sub write ($file, $use_utf8, @lines) { 9625 # Given a filename and references to arrays of lines, write the lines of 9626 # each array to the file 9627 # Filename can be given as an arrayref of directory names 9628 9629 # Get into a single string if an array, and get rid of, in Unix terms, any 9630 # leading '.' 9631 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; 9632 $file = File::Spec->canonpath($file); 9633 9634 # If has directories, make sure that they all exist 9635 (undef, my $directories, undef) = File::Spec->splitpath($file); 9636 File::Path::mkpath($directories) if $directories && ! -d $directories; 9637 9638 push @files_actually_output, $file; 9639 9640 force_unlink ($file); 9641 9642 my $OUT; 9643 if (not open $OUT, ">", $file) { 9644 Carp::my_carp("can't open $file for output. Skipping this file: $!"); 9645 return; 9646 } 9647 9648 binmode $OUT, ":utf8" if $use_utf8; 9649 9650 foreach my $lines_ref (@lines) { 9651 unless (@$lines_ref) { 9652 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;"); 9653 } 9654 9655 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); 9656 } 9657 close $OUT or die Carp::my_carp("close '$file' failed: $!"); 9658 9659 print "$file written.\n" if $verbosity >= $VERBOSE; 9660 9661 return; 9662} 9663 9664 9665sub Standardize($name=undef) { 9666 # This converts the input name string into a standardized equivalent to 9667 # use internally. 9668 9669 unless (defined $name) { 9670 Carp::my_carp_bug("Standardize() called with undef. Returning undef."); 9671 return; 9672 } 9673 9674 # Remove any leading or trailing white space 9675 $name =~ s/^\s+//g; 9676 $name =~ s/\s+$//g; 9677 9678 # Convert interior white space and hyphens into underscores. 9679 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; 9680 9681 # Capitalize the letter following an underscore, and convert a sequence of 9682 # multiple underscores to a single one 9683 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; 9684 9685 # And capitalize the first letter, but not for the special cjk ones. 9686 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 9687 return $name; 9688} 9689 9690sub standardize ($str=undef) { 9691 # Returns a lower-cased standardized name, without underscores. This form 9692 # is chosen so that it can distinguish between any real versus superficial 9693 # Unicode name differences. It relies on the fact that Unicode doesn't 9694 # have interior underscores, white space, nor dashes in any 9695 # stricter-matched name. It should not be used on Unicode code point 9696 # names (the Name property), as they mostly, but not always follow these 9697 # rules. 9698 9699 my $name = Standardize($str); 9700 return if !defined $name; 9701 9702 $name =~ s/ (?<= .) _ (?= . ) //xg; 9703 return lc $name; 9704} 9705 9706sub UCD_name ($table, $alias) { 9707 # Returns the name that Unicode::UCD will use to find a table. XXX 9708 # perhaps this function should be placed somewhere, like UCD.pm so that 9709 # Unicode::UCD can use it directly without duplicating code that can get 9710 # out-of sync. 9711 9712 my $property = $table->property; 9713 $property = ($property == $perl) 9714 ? "" # 'perl' is never explicitly stated 9715 : standardize($property->name) . '='; 9716 if ($alias->loose_match) { 9717 return $property . standardize($alias->name); 9718 } 9719 else { 9720 return lc ($property . $alias->name); 9721 } 9722 9723 return; 9724} 9725 9726{ # Closure 9727 9728 my $indent_increment = " " x ( $debugging_build ? 2 : 0); 9729 %main::already_output = (); 9730 9731 $main::simple_dumper_nesting = 0; 9732 9733 sub simple_dumper( $item, $indent = "" ) { 9734 # Like Simple Data::Dumper. Good enough for our needs. We can't use 9735 # the real thing as we have to run under miniperl. 9736 9737 # It is designed so that on input it is at the beginning of a line, 9738 # and the final thing output in any call is a trailing ",\n". 9739 9740 $indent = "" if ! $debugging_build; 9741 9742 # nesting level is localized, so that as the call stack pops, it goes 9743 # back to the prior value. 9744 local $main::simple_dumper_nesting = $main::simple_dumper_nesting; 9745 local %main::already_output = %main::already_output; 9746 $main::simple_dumper_nesting++; 9747 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; 9748 9749 # Determine the indent for recursive calls. 9750 my $next_indent = $indent . $indent_increment; 9751 9752 my $output; 9753 if (! ref $item) { 9754 9755 # Dump of scalar: just output it in quotes if not a number. To do 9756 # so we must escape certain characters, and therefore need to 9757 # operate on a copy to avoid changing the original 9758 my $copy = $item; 9759 $copy = $UNDEF unless defined $copy; 9760 9761 # Quote non-integers (integers also have optional leading '-') 9762 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { 9763 9764 # Escape apostrophe and backslash 9765 $copy =~ s/ ( ['\\] ) /\\$1/xg; 9766 $copy = "'$copy'"; 9767 } 9768 $output = "$indent$copy,\n"; 9769 } 9770 else { 9771 9772 # Keep track of cycles in the input, and refuse to infinitely loop 9773 my $addr = pack 'J', refaddr $item; 9774 if (defined $main::already_output{$addr}) { 9775 return "${indent}ALREADY OUTPUT: $item\n"; 9776 } 9777 $main::already_output{$addr} = $item; 9778 9779 if (ref $item eq 'ARRAY') { 9780 my $using_brackets; 9781 $output = $indent; 9782 if ($main::simple_dumper_nesting > 1) { 9783 $output .= '['; 9784 $using_brackets = 1; 9785 } 9786 else { 9787 $using_brackets = 0; 9788 } 9789 9790 # If the array is empty, put the closing bracket on the same 9791 # line. Otherwise, recursively add each array element 9792 if (@$item == 0) { 9793 $output .= " "; 9794 } 9795 else { 9796 $output .= "\n"; 9797 for (my $i = 0; $i < @$item; $i++) { 9798 9799 # Indent array elements one level 9800 $output .= &simple_dumper($item->[$i], $next_indent); 9801 next if ! $debugging_build; 9802 $output =~ s/\n$//; # Remove any trailing nl so 9803 $output .= " # [$i]\n"; # as to add a comment giving 9804 # the array index 9805 } 9806 $output .= $indent; # Indent closing ']' to orig level 9807 } 9808 $output .= ']' if $using_brackets; 9809 $output .= ",\n"; 9810 } 9811 elsif (ref $item eq 'HASH') { 9812 my $is_first_line; 9813 my $using_braces; 9814 my $body_indent; 9815 9816 # No surrounding braces at top level 9817 $output .= $indent; 9818 if ($main::simple_dumper_nesting > 1) { 9819 $output .= "{\n"; 9820 $is_first_line = 0; 9821 $body_indent = $next_indent; 9822 $next_indent .= $indent_increment; 9823 $using_braces = 1; 9824 } 9825 else { 9826 $is_first_line = 1; 9827 $body_indent = $indent; 9828 $using_braces = 0; 9829 } 9830 9831 # Output hashes sorted alphabetically instead of apparently 9832 # random. Use caseless alphabetic sort 9833 foreach my $key (sort { lc $a cmp lc $b } keys %$item) 9834 { 9835 if ($is_first_line) { 9836 $is_first_line = 0; 9837 } 9838 else { 9839 $output .= "$body_indent"; 9840 } 9841 9842 # The key must be a scalar, but this recursive call quotes 9843 # it 9844 $output .= &simple_dumper($key); 9845 9846 # And change the trailing comma and nl to the hash fat 9847 # comma for clarity, and so the value can be on the same 9848 # line 9849 $output =~ s/,\n$/ => /; 9850 9851 # Recursively call to get the value's dump. 9852 my $next = &simple_dumper($item->{$key}, $next_indent); 9853 9854 # If the value is all on one line, remove its indent, so 9855 # will follow the => immediately. If it takes more than 9856 # one line, start it on a new line. 9857 if ($next !~ /\n.*\n/) { 9858 $next =~ s/^ *//; 9859 } 9860 else { 9861 $output .= "\n"; 9862 } 9863 $output .= $next; 9864 } 9865 9866 $output .= "$indent},\n" if $using_braces; 9867 } 9868 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { 9869 $output = $indent . ref($item) . "\n"; 9870 # XXX see if blessed 9871 } 9872 elsif ($item->can('dump')) { 9873 9874 # By convention in this program, objects furnish a 'dump' 9875 # method. Since not doing any output at this level, just pass 9876 # on the input indent 9877 $output = $item->dump($indent); 9878 } 9879 else { 9880 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); 9881 } 9882 } 9883 return $output; 9884 } 9885} 9886 9887sub dump_inside_out( $object, $fields_ref, @args ) { 9888 # Dump inside-out hashes in an object's state by converting them to a 9889 # regular hash and then calling simple_dumper on that. 9890 9891 my $addr = pack 'J', refaddr $object; 9892 9893 my %hash; 9894 foreach my $key (keys %$fields_ref) { 9895 $hash{$key} = $fields_ref->{$key}{$addr}; 9896 } 9897 9898 return simple_dumper(\%hash, @args); 9899} 9900 9901sub _operator_dot($self, $other="", $reversed=0) { 9902 # Overloaded '.' method that is common to all packages. It uses the 9903 # package's stringify method. 9904 9905 foreach my $which (\$self, \$other) { 9906 next unless ref $$which; 9907 if ($$which->can('_operator_stringify')) { 9908 $$which = $$which->_operator_stringify; 9909 } 9910 else { 9911 my $ref = ref $$which; 9912 my $addr = pack 'J', refaddr $$which; 9913 $$which = "$ref ($addr)"; 9914 } 9915 } 9916 return ($reversed) 9917 ? "$other$self" 9918 : "$self$other"; 9919} 9920 9921sub _operator_dot_equal($self, $other="", $reversed=0) { 9922 # Overloaded '.=' method that is common to all packages. 9923 9924 if ($reversed) { 9925 return $other .= "$self"; 9926 } 9927 else { 9928 return "$self" . "$other"; 9929 } 9930} 9931 9932sub _operator_equal($self, $other, @) { 9933 # Generic overloaded '==' routine. To be equal, they must be the exact 9934 # same object 9935 9936 return 0 unless defined $other; 9937 return 0 unless ref $other; 9938 no overloading; 9939 return $self == $other; 9940} 9941 9942sub _operator_not_equal($self, $other, @) { 9943 return ! _operator_equal($self, $other); 9944} 9945 9946sub substitute_PropertyAliases($file_object) { 9947 # Deal with early releases that don't have the crucial PropertyAliases.txt 9948 # file. 9949 9950 $file_object->insert_lines(get_old_property_aliases()); 9951 9952 process_PropertyAliases($file_object); 9953} 9954 9955 9956sub process_PropertyAliases($file) { 9957 # This reads in the PropertyAliases.txt file, which contains almost all 9958 # the character properties in Unicode and their equivalent aliases: 9959 # scf ; Simple_Case_Folding ; sfc 9960 # 9961 # Field 0 is the preferred short name for the property. 9962 # Field 1 is the full name. 9963 # Any succeeding ones are other accepted names. 9964 9965 # Add any cjk properties that may have been defined. 9966 $file->insert_lines(@cjk_properties); 9967 9968 while ($file->next_line) { 9969 9970 my @data = split /\s*;\s*/; 9971 9972 my $full = $data[1]; 9973 9974 # This line is defective in early Perls. The property in Unihan.txt 9975 # is kRSUnicode. 9976 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) { 9977 push @data, qw(cjkRSUnicode kRSUnicode); 9978 } 9979 9980 my $this = Property->new($data[0], Full_Name => $full); 9981 9982 $this->set_fate($SUPPRESSED, $why_suppressed{$full}) 9983 if $why_suppressed{$full}; 9984 9985 # Start looking for more aliases after these two. 9986 for my $i (2 .. @data - 1) { 9987 $this->add_alias($data[$i]); 9988 } 9989 9990 } 9991 9992 my $scf = property_ref("Simple_Case_Folding"); 9993 $scf->add_alias("scf"); 9994 $scf->add_alias("sfc"); 9995 9996 return; 9997} 9998 9999sub finish_property_setup($file) { 10000 # Finishes setting up after PropertyAliases. 10001 10002 # This entry was missing from this file in earlier Unicode versions 10003 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) { 10004 Property->new('JSN', Full_Name => 'Jamo_Short_Name'); 10005 } 10006 10007 # These are used so much, that we set globals for them. 10008 $gc = property_ref('General_Category'); 10009 $block = property_ref('Block'); 10010 $script = property_ref('Script'); 10011 $age = property_ref('Age'); 10012 10013 # Perl adds this alias. 10014 $gc->add_alias('Category'); 10015 10016 # Unicode::Normalize expects this file with this name and directory. 10017 $ccc = property_ref('Canonical_Combining_Class'); 10018 if (defined $ccc) { 10019 $ccc->set_file('CombiningClass'); 10020 $ccc->set_directory(File::Spec->curdir()); 10021 } 10022 10023 # These two properties aren't actually used in the core, but unfortunately 10024 # the names just above that are in the core interfere with these, so 10025 # choose different names. These aren't a problem unless the map tables 10026 # for these files get written out. 10027 my $lowercase = property_ref('Lowercase'); 10028 $lowercase->set_file('IsLower') if defined $lowercase; 10029 my $uppercase = property_ref('Uppercase'); 10030 $uppercase->set_file('IsUpper') if defined $uppercase; 10031 10032 # Set up the hard-coded default mappings, but only on properties defined 10033 # for this release 10034 foreach my $property (keys %default_mapping) { 10035 my $property_object = property_ref($property); 10036 next if ! defined $property_object; 10037 my $default_map = $default_mapping{$property}; 10038 $property_object->set_default_map($default_map); 10039 10040 # A map of <code point> implies the property is string. 10041 if ($property_object->type == $UNKNOWN 10042 && $default_map eq $CODE_POINT) 10043 { 10044 $property_object->set_type($STRING); 10045 } 10046 } 10047 10048 # For backwards compatibility with applications that may read the mapping 10049 # file directly (it was documented in 5.12 and 5.14 as being thusly 10050 # usable), keep it from being adjusted. (range_size_1 is 10051 # used to force the traditional format.) 10052 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { 10053 $nfkc_cf->set_to_output_map($EXTERNAL_MAP); 10054 $nfkc_cf->set_range_size_1(1); 10055 } 10056 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { 10057 $bmg->set_to_output_map($EXTERNAL_MAP); 10058 $bmg->set_range_size_1(1); 10059 } 10060 10061 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); 10062 10063 # The rest of this sub is for properties that need the Multi_Default class 10064 # to create objects for defaults. As of v15.0, this is no longer needed. 10065 10066 return if $v_version ge v15.0.0; 10067 10068 # Bidi class has a complicated default, but the derived file takes care of 10069 # the complications, leaving just 'L'. 10070 if (file_exists("${EXTRACTED}DBidiClass.txt")) { 10071 property_ref('Bidi_Class')->set_default_map('L'); 10072 } 10073 else { 10074 my $default; 10075 10076 # The derived file was introduced in 3.1.1. The values below are 10077 # taken from table 3-8, TUS 3.0 10078 my $default_R = 10079 'my $default = Range_List->new; 10080 $default->add_range(0x0590, 0x05FF); 10081 $default->add_range(0xFB1D, 0xFB4F);' 10082 ; 10083 10084 # The defaults apply only to unassigned characters 10085 $default_R .= '$gc->table("Unassigned") & $default;'; 10086 10087 if ($v_version lt v3.0.0) { 10088 $default = Multi_Default->new(R => $default_R, 'L'); 10089 } 10090 else { 10091 10092 # AL apparently not introduced until 3.0: TUS 2.x references are 10093 # not on-line to check it out 10094 my $default_AL = 10095 'my $default = Range_List->new; 10096 $default->add_range(0x0600, 0x07BF); 10097 $default->add_range(0xFB50, 0xFDFF); 10098 $default->add_range(0xFE70, 0xFEFF);' 10099 ; 10100 10101 # Non-character code points introduced in this release; aren't AL 10102 if ($v_version ge 3.1.0) { 10103 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; 10104 } 10105 $default_AL .= '$gc->table("Unassigned") & $default'; 10106 $default = Multi_Default->new(AL => $default_AL, 10107 R => $default_R, 10108 'L'); 10109 } 10110 property_ref('Bidi_Class')->set_default_map($default); 10111 } 10112 10113 # Joining type has a complicated default, but the derived file takes care 10114 # of the complications, leaving just 'U' (or Non_Joining), except the file 10115 # is bad in 3.1.0 10116 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { 10117 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { 10118 property_ref('Joining_Type')->set_default_map('Non_Joining'); 10119 } 10120 else { 10121 10122 # Otherwise, there are not one, but two possibilities for the 10123 # missing defaults: T and U. 10124 # The missing defaults that evaluate to T are given by: 10125 # T = Mn + Cf - ZWNJ - ZWJ 10126 # where Mn and Cf are the general category values. In other words, 10127 # any non-spacing mark or any format control character, except 10128 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO 10129 # WIDTH JOINER (joining type C). 10130 my $default = Multi_Default->new( 10131 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', 10132 'Non_Joining'); 10133 property_ref('Joining_Type')->set_default_map($default); 10134 } 10135 } 10136 10137 # Line break has a complicated default in early releases. It is 'Unknown' 10138 # for non-assigned code points; 'AL' for assigned. 10139 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { 10140 my $lb = property_ref('Line_Break'); 10141 if (file_exists("${EXTRACTED}DLineBreak.txt")) { 10142 $lb->set_default_map('Unknown'); 10143 } 10144 else { 10145 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")', 10146 'Unknown', 10147 ); 10148 $lb->set_default_map($default); 10149 } 10150 } 10151 10152 return; 10153} 10154 10155sub get_old_property_aliases() { 10156 # Returns what would be in PropertyAliases.txt if it existed in very old 10157 # versions of Unicode. It was derived from the one in 3.2, and pared 10158 # down based on the data that was actually in the older releases. 10159 # An attempt was made to use the existence of files to mean inclusion or 10160 # not of various aliases, but if this was not sufficient, using version 10161 # numbers was resorted to. 10162 10163 my @return; 10164 10165 # These are to be used in all versions (though some are constructed by 10166 # this program if missing) 10167 push @return, split /\n/, <<'END'; 10168bc ; Bidi_Class 10169Bidi_M ; Bidi_Mirrored 10170cf ; Case_Folding 10171ccc ; Canonical_Combining_Class 10172dm ; Decomposition_Mapping 10173dt ; Decomposition_Type 10174gc ; General_Category 10175isc ; ISO_Comment 10176lc ; Lowercase_Mapping 10177na ; Name 10178na1 ; Unicode_1_Name 10179nt ; Numeric_Type 10180nv ; Numeric_Value 10181scf ; Simple_Case_Folding 10182slc ; Simple_Lowercase_Mapping 10183stc ; Simple_Titlecase_Mapping 10184suc ; Simple_Uppercase_Mapping 10185tc ; Titlecase_Mapping 10186uc ; Uppercase_Mapping 10187END 10188 10189 if (-e 'Blocks.txt') { 10190 push @return, "blk ; Block\n"; 10191 } 10192 if (-e 'ArabicShaping.txt') { 10193 push @return, split /\n/, <<'END'; 10194jg ; Joining_Group 10195jt ; Joining_Type 10196END 10197 } 10198 if (-e 'PropList.txt') { 10199 10200 # This first set is in the original old-style proplist. 10201 push @return, split /\n/, <<'END'; 10202Bidi_C ; Bidi_Control 10203Dash ; Dash 10204Dia ; Diacritic 10205Ext ; Extender 10206Hex ; Hex_Digit 10207Hyphen ; Hyphen 10208IDC ; ID_Continue 10209Ideo ; Ideographic 10210Join_C ; Join_Control 10211Math ; Math 10212QMark ; Quotation_Mark 10213Term ; Terminal_Punctuation 10214WSpace ; White_Space 10215END 10216 # The next sets were added later 10217 if ($v_version ge v3.0.0) { 10218 push @return, split /\n/, <<'END'; 10219Upper ; Uppercase 10220Lower ; Lowercase 10221END 10222 } 10223 if ($v_version ge v3.0.1) { 10224 push @return, split /\n/, <<'END'; 10225NChar ; Noncharacter_Code_Point 10226END 10227 } 10228 # The next sets were added in the new-style 10229 if ($v_version ge v3.1.0) { 10230 push @return, split /\n/, <<'END'; 10231OAlpha ; Other_Alphabetic 10232OLower ; Other_Lowercase 10233OMath ; Other_Math 10234OUpper ; Other_Uppercase 10235END 10236 } 10237 if ($v_version ge v3.1.1) { 10238 push @return, "AHex ; ASCII_Hex_Digit\n"; 10239 } 10240 } 10241 if (-e 'EastAsianWidth.txt') { 10242 push @return, "ea ; East_Asian_Width\n"; 10243 } 10244 if (-e 'CompositionExclusions.txt') { 10245 push @return, "CE ; Composition_Exclusion\n"; 10246 } 10247 if (-e 'LineBreak.txt') { 10248 push @return, "lb ; Line_Break\n"; 10249 } 10250 if (-e 'BidiMirroring.txt') { 10251 push @return, "bmg ; Bidi_Mirroring_Glyph\n"; 10252 } 10253 if (-e 'Scripts.txt') { 10254 push @return, "sc ; Script\n"; 10255 } 10256 if (-e 'DNormalizationProps.txt') { 10257 push @return, split /\n/, <<'END'; 10258Comp_Ex ; Full_Composition_Exclusion 10259FC_NFKC ; FC_NFKC_Closure 10260NFC_QC ; NFC_Quick_Check 10261NFD_QC ; NFD_Quick_Check 10262NFKC_QC ; NFKC_Quick_Check 10263NFKD_QC ; NFKD_Quick_Check 10264XO_NFC ; Expands_On_NFC 10265XO_NFD ; Expands_On_NFD 10266XO_NFKC ; Expands_On_NFKC 10267XO_NFKD ; Expands_On_NFKD 10268END 10269 } 10270 if (-e 'DCoreProperties.txt') { 10271 push @return, split /\n/, <<'END'; 10272Alpha ; Alphabetic 10273IDS ; ID_Start 10274XIDC ; XID_Continue 10275XIDS ; XID_Start 10276END 10277 # These can also appear in some versions of PropList.txt 10278 push @return, "Lower ; Lowercase\n" 10279 unless grep { $_ =~ /^Lower\b/} @return; 10280 push @return, "Upper ; Uppercase\n" 10281 unless grep { $_ =~ /^Upper\b/} @return; 10282 } 10283 10284 # This flag requires the DAge.txt file to be copied into the directory. 10285 if (DEBUG && $compare_versions) { 10286 push @return, 'age ; Age'; 10287 } 10288 10289 return @return; 10290} 10291 10292sub substitute_PropValueAliases($file_object) { 10293 # Deal with early releases that don't have the crucial 10294 # PropValueAliases.txt file. 10295 10296 $file_object->insert_lines(get_old_property_value_aliases()); 10297 10298 process_PropValueAliases($file_object); 10299} 10300 10301sub process_PropValueAliases($file) { 10302 # This file contains values that properties look like: 10303 # bc ; AL ; Arabic_Letter 10304 # blk; n/a ; Greek_And_Coptic ; Greek 10305 # 10306 # Field 0 is the property. 10307 # Field 1 is the short name of a property value or 'n/a' if no 10308 # short name exists; 10309 # Field 2 is the full property value name; 10310 # Any other fields are more synonyms for the property value. 10311 # Purely numeric property values are omitted from the file; as are some 10312 # others, fewer and fewer in later releases 10313 10314 # Entries for the ccc property have an extra field before the 10315 # abbreviation: 10316 # ccc; 0; NR ; Not_Reordered 10317 # It is the numeric value that the names are synonyms for. 10318 10319 # There are comment entries for values missing from this file: 10320 # # @missing: 0000..10FFFF; ISO_Comment; <none> 10321 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point> 10322 10323 if ($v_version lt 4.0.0) { 10324 $file->insert_lines(split /\n/, <<'END' 10325Hangul_Syllable_Type; L ; Leading_Jamo 10326Hangul_Syllable_Type; LV ; LV_Syllable 10327Hangul_Syllable_Type; LVT ; LVT_Syllable 10328Hangul_Syllable_Type; NA ; Not_Applicable 10329Hangul_Syllable_Type; T ; Trailing_Jamo 10330Hangul_Syllable_Type; V ; Vowel_Jamo 10331END 10332 ); 10333 } 10334 if ($v_version lt 4.1.0) { 10335 $file->insert_lines(split /\n/, <<'END' 10336_Perl_GCB; CN ; Control 10337_Perl_GCB; CR ; CR 10338_Perl_GCB; EX ; Extend 10339_Perl_GCB; L ; L 10340_Perl_GCB; LF ; LF 10341_Perl_GCB; LV ; LV 10342_Perl_GCB; LVT ; LVT 10343_Perl_GCB; T ; T 10344_Perl_GCB; V ; V 10345_Perl_GCB; XX ; Other 10346END 10347 ); 10348 } 10349 10350 # Add any explicit cjk values 10351 $file->insert_lines(@cjk_property_values); 10352 10353 # This line is used only for testing the code that checks for name 10354 # conflicts. There is a script Inherited, and when this line is executed 10355 # it causes there to be a name conflict with the 'Inherited' that this 10356 # program generates for this block property value 10357 #$file->insert_lines('blk; n/a; Herited'); 10358 10359 # Process each line of the file ... 10360 while ($file->next_line) { 10361 10362 # Fix typo in input file 10363 s/CCC133/CCC132/g if $v_version eq v6.1.0; 10364 10365 my ($property, @data) = split /\s*;\s*/; 10366 10367 # The ccc property has an extra field at the beginning, which is the 10368 # numeric value. Move it to be after the other two, mnemonic, fields, 10369 # so that those will be used as the property value's names, and the 10370 # number will be an extra alias. (Rightmost splice removes field 1-2, 10371 # returning them in a slice; left splice inserts that before anything, 10372 # thus shifting the former field 0 to after them.) 10373 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; 10374 10375 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) { 10376 my $new_style = $data[1] =~ s/-/_/gr; 10377 splice @data, 1, 0, $new_style; 10378 } 10379 10380 # Field 0 is a short name unless "n/a"; field 1 is the full name. If 10381 # there is no short name, use the full one in element 1 10382 if ($data[0] eq "n/a") { 10383 $data[0] = $data[1]; 10384 } 10385 elsif ($data[0] ne $data[1] 10386 && standardize($data[0]) eq standardize($data[1]) 10387 && $data[1] !~ /[[:upper:]]/) 10388 { 10389 # Also, there is a bug in the file in which "n/a" is omitted, and 10390 # the two fields are identical except for case, and the full name 10391 # is all lower case. Copy the "short" name unto the full one to 10392 # give it some upper case. 10393 10394 $data[1] = $data[0]; 10395 } 10396 10397 # Earlier releases had the pseudo property 'qc' that should expand to 10398 # the ones that replace it below. 10399 if ($property eq 'qc') { 10400 if (lc $data[0] eq 'y') { 10401 $file->insert_lines('NFC_QC; Y ; Yes', 10402 'NFD_QC; Y ; Yes', 10403 'NFKC_QC; Y ; Yes', 10404 'NFKD_QC; Y ; Yes', 10405 ); 10406 } 10407 elsif (lc $data[0] eq 'n') { 10408 $file->insert_lines('NFC_QC; N ; No', 10409 'NFD_QC; N ; No', 10410 'NFKC_QC; N ; No', 10411 'NFKD_QC; N ; No', 10412 ); 10413 } 10414 elsif (lc $data[0] eq 'm') { 10415 $file->insert_lines('NFC_QC; M ; Maybe', 10416 'NFKC_QC; M ; Maybe', 10417 ); 10418 } 10419 else { 10420 $file->carp_bad_line("qc followed by unexpected '$data[0]"); 10421 } 10422 next; 10423 } 10424 10425 # The first field is the short name, 2nd is the full one. 10426 my $property_object = property_ref($property); 10427 my $table = $property_object->add_match_table($data[0], 10428 Full_Name => $data[1]); 10429 10430 # Start looking for more aliases after these two. 10431 for my $i (2 .. @data - 1) { 10432 $table->add_alias($data[$i]); 10433 } 10434 } # End of looping through the file 10435 10436 # As noted in the comments early in the program, it generates tables for 10437 # the default values for all releases, even those for which the concept 10438 # didn't exist at the time. Here we add those if missing. 10439 if (defined $age && ! defined $age->table('Unassigned')) { 10440 $age->add_match_table('Unassigned'); 10441 } 10442 $block->add_match_table('No_Block') if -e 'Blocks.txt' 10443 && ! defined $block->table('No_Block'); 10444 10445 10446 # Now set the default mappings of the properties from the file. This is 10447 # done after the loop because a number of properties have only @missings 10448 # entries in the file, and may not show up until the end. 10449 my @defaults = $file->get_missings; 10450 foreach my $default_ref (@defaults) { 10451 my $default = $default_ref->{default}; 10452 my $property = property_ref($default_ref->{property}); 10453 $property->set_default_map($default); 10454 } 10455 10456 return; 10457} 10458 10459sub get_old_property_value_aliases () { 10460 # Returns what would be in PropValueAliases.txt if it existed in very old 10461 # versions of Unicode. It was derived from the one in 3.2, and pared 10462 # down. An attempt was made to use the existence of files to mean 10463 # inclusion or not of various aliases, but if this was not sufficient, 10464 # using version numbers was resorted to. 10465 10466 my @return = split /\n/, <<'END'; 10467bc ; AN ; Arabic_Number 10468bc ; B ; Paragraph_Separator 10469bc ; CS ; Common_Separator 10470bc ; EN ; European_Number 10471bc ; ES ; European_Separator 10472bc ; ET ; European_Terminator 10473bc ; L ; Left_To_Right 10474bc ; ON ; Other_Neutral 10475bc ; R ; Right_To_Left 10476bc ; WS ; White_Space 10477 10478Bidi_M; N; No; F; False 10479Bidi_M; Y; Yes; T; True 10480 10481# The standard combining classes are very much different in v1, so only use 10482# ones that look right (not checked thoroughly) 10483ccc; 0; NR ; Not_Reordered 10484ccc; 1; OV ; Overlay 10485ccc; 7; NK ; Nukta 10486ccc; 8; KV ; Kana_Voicing 10487ccc; 9; VR ; Virama 10488ccc; 202; ATBL ; Attached_Below_Left 10489ccc; 216; ATAR ; Attached_Above_Right 10490ccc; 218; BL ; Below_Left 10491ccc; 220; B ; Below 10492ccc; 222; BR ; Below_Right 10493ccc; 224; L ; Left 10494ccc; 228; AL ; Above_Left 10495ccc; 230; A ; Above 10496ccc; 232; AR ; Above_Right 10497ccc; 234; DA ; Double_Above 10498 10499dt ; can ; canonical 10500dt ; enc ; circle 10501dt ; fin ; final 10502dt ; font ; font 10503dt ; fra ; fraction 10504dt ; init ; initial 10505dt ; iso ; isolated 10506dt ; med ; medial 10507dt ; n/a ; none 10508dt ; nb ; noBreak 10509dt ; sqr ; square 10510dt ; sub ; sub 10511dt ; sup ; super 10512 10513gc ; C ; Other # Cc | Cf | Cn | Co | Cs 10514gc ; Cc ; Control 10515gc ; Cn ; Unassigned 10516gc ; Co ; Private_Use 10517gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu 10518gc ; LC ; Cased_Letter # Ll | Lt | Lu 10519gc ; Ll ; Lowercase_Letter 10520gc ; Lm ; Modifier_Letter 10521gc ; Lo ; Other_Letter 10522gc ; Lu ; Uppercase_Letter 10523gc ; M ; Mark # Mc | Me | Mn 10524gc ; Mc ; Spacing_Mark 10525gc ; Mn ; Nonspacing_Mark 10526gc ; N ; Number # Nd | Nl | No 10527gc ; Nd ; Decimal_Number 10528gc ; No ; Other_Number 10529gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps 10530gc ; Pd ; Dash_Punctuation 10531gc ; Pe ; Close_Punctuation 10532gc ; Po ; Other_Punctuation 10533gc ; Ps ; Open_Punctuation 10534gc ; S ; Symbol # Sc | Sk | Sm | So 10535gc ; Sc ; Currency_Symbol 10536gc ; Sm ; Math_Symbol 10537gc ; So ; Other_Symbol 10538gc ; Z ; Separator # Zl | Zp | Zs 10539gc ; Zl ; Line_Separator 10540gc ; Zp ; Paragraph_Separator 10541gc ; Zs ; Space_Separator 10542 10543nt ; de ; Decimal 10544nt ; di ; Digit 10545nt ; n/a ; None 10546nt ; nu ; Numeric 10547END 10548 10549 if (-e 'ArabicShaping.txt') { 10550 push @return, split /\n/, <<'END'; 10551jg ; n/a ; AIN 10552jg ; n/a ; ALEF 10553jg ; n/a ; DAL 10554jg ; n/a ; GAF 10555jg ; n/a ; LAM 10556jg ; n/a ; MEEM 10557jg ; n/a ; NO_JOINING_GROUP 10558jg ; n/a ; NOON 10559jg ; n/a ; QAF 10560jg ; n/a ; SAD 10561jg ; n/a ; SEEN 10562jg ; n/a ; TAH 10563jg ; n/a ; WAW 10564 10565jt ; C ; Join_Causing 10566jt ; D ; Dual_Joining 10567jt ; L ; Left_Joining 10568jt ; R ; Right_Joining 10569jt ; U ; Non_Joining 10570jt ; T ; Transparent 10571END 10572 if ($v_version ge v3.0.0) { 10573 push @return, split /\n/, <<'END'; 10574jg ; n/a ; ALAPH 10575jg ; n/a ; BEH 10576jg ; n/a ; BETH 10577jg ; n/a ; DALATH_RISH 10578jg ; n/a ; E 10579jg ; n/a ; FEH 10580jg ; n/a ; FINAL_SEMKATH 10581jg ; n/a ; GAMAL 10582jg ; n/a ; HAH 10583jg ; n/a ; HAMZA_ON_HEH_GOAL 10584jg ; n/a ; HE 10585jg ; n/a ; HEH 10586jg ; n/a ; HEH_GOAL 10587jg ; n/a ; HETH 10588jg ; n/a ; KAF 10589jg ; n/a ; KAPH 10590jg ; n/a ; KNOTTED_HEH 10591jg ; n/a ; LAMADH 10592jg ; n/a ; MIM 10593jg ; n/a ; NUN 10594jg ; n/a ; PE 10595jg ; n/a ; QAPH 10596jg ; n/a ; REH 10597jg ; n/a ; REVERSED_PE 10598jg ; n/a ; SADHE 10599jg ; n/a ; SEMKATH 10600jg ; n/a ; SHIN 10601jg ; n/a ; SWASH_KAF 10602jg ; n/a ; TAW 10603jg ; n/a ; TEH_MARBUTA 10604jg ; n/a ; TETH 10605jg ; n/a ; YEH 10606jg ; n/a ; YEH_BARREE 10607jg ; n/a ; YEH_WITH_TAIL 10608jg ; n/a ; YUDH 10609jg ; n/a ; YUDH_HE 10610jg ; n/a ; ZAIN 10611END 10612 } 10613 } 10614 10615 10616 if (-e 'EastAsianWidth.txt') { 10617 push @return, split /\n/, <<'END'; 10618ea ; A ; Ambiguous 10619ea ; F ; Fullwidth 10620ea ; H ; Halfwidth 10621ea ; N ; Neutral 10622ea ; Na ; Narrow 10623ea ; W ; Wide 10624END 10625 } 10626 10627 if (-e 'LineBreak.txt' || -e 'LBsubst.txt') { 10628 my @lb = split /\n/, <<'END'; 10629lb ; AI ; Ambiguous 10630lb ; AL ; Alphabetic 10631lb ; B2 ; Break_Both 10632lb ; BA ; Break_After 10633lb ; BB ; Break_Before 10634lb ; BK ; Mandatory_Break 10635lb ; CB ; Contingent_Break 10636lb ; CL ; Close_Punctuation 10637lb ; CM ; Combining_Mark 10638lb ; CR ; Carriage_Return 10639lb ; EX ; Exclamation 10640lb ; GL ; Glue 10641lb ; HY ; Hyphen 10642lb ; ID ; Ideographic 10643lb ; IN ; Inseperable 10644lb ; IS ; Infix_Numeric 10645lb ; LF ; Line_Feed 10646lb ; NS ; Nonstarter 10647lb ; NU ; Numeric 10648lb ; OP ; Open_Punctuation 10649lb ; PO ; Postfix_Numeric 10650lb ; PR ; Prefix_Numeric 10651lb ; QU ; Quotation 10652lb ; SA ; Complex_Context 10653lb ; SG ; Surrogate 10654lb ; SP ; Space 10655lb ; SY ; Break_Symbols 10656lb ; XX ; Unknown 10657lb ; ZW ; ZWSpace 10658END 10659 # If this Unicode version predates the lb property, we use our 10660 # substitute one 10661 if (-e 'LBsubst.txt') { 10662 $_ = s/^lb/_Perl_LB/r for @lb; 10663 } 10664 push @return, @lb; 10665 } 10666 10667 if (-e 'DNormalizationProps.txt') { 10668 push @return, split /\n/, <<'END'; 10669qc ; M ; Maybe 10670qc ; N ; No 10671qc ; Y ; Yes 10672END 10673 } 10674 10675 if (-e 'Scripts.txt') { 10676 push @return, split /\n/, <<'END'; 10677sc ; Arab ; Arabic 10678sc ; Armn ; Armenian 10679sc ; Beng ; Bengali 10680sc ; Bopo ; Bopomofo 10681sc ; Cans ; Canadian_Aboriginal 10682sc ; Cher ; Cherokee 10683sc ; Cyrl ; Cyrillic 10684sc ; Deva ; Devanagari 10685sc ; Dsrt ; Deseret 10686sc ; Ethi ; Ethiopic 10687sc ; Geor ; Georgian 10688sc ; Goth ; Gothic 10689sc ; Grek ; Greek 10690sc ; Gujr ; Gujarati 10691sc ; Guru ; Gurmukhi 10692sc ; Hang ; Hangul 10693sc ; Hani ; Han 10694sc ; Hebr ; Hebrew 10695sc ; Hira ; Hiragana 10696sc ; Ital ; Old_Italic 10697sc ; Kana ; Katakana 10698sc ; Khmr ; Khmer 10699sc ; Knda ; Kannada 10700sc ; Laoo ; Lao 10701sc ; Latn ; Latin 10702sc ; Mlym ; Malayalam 10703sc ; Mong ; Mongolian 10704sc ; Mymr ; Myanmar 10705sc ; Ogam ; Ogham 10706sc ; Orya ; Oriya 10707sc ; Qaai ; Inherited 10708sc ; Runr ; Runic 10709sc ; Sinh ; Sinhala 10710sc ; Syrc ; Syriac 10711sc ; Taml ; Tamil 10712sc ; Telu ; Telugu 10713sc ; Thaa ; Thaana 10714sc ; Thai ; Thai 10715sc ; Tibt ; Tibetan 10716sc ; Yiii ; Yi 10717sc ; Zyyy ; Common 10718END 10719 } 10720 10721 if ($v_version ge v2.0.0) { 10722 push @return, split /\n/, <<'END'; 10723dt ; com ; compat 10724dt ; nar ; narrow 10725dt ; sml ; small 10726dt ; vert ; vertical 10727dt ; wide ; wide 10728 10729gc ; Cf ; Format 10730gc ; Cs ; Surrogate 10731gc ; Lt ; Titlecase_Letter 10732gc ; Me ; Enclosing_Mark 10733gc ; Nl ; Letter_Number 10734gc ; Pc ; Connector_Punctuation 10735gc ; Sk ; Modifier_Symbol 10736END 10737 } 10738 if ($v_version ge v2.1.2) { 10739 push @return, "bc ; S ; Segment_Separator\n"; 10740 } 10741 if ($v_version ge v2.1.5) { 10742 push @return, split /\n/, <<'END'; 10743gc ; Pf ; Final_Punctuation 10744gc ; Pi ; Initial_Punctuation 10745END 10746 } 10747 if ($v_version ge v2.1.8) { 10748 push @return, "ccc; 240; IS ; Iota_Subscript\n"; 10749 } 10750 10751 if ($v_version ge v3.0.0) { 10752 push @return, split /\n/, <<'END'; 10753bc ; AL ; Arabic_Letter 10754bc ; BN ; Boundary_Neutral 10755bc ; LRE ; Left_To_Right_Embedding 10756bc ; LRO ; Left_To_Right_Override 10757bc ; NSM ; Nonspacing_Mark 10758bc ; PDF ; Pop_Directional_Format 10759bc ; RLE ; Right_To_Left_Embedding 10760bc ; RLO ; Right_To_Left_Override 10761 10762ccc; 233; DB ; Double_Below 10763END 10764 } 10765 10766 if ($v_version ge v3.1.0) { 10767 push @return, "ccc; 226; R ; Right\n"; 10768 } 10769 10770 return @return; 10771} 10772 10773sub process_NormalizationsTest($file) { 10774 10775 # Each line looks like: 10776 # source code point; NFC; NFD; NFKC; NFKD 10777 # e.g. 10778 # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 10779 10780 # Process each line of the file ... 10781 while ($file->next_line) { 10782 10783 next if /^@/; 10784 10785 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/; 10786 10787 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) { 10788 $$var = pack "U0U*", map { hex } split " ", $$var; 10789 $$var =~ s/(\\)/$1$1/g; 10790 } 10791 10792 push @normalization_tests, 10793 "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n"; 10794 } # End of looping through the file 10795} 10796 10797sub output_perl_charnames_line ($code_point, $name) { 10798 10799 # Output the entries in Perl_charnames specially, using 5 digits instead 10800 # of four. This makes the entries a constant length, and simplifies 10801 # charnames.pm which this table is for. Unicode can have 6 digit 10802 # ordinals, but they are all private use or noncharacters which do not 10803 # have names, so won't be in this table. 10804 10805 return sprintf "%05X\n%s\n\n", $code_point, $name; 10806} 10807 10808{ # Closure 10809 10810 # These are constants to the $property_info hash in this subroutine, to 10811 # avoid using a quoted-string which might have a typo. 10812 my $TYPE = 'type'; 10813 my $DEFAULT_MAP = 'default_map'; 10814 my $DEFAULT_TABLE = 'default_table'; 10815 my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; 10816 my $MISSINGS = 'missings'; 10817 10818 sub process_generic_property_file($file) { 10819 # This processes a file containing property mappings and puts them 10820 # into internal map tables. It should be used to handle any property 10821 # files that have mappings from a code point or range thereof to 10822 # something else. This means almost all the UCD .txt files. 10823 # each_line_handlers() should be set to adjust the lines of these 10824 # files, if necessary, to what this routine understands: 10825 # 10826 # 0374 ; NFD_QC; N 10827 # 003C..003E ; Math 10828 # 10829 # the fields are: "codepoint-range ; property; map" 10830 # 10831 # meaning the codepoints in the range all have the value 'map' under 10832 # 'property'. 10833 # Beginning and trailing white space in each field are not significant. 10834 # Note there is not a trailing semi-colon in the above. A trailing 10835 # semi-colon means the map is a null-string. An omitted map, as 10836 # opposed to a null-string, is assumed to be 'Y', based on Unicode 10837 # table syntax. (This could have been hidden from this routine by 10838 # doing it in the $file object, but that would require parsing of the 10839 # line there, so would have to parse it twice, or change the interface 10840 # to pass this an array. So not done.) 10841 # 10842 # The map field may begin with a sequence of commands that apply to 10843 # this range. Each such command begins and ends with $CMD_DELIM. 10844 # These are used to indicate, for example, that the mapping for a 10845 # range has a non-default type. 10846 # 10847 # This loops through the file, calling its next_line() method, and 10848 # then taking the map and adding it to the property's table. 10849 # Complications arise because any number of properties can be in the 10850 # file, in any order, interspersed in any way. The first time a 10851 # property is seen, it gets information about that property and 10852 # caches it for quick retrieval later. It also normalizes the maps 10853 # so that only one of many synonyms is stored. The Unicode input 10854 # files do use some multiple synonyms. 10855 10856 my %property_info; # To keep track of what properties 10857 # have already had entries in the 10858 # current file, and info about each, 10859 # so don't have to recompute. 10860 my $property_name; # property currently being worked on 10861 my $property_type; # and its type 10862 my $previous_property_name = ""; # name from last time through loop 10863 my $property_object; # pointer to the current property's 10864 # object 10865 my $property_addr; # the address of that object 10866 my $default_map; # the string that code points missing 10867 # from the file map to 10868 my $default_table; # For non-string properties, a 10869 # reference to the match table that 10870 # will contain the list of code 10871 # points that map to $default_map. 10872 10873 # Get the next real non-comment line 10874 LINE: 10875 while ($file->next_line) { 10876 10877 # Default replacement type; means that if parts of the range have 10878 # already been stored in our tables, the new map overrides them if 10879 # they differ more than cosmetically 10880 my $replace = $IF_NOT_EQUIVALENT; 10881 my $map_type; # Default type for the map of this range 10882 10883 #local $to_trace = 1 if main::DEBUG; 10884 trace $_ if main::DEBUG && $to_trace; 10885 10886 # Split the line into components 10887 my ($range, $property_name, $map, @remainder) 10888 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 10889 10890 # If more or less on the line than we are expecting, warn and skip 10891 # the line 10892 if (@remainder) { 10893 $file->carp_bad_line('Extra fields'); 10894 next LINE; 10895 } 10896 elsif ( ! defined $property_name) { 10897 $file->carp_bad_line('Missing property'); 10898 next LINE; 10899 } 10900 10901 # Examine the range. 10902 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 10903 { 10904 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); 10905 next LINE; 10906 } 10907 my $low = hex $1; 10908 my $high = (defined $2) ? hex $2 : $low; 10909 10910 # If changing to a new property, get the things constant per 10911 # property 10912 if ($previous_property_name ne $property_name) { 10913 10914 $property_object = property_ref($property_name); 10915 if (! defined $property_object) { 10916 $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); 10917 next LINE; 10918 } 10919 $property_addr = pack 'J', refaddr $property_object; 10920 10921 # Defer changing names until have a line that is acceptable 10922 # (the 'next' statement above means is unacceptable) 10923 $previous_property_name = $property_name; 10924 10925 # If not the first time for this property, retrieve info about 10926 # it from the cache 10927 my $this_property_info = $property_info{$property_addr}; 10928 if (defined ($this_property_info->{$TYPE})) { 10929 $property_type = $this_property_info->{$TYPE}; 10930 $default_map = $this_property_info->{$DEFAULT_MAP}; 10931 $map_type = $this_property_info->{$PSEUDO_MAP_TYPE}; 10932 $default_table = $this_property_info->{$DEFAULT_TABLE}; 10933 } 10934 else { 10935 10936 # Here, is the first time for this property. Set up the 10937 # cache. 10938 $property_type = $this_property_info->{$TYPE} 10939 = $property_object->type; 10940 $map_type 10941 = $this_property_info->{$PSEUDO_MAP_TYPE} 10942 = $property_object->pseudo_map_type; 10943 10944 # The Unicode files are set up so that if the map is not 10945 # defined, it is a binary property 10946 if (! defined $map && $property_type != $BINARY) { 10947 if ($property_type != $UNKNOWN 10948 && $property_type != $NON_STRING) 10949 { 10950 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); 10951 } 10952 else { 10953 $property_object->set_type($BINARY); 10954 $property_type = $this_property_info->{$TYPE} 10955 = $BINARY; 10956 } 10957 } 10958 10959 # Get any @missings default for this property. This 10960 # should precede the first entry for the property in the 10961 # input file, and is located in a comment that has been 10962 # stored by the Input_file class until we access it here. 10963 # It's possible that there is more than one such line 10964 # waiting for us; collect them all, and parse 10965 my @missings_list; 10966 @missings_list = $file->get_missings 10967 if $file->has_missings_defaults; 10968 10969 foreach my $default_ref (@missings_list) { 10970 10971 # For now, we are only interested in the fallback 10972 # default for the entire property. i.e., an @missing 10973 # line that is for the whole Unicode range. 10974 next if $default_ref->{start} != 0 10975 || $default_ref->{end} != $MAX_UNICODE_CODEPOINT; 10976 10977 $default_map = $default_ref->{default}; 10978 10979 # For string properties, the default is just what the 10980 # file says, but non-string properties should already 10981 # have set up a table for the default property value; 10982 # use the table for these, so can resolve synonyms 10983 # later to a single standard one. 10984 if ($property_type == $STRING 10985 || $property_type == $UNKNOWN) 10986 { 10987 $this_property_info->{$MISSINGS} = $default_map; 10988 } 10989 else { 10990 $default_map = 10991 $property_object->table($default_map)->full_name; 10992 $this_property_info->{$MISSINGS} = $default_map; 10993 $this_property_info->{$DEFAULT_MAP} = $default_map; 10994 if (! defined $property_object->default_map) { 10995 $property_object->set_default_map($default_map); 10996 } 10997 } 10998 } 10999 11000 # For later Unicode versions, multiple @missing lines for 11001 # a single property can appear in the files. The first 11002 # always applies to the entire Unicode range, and was 11003 # handled above. The subsequent ones are for smaller 11004 # ranges, and can be read as "But for this range, the 11005 # default is ...". So each overrides all the preceding 11006 # ones for the range it applies to. Typically they apply 11007 # to disjoint ranges, but don't have to. What we do is to 11008 # set them up to work in reverse order, so that after the 11009 # rest of the table is filled, the highest priority 11010 # default range fills in any code points that haven't been 11011 # specified; then the next highest priority one is 11012 # applied, and so forth. 11013 if (@missings_list > 1 && $v_version ge v15.0.0) { 11014 if ($property_type != $ENUM) { 11015 Carp::my_carp_bug("Multiple \@missings lines only" 11016 . " make sense for ENUM-type" 11017 . " properties. Changing type to" 11018 . " that"); 11019 $property_type = $this_property_info->{$TYPE} 11020 = $ENUM; 11021 $property_object->set_type($ENUM); 11022 } 11023 11024 my $multi = Multi_Default->new(); 11025 11026 # The overall default should be first on this list, 11027 # and is handled differently than the rest. 11028 $default_map = shift @missings_list; 11029 Carp::my_carp_bug("\@missings needs to be entire range") 11030 if $default_map->{start} != 0 11031 || $default_map->{end} != $MAX_UNICODE_CODEPOINT; 11032 11033 # We already have looked at this line above. Use that 11034 # result 11035 $multi->set_final_default($this_property_info-> 11036 {$MISSINGS}); 11037 11038 # Now get the individual range elements, and add them 11039 # to Multi_Default object 11040 while (@missings_list) { 11041 my $this_entry = pop @missings_list; 11042 my $subrange_default = $this_entry->{default}; 11043 11044 # Use the short name as a standard 11045 $subrange_default = $property_object-> 11046 table($subrange_default)->short_name; 11047 $multi->append_default($subrange_default, 11048 "Range_List->new(Initialize => Range->new(" 11049 . "$this_entry->{start}, $this_entry->{end}))"); 11050 } 11051 11052 # Override the property's simple default with this. 11053 $property_object->set_default_map($multi); 11054 } 11055 11056 if (! $default_map || $property_type != $ENUM) { 11057 11058 # Finished storing all the @missings defaults in the 11059 # input file so far. Get the one for the current 11060 # property. 11061 my $missings = $this_property_info->{$MISSINGS}; 11062 11063 # But we likely have separately stored what the 11064 # default should be. (This is to accommodate versions 11065 # of the standard where the @missings lines are absent 11066 # or incomplete.) Hopefully the two will match. But 11067 # check it out. 11068 $default_map = $property_object->default_map; 11069 11070 # If the map is a ref, it means that the default won't 11071 # be processed until later, so undef it, so next few 11072 # lines will redefine it to something that nothing 11073 # will match 11074 undef $default_map if ref $default_map; 11075 11076 # Create a $default_map if don't have one; maybe a 11077 # dummy that won't match anything. 11078 if (! defined $default_map) { 11079 11080 # Use any @missings line in the file. 11081 if (defined $missings) { 11082 if (ref $missings) { 11083 $default_map = $missings->full_name; 11084 $default_table = $missings; 11085 } 11086 else { 11087 $default_map = $missings; 11088 } 11089 11090 # And store it with the property for outside 11091 # use. 11092 $property_object->set_default_map($default_map); 11093 } 11094 else { 11095 11096 # Neither an @missings nor a default map. 11097 # Create a dummy one, so won't have to test 11098 # definedness in the main loop. 11099 $default_map = '_Perl This will never be in a' 11100 . ' file from Unicode'; 11101 } 11102 } 11103 11104 # Here, we have $default_map defined, possibly in 11105 # terms of $missings, but maybe not, and possibly is a 11106 # dummy one. 11107 if (defined $missings) { 11108 11109 # Make sure there is no conflict between the two. 11110 # $missings has priority. 11111 if (ref $missings) { 11112 $default_table 11113 = $property_object->table($default_map); 11114 if ( ! defined $default_table 11115 || $default_table != $missings) 11116 { 11117 if (! defined $default_table) { 11118 $default_table = $UNDEF; 11119 } 11120 $file->carp_bad_line(<<END 11121The \@missings line for $property_name in $file says that missings default to 11122$missings, but we expect it to be $default_table. $missings used. 11123END 11124 ); 11125 $default_table = $missings; 11126 $default_map = $missings->full_name; 11127 } 11128 $this_property_info->{$DEFAULT_TABLE} 11129 = $default_table; 11130 } 11131 elsif ($default_map ne $missings) { 11132 $file->carp_bad_line(<<END 11133The \@missings line for $property_name in $file says that missings default to 11134$missings, but we expect it to be $default_map. $missings used. 11135END 11136 ); 11137 $default_map = $missings; 11138 } 11139 } 11140 11141 $this_property_info->{$DEFAULT_MAP} = $default_map; 11142 11143 # If haven't done so already, find the table 11144 # corresponding to this map for non-string properties. 11145 if (! defined $default_table 11146 && $property_type != $STRING 11147 && $property_type != $UNKNOWN) 11148 { 11149 $default_table 11150 = $this_property_info->{$DEFAULT_TABLE} 11151 = $property_object->table($default_map); 11152 } 11153 } 11154 } # End of is first time for this property 11155 } # End of switching properties. 11156 11157 # Ready to process the line. 11158 # The Unicode files are set up so that if the map is not defined, 11159 # it is a binary property with value 'Y' 11160 if (! defined $map) { 11161 $map = 'Y'; 11162 } 11163 else { 11164 11165 # If the map begins with a special command to us (enclosed in 11166 # delimiters), extract the command(s). 11167 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { 11168 my $command = $1; 11169 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { 11170 $replace = $1; 11171 } 11172 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { 11173 $map_type = $1; 11174 } 11175 else { 11176 $file->carp_bad_line("Unknown command line: '$1'"); 11177 next LINE; 11178 } 11179 } 11180 } 11181 11182 if ( $default_map eq $CODE_POINT 11183 && $map =~ / ^ $code_point_re $/x) 11184 { 11185 11186 # Here, we have a map to a particular code point, and the 11187 # default map is to a code point itself. If the range 11188 # includes the particular code point, change that portion of 11189 # the range to the default. This makes sure that in the final 11190 # table only the non-defaults are listed. 11191 my $decimal_map = hex $map; 11192 if ($low <= $decimal_map && $decimal_map <= $high) { 11193 11194 # If the range includes stuff before or after the map 11195 # we're changing, split it and process the split-off parts 11196 # later. 11197 if ($low < $decimal_map) { 11198 $file->insert_adjusted_lines( 11199 sprintf("%04X..%04X; %s; %s", 11200 $low, 11201 $decimal_map - 1, 11202 $property_name, 11203 $map)); 11204 } 11205 if ($high > $decimal_map) { 11206 $file->insert_adjusted_lines( 11207 sprintf("%04X..%04X; %s; %s", 11208 $decimal_map + 1, 11209 $high, 11210 $property_name, 11211 $map)); 11212 } 11213 $low = $high = $decimal_map; 11214 $map = $CODE_POINT; 11215 } 11216 } 11217 11218 if ($property_type != $STRING && $property_type != $UNKNOWN) { 11219 my $table = $property_object->table($map); 11220 if (defined $table) { 11221 11222 # Unicode isn't very consistent about which synonym they 11223 # use in their .txt files, even within the same file, or 11224 # two files that are for the same property. For enum 11225 # properties, we know already what all the synonyms are 11226 # (because we processed PropValueAliases already). 11227 # Therefore we can take the input and map it to a uniform 11228 # value now, saving us trouble later. 11229 # 11230 # Only if the map is well-behaved do we try this: 11231 # non-empty, all non-blank. 11232 if ($property_type == $ENUM && $map =~ / ^ \S+ $ /x) { 11233 11234 # Use existing practice as much as easily practicable, 11235 # so that code that has assumptions about spelling 11236 # doesn't have to change 11237 my $short_name = $property_object->short_name; 11238 if ($short_name =~ / ^ (BC | EA | GC |HST | JT | 11239 Lb | BT | BPT | NFCQC | 11240 NFKCQC) $ /ix) 11241 { 11242 $map = $table->short_name; 11243 } 11244 elsif ($short_name !~ / ^ ( Ccc | Age | InSC | JG | 11245 SB) $ /ix) 11246 { 11247 $map = $table->full_name; 11248 } 11249 } 11250 elsif ($table == $default_table) { 11251 11252 # When it isn't an ENUM, we we can still tell if 11253 # this is a synonym for the default map. If so, use 11254 # the default one instead. 11255 $map = $default_map; 11256 } 11257 } 11258 } 11259 11260 # And figure out the map type if not known. 11261 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { 11262 if ($map eq "") { # Nulls are always $NULL map type 11263 $map_type = $NULL; 11264 } # Otherwise, non-strings, and those that don't allow 11265 # $MULTI_CP, and those that aren't multiple code points are 11266 # 0 11267 elsif 11268 (($property_type != $STRING && $property_type != $UNKNOWN) 11269 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) 11270 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) 11271 { 11272 $map_type = 0; 11273 } 11274 else { 11275 $map_type = $MULTI_CP; 11276 } 11277 } 11278 11279 $property_object->add_map($low, $high, 11280 $map, 11281 Type => $map_type, 11282 Replace => $replace); 11283 } # End of loop through file's lines 11284 11285 return; 11286 } 11287} 11288 11289{ # Closure for UnicodeData.txt handling 11290 11291 # This file was the first one in the UCD; its design leads to some 11292 # awkwardness in processing. Here is a sample line: 11293 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 11294 # The fields in order are: 11295 my $i = 0; # The code point is in field 0, and is shifted off. 11296 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") 11297 my $CATEGORY = $i++; # category (e.g. "Lu") 11298 my $CCC = $i++; # Canonical combining class (e.g. "230") 11299 my $BIDI = $i++; # directional class (e.g. "L") 11300 my $PERL_DECOMPOSITION = $i++; # decomposition mapping 11301 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value 11302 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript 11303 # Dual-use in this program; see below 11304 my $NUMERIC = $i++; # numeric value 11305 my $MIRRORED = $i++; # ? mirrored 11306 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 11307 my $COMMENT = $i++; # iso comment 11308 my $UPPER = $i++; # simple uppercase mapping 11309 my $LOWER = $i++; # simple lowercase mapping 11310 my $TITLE = $i++; # simple titlecase mapping 11311 my $input_field_count = $i; 11312 11313 # This routine in addition outputs these extra fields: 11314 11315 my $DECOMP_TYPE = $i++; # Decomposition type 11316 11317 # These fields are modifications of ones above, and are usually 11318 # suppressed; they must come last, as for speed, the loop upper bound is 11319 # normally set to ignore them 11320 my $NAME = $i++; # This is the strict name field, not the one that 11321 # charnames uses. 11322 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used 11323 # by Unicode::Normalize 11324 my $last_field = $i - 1; 11325 11326 # All these are read into an array for each line, with the indices defined 11327 # above. The empty fields in the example line above indicate that the 11328 # value is defaulted. The handler called for each line of the input 11329 # changes these to their defaults. 11330 11331 # Here are the official names of the properties, in a parallel array: 11332 my @field_names; 11333 $field_names[$BIDI] = 'Bidi_Class'; 11334 $field_names[$CATEGORY] = 'General_Category'; 11335 $field_names[$CCC] = 'Canonical_Combining_Class'; 11336 $field_names[$CHARNAME] = 'Perl_Charnames'; 11337 $field_names[$COMMENT] = 'ISO_Comment'; 11338 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; 11339 $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; 11340 $field_names[$LOWER] = 'Lowercase_Mapping'; 11341 $field_names[$MIRRORED] = 'Bidi_Mirrored'; 11342 $field_names[$NAME] = 'Name'; 11343 $field_names[$NUMERIC] = 'Numeric_Value'; 11344 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; 11345 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; 11346 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; 11347 $field_names[$TITLE] = 'Titlecase_Mapping'; 11348 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; 11349 $field_names[$UPPER] = 'Uppercase_Mapping'; 11350 11351 # Some of these need a little more explanation: 11352 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode 11353 # property, but is used in calculating the Numeric_Type. Perl however, 11354 # creates a file from this field, so a Perl property is created from it. 11355 # Similarly, the Other_Digit field is used only for calculating the 11356 # Numeric_Type, and so it can be safely re-used as the place to store 11357 # the value for Numeric_Type; hence it is referred to as 11358 # $NUMERIC_TYPE_OTHER_DIGIT. 11359 # The input field named $PERL_DECOMPOSITION is a combination of both the 11360 # decomposition mapping and its type. Perl creates a file containing 11361 # exactly this field, so it is used for that. The two properties are 11362 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. 11363 # $DECOMP_MAP is usually suppressed (unless the lists are changed to 11364 # output it), as Perl doesn't use it directly. 11365 # The input field named here $CHARNAME is used to construct the 11366 # Perl_Charnames property, which is a combination of the Name property 11367 # (which the input field contains), and the Unicode_1_Name property, and 11368 # others from other files. Since, the strict Name property is not used 11369 # by Perl, this field is used for the table that Perl does use. The 11370 # strict Name property table is usually suppressed (unless the lists are 11371 # changed to output it), so it is accumulated in a separate field, 11372 # $NAME, which to save time is discarded unless the table is actually to 11373 # be output 11374 11375 # This file is processed like most in this program. Control is passed to 11376 # process_generic_property_file() which calls filter_UnicodeData_line() 11377 # for each input line. This filter converts the input into line(s) that 11378 # process_generic_property_file() understands. There is also a setup 11379 # routine called before any of the file is processed, and a handler for 11380 # EOF processing, all in this closure. 11381 11382 # A huge speed-up occurred at the cost of some added complexity when these 11383 # routines were altered to buffer the outputs into ranges. Almost all the 11384 # lines of the input file apply to just one code point, and for most 11385 # properties, the map for the next code point up is the same as the 11386 # current one. So instead of creating a line for each property for each 11387 # input line, filter_UnicodeData_line() remembers what the previous map 11388 # of a property was, and doesn't generate a line to pass on until it has 11389 # to, as when the map changes; and that passed-on line encompasses the 11390 # whole contiguous range of code points that have the same map for that 11391 # property. This means a slight amount of extra setup, and having to 11392 # flush these buffers on EOF, testing if the maps have changed, plus 11393 # remembering state information in the closure. But it means a lot less 11394 # real time in not having to change the data base for each property on 11395 # each line. 11396 11397 # Another complication is that there are already a few ranges designated 11398 # in the input. There are two lines for each, with the same maps except 11399 # the code point and name on each line. This was actually the hardest 11400 # thing to design around. The code points in those ranges may actually 11401 # have real maps not given by these two lines. These maps will either 11402 # be algorithmically determinable, or be in the extracted files furnished 11403 # with the UCD. In the event of conflicts between these extracted files, 11404 # and this one, Unicode says that this one prevails. But it shouldn't 11405 # prevail for conflicts that occur in these ranges. The data from the 11406 # extracted files prevails in those cases. So, this program is structured 11407 # so that those files are processed first, storing maps. Then the other 11408 # files are processed, generally overwriting what the extracted files 11409 # stored. But just the range lines in this input file are processed 11410 # without overwriting. This is accomplished by adding a special string to 11411 # the lines output to tell process_generic_property_file() to turn off the 11412 # overwriting for just this one line. 11413 # A similar mechanism is used to tell it that the map is of a non-default 11414 # type. 11415 11416 sub setup_UnicodeData($file) { # Called before any lines of the input are read 11417 11418 # Create a new property specially located that is a combination of 11419 # various Name properties: Name, Unicode_1_Name, Named Sequences, and 11420 # _Perl_Name_Alias properties. (The final one duplicates elements of the 11421 # first, and starting in v6.1, is the same as the 'Name_Alias 11422 # property.) A comment for the new property will later be constructed 11423 # based on the actual properties present and used 11424 $perl_charname = Property->new('Perl_Charnames', 11425 Default_Map => "", 11426 Directory => File::Spec->curdir(), 11427 File => 'Name', 11428 Fate => $INTERNAL_ONLY, 11429 Perl_Extension => 1, 11430 Range_Size_1 => \&output_perl_charnames_line, 11431 Type => $STRING, 11432 ); 11433 $perl_charname->set_proxy_for('Name'); 11434 11435 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', 11436 Directory => File::Spec->curdir(), 11437 File => 'Decomposition', 11438 Format => $DECOMP_STRING_FORMAT, 11439 Fate => $INTERNAL_ONLY, 11440 Perl_Extension => 1, 11441 Default_Map => $CODE_POINT, 11442 11443 # normalize.pm can't cope with these 11444 Output_Range_Counts => 0, 11445 11446 # This is a specially formatted table 11447 # explicitly for normalize.pm, which 11448 # is expecting a particular format, 11449 # which means that mappings containing 11450 # multiple code points are in the main 11451 # body of the table 11452 Map_Type => $COMPUTE_NO_MULTI_CP, 11453 Type => $STRING, 11454 To_Output_Map => $INTERNAL_MAP, 11455 ); 11456 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); 11457 $Perl_decomp->add_comment(join_lines(<<END 11458This mapping is a combination of the Unicode 'Decomposition_Type' and 11459'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is 11460identical to the official Unicode 'Decomposition_Mapping' property except for 11461two things: 11462 1) It omits the algorithmically determinable Hangul syllable decompositions, 11463which normalize.pm handles algorithmically. 11464 2) It contains the decomposition type as well. Non-canonical decompositions 11465begin with a word in angle brackets, like <super>, which denotes the 11466compatible decomposition type. If the map does not begin with the <angle 11467brackets>, the decomposition is canonical. 11468END 11469 )); 11470 11471 my $Decimal_Digit = Property->new("Perl_Decimal_Digit", 11472 Default_Map => "", 11473 Perl_Extension => 1, 11474 Directory => $map_directory, 11475 Type => $STRING, 11476 To_Output_Map => $OUTPUT_ADJUSTED, 11477 ); 11478 $Decimal_Digit->add_comment(join_lines(<<END 11479This file gives the mapping of all code points which represent a single 11480decimal digit [0-9] to their respective digits, but it has ranges of 10 code 11481points, and the mapping of each non-initial element of each range is actually 11482not to "0", but to the offset that element has from its corresponding DIGIT 0. 11483These code points are those that have Numeric_Type=Decimal; not special 11484things, like subscripts nor Roman numerals. 11485END 11486 )); 11487 11488 # These properties are not used for generating anything else, and are 11489 # usually not output. By making them last in the list, we can just 11490 # change the high end of the loop downwards to avoid the work of 11491 # generating a table(s) that is/are just going to get thrown away. 11492 if (! property_ref('Decomposition_Mapping')->to_output_map 11493 && ! property_ref('Name')->to_output_map) 11494 { 11495 $last_field = min($NAME, $DECOMP_MAP) - 1; 11496 } elsif (property_ref('Decomposition_Mapping')->to_output_map) { 11497 $last_field = $DECOMP_MAP; 11498 } elsif (property_ref('Name')->to_output_map) { 11499 $last_field = $NAME; 11500 } 11501 return; 11502 } 11503 11504 my $first_time = 1; # ? Is this the first line of the file 11505 my $in_range = 0; # ? Are we in one of the file's ranges 11506 my $previous_cp; # hex code point of previous line 11507 my $decimal_previous_cp = -1; # And its decimal equivalent 11508 my @start; # For each field, the current starting 11509 # code point in hex for the range 11510 # being accumulated. 11511 my @fields; # The input fields; 11512 my @previous_fields; # And those from the previous call 11513 11514 sub filter_UnicodeData_line($file) { 11515 # Handle a single input line from UnicodeData.txt; see comments above 11516 # Conceptually this takes a single line from the file containing N 11517 # properties, and converts it into N lines with one property per line, 11518 # which is what the final handler expects. But there are 11519 # complications due to the quirkiness of the input file, and to save 11520 # time, it accumulates ranges where the property values don't change 11521 # and only emits lines when necessary. This is about an order of 11522 # magnitude fewer lines emitted. 11523 11524 # $_ contains the input line. 11525 # -1 in split means retain trailing null fields 11526 (my $cp, @fields) = split /\s*;\s*/, $_, -1; 11527 11528 #local $to_trace = 1 if main::DEBUG; 11529 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; 11530 if (@fields > $input_field_count) { 11531 $file->carp_bad_line('Extra fields'); 11532 $_ = ""; 11533 return; 11534 } 11535 11536 my $decimal_cp = hex $cp; 11537 11538 # We have to output all the buffered ranges when the next code point 11539 # is not exactly one after the previous one, which means there is a 11540 # gap in the ranges. 11541 my $force_output = ($decimal_cp != $decimal_previous_cp + 1); 11542 11543 # The decomposition mapping field requires special handling. It looks 11544 # like either: 11545 # 11546 # <compat> 0032 0020 11547 # 0041 0300 11548 # 11549 # The decomposition type is enclosed in <brackets>; if missing, it 11550 # means the type is canonical. There are two decomposition mapping 11551 # tables: the one for use by Perl's normalize.pm has a special format 11552 # which is this field intact; the other, for general use is of 11553 # standard format. In either case we have to find the decomposition 11554 # type. Empty fields have None as their type, and map to the code 11555 # point itself 11556 if ($fields[$PERL_DECOMPOSITION] eq "") { 11557 $fields[$DECOMP_TYPE] = 'None'; 11558 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; 11559 } 11560 else { 11561 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] 11562 =~ / < ( .+? ) > \s* ( .+ ) /x; 11563 if (! defined $fields[$DECOMP_TYPE]) { 11564 $fields[$DECOMP_TYPE] = 'Canonical'; 11565 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; 11566 } 11567 else { 11568 $fields[$DECOMP_MAP] = $map; 11569 } 11570 } 11571 11572 # The 3 numeric fields also require special handling. The 2 digit 11573 # fields must be either empty or match the number field. This means 11574 # that if it is empty, they must be as well, and the numeric type is 11575 # None, and the numeric value is 'Nan'. 11576 # The decimal digit field must be empty or match the other digit 11577 # field. If the decimal digit field is non-empty, the code point is 11578 # a decimal digit, and the other two fields will have the same value. 11579 # If it is empty, but the other digit field is non-empty, the code 11580 # point is an 'other digit', and the number field will have the same 11581 # value as the other digit field. If the other digit field is empty, 11582 # but the number field is non-empty, the code point is a generic 11583 # numeric type. 11584 if ($fields[$NUMERIC] eq "") { 11585 if ($fields[$PERL_DECIMAL_DIGIT] ne "" 11586 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" 11587 ) { 11588 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); 11589 } 11590 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; 11591 $fields[$NUMERIC] = 'NaN'; 11592 } 11593 else { 11594 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; 11595 if ($fields[$PERL_DECIMAL_DIGIT] ne "") { 11596 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; 11597 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd"; 11598 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; 11599 } 11600 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { 11601 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; 11602 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; 11603 } 11604 else { 11605 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; 11606 11607 # Rationals require extra effort. 11608 if ($fields[$NUMERIC] =~ qr{/}) { 11609 reduce_fraction(\$fields[$NUMERIC]); 11610 register_fraction($fields[$NUMERIC]) 11611 } 11612 } 11613 } 11614 11615 # For the properties that have empty fields in the file, and which 11616 # mean something different from empty, change them to that default. 11617 # Certain fields just haven't been empty so far in any Unicode 11618 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, 11619 # $CATEGORY. This leaves just the two fields, and so we hard-code in 11620 # the defaults; which are very unlikely to ever change. 11621 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; 11622 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; 11623 11624 # UAX44 says that if title is empty, it is the same as whatever upper 11625 # is, 11626 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; 11627 11628 # There are a few pairs of lines like: 11629 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 11630 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 11631 # that define ranges. These should be processed after the fields are 11632 # adjusted above, as they may override some of them; but mostly what 11633 # is left is to possibly adjust the $CHARNAME field. The names of all the 11634 # paired lines start with a '<', but this is also true of '<control>, 11635 # which isn't one of these special ones. 11636 if ($fields[$CHARNAME] eq '<control>') { 11637 11638 # Some code points in this file have the pseudo-name 11639 # '<control>', but the official name for such ones is the null 11640 # string. 11641 $fields[$NAME] = $fields[$CHARNAME] = ""; 11642 11643 # We had better not be in between range lines. 11644 if ($in_range) { 11645 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11646 $in_range = 0; 11647 } 11648 } 11649 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { 11650 11651 # Here is a non-range line. We had better not be in between range 11652 # lines. 11653 if ($in_range) { 11654 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11655 $in_range = 0; 11656 } 11657 if ($fields[$CHARNAME] =~ s/- $cp $//x) { 11658 11659 # These are code points whose names end in their code points, 11660 # which means the names are algorithmically derivable from the 11661 # code points. To shorten the output Name file, the algorithm 11662 # for deriving these is placed in the file instead of each 11663 # code point, so they have map type $CP_IN_NAME 11664 $fields[$CHARNAME] = $CMD_DELIM 11665 . $MAP_TYPE_CMD 11666 . '=' 11667 . $CP_IN_NAME 11668 . $CMD_DELIM 11669 . $fields[$CHARNAME]; 11670 } 11671 $fields[$NAME] = $fields[$CHARNAME]; 11672 } 11673 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { 11674 $fields[$CHARNAME] = $fields[$NAME] = $1; 11675 11676 # Here we are at the beginning of a range pair. 11677 if ($in_range) { 11678 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); 11679 } 11680 $in_range = 1; 11681 11682 # Because the properties in the range do not overwrite any already 11683 # in the db, we must flush the buffers of what's already there, so 11684 # they get handled in the normal scheme. 11685 $force_output = 1; 11686 11687 } 11688 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { 11689 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); 11690 $_ = ""; 11691 return; 11692 } 11693 else { # Here, we are at the last line of a range pair. 11694 11695 if (! $in_range) { 11696 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); 11697 $_ = ""; 11698 return; 11699 } 11700 $in_range = 0; 11701 11702 $fields[$NAME] = $fields[$CHARNAME]; 11703 11704 # Check that the input is valid: that the closing of the range is 11705 # the same as the beginning. 11706 foreach my $i (0 .. $last_field) { 11707 next if $fields[$i] eq $previous_fields[$i]; 11708 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); 11709 } 11710 11711 # The processing differs depending on the type of range, 11712 # determined by its $CHARNAME 11713 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { 11714 11715 # Check that the data looks right. 11716 if ($decimal_previous_cp != $SBase) { 11717 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); 11718 } 11719 if ($decimal_cp != $SBase + $SCount - 1) { 11720 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); 11721 } 11722 11723 # The Hangul syllable range has a somewhat complicated name 11724 # generation algorithm. Each code point in it has a canonical 11725 # decomposition also computable by an algorithm. The 11726 # perl decomposition map table built from these is used only 11727 # by normalize.pm, which has the algorithm built in it, so the 11728 # decomposition maps are not needed, and are large, so are 11729 # omitted from it. If the full decomposition map table is to 11730 # be output, the decompositions are generated for it, in the 11731 # EOF handling code for this input file. 11732 11733 $previous_fields[$DECOMP_TYPE] = 'Canonical'; 11734 11735 # This range is stored in our internal structure with its 11736 # own map type, different from all others. 11737 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11738 = $CMD_DELIM 11739 . $MAP_TYPE_CMD 11740 . '=' 11741 . $HANGUL_SYLLABLE 11742 . $CMD_DELIM 11743 . $fields[$CHARNAME]; 11744 } 11745 elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter 11746 11747 # All the CJK ranges like this have the name given as a 11748 # special case in the next code line. And for the others, we 11749 # hope that Unicode continues to use the correct name in 11750 # future releases, so we don't have to make further special 11751 # cases. 11752 my $name = ($fields[$CHARNAME] =~ /^CJK/) 11753 ? 'CJK UNIFIED IDEOGRAPH' 11754 : uc $fields[$CHARNAME]; 11755 11756 # The name for these contains the code point itself, and all 11757 # are defined to have the same base name, regardless of what 11758 # is in the file. They are stored in our internal structure 11759 # with a map type of $CP_IN_NAME 11760 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11761 = $CMD_DELIM 11762 . $MAP_TYPE_CMD 11763 . '=' 11764 . $CP_IN_NAME 11765 . $CMD_DELIM 11766 . $name; 11767 11768 } 11769 elsif ($fields[$CATEGORY] eq 'Co' 11770 || $fields[$CATEGORY] eq 'Cs') 11771 { 11772 # The names of all the code points in these ranges are set to 11773 # null, as there are no names for the private use and 11774 # surrogate code points. 11775 11776 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; 11777 } 11778 else { 11779 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); 11780 } 11781 11782 # The first line of the range caused everything else to be output, 11783 # and then its values were stored as the beginning values for the 11784 # next set of ranges, which this one ends. Now, for each value, 11785 # add a command to tell the handler that these values should not 11786 # replace any existing ones in our database. 11787 foreach my $i (0 .. $last_field) { 11788 $previous_fields[$i] = $CMD_DELIM 11789 . $REPLACE_CMD 11790 . '=' 11791 . $NO 11792 . $CMD_DELIM 11793 . $previous_fields[$i]; 11794 } 11795 11796 # And change things so it looks like the entire range has been 11797 # gone through with this being the final part of it. Adding the 11798 # command above to each field will cause this range to be flushed 11799 # during the next iteration, as it guaranteed that the stored 11800 # field won't match whatever value the next one has. 11801 $previous_cp = $cp; 11802 $decimal_previous_cp = $decimal_cp; 11803 11804 # We are now set up for the next iteration; so skip the remaining 11805 # code in this subroutine that does the same thing, but doesn't 11806 # know about these ranges. 11807 $_ = ""; 11808 11809 return; 11810 } 11811 11812 # On the very first line, we fake it so the code below thinks there is 11813 # nothing to output, and initialize so that when it does get output it 11814 # uses the first line's values for the lowest part of the range. 11815 # (One could avoid this by using peek(), but then one would need to 11816 # know the adjustments done above and do the same ones in the setup 11817 # routine; not worth it) 11818 if ($first_time) { 11819 $first_time = 0; 11820 @previous_fields = @fields; 11821 @start = ($cp) x scalar @fields; 11822 $decimal_previous_cp = $decimal_cp - 1; 11823 } 11824 11825 # For each field, output the stored up ranges that this code point 11826 # doesn't fit in. Earlier we figured out if all ranges should be 11827 # terminated because of changing the replace or map type styles, or if 11828 # there is a gap between this new code point and the previous one, and 11829 # that is stored in $force_output. But even if those aren't true, we 11830 # need to output the range if this new code point's value for the 11831 # given property doesn't match the stored range's. 11832 #local $to_trace = 1 if main::DEBUG; 11833 foreach my $i (0 .. $last_field) { 11834 my $field = $fields[$i]; 11835 if ($force_output || $field ne $previous_fields[$i]) { 11836 11837 # Flush the buffer of stored values. 11838 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11839 11840 # Start a new range with this code point and its value 11841 $start[$i] = $cp; 11842 $previous_fields[$i] = $field; 11843 } 11844 } 11845 11846 # Set the values for the next time. 11847 $previous_cp = $cp; 11848 $decimal_previous_cp = $decimal_cp; 11849 11850 # The input line has generated whatever adjusted lines are needed, and 11851 # should not be looked at further. 11852 $_ = ""; 11853 return; 11854 } 11855 11856 sub EOF_UnicodeData($file) { 11857 # Called upon EOF to flush the buffers, and create the Hangul 11858 # decomposition mappings if needed. 11859 11860 # Flush the buffers. 11861 foreach my $i (0 .. $last_field) { 11862 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11863 } 11864 11865 if (-e 'Jamo.txt') { 11866 11867 # The algorithm is published by Unicode, based on values in 11868 # Jamo.txt, (which should have been processed before this 11869 # subroutine), and the results left in %Jamo 11870 unless (%Jamo) { 11871 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); 11872 return; 11873 } 11874 11875 # If the full decomposition map table is being output, insert 11876 # into it the Hangul syllable mappings. This is to avoid having 11877 # to publish a subroutine in it to compute them. (which would 11878 # essentially be this code.) This uses the algorithm published by 11879 # Unicode. (No hangul syllables in version 1) 11880 if ($v_version ge v2.0.0 11881 && property_ref('Decomposition_Mapping')->to_output_map) { 11882 for (my $S = $SBase; $S < $SBase + $SCount; $S++) { 11883 use integer; 11884 my $SIndex = $S - $SBase; 11885 my $L = $LBase + $SIndex / $NCount; 11886 my $V = $VBase + ($SIndex % $NCount) / $TCount; 11887 my $T = $TBase + $SIndex % $TCount; 11888 11889 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; 11890 my $decomposition = sprintf("%04X %04X", $L, $V); 11891 $decomposition .= sprintf(" %04X", $T) if $T != $TBase; 11892 $file->insert_adjusted_lines( 11893 sprintf("%04X; Decomposition_Mapping; %s", 11894 $S, 11895 $decomposition)); 11896 } 11897 } 11898 } 11899 11900 return; 11901 } 11902 11903 sub filter_v1_ucd($file) { 11904 # Fix UCD lines in version 1. This is probably overkill, but this 11905 # fixes some glaring errors in Version 1 UnicodeData.txt. That file: 11906 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later 11907 # removed. This program retains them 11908 # 2) didn't include ranges, which it should have, and which are now 11909 # added in @corrected_lines below. It was hand populated by 11910 # taking the data from Version 2, verified by analyzing 11911 # DAge.txt. 11912 # 3) There is a syntax error in the entry for U+09F8 which could 11913 # cause problems for Unicode::UCD, and so is changed. It's 11914 # numeric value was simply a minus sign, without any number. 11915 # (Eventually Unicode changed the code point to non-numeric.) 11916 # 4) The decomposition types often don't match later versions 11917 # exactly, and the whole syntax of that field is different; so 11918 # the syntax is changed as well as the types to their later 11919 # terminology. Otherwise normalize.pm would be very unhappy 11920 # 5) Many ccc classes are different. These are left intact. 11921 # 6) U+FF10..U+FF19 are missing their numeric values in all three 11922 # fields. These are unchanged because it doesn't really cause 11923 # problems for Perl. 11924 # 7) A number of code points, such as controls, don't have their 11925 # Unicode Version 1 Names in this file. These are added. 11926 # 8) A number of Symbols were marked as Lm. This changes those in 11927 # the Latin1 range, so that regexes work. 11928 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are 11929 # referred to by their lc equivalents. Not fixed. 11930 11931 my @corrected_lines = split /\n/, <<'END'; 119324E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 119339FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11934E000;<Private Use, First>;Co;0;L;;;;;N;;;;; 11935F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;; 11936F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;; 11937FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11938END 11939 11940 #local $to_trace = 1 if main::DEBUG; 11941 trace $_ if main::DEBUG && $to_trace; 11942 11943 # -1 => retain trailing null fields 11944 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11945 11946 # At the first place that is wrong in the input, insert all the 11947 # corrections, replacing the wrong line. 11948 if ($code_point eq '4E00') { 11949 my @copy = @corrected_lines; 11950 $_ = shift @copy; 11951 ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11952 11953 $file->insert_lines(@copy); 11954 } 11955 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') { 11956 11957 # There are no Lm characters in Latin1; these should be 'Sk', but 11958 # there isn't that in V1. 11959 $fields[$CATEGORY] = 'So'; 11960 } 11961 11962 if ($fields[$NUMERIC] eq '-') { 11963 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. 11964 } 11965 11966 if ($fields[$PERL_DECOMPOSITION] ne "") { 11967 11968 # Several entries have this change to superscript 2 or 3 in the 11969 # middle. Convert these to the modern version, which is to use 11970 # the actual U+00B2 and U+00B3 (the superscript forms) instead. 11971 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes 11972 # 'HHHH HHHH 00B3 HHHH'. 11973 # It turns out that all of these that don't have another 11974 # decomposition defined at the beginning of the line have the 11975 # <square> decomposition in later releases. 11976 if ($code_point ne '00B2' && $code_point ne '00B3') { 11977 if ($fields[$PERL_DECOMPOSITION] 11978 =~ s/<\+sup> 003([23]) <-sup>/00B$1/) 11979 { 11980 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { 11981 $fields[$PERL_DECOMPOSITION] = '<square> ' 11982 . $fields[$PERL_DECOMPOSITION]; 11983 } 11984 } 11985 } 11986 11987 # If is like '<+circled> 0052 <-circled>', convert to 11988 # '<circled> 0052' 11989 $fields[$PERL_DECOMPOSITION] =~ 11990 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg; 11991 11992 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc. 11993 $fields[$PERL_DECOMPOSITION] =~ 11994 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x 11995 or $fields[$PERL_DECOMPOSITION] =~ 11996 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x 11997 or $fields[$PERL_DECOMPOSITION] =~ 11998 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x 11999 or $fields[$PERL_DECOMPOSITION] =~ 12000 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x; 12001 12002 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc. 12003 $fields[$PERL_DECOMPOSITION] =~ 12004 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; 12005 12006 # Change names to modern form. 12007 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g; 12008 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g; 12009 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g; 12010 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g; 12011 12012 # One entry has weird braces 12013 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; 12014 12015 # One entry at U+2116 has an extra <sup> 12016 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x; 12017 } 12018 12019 $_ = join ';', $code_point, @fields; 12020 trace $_ if main::DEBUG && $to_trace; 12021 return; 12022 } 12023 12024 sub filter_bad_Nd_ucd { 12025 # Early versions specified a value in the decimal digit field even 12026 # though the code point wasn't a decimal digit. Clear the field in 12027 # that situation, so that the main code doesn't think it is a decimal 12028 # digit. 12029 12030 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12031 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') { 12032 $fields[$PERL_DECIMAL_DIGIT] = ""; 12033 $_ = join ';', $code_point, @fields; 12034 } 12035 return; 12036 } 12037 12038 my @U1_control_names = split /\n/, <<'END'; 12039NULL 12040START OF HEADING 12041START OF TEXT 12042END OF TEXT 12043END OF TRANSMISSION 12044ENQUIRY 12045ACKNOWLEDGE 12046BELL 12047BACKSPACE 12048HORIZONTAL TABULATION 12049LINE FEED 12050VERTICAL TABULATION 12051FORM FEED 12052CARRIAGE RETURN 12053SHIFT OUT 12054SHIFT IN 12055DATA LINK ESCAPE 12056DEVICE CONTROL ONE 12057DEVICE CONTROL TWO 12058DEVICE CONTROL THREE 12059DEVICE CONTROL FOUR 12060NEGATIVE ACKNOWLEDGE 12061SYNCHRONOUS IDLE 12062END OF TRANSMISSION BLOCK 12063CANCEL 12064END OF MEDIUM 12065SUBSTITUTE 12066ESCAPE 12067FILE SEPARATOR 12068GROUP SEPARATOR 12069RECORD SEPARATOR 12070UNIT SEPARATOR 12071DELETE 12072BREAK PERMITTED HERE 12073NO BREAK HERE 12074INDEX 12075NEXT LINE 12076START OF SELECTED AREA 12077END OF SELECTED AREA 12078CHARACTER TABULATION SET 12079CHARACTER TABULATION WITH JUSTIFICATION 12080LINE TABULATION SET 12081PARTIAL LINE DOWN 12082PARTIAL LINE UP 12083REVERSE LINE FEED 12084SINGLE SHIFT TWO 12085SINGLE SHIFT THREE 12086DEVICE CONTROL STRING 12087PRIVATE USE ONE 12088PRIVATE USE TWO 12089SET TRANSMIT STATE 12090CANCEL CHARACTER 12091MESSAGE WAITING 12092START OF GUARDED AREA 12093END OF GUARDED AREA 12094START OF STRING 12095SINGLE CHARACTER INTRODUCER 12096CONTROL SEQUENCE INTRODUCER 12097STRING TERMINATOR 12098OPERATING SYSTEM COMMAND 12099PRIVACY MESSAGE 12100APPLICATION PROGRAM COMMAND 12101END 12102 12103 sub filter_early_U1_names { 12104 # Very early versions did not have the Unicode_1_name field specified. 12105 # They differed in which ones were present; make sure a U1 name 12106 # exists, so that Unicode::UCD::charinfo will work 12107 12108 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12109 12110 12111 # @U1_control names above are entirely positional, so we pull them out 12112 # in the exact order required, with gaps for the ones that don't have 12113 # names. 12114 if ($code_point =~ /^00[01]/ 12115 || $code_point eq '007F' 12116 || $code_point =~ /^008[2-9A-F]/ 12117 || $code_point =~ /^009[0-8A-F]/) 12118 { 12119 my $u1_name = shift @U1_control_names; 12120 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME]; 12121 $_ = join ';', $code_point, @fields; 12122 } 12123 return; 12124 } 12125 12126 sub filter_v2_1_5_ucd { 12127 # A dozen entries in this 2.1.5 file had the mirrored and numeric 12128 # columns swapped; These all had mirrored be 'N'. So if the numeric 12129 # column appears to be N, swap it back. 12130 12131 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12132 if ($fields[$NUMERIC] eq 'N') { 12133 $fields[$NUMERIC] = $fields[$MIRRORED]; 12134 $fields[$MIRRORED] = 'N'; 12135 $_ = join ';', $code_point, @fields; 12136 } 12137 return; 12138 } 12139 12140 sub filter_v6_ucd { 12141 12142 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17, 12143 # it wasn't accepted, to allow for some deprecation cycles. This 12144 # function is not called after 5.16 12145 12146 return if $_ !~ /^(?:0007|1F514|070F);/; 12147 12148 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12149 if ($code_point eq '0007') { 12150 $fields[$CHARNAME] = ""; 12151 } 12152 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see 12153 # http://www.unicode.org/versions/corrigendum8.html 12154 $fields[$BIDI] = "AL"; 12155 } 12156 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name 12157 $fields[$CHARNAME] = ""; 12158 } 12159 12160 $_ = join ';', $code_point, @fields; 12161 12162 return; 12163 } 12164} # End closure for UnicodeData 12165 12166sub process_GCB_test($file) { 12167 12168 while ($file->next_line) { 12169 push @backslash_X_tests, $_; 12170 } 12171 12172 return; 12173} 12174 12175sub process_LB_test($file) { 12176 12177 while ($file->next_line) { 12178 push @LB_tests, $_; 12179 } 12180 12181 return; 12182} 12183 12184sub process_SB_test($file) { 12185 12186 while ($file->next_line) { 12187 push @SB_tests, $_; 12188 } 12189 12190 return; 12191} 12192 12193sub process_WB_test($file) { 12194 12195 while ($file->next_line) { 12196 push @WB_tests, $_; 12197 } 12198 12199 return; 12200} 12201 12202sub process_NamedSequences($file) { 12203 # NamedSequences.txt entries are just added to an array. Because these 12204 # don't look like the other tables, they have their own handler. 12205 # An example: 12206 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 12207 # 12208 # This just adds the sequence to an array for later handling 12209 12210 while ($file->next_line) { 12211 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; 12212 if (@remainder) { 12213 $file->carp_bad_line( 12214 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); 12215 next; 12216 } 12217 12218 # Code points need to be 5 digits long like the other entries in 12219 # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be 12220 # converted to native 12221 $sequence = join " ", map { sprintf("%05X", 12222 utf8::unicode_to_native(hex $_)) 12223 } split / /, $sequence; 12224 push @named_sequences, "$sequence\n$name\n"; 12225 } 12226 return; 12227} 12228 12229{ # Closure 12230 12231 my $first_range; 12232 12233 sub filter_early_ea_lb { 12234 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a 12235 # third field be the name of the code point, which can be ignored in 12236 # most cases. But it can be meaningful if it marks a range: 12237 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 12238 # 3400;W;<CJK Ideograph Extension A, First> 12239 # 12240 # We need to see the First in the example above to know it's a range. 12241 # They did not use the later range syntaxes. This routine changes it 12242 # to use the modern syntax. 12243 # $1 is the Input_file object. 12244 12245 my @fields = split /\s*;\s*/; 12246 if ($fields[2] =~ /^<.*, First>/) { 12247 $first_range = $fields[0]; 12248 $_ = ""; 12249 } 12250 elsif ($fields[2] =~ /^<.*, Last>/) { 12251 $_ = $_ = "$first_range..$fields[0]; $fields[1]"; 12252 } 12253 else { 12254 undef $first_range; 12255 $_ = "$fields[0]; $fields[1]"; 12256 } 12257 12258 return; 12259 } 12260} 12261 12262sub filter_substitute_lb { 12263 # Used on Unicodes that predate the LB property, where there is a 12264 # substitute file. This just does the regular ea_lb handling for such 12265 # files, and then substitutes the long property value name for the short 12266 # one that comes with the file. (The other break files have the long 12267 # names in them, so this is the odd one out.) The reason for doing this 12268 # kludge is that regen/mk_invlists.pl is expecting the long name. This 12269 # also fixes the typo 'Inseperable' that leads to problems. 12270 12271 filter_early_ea_lb; 12272 return unless $_; 12273 12274 my @fields = split /\s*;\s*/; 12275 $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name; 12276 $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable'; 12277 $_ = join '; ', @fields; 12278} 12279 12280sub filter_old_style_arabic_shaping { 12281 # Early versions used a different term for the later one. 12282 12283 my @fields = split /\s*;\s*/; 12284 $fields[3] =~ s/<no shaping>/No_Joining_Group/; 12285 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores 12286 $_ = join ';', @fields; 12287 return; 12288} 12289 12290{ # Closure 12291 my $lc; # Table for lowercase mapping 12292 my $tc; 12293 my $uc; 12294 my %special_casing_code_points; 12295 12296 sub setup_special_casing($file) { 12297 # SpecialCasing.txt contains the non-simple case change mappings. The 12298 # simple ones are in UnicodeData.txt, which should already have been 12299 # read in to the full property data structures, so as to initialize 12300 # these with the simple ones. Then the SpecialCasing.txt entries 12301 # add or overwrite the ones which have different full mappings. 12302 12303 # This routine sees if the simple mappings are to be output, and if 12304 # so, copies what has already been put into the full mapping tables, 12305 # while they still contain only the simple mappings. 12306 12307 # The reason it is done this way is that the simple mappings are 12308 # probably not going to be output, so it saves work to initialize the 12309 # full tables with the simple mappings, and then overwrite those 12310 # relatively few entries in them that have different full mappings, 12311 # and thus skip the simple mapping tables altogether. 12312 12313 $lc = property_ref('lc'); 12314 $tc = property_ref('tc'); 12315 $uc = property_ref('uc'); 12316 12317 # For each of the case change mappings... 12318 foreach my $full_casing_table ($lc, $tc, $uc) { 12319 my $full_casing_name = $full_casing_table->name; 12320 my $full_casing_full_name = $full_casing_table->full_name; 12321 unless (defined $full_casing_table 12322 && ! $full_casing_table->is_empty) 12323 { 12324 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); 12325 } 12326 12327 $full_casing_table->add_comment(join_lines( <<END 12328This file includes both the simple and full case changing maps. The simple 12329ones are in the main body of the table below, and the full ones adding to or 12330overriding them are in the hash. 12331END 12332 )); 12333 12334 # The simple version's name in each mapping merely has an 's' in 12335 # front of the full one's 12336 my $simple_name = 's' . $full_casing_name; 12337 my $simple = property_ref($simple_name); 12338 $simple->initialize($full_casing_table) if $simple->to_output_map(); 12339 } 12340 12341 return; 12342 } 12343 12344 sub filter_2_1_8_special_casing_line { 12345 12346 # This version had duplicate entries in this file. Delete all but the 12347 # first one 12348 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12349 # fields 12350 if (exists $special_casing_code_points{$fields[0]}) { 12351 $_ = ""; 12352 return; 12353 } 12354 12355 $special_casing_code_points{$fields[0]} = 1; 12356 filter_special_casing_line(@_); 12357 } 12358 12359 sub filter_special_casing_line($file) { 12360 # Change the format of $_ from SpecialCasing.txt into something that 12361 # the generic handler understands. Each input line contains three 12362 # case mappings. This will generate three lines to pass to the 12363 # generic handler for each of those. 12364 12365 # The input syntax (after stripping comments and trailing white space 12366 # is like one of the following (with the final two being entries that 12367 # we ignore): 12368 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 12369 # 03A3; 03C2; 03A3; 03A3; Final_Sigma; 12370 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 12371 # Note the trailing semi-colon, unlike many of the input files. That 12372 # means that there will be an extra null field generated by the split 12373 12374 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12375 # fields 12376 12377 # field #4 is when this mapping is conditional. If any of these get 12378 # implemented, it would be by hard-coding in the casing functions in 12379 # the Perl core, not through tables. But if there is a new condition 12380 # we don't know about, output a warning. We know about all the 12381 # conditions through 6.0 12382 if ($fields[4] ne "") { 12383 my @conditions = split ' ', $fields[4]; 12384 if ($conditions[0] ne 'tr' # We know that these languages have 12385 # conditions, and some are multiple 12386 && $conditions[0] ne 'az' 12387 && $conditions[0] ne 'lt' 12388 12389 # And, we know about a single condition Final_Sigma, but 12390 # nothing else. 12391 && ($v_version gt v5.2.0 12392 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) 12393 { 12394 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); 12395 } 12396 elsif ($conditions[0] ne 'Final_Sigma') { 12397 12398 # Don't print out a message for Final_Sigma, because we 12399 # have hard-coded handling for it. (But the standard 12400 # could change what the rule should be, but it wouldn't 12401 # show up here anyway. 12402 12403 print "# SKIPPING Special Casing: $_\n" 12404 if $verbosity >= $VERBOSE; 12405 } 12406 $_ = ""; 12407 return; 12408 } 12409 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { 12410 $file->carp_bad_line('Extra fields'); 12411 $_ = ""; 12412 return; 12413 } 12414 12415 my $decimal_code_point = hex $fields[0]; 12416 12417 # Loop to handle each of the three mappings in the input line, in 12418 # order, with $i indicating the current field number. 12419 my $i = 0; 12420 for my $object ($lc, $tc, $uc) { 12421 $i++; # First time through, $i = 0 ... 3rd time = 3 12422 12423 my $value = $object->value_of($decimal_code_point); 12424 $value = ($value eq $CODE_POINT) 12425 ? $decimal_code_point 12426 : hex $value; 12427 12428 # If this isn't a multi-character mapping, it should already have 12429 # been read in. 12430 if ($fields[$i] !~ / /) { 12431 if ($value != hex $fields[$i]) { 12432 Carp::my_carp("Bad news. UnicodeData.txt thinks " 12433 . $object->name 12434 . "(0x$fields[0]) is $value" 12435 . " and SpecialCasing.txt thinks it is " 12436 . hex($fields[$i]) 12437 . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); 12438 } 12439 } 12440 else { 12441 12442 # The mapping is additional, beyond the simple mapping. 12443 $file->insert_adjusted_lines("$fields[0]; " 12444 . $object->name 12445 . "; " 12446 . $CMD_DELIM 12447 . "$REPLACE_CMD=$MULTIPLE_BEFORE" 12448 . $CMD_DELIM 12449 . $fields[$i]); 12450 } 12451 } 12452 12453 # Everything has been handled by the insert_adjusted_lines() 12454 $_ = ""; 12455 12456 return; 12457 } 12458} 12459 12460sub filter_old_style_case_folding($file) { 12461 # This transforms $_ containing the case folding style of 3.0.1, to 3.1 12462 # and later style. Different letters were used in the earlier. 12463 12464 my @fields = split /\s*;\s*/; 12465 12466 if ($fields[1] eq 'L') { 12467 $fields[1] = 'C'; # L => C always 12468 } 12469 elsif ($fields[1] eq 'E') { 12470 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise 12471 $fields[1] = 'F' 12472 } 12473 else { 12474 $fields[1] = 'C' 12475 } 12476 } 12477 else { 12478 $file->carp_bad_line("Expecting L or E in second field"); 12479 $_ = ""; 12480 return; 12481 } 12482 $_ = join("; ", @fields) . ';'; 12483 return; 12484} 12485 12486{ # Closure for case folding 12487 12488 # Create the map for simple only if are going to output it, for otherwise 12489 # it takes no part in anything we do. 12490 my $to_output_simple; 12491 12492 sub setup_case_folding { 12493 # Read in the case foldings in CaseFolding.txt. This handles both 12494 # simple and full case folding. 12495 12496 $to_output_simple 12497 = property_ref('Simple_Case_Folding')->to_output_map; 12498 12499 if (! $to_output_simple) { 12500 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); 12501 } 12502 12503 # If we ever wanted to show that these tables were combined, a new 12504 # property method could be created, like set_combined_props() 12505 property_ref('Case_Folding')->add_comment(join_lines( <<END 12506This file includes both the simple and full case folding maps. The simple 12507ones are in the main body of the table below, and the full ones adding to or 12508overriding them are in the hash. 12509END 12510 )); 12511 return; 12512 } 12513 12514 sub filter_case_folding_line($file) { 12515 # Called for each line in CaseFolding.txt 12516 # Input lines look like: 12517 # 0041; C; 0061; # LATIN CAPITAL LETTER A 12518 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 12519 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 12520 # 12521 # 'C' means that folding is the same for both simple and full 12522 # 'F' that it is only for full folding 12523 # 'S' that it is only for simple folding 12524 # 'T' is locale-dependent, and ignored 12525 # 'I' is a type of 'F' used in some early releases. 12526 # Note the trailing semi-colon, unlike many of the input files. That 12527 # means that there will be an extra null field generated by the split 12528 # below, which we ignore and hence is not an error. 12529 12530 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; 12531 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { 12532 $file->carp_bad_line('Extra fields'); 12533 $_ = ""; 12534 return; 12535 } 12536 12537 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent 12538 $_ = ""; 12539 return; 12540 } 12541 12542 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase 12543 # I are all full foldings; S is single-char. For S, there is always 12544 # an F entry, so we must allow multiple values for the same code 12545 # point. Fortunately this table doesn't need further manipulation 12546 # which would preclude using multiple-values. The S is now included 12547 # so that _swash_inversion_hash() is able to construct closures 12548 # without having to worry about F mappings. 12549 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { 12550 $_ = "$range; Case_Folding; " 12551 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map"; 12552 } 12553 else { 12554 $_ = ""; 12555 $file->carp_bad_line('Expecting C F I S or T in second field'); 12556 } 12557 12558 # C and S are simple foldings, but simple case folding is not needed 12559 # unless we explicitly want its map table output. 12560 if ($to_output_simple && $type eq 'C' || $type eq 'S') { 12561 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); 12562 } 12563 12564 return; 12565 } 12566 12567} # End case fold closure 12568 12569sub filter_jamo_line { 12570 # Filter Jamo.txt lines. This routine mainly is used to populate hashes 12571 # from this file that is used in generating the Name property for Jamo 12572 # code points. But, it also is used to convert early versions' syntax 12573 # into the modern form. Here are two examples: 12574 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax 12575 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax 12576 # 12577 # The input is $_, the output is $_ filtered. 12578 12579 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 12580 12581 # Let the caller handle unexpected input. In earlier versions, there was 12582 # a third field which is supposed to be a comment, but did not have a '#' 12583 # before it. 12584 return if @fields > (($v_version gt v3.0.0) ? 2 : 3); 12585 12586 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous 12587 # beginning. 12588 12589 # Some 2.1 versions had this wrong. Causes havoc with the algorithm. 12590 $fields[1] = 'R' if $fields[0] eq '1105'; 12591 12592 # Add to structure so can generate Names from it. 12593 my $cp = hex $fields[0]; 12594 my $short_name = $fields[1]; 12595 $Jamo{$cp} = $short_name; 12596 if ($cp <= $LBase + $LCount) { 12597 $Jamo_L{$short_name} = $cp - $LBase; 12598 } 12599 elsif ($cp <= $VBase + $VCount) { 12600 $Jamo_V{$short_name} = $cp - $VBase; 12601 } 12602 elsif ($cp <= $TBase + $TCount) { 12603 $Jamo_T{$short_name} = $cp - $TBase; 12604 } 12605 else { 12606 Carp::my_carp_bug("Unexpected Jamo code point in $_"); 12607 } 12608 12609 12610 # Reassemble using just the first two fields to look like a typical 12611 # property file line 12612 $_ = "$fields[0]; $fields[1]"; 12613 12614 return; 12615} 12616 12617sub register_fraction($rational) { 12618 # This registers the input rational number so that it can be passed on to 12619 # Unicode::UCD, both in rational and floating forms. 12620 12621 my $floating = eval $rational; 12622 12623 my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating; 12624 12625 # See if the denominator is a power of 2. 12626 $rational =~ m!.*/(.*)!; 12627 my $denominator = $1; 12628 if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) { 12629 12630 # Here the denominator is a power of 2. This means it has an exact 12631 # representation in binary, so rounding could go either way. It turns 12632 # out that Windows doesn't necessarily round towards even, so output 12633 # an extra entry. This happens when the final digit we output is even 12634 # and the next digits would be 50* to the precision of the machine. 12635 my $extra_digit_float = sprintf "%e", $floating; 12636 my $q = $E_FLOAT_PRECISION - 1; 12637 if ($extra_digit_float =~ / ( .* \. \d{$q} ) 12638 ( [02468] ) 5 0* ( e .*) 12639 /ix) 12640 { 12641 push @floats, $1 . ($2 + 1) . $3; 12642 } 12643 } 12644 12645 foreach my $float (@floats) { 12646 # Strip off any leading zeros beyond 2 digits to make it C99 12647 # compliant. (Windows has 3 digit exponents, contrary to C99) 12648 $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x; 12649 12650 if ( defined $nv_floating_to_rational{$float} 12651 && $nv_floating_to_rational{$float} ne $rational) 12652 { 12653 die Carp::my_carp_bug("Both '$rational' and" 12654 . " '$nv_floating_to_rational{$float}' evaluate to" 12655 . " the same floating point number." 12656 . " \$E_FLOAT_PRECISION must be increased"); 12657 } 12658 $nv_floating_to_rational{$float} = $rational; 12659 } 12660 return; 12661} 12662 12663sub gcd($a, $b) { # Greatest-common-divisor; from 12664 # http://en.wikipedia.org/wiki/Euclidean_algorithm 12665 use integer; 12666 12667 while ($b != 0) { 12668 my $temp = $b; 12669 $b = $a % $b; 12670 $a = $temp; 12671 } 12672 return $a; 12673} 12674 12675sub reduce_fraction($fraction_ref) { 12676 # Reduce a fraction to lowest terms. The Unicode data may be reducible, 12677 # hence this is needed. The argument is a reference to the 12678 # string denoting the fraction, which must be of the form: 12679 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) { 12680 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged"); 12681 return; 12682 } 12683 12684 my $sign = $1; 12685 my $numerator = $2; 12686 my $denominator = $3; 12687 12688 use integer; 12689 12690 # Find greatest common divisor 12691 my $gcd = gcd($numerator, $denominator); 12692 12693 # And reduce using the gcd. 12694 if ($gcd != 1) { 12695 $numerator /= $gcd; 12696 $denominator /= $gcd; 12697 $$fraction_ref = "$sign$numerator/$denominator"; 12698 } 12699 12700 return; 12701} 12702 12703sub filter_numeric_value_line($file) { 12704 # DNumValues contains lines of a different syntax than the typical 12705 # property file: 12706 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO 12707 # 12708 # This routine transforms $_ containing the anomalous syntax to the 12709 # typical, by filtering out the extra columns, and convert early version 12710 # decimal numbers to strings that look like rational numbers. 12711 12712 # Starting in 5.1, there is a rational field. Just use that, omitting the 12713 # extra columns. Otherwise convert the decimal number in the second field 12714 # to a rational, and omit extraneous columns. 12715 my @fields = split /\s*;\s*/, $_, -1; 12716 my $rational; 12717 12718 if ($v_version ge v5.1.0) { 12719 if (@fields != 4) { 12720 $file->carp_bad_line('Not 4 semi-colon separated fields'); 12721 $_ = ""; 12722 return; 12723 } 12724 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/}; 12725 $rational = $fields[3]; 12726 12727 $_ = join '; ', @fields[ 0, 3 ]; 12728 } 12729 else { 12730 12731 # Here, is an older Unicode file, which has decimal numbers instead of 12732 # rationals in it. Use the fraction to calculate the denominator and 12733 # convert to rational. 12734 12735 if (@fields != 2 && @fields != 3) { 12736 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); 12737 $_ = ""; 12738 return; 12739 } 12740 12741 my $codepoints = $fields[0]; 12742 my $decimal = $fields[1]; 12743 if ($decimal =~ s/\.0+$//) { 12744 12745 # Anything ending with a decimal followed by nothing but 0's is an 12746 # integer 12747 $_ = "$codepoints; $decimal"; 12748 $rational = $decimal; 12749 } 12750 else { 12751 12752 my $denominator; 12753 if ($decimal =~ /\.50*$/) { 12754 $denominator = 2; 12755 } 12756 12757 # Here have the hardcoded repeating decimals in the fraction, and 12758 # the denominator they imply. There were only a few denominators 12759 # in the older Unicode versions of this file which this code 12760 # handles, so it is easy to convert them. 12761 12762 # The 4 is because of a round-off error in the Unicode 3.2 files 12763 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { 12764 $denominator = 3; 12765 } 12766 elsif ($decimal =~ /\.[27]50*$/) { 12767 $denominator = 4; 12768 } 12769 elsif ($decimal =~ /\.[2468]0*$/) { 12770 $denominator = 5; 12771 } 12772 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { 12773 $denominator = 6; 12774 } 12775 elsif ($decimal =~ /\.(12|37|62|87)50*$/) { 12776 $denominator = 8; 12777 } 12778 if ($denominator) { 12779 my $sign = ($decimal < 0) ? "-" : ""; 12780 my $numerator = int((abs($decimal) * $denominator) + .5); 12781 $rational = "$sign$numerator/$denominator"; 12782 $_ = "$codepoints; $rational"; 12783 } 12784 else { 12785 $file->carp_bad_line("Can't cope with number '$decimal'."); 12786 $_ = ""; 12787 return; 12788 } 12789 } 12790 } 12791 12792 register_fraction($rational) if $rational =~ qr{/}; 12793 return; 12794} 12795 12796{ # Closure 12797 my %unihan_properties; 12798 12799 sub construct_unihan($file_object) { 12800 12801 return unless file_exists($file_object->file); 12802 12803 if ($v_version lt v4.0.0) { 12804 push @cjk_properties, 'URS ; Unicode_Radical_Stroke'; 12805 push @cjk_property_values, split "\n", <<'END'; 12806# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none> 12807END 12808 } 12809 12810 if ($v_version ge v3.0.0) { 12811 push @cjk_properties, split "\n", <<'END'; 12812cjkIRG_GSource; kIRG_GSource 12813cjkIRG_JSource; kIRG_JSource 12814cjkIRG_KSource; kIRG_KSource 12815cjkIRG_TSource; kIRG_TSource 12816cjkIRG_VSource; kIRG_VSource 12817END 12818 push @cjk_property_values, split "\n", <<'END'; 12819# @missing: 0000..10FFFF; cjkIRG_GSource; <none> 12820# @missing: 0000..10FFFF; cjkIRG_JSource; <none> 12821# @missing: 0000..10FFFF; cjkIRG_KSource; <none> 12822# @missing: 0000..10FFFF; cjkIRG_TSource; <none> 12823# @missing: 0000..10FFFF; cjkIRG_VSource; <none> 12824END 12825 } 12826 if ($v_version ge v3.1.0) { 12827 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource'; 12828 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>'; 12829 } 12830 if ($v_version ge v3.1.1) { 12831 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource'; 12832 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>'; 12833 } 12834 if ($v_version ge v3.2.0) { 12835 push @cjk_properties, split "\n", <<'END'; 12836cjkAccountingNumeric; kAccountingNumeric 12837cjkCompatibilityVariant; kCompatibilityVariant 12838cjkOtherNumeric; kOtherNumeric 12839cjkPrimaryNumeric; kPrimaryNumeric 12840END 12841 push @cjk_property_values, split "\n", <<'END'; 12842# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 12843# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> 12844# @missing: 0000..10FFFF; cjkOtherNumeric; NaN 12845# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN 12846END 12847 } 12848 if ($v_version gt v4.0.0) { 12849 push @cjk_properties, 'cjkIRG_USource; kIRG_USource'; 12850 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>'; 12851 } 12852 12853 if ($v_version ge v4.1.0) { 12854 push @cjk_properties, 'cjkIICore ; kIICore'; 12855 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>'; 12856 } 12857 } 12858 12859 sub setup_unihan { 12860 # Do any special setup for Unihan properties. 12861 12862 # This property gives the wrong computed type, so override. 12863 my $usource = property_ref('kIRG_USource'); 12864 $usource->set_type($STRING) if defined $usource; 12865 12866 # This property is to be considered binary (it says so in 12867 # http://www.unicode.org/reports/tr38/) 12868 my $iicore = property_ref('kIICore'); 12869 if (defined $iicore) { 12870 $iicore->set_type($FORCED_BINARY); 12871 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38."); 12872 12873 # Unicode doesn't include the maps for this property, so don't 12874 # warn that they are missing. 12875 $iicore->set_pre_declared_maps(0); 12876 $iicore->add_comment(join_lines( <<END 12877This property contains string values, but any non-empty ones are considered to 12878be 'core', so Perl creates tables for both: 1) its string values, plus 2) 12879tables so that \\p{kIICore} matches any code point which has a non-empty 12880value for this property. 12881END 12882 )); 12883 } 12884 12885 return; 12886 } 12887 12888 sub filter_unihan_line { 12889 # Change unihan db lines to look like the others in the db. Here is 12890 # an input sample: 12891 # U+341C kCangjie IEKN 12892 12893 # Tabs are used instead of semi-colons to separate fields; therefore 12894 # they may have semi-colons embedded in them. Change these to periods 12895 # so won't screw up the rest of the code. 12896 s/;/./g; 12897 12898 # Remove lines that don't look like ones we accept. 12899 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { 12900 $_ = ""; 12901 return; 12902 } 12903 12904 # Extract the property, and save a reference to its object. 12905 my $property = $1; 12906 if (! exists $unihan_properties{$property}) { 12907 $unihan_properties{$property} = property_ref($property); 12908 } 12909 12910 # Don't do anything unless the property is one we're handling, which 12911 # we determine by seeing if there is an object defined for it or not 12912 if (! defined $unihan_properties{$property}) { 12913 $_ = ""; 12914 return; 12915 } 12916 12917 # Convert the tab separators to our standard semi-colons, and convert 12918 # the U+HHHH notation to the rest of the standard's HHHH 12919 s/\t/;/g; 12920 s/\b U \+ (?= $code_point_re )//xg; 12921 12922 #local $to_trace = 1 if main::DEBUG; 12923 trace $_ if main::DEBUG && $to_trace; 12924 12925 return; 12926 } 12927} 12928 12929sub filter_blocks_lines($file) { 12930 # In the Blocks.txt file, the names of the blocks don't quite match the 12931 # names given in PropertyValueAliases.txt, so this changes them so they 12932 # do match: Blanks and hyphens are changed into underscores. Also makes 12933 # early release versions look like later ones 12934 # 12935 # $_ is transformed to the correct value. 12936 12937 if ($v_version lt v3.2.0) { 12938 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted 12939 $_ = ""; 12940 return; 12941 } 12942 12943 # Old versions used a different syntax to mark the range. 12944 $_ =~ s/;\s+/../ if $v_version lt v3.1.0; 12945 } 12946 12947 my @fields = split /\s*;\s*/, $_, -1; 12948 if (@fields != 2) { 12949 $file->carp_bad_line("Expecting exactly two fields"); 12950 $_ = ""; 12951 return; 12952 } 12953 12954 # Change hyphens and blanks in the block name field only 12955 $fields[1] =~ s/[ -]/_/g; 12956 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word 12957 12958 $_ = join("; ", @fields); 12959 return; 12960} 12961 12962{ # Closure 12963 my $current_property; 12964 12965 sub filter_old_style_proplist { 12966 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it 12967 # was in a completely different syntax. Ken Whistler of Unicode says 12968 # that it was something he used as an aid for his own purposes, but 12969 # was never an official part of the standard. Many of the properties 12970 # in it were incorporated into the later PropList.txt, but some were 12971 # not. This program uses this early file to generate property tables 12972 # that are otherwise not accessible in the early UCD's. It does this 12973 # for the ones that eventually became official, and don't appear to be 12974 # too different in their contents from the later official version, and 12975 # throws away the rest. It could be argued that the ones it generates 12976 # were probably not really official at that time, so should be 12977 # ignored. You can easily modify things to skip all of them by 12978 # changing this function to just set $_ to "", and return; and to skip 12979 # certain of them by simply removing their declarations from 12980 # get_old_property_aliases(). 12981 # 12982 # Here is a list of all the ones that are thrown away: 12983 # Alphabetic The definitions for this are very 12984 # defective, so better to not mislead 12985 # people into thinking it works. 12986 # Instead the Perl extension of the 12987 # same name is constructed from first 12988 # principles. 12989 # Bidi=* duplicates UnicodeData.txt 12990 # Combining never made into official property; 12991 # is \P{ccc=0} 12992 # Composite never made into official property. 12993 # Currency Symbol duplicates UnicodeData.txt: gc=sc 12994 # Decimal Digit duplicates UnicodeData.txt: gc=nd 12995 # Delimiter never made into official property; 12996 # removed in 3.0.1 12997 # Format Control never made into official property; 12998 # similar to gc=cf 12999 # High Surrogate duplicates Blocks.txt 13000 # Ignorable Control never made into official property; 13001 # similar to di=y 13002 # ISO Control duplicates UnicodeData.txt: gc=cc 13003 # Left of Pair never made into official property; 13004 # Line Separator duplicates UnicodeData.txt: gc=zl 13005 # Low Surrogate duplicates Blocks.txt 13006 # Non-break was actually listed as a property 13007 # in 3.2, but without any code 13008 # points. Unicode denies that this 13009 # was ever an official property 13010 # Non-spacing duplicate UnicodeData.txt: gc=mn 13011 # Numeric duplicates UnicodeData.txt: gc=cc 13012 # Paired Punctuation never made into official property; 13013 # appears to be gc=ps + gc=pe 13014 # Paragraph Separator duplicates UnicodeData.txt: gc=cc 13015 # Private Use duplicates UnicodeData.txt: gc=co 13016 # Private Use High Surrogate duplicates Blocks.txt 13017 # Punctuation duplicates UnicodeData.txt: gc=p 13018 # Space different definition than eventual 13019 # one. 13020 # Titlecase duplicates UnicodeData.txt: gc=lt 13021 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn 13022 # Zero-width never made into official property; 13023 # subset of gc=cf 13024 # Most of the properties have the same names in this file as in later 13025 # versions, but a couple do not. 13026 # 13027 # This subroutine filters $_, converting it from the old style into 13028 # the new style. Here's a sample of the old-style 13029 # 13030 # ******************************************* 13031 # 13032 # Property dump for: 0x100000A0 (Join Control) 13033 # 13034 # 200C..200D (2 chars) 13035 # 13036 # In the example, the property is "Join Control". It is kept in this 13037 # closure between calls to the subroutine. The numbers beginning with 13038 # 0x were internal to Ken's program that generated this file. 13039 13040 # If this line contains the property name, extract it. 13041 if (/^Property dump for: [^(]*\((.*)\)/) { 13042 $_ = $1; 13043 13044 # Convert white space to underscores. 13045 s/ /_/g; 13046 13047 # Convert the few properties that don't have the same name as 13048 # their modern counterparts 13049 s/Identifier_Part/ID_Continue/ 13050 or s/Not_a_Character/NChar/; 13051 13052 # If the name matches an existing property, use it. 13053 if (defined property_ref($_)) { 13054 trace "new property=", $_ if main::DEBUG && $to_trace; 13055 $current_property = $_; 13056 } 13057 else { # Otherwise discard it 13058 trace "rejected property=", $_ if main::DEBUG && $to_trace; 13059 undef $current_property; 13060 } 13061 $_ = ""; # The property is saved for the next lines of the 13062 # file, but this defining line is of no further use, 13063 # so clear it so that the caller won't process it 13064 # further. 13065 } 13066 elsif (! defined $current_property || $_ !~ /^$code_point_re/) { 13067 13068 # Here, the input line isn't a header defining a property for the 13069 # following section, and either we aren't in such a section, or 13070 # the line doesn't look like one that defines the code points in 13071 # such a section. Ignore this line. 13072 $_ = ""; 13073 } 13074 else { 13075 13076 # Here, we have a line defining the code points for the current 13077 # stashed property. Anything starting with the first blank is 13078 # extraneous. Otherwise, it should look like a normal range to 13079 # the caller. Append the property name so that it looks just like 13080 # a modern PropList entry. 13081 13082 $_ =~ s/\s.*//; 13083 $_ .= "; $current_property"; 13084 } 13085 trace $_ if main::DEBUG && $to_trace; 13086 return; 13087 } 13088} # End closure for old style proplist 13089 13090sub filter_old_style_normalization_lines { 13091 # For early releases of Unicode, the lines were like: 13092 # 74..2A76 ; NFKD_NO 13093 # For later releases this became: 13094 # 74..2A76 ; NFKD_QC; N 13095 # Filter $_ to look like those in later releases. 13096 # Similarly for MAYBEs 13097 13098 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; 13099 13100 # Also, the property FC_NFKC was abbreviated to FNC 13101 s/FNC/FC_NFKC/; 13102 return; 13103} 13104 13105sub setup_script_extensions { 13106 # The Script_Extensions property starts out with a clone of the Script 13107 # property. 13108 13109 $scx = property_ref("Script_Extensions"); 13110 return unless defined $scx; 13111 13112 $scx->_set_format($STRING_WHITE_SPACE_LIST); 13113 $scx->initialize($script); 13114 $scx->set_default_map($script->default_map); 13115 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 13116 $scx->add_comment(join_lines( <<END 13117The values for code points that appear in one script are just the same as for 13118the 'Script' property. Likewise the values for those that appear in many 13119scripts are either 'Common' or 'Inherited', same as with 'Script'. But the 13120values of code points that appear in a few scripts are a space separated list 13121of those scripts. 13122END 13123 )); 13124 13125 # Initialize scx's tables and the aliases for them to be the same as sc's 13126 foreach my $table ($script->tables) { 13127 my $scx_table = $scx->add_match_table($table->name, 13128 Full_Name => $table->full_name); 13129 foreach my $alias ($table->aliases) { 13130 $scx_table->add_alias($alias->name); 13131 } 13132 } 13133} 13134 13135sub filter_script_extensions_line { 13136 # The Scripts file comes with the full name for the scripts; the 13137 # ScriptExtensions, with the short name. The final mapping file is a 13138 # combination of these, and without adjustment, would have inconsistent 13139 # entries. This filters the latter file to convert to full names. 13140 # Entries look like this: 13141 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 13142 13143 my @fields = split /\s*;\s*/; 13144 13145 # This script was erroneously omitted in this Unicode version. 13146 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; 13147 13148 my @full_names; 13149 foreach my $short_name (split " ", $fields[1]) { 13150 push @full_names, $script->table($short_name)->full_name; 13151 } 13152 $fields[1] = join " ", @full_names; 13153 $_ = join "; ", @fields; 13154 13155 return; 13156} 13157 13158sub setup_emojidata { 13159 my $prop_ref = Property->new('ExtPict', 13160 Full_Name => 'Extended_Pictographic', 13161 ); 13162 $prop_ref->set_fate($PLACEHOLDER, 13163 "Not part of the Unicode Character Database"); 13164} 13165 13166sub filter_emojidata_line { 13167 # We only are interested in this single property from this non-UCD data 13168 # file, and we turn it into a Perl property, so that it isn't accessible 13169 # to the users 13170 13171 $_ = "" unless /\bExtended_Pictographic\b/; 13172 13173 return; 13174} 13175 13176sub setup_IdStatus { 13177 my $ids = Property->new('Identifier_Status', 13178 Match_SubDir => 'IdStatus', 13179 Default_Map => 'Restricted', 13180 ); 13181 $ids->add_match_table('Allowed'); 13182} 13183 13184sub setup_IdType { 13185 $idt = Property->new('Identifier_Type', 13186 Match_SubDir => 'IdType', 13187 Default_Map => 'Not_Character', 13188 Format => $STRING_WHITE_SPACE_LIST, 13189 ); 13190} 13191 13192sub filter_IdType_line { 13193 13194 # Some code points have more than one type, separated by spaces on the 13195 # input. For now, we just add everything as a property value. Later when 13196 # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve 13197 # things 13198 13199 my @fields = split /\s*;\s*/; 13200 my $types = $fields[1]; 13201 $idt->add_match_table($types) unless defined $idt->table($types); 13202 13203 return; 13204} 13205 13206sub generate_hst($file) { 13207 13208 # Populates the Hangul Syllable Type property from first principles 13209 13210 # These few ranges are hard-coded in. 13211 $file->insert_lines(split /\n/, <<'END' 132121100..1159 ; L 13213115F ; L 132141160..11A2 ; V 1321511A8..11F9 ; T 13216END 13217); 13218 13219 # The Hangul syllables in version 1 are at different code points than 13220 # those that came along starting in version 2, and have different names; 13221 # they comprise about 60% of the code points of the later version. 13222 # From my (khw) research on them (see <558493EB.4000807@att.net>), the 13223 # initial set is a subset of the later version, with different English 13224 # transliterations. I did not see an easy mapping between them. The 13225 # later set includes essentially all possibilities, even ones that aren't 13226 # in modern use (if they ever were), and over 96% of the new ones are type 13227 # LVT. Mathematically, the early set must also contain a preponderance of 13228 # LVT values. In lieu of doing nothing, we just set them all to LVT, and 13229 # expect that this will be right most of the time, which is better than 13230 # not being right at all. 13231 if ($v_version lt v2.0.0) { 13232 my $property = property_ref($file->property); 13233 $file->insert_lines(sprintf("%04X..%04X; LVT\n", 13234 $FIRST_REMOVED_HANGUL_SYLLABLE, 13235 $FINAL_REMOVED_HANGUL_SYLLABLE)); 13236 push @tables_that_may_be_empty, $property->table('LV')->complete_name; 13237 return; 13238 } 13239 13240 # The algorithmically derived syllables are almost all LVT ones, so 13241 # initialize the whole range with that. 13242 $file->insert_lines(sprintf "%04X..%04X; LVT\n", 13243 $SBase, $SBase + $SCount -1); 13244 13245 # Those ones that aren't LVT are LV, and they occur at intervals of 13246 # $TCount code points, starting with the first code point, at $SBase. 13247 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) { 13248 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i); 13249 } 13250 13251 return; 13252} 13253 13254sub generate_GCB($file) { 13255 13256 # Populates the Grapheme Cluster Break property from first principles 13257 13258 # All these definitions are from 13259 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation 13260 # from http://www.unicode.org/reports/tr29/tr29-4.html 13261 13262 foreach my $range ($gc->ranges) { 13263 13264 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc 13265 # and gc=Cf 13266 if ($range->value =~ / ^ M [en] $ /x) { 13267 $file->insert_lines(sprintf "%04X..%04X; Extend", 13268 $range->start, $range->end); 13269 } 13270 elsif ($range->value =~ / ^ C [cf] $ /x) { 13271 $file->insert_lines(sprintf "%04X..%04X; Control", 13272 $range->start, $range->end); 13273 } 13274 } 13275 $file->insert_lines("2028; Control"); # Line Separator 13276 $file->insert_lines("2029; Control"); # Paragraph Separator 13277 13278 $file->insert_lines("000D; CR"); 13279 $file->insert_lines("000A; LF"); 13280 13281 # Also from http://www.unicode.org/reports/tr29/tr29-3.html. 13282 foreach my $code_point ( qw{ 13283 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6 13284 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F 13285 } 13286 ) { 13287 my $category = $gc->value_of(hex $code_point); 13288 next if ! defined $category || $category eq 'Cn'; # But not if 13289 # unassigned in this 13290 # release 13291 $file->insert_lines("$code_point; Extend"); 13292 } 13293 13294 my $hst = property_ref('Hangul_Syllable_Type'); 13295 if ($hst->count > 0) { 13296 foreach my $range ($hst->ranges) { 13297 $file->insert_lines(sprintf "%04X..%04X; %s", 13298 $range->start, $range->end, $range->value); 13299 } 13300 } 13301 else { 13302 generate_hst($file); 13303 } 13304 13305 main::process_generic_property_file($file); 13306} 13307 13308 13309sub fixup_early_perl_name_alias($file) { 13310 13311 # Different versions of Unicode have varying support for the name synonyms 13312 # below. Just include everything. As of 6.1, all these are correct in 13313 # the Unicode-supplied file. 13314 13315 # ALERT did not come along until 6.0, at which point it became preferred 13316 # over BELL. By inserting it last in early releases, BELL is preferred 13317 # over it; and vice-vers in 6.0 13318 my $type_for_bell = ($v_version lt v6.0.0) 13319 ? 'correction' 13320 : 'alternate'; 13321 $file->insert_lines(split /\n/, <<END 133220007;BELL; $type_for_bell 13323000A;LINE FEED (LF);alternate 13324000C;FORM FEED (FF);alternate 13325000D;CARRIAGE RETURN (CR);alternate 133260085;NEXT LINE (NEL);alternate 13327END 13328 13329 ); 13330 13331 # One might think that the 'Unicode_1_Name' field, could work for most 13332 # of the above names, but sadly that field varies depending on the 13333 # release. Version 1.1.5 had no names for any of the controls; Version 13334 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names. 13335 # 3.0.1 removed the name INDEX; and 3.2 changed some names: 13336 # changed to parenthesized versions like "NEXT LINE" to 13337 # "NEXT LINE (NEL)"; 13338 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD 13339 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; 13340 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR 13341 # 13342 # All these are present in the 6.1 NameAliases.txt 13343 13344 return; 13345} 13346 13347sub filter_later_version_name_alias_line { 13348 13349 # This file has an extra entry per line for the alias type. This is 13350 # handled by creating a compound entry: "$alias: $type"; First, split 13351 # the line into components. 13352 my ($range, $alias, $type, @remainder) 13353 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13354 13355 # This file contains multiple entries for some components, so tell the 13356 # downstream code to allow this in our internal tables; the 13357 # $MULTIPLE_AFTER preserves the input ordering. 13358 $_ = join ";", $range, $CMD_DELIM 13359 . $REPLACE_CMD 13360 . '=' 13361 . $MULTIPLE_AFTER 13362 . $CMD_DELIM 13363 . "$alias: $type", 13364 @remainder; 13365 return; 13366} 13367 13368sub filter_early_version_name_alias_line { 13369 13370 # Early versions did not have the trailing alias type field; implicitly it 13371 # was 'correction'. 13372 $_ .= "; correction"; 13373 13374 filter_later_version_name_alias_line; 13375 return; 13376} 13377 13378sub filter_all_caps_script_names { 13379 13380 # Some early Unicode releases had the script names in all CAPS. This 13381 # converts them to just the first letter of each word being capital. 13382 13383 my ($range, $script, @remainder) 13384 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13385 my @words = split /[_-]/, $script; 13386 for my $word (@words) { 13387 $word = 13388 ucfirst(lc($word)) if $word ne 'CJK'; 13389 } 13390 $script = join "_", @words; 13391 $_ = join ";", $range, $script, @remainder; 13392} 13393 13394sub finish_Unicode() { 13395 # This routine should be called after all the Unicode files have been read 13396 # in. It: 13397 # 1) Creates properties that are missing from the version of Unicode being 13398 # compiled, and which, for whatever reason, are needed for the Perl 13399 # core to function properly. These are minimally populated as 13400 # necessary. 13401 # 2) Adds the mappings for code points missing from the files which have 13402 # defaults specified for them. 13403 # 3) At this point all mappings are known, so it computes the type of 13404 # each property whose type hasn't been determined yet. 13405 # 4) Calculates all the regular expression match tables based on the 13406 # mappings. 13407 # 5) Calculates and adds the tables which are defined by Unicode, but 13408 # which aren't derived by them, and certain derived tables that Perl 13409 # uses. 13410 13411 # Folding information was introduced later into Unicode data. To get 13412 # Perl's case ignore (/i) to work at all in releases that don't have 13413 # folding, use the best available alternative, which is lower casing. 13414 my $fold = property_ref('Case_Folding'); 13415 if ($fold->is_empty) { 13416 $fold->initialize(property_ref('Lowercase_Mapping')); 13417 $fold->add_note(join_lines(<<END 13418WARNING: This table uses lower case as a substitute for missing fold 13419information 13420END 13421 )); 13422 } 13423 13424 # Multiple-character mapping was introduced later into Unicode data, so it 13425 # is by default the simple version. If to output the simple versions and 13426 # not present, just use the regular (which in these Unicode versions is 13427 # the simple as well). 13428 foreach my $map (qw { Uppercase_Mapping 13429 Lowercase_Mapping 13430 Titlecase_Mapping 13431 Case_Folding 13432 } ) 13433 { 13434 my $comment = <<END; 13435 13436Note that although the Perl core uses this file, it has the standard values 13437for code points from U+0000 to U+00FF compiled in, so changing this table will 13438not change the core's behavior with respect to these code points. Use 13439Unicode::Casing to override this table. 13440END 13441 if ($map eq 'Case_Folding') { 13442 $comment .= <<END; 13443(/i regex matching is not overridable except by using a custom regex engine) 13444END 13445 } 13446 property_ref($map)->add_comment(join_lines($comment)); 13447 my $simple = property_ref("Simple_$map"); 13448 next if ! $simple->is_empty; 13449 if ($simple->to_output_map) { 13450 $simple->initialize(property_ref($map)); 13451 } 13452 else { 13453 property_ref($map)->set_proxy_for($simple->name); 13454 } 13455 } 13456 13457 # For each property, fill in any missing mappings, and calculate the re 13458 # match tables. If a property has more than one missing mapping, the 13459 # default is a reference to a data structure, and may require data from 13460 # other properties to resolve. The sort is used to cause these to be 13461 # processed last, after all the other properties have been calculated. 13462 # (Fortunately, the missing properties so far don't depend on each other.) 13463 foreach my $property 13464 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } 13465 property_ref('*')) 13466 { 13467 # $perl has been defined, but isn't one of the Unicode properties that 13468 # need to be finished up. 13469 next if $property == $perl; 13470 13471 # Nor do we need to do anything with properties that aren't going to 13472 # be output. 13473 next if $property->fate == $SUPPRESSED; 13474 13475 # Handle the properties that have more than one possible default 13476 if (ref $property->default_map) { 13477 my $default_map = $property->default_map; 13478 13479 # These properties have stored in the default_map: 13480 # One or more of: 13481 # 1) A default map which applies to all code points in a 13482 # certain class 13483 # 2) an expression which will evaluate to the list of code 13484 # points in that class 13485 # And 13486 # 3) the default map which applies to every other missing code 13487 # point. 13488 # 13489 # Go through each list. 13490 while (my ($default, $eval) = $default_map->get_next_defaults) { 13491 last unless defined $eval; 13492 13493 # Get the class list, and intersect it with all the so-far 13494 # unspecified code points yielding all the code points 13495 # in the class that haven't been specified. 13496 my $list = eval $eval; 13497 if ($@) { 13498 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); 13499 last; 13500 } 13501 13502 # Narrow down the list to just those code points we don't have 13503 # maps for yet. 13504 $list = $list & $property->inverse_list; 13505 13506 # Add mappings to the property for each code point in the list 13507 foreach my $range ($list->ranges) { 13508 $property->add_map($range->start, $range->end, $default, 13509 Replace => $NO); 13510 } 13511 } 13512 13513 # All remaining code points have the other mapping. Set that up 13514 # so the normal single-default mapping code will work on them 13515 $property->set_default_map($default_map->other_default); 13516 13517 # And fall through to do that 13518 } 13519 13520 # We should have enough data now to compute the type of the property. 13521 my $property_name = $property->name; 13522 $property->compute_type; 13523 my $property_type = $property->type; 13524 13525 next if ! $property->to_create_match_tables; 13526 13527 # Here want to create match tables for this property 13528 13529 # The Unicode db always (so far, and they claim into the future) have 13530 # the default for missing entries in binary properties be 'N' (unless 13531 # there is a '@missing' line that specifies otherwise) 13532 if (! defined $property->default_map) { 13533 if ($property_type == $BINARY) { 13534 $property->set_default_map('N'); 13535 } 13536 elsif ($property_type == $ENUM) { 13537 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one"); 13538 $property->set_default_map('XXX This makes sure there is a default map'); 13539 } 13540 } 13541 13542 # Add any remaining code points to the mapping, using the default for 13543 # missing code points. 13544 my $default_table; 13545 my $default_map = $property->default_map; 13546 if ($property_type == $FORCED_BINARY) { 13547 13548 # A forced binary property creates a 'Y' table that matches all 13549 # non-default values. The actual string values are also written out 13550 # as a map table. (The default value will almost certainly be the 13551 # empty string, so the pod glosses over the distinction, and just 13552 # talks about empty vs non-empty.) 13553 my $yes = $property->table("Y"); 13554 foreach my $range ($property->ranges) { 13555 next if $range->value eq $default_map; 13556 $yes->add_range($range->start, $range->end); 13557 } 13558 $property->table("N")->set_complement($yes); 13559 } 13560 else { 13561 if (defined $default_map) { 13562 13563 # Make sure there is a match table for the default 13564 if (! defined ($default_table = $property->table($default_map))) 13565 { 13566 $default_table = $property->add_match_table($default_map); 13567 } 13568 13569 # And, if the property is binary, the default table will just 13570 # be the complement of the other table. 13571 if ($property_type == $BINARY) { 13572 my $non_default_table; 13573 13574 # Find the non-default table. 13575 for my $table ($property->tables) { 13576 if ($table == $default_table) { 13577 if ($v_version le v5.0.0) { 13578 $table->add_alias($_) for qw(N No F False); 13579 } 13580 next; 13581 } elsif ($v_version le v5.0.0) { 13582 $table->add_alias($_) for qw(Y Yes T True); 13583 } 13584 $non_default_table = $table; 13585 } 13586 $default_table->set_complement($non_default_table); 13587 } 13588 else { 13589 13590 # This fills in any missing values with the default. It's 13591 # not necessary to do this with binary properties, as the 13592 # default is defined completely in terms of the Y table. 13593 $property->add_map(0, $MAX_WORKING_CODEPOINT, 13594 $default_map, Replace => $NO); 13595 } 13596 } 13597 13598 # Have all we need to populate the match tables. 13599 my $maps_should_be_defined = $property->pre_declared_maps; 13600 foreach my $range ($property->ranges) { 13601 my $map = $range->value; 13602 my $table = $property->table($map); 13603 if (! defined $table) { 13604 13605 # Integral and rational property values are not 13606 # necessarily defined in PropValueAliases, but whether all 13607 # the other ones should be depends on the property. 13608 if ($maps_should_be_defined 13609 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) 13610 { 13611 Carp::my_carp("Table '$property_name=$map' should " 13612 . "have been defined. Defining it now.") 13613 } 13614 $table = $property->add_match_table($map); 13615 } 13616 13617 next if $table->complement != 0; # Don't need to populate these 13618 $table->add_range($range->start, $range->end); 13619 } 13620 } 13621 13622 # For Perl 5.6 compatibility, all properties matchable in regexes can 13623 # have an optional 'Is_' prefix. This is now done in Unicode::UCD. 13624 # But warn if this creates a conflict with a (new) Unicode property 13625 # name, although it appears that Unicode has made a decision never to 13626 # begin a property name with 'Is_', so this shouldn't happen. 13627 foreach my $alias ($property->aliases) { 13628 my $Is_name = 'Is_' . $alias->name; 13629 if (defined (my $pre_existing = property_ref($Is_name))) { 13630 Carp::my_carp(<<END 13631There is already an alias named $Is_name (from " . $pre_existing . "), so 13632creating one for $property won't work. This is bad news. If it is not too 13633late, get Unicode to back off. Otherwise go back to the old scheme (findable 13634from the git blame log for this area of the code that suppressed individual 13635aliases that conflict with the new Unicode names. Proceeding anyway. 13636END 13637 ); 13638 } 13639 } # End of loop through aliases for this property 13640 13641 13642 # Properties that have sets of values for some characters are now 13643 # converted. For example, the Script_Extensions property started out 13644 # as a clone of the Script property. But processing its data file 13645 # caused some elements to be replaced with different data. (These 13646 # elements were for the Common and Inherited properties.) This data 13647 # is a qw() list of all the scripts that the code points in the given 13648 # range are in. An example line is: 13649 # 13650 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA 13651 # 13652 # Code executed earlier has created a new match table named "Arab Syrc 13653 # Thaa" which contains 060C. (The cloned table started out with this 13654 # code point mapping to "Common".) Now we add 060C to each of the 13655 # Arab, Syrc, and Thaa match tables. Then we delete the now spurious 13656 # "Arab Syrc Thaa" match table. This is repeated for all these tables 13657 # and ranges. The map data is retained in the map table for 13658 # reference, but the spurious match tables are deleted. 13659 my $format = $property->format; 13660 if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) { 13661 foreach my $table ($property->tables) { 13662 13663 # Space separates the entries which should go in multiple 13664 # tables 13665 next unless $table->name =~ /\s/; 13666 13667 # The list of the entries, hence the names of the tables that 13668 # everything in this combo table should be added to. 13669 my @list = split /\s+/, $table->name; 13670 13671 # Add the entries from the combo table to each individual 13672 # table 13673 foreach my $individual (@list) { 13674 my $existing_table = $property->table($individual); 13675 13676 # This should only be necessary if this particular entry 13677 # occurs only in combo with others. 13678 $existing_table = $property->add_match_table($individual) 13679 unless defined $existing_table; 13680 $existing_table += $table; 13681 } 13682 $property->delete_match_table($table); 13683 } 13684 } 13685 } # End of loop through all Unicode properties. 13686 13687 # Fill in the mappings that Unicode doesn't completely furnish. First the 13688 # single letter major general categories. If Unicode were to start 13689 # delivering the values, this would be redundant, but better that than to 13690 # try to figure out if should skip and not get it right. Ths could happen 13691 # if a new major category were to be introduced, and the hard-coded test 13692 # wouldn't know about it. 13693 # This routine depends on the standard names for the general categories 13694 # being what it thinks they are, like 'Cn'. The major categories are the 13695 # union of all the general category tables which have the same first 13696 # letters. eg. L = Lu + Lt + Ll + Lo + Lm 13697 foreach my $minor_table ($gc->tables) { 13698 my $minor_name = $minor_table->name; 13699 next if length $minor_name == 1; 13700 if (length $minor_name != 2) { 13701 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); 13702 next; 13703 } 13704 13705 my $major_name = uc(substr($minor_name, 0, 1)); 13706 my $major_table = $gc->table($major_name); 13707 $major_table += $minor_table; 13708 } 13709 13710 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt 13711 # defines it as LC) 13712 my $LC = $gc->table('LC'); 13713 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... 13714 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. 13715 13716 13717 if ($LC->is_empty) { # Assume if not empty that Unicode has started to 13718 # deliver the correct values in it 13719 $LC->initialize($gc->table('Ll') + $gc->table('Lu')); 13720 13721 # Lt not in release 1. 13722 if (defined $gc->table('Lt')) { 13723 $LC += $gc->table('Lt'); 13724 $gc->table('Lt')->set_caseless_equivalent($LC); 13725 } 13726 } 13727 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); 13728 13729 $gc->table('Ll')->set_caseless_equivalent($LC); 13730 $gc->table('Lu')->set_caseless_equivalent($LC); 13731 13732 # Make sure this assumption in perl core code is valid in this Unicode 13733 # release, with known exceptions 13734 foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) { 13735 next if $range->end - $range->start == 9; 13736 next if $range->start == 0x1D7CE; # This whole range was added in 3.1 13737 next if $range->end == 0x19DA && $v_version eq v5.2.0; 13738 next if $range->end - $range->start < 9 && $v_version le 4.0.0; 13739 Carp::my_carp("Range $range unexpectedly doesn't contain 10" 13740 . " decimal digits. Code in regcomp.c assumes it does," 13741 . " and will have to be fixed. Proceeding anyway."); 13742 } 13743 13744 # Mark the scx table as the parent of the corresponding sc table for those 13745 # which are identical. This causes the pod for the script table to refer 13746 # to the corresponding scx one. This is done after everything, so as to 13747 # wait until the tables are stabilized before checking for equivalency. 13748 if (defined $scx) { 13749 if (defined $pod_directory) { 13750 foreach my $table ($scx->tables) { 13751 my $plain_sc_equiv = $script->table($table->name); 13752 if ($table->matches_identically_to($plain_sc_equiv)) { 13753 $plain_sc_equiv->set_equivalent_to($table, Related => 1); 13754 } 13755 } 13756 } 13757 } 13758 13759 return; 13760} 13761 13762sub pre_3_dot_1_Nl () { 13763 13764 # Return a range list for gc=nl for Unicode versions prior to 3.1, which 13765 # is when Unicode's became fully usable. These code points were 13766 # determined by inspection and experimentation. gc=nl is important for 13767 # certain Perl-extension properties that should be available in all 13768 # releases. 13769 13770 my $Nl = Range_List->new(); 13771 if (defined (my $official = $gc->table('Nl'))) { 13772 $Nl += $official; 13773 } 13774 else { 13775 $Nl->add_range(0x2160, 0x2182); 13776 $Nl->add_range(0x3007, 0x3007); 13777 $Nl->add_range(0x3021, 0x3029); 13778 } 13779 $Nl->add_range(0xFE20, 0xFE23); 13780 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when 13781 # these were added 13782 return $Nl; 13783} 13784 13785sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be 13786 # called before the Cn's are completely filled. 13787 # Works on Unicodes earlier than ones that 13788 # explicitly specify Cn. 13789 return if defined $Assigned; 13790 13791 if (! defined $gc || $gc->is_empty()) { 13792 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated"); 13793 } 13794 13795 $Assigned = $perl->add_match_table('Assigned', 13796 Description => "All assigned code points", 13797 ); 13798 while (defined (my $range = $gc->each_range())) { 13799 my $standard_value = standardize($range->value); 13800 next if $standard_value eq 'cn' || $standard_value eq 'unassigned'; 13801 $Assigned->add_range($range->start, $range->end); 13802 } 13803} 13804 13805sub calculate_DI() { # Set $DI to a Range_List equivalent to the 13806 # Default_Ignorable_Code_Point property. Works on 13807 # Unicodes earlier than ones that explicitly specify 13808 # DI. 13809 return if defined $DI; 13810 13811 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 13812 $DI = $di->table('Y'); 13813 } 13814 else { 13815 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D, 13816 0x2060 .. 0x206F, 13817 0xFE00 .. 0xFE0F, 13818 0xFFF0 .. 0xFFFB, 13819 ]); 13820 if ($v_version ge v2.0) { 13821 $DI += $gc->table('Cf') 13822 + $gc->table('Cs'); 13823 13824 # These are above the Unicode version 1 max 13825 $DI->add_range(0xE0000, 0xE0FFF); 13826 } 13827 $DI += $gc->table('Cc') 13828 - ord("\t") 13829 - utf8::unicode_to_native(0x0A) # LINE FEED 13830 - utf8::unicode_to_native(0x0B) # VERTICAL TAB 13831 - ord("\f") 13832 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 13833 - utf8::unicode_to_native(0x85); # NEL 13834 } 13835} 13836 13837sub calculate_NChar() { # Create a Perl extension match table which is the 13838 # same as the Noncharacter_Code_Point property, and 13839 # set $NChar to point to it. Works on Unicodes 13840 # earlier than ones that explicitly specify NChar 13841 return if defined $NChar; 13842 13843 $NChar = $perl->add_match_table('_Perl_Nchar', 13844 Perl_Extension => 1, 13845 Fate => $INTERNAL_ONLY); 13846 if (defined (my $off_nchar = property_ref('NChar'))) { 13847 $NChar->initialize($off_nchar->table('Y')); 13848 } 13849 else { 13850 $NChar->initialize([ 0xFFFE .. 0xFFFF ]); 13851 if ($v_version ge v2.0) { # First release with these nchars 13852 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) { 13853 $NChar += [ $i .. $i+1 ]; 13854 } 13855 } 13856 } 13857} 13858 13859sub handle_compare_versions () { 13860 # This fixes things up for the $compare_versions capability, where we 13861 # compare Unicode version X with version Y (with Y > X), and we are 13862 # running it on the Unicode Data for version Y. 13863 # 13864 # It works by calculating the code points whose meaning has been specified 13865 # after release X, by using the Age property. The complement of this set 13866 # is the set of code points whose meaning is unchanged between the 13867 # releases. This is the set the program restricts itself to. It includes 13868 # everything whose meaning has been specified by the time version X came 13869 # along, plus those still unassigned by the time of version Y. (We will 13870 # continue to use the word 'assigned' to mean 'meaning has been 13871 # specified', as it's shorter and is accurate in all cases except the 13872 # Noncharacter code points.) 13873 # 13874 # This function is run after all the properties specified by Unicode have 13875 # been calculated for release Y. This makes sure we get all the nuances 13876 # of Y's rules. (It is done before the Perl extensions are calculated, as 13877 # those are based entirely on the Unicode ones.) But doing it after the 13878 # Unicode table calculations means we have to fix up the Unicode tables. 13879 # We do this by subtracting the code points that have been assigned since 13880 # X (which is actually done by ANDing each table of assigned code points 13881 # with the set of unchanged code points). Most Unicode properties are of 13882 # the form such that all unassigned code points have a default, grab-bag, 13883 # property value which is changed when the code point gets assigned. For 13884 # these, we just remove the changed code points from the table for the 13885 # latter property value, and add them back in to the grab-bag one. A few 13886 # other properties are not entirely of this form and have values for some 13887 # or all unassigned code points that are not the grab-bag one. These have 13888 # to be handled specially, and are hard-coded in to this routine based on 13889 # manual inspection of the Unicode character database. A list of the 13890 # outlier code points is made for each of these properties, and those 13891 # outliers are excluded from adding and removing from tables. 13892 # 13893 # Note that there are glitches when comparing against Unicode 1.1, as some 13894 # Hangul syllables in it were later ripped out and eventually replaced 13895 # with other things. 13896 13897 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS; 13898 13899 my $after_first_version = "All matching code points were added after " 13900 . "Unicode $string_compare_versions"; 13901 13902 # Calculate the delta as those code points that have been newly assigned 13903 # since the first compare version. 13904 my $delta = Range_List->new(); 13905 foreach my $table ($age->tables) { 13906 use version; 13907 next if $table == $age->table('Unassigned'); 13908 next if version->parse($table->name) 13909 le version->parse($string_compare_versions); 13910 $delta += $table; 13911 } 13912 if ($delta->is_empty) { 13913 die ("No changes; perhaps you need a 'DAge.txt' file?"); 13914 } 13915 13916 my $unchanged = ~ $delta; 13917 13918 calculate_Assigned() if ! defined $Assigned; 13919 $Assigned &= $unchanged; 13920 13921 # $Assigned now contains the code points that were assigned as of Unicode 13922 # version X. 13923 13924 # A block is all or nothing. If nothing is assigned in it, it all goes 13925 # back to the No_Block pool; but if even one code point is assigned, the 13926 # block is retained. 13927 my $no_block = $block->table('No_Block'); 13928 foreach my $this_block ($block->tables) { 13929 next if $this_block == $no_block 13930 || ! ($this_block & $Assigned)->is_empty; 13931 $this_block->set_fate($SUPPRESSED, $after_first_version); 13932 foreach my $range ($this_block->ranges) { 13933 $block->replace_map($range->start, $range->end, 'No_Block') 13934 } 13935 $no_block += $this_block; 13936 } 13937 13938 my @special_delta_properties; # List of properties that have to be 13939 # handled specially. 13940 my %restricted_delta; # Keys are the entries in 13941 # @special_delta_properties; values 13942 # are the range list of the code points 13943 # that behave normally when they get 13944 # assigned. 13945 13946 # In the next three properties, the Default Ignorable code points are 13947 # outliers. 13948 calculate_DI(); 13949 $DI &= $unchanged; 13950 13951 push @special_delta_properties, property_ref('_Perl_GCB'); 13952 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13953 13954 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded'))) 13955 { 13956 push @special_delta_properties, $cwnfkcc; 13957 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13958 } 13959 13960 calculate_NChar(); # Non-character code points 13961 $NChar &= $unchanged; 13962 13963 # This may have to be updated from time-to-time to get the most accurate 13964 # results. 13965 my $default_BC_non_LtoR = Range_List->new(Initialize => 13966 # These came from the comments in v8.0 DBidiClass.txt 13967 [ # AL 13968 0x0600 .. 0x07BF, 13969 0x08A0 .. 0x08FF, 13970 0xFB50 .. 0xFDCF, 13971 0xFDF0 .. 0xFDFF, 13972 0xFE70 .. 0xFEFF, 13973 0x1EE00 .. 0x1EEFF, 13974 # R 13975 0x0590 .. 0x05FF, 13976 0x07C0 .. 0x089F, 13977 0xFB1D .. 0xFB4F, 13978 0x10800 .. 0x10FFF, 13979 0x1E800 .. 0x1EDFF, 13980 0x1EF00 .. 0x1EFFF, 13981 # ET 13982 0x20A0 .. 0x20CF, 13983 ] 13984 ); 13985 $default_BC_non_LtoR += $DI + $NChar; 13986 push @special_delta_properties, property_ref('BidiClass'); 13987 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR; 13988 13989 if (defined (my $eaw = property_ref('East_Asian_Width'))) { 13990 13991 my $default_EA_width_W = Range_List->new(Initialize => 13992 # From comments in v8.0 EastAsianWidth.txt 13993 [ 13994 0x3400 .. 0x4DBF, 13995 0x4E00 .. 0x9FFF, 13996 0xF900 .. 0xFAFF, 13997 0x20000 .. 0x2A6DF, 13998 0x2A700 .. 0x2B73F, 13999 0x2B740 .. 0x2B81F, 14000 0x2B820 .. 0x2CEAF, 14001 0x2F800 .. 0x2FA1F, 14002 0x20000 .. 0x2FFFD, 14003 0x30000 .. 0x3FFFD, 14004 ] 14005 ); 14006 push @special_delta_properties, $eaw; 14007 $restricted_delta{$special_delta_properties[-1]} 14008 = ~ $default_EA_width_W; 14009 14010 # Line break came along in the same release as East_Asian_Width, and 14011 # the non-grab-bag default set is a superset of the EAW one. 14012 if (defined (my $lb = property_ref('Line_Break'))) { 14013 my $default_LB_non_XX = Range_List->new(Initialize => 14014 # From comments in v8.0 LineBreak.txt 14015 [ 0x20A0 .. 0x20CF ]); 14016 $default_LB_non_XX += $default_EA_width_W; 14017 push @special_delta_properties, $lb; 14018 $restricted_delta{$special_delta_properties[-1]} 14019 = ~ $default_LB_non_XX; 14020 } 14021 } 14022 14023 # Go through every property, skipping those we've already worked on, those 14024 # that are immutable, and the perl ones that will be calculated after this 14025 # routine has done its fixup. 14026 foreach my $property (property_ref('*')) { 14027 next if $property == $perl # Done later in the program 14028 || $property == $block # Done just above 14029 || $property == $DI # Done just above 14030 || $property == $NChar # Done just above 14031 14032 # The next two are invariant across Unicode versions 14033 || $property == property_ref('Pattern_Syntax') 14034 || $property == property_ref('Pattern_White_Space'); 14035 14036 # Find the grab-bag value. 14037 my $default_map = $property->default_map; 14038 14039 if (! $property->to_create_match_tables) { 14040 14041 # Here there aren't any match tables. So far, all such properties 14042 # have a default map, and don't require special handling. Just 14043 # change each newly assigned code point back to the default map, 14044 # as if they were unassigned. 14045 foreach my $range ($delta->ranges) { 14046 $property->add_map($range->start, 14047 $range->end, 14048 $default_map, 14049 Replace => $UNCONDITIONALLY); 14050 } 14051 } 14052 else { # Here there are match tables. Find the one (if any) for the 14053 # grab-bag value that unassigned code points go to. 14054 my $default_table; 14055 if (defined $default_map) { 14056 $default_table = $property->table($default_map); 14057 } 14058 14059 # If some code points don't go back to the grab-bag when they 14060 # are considered unassigned, exclude them from the list that does 14061 # that. 14062 my $this_delta = $delta; 14063 my $this_unchanged = $unchanged; 14064 if (grep { $_ == $property } @special_delta_properties) { 14065 $this_delta = $delta & $restricted_delta{$property}; 14066 $this_unchanged = ~ $this_delta; 14067 } 14068 14069 # Fix up each match table for this property. 14070 foreach my $table ($property->tables) { 14071 if (defined $default_table && $table == $default_table) { 14072 14073 # The code points assigned after release X (the ones we 14074 # are excluding in this routine) go back on to the default 14075 # (grab-bag) table. However, some of these tables don't 14076 # actually exist, but are specified solely by the other 14077 # tables. (In a binary property, we don't need to 14078 # actually have an 'N' table, as it's just the complement 14079 # of the 'Y' table.) Such tables will be locked, so just 14080 # skip those. 14081 $table += $this_delta unless $table->locked; 14082 } 14083 else { 14084 14085 # Here the table is not for the default value. We need to 14086 # subtract the code points we are ignoring for this 14087 # comparison (the deltas) from it. But if the table 14088 # started out with nothing, no need to exclude anything, 14089 # and want to skip it here anyway, so it gets listed 14090 # properly in the pod. 14091 next if $table->is_empty; 14092 14093 # Save the deltas for later, before we do the subtraction 14094 my $deltas = $table & $this_delta; 14095 14096 $table &= $this_unchanged; 14097 14098 # Suppress the table if the subtraction left it with 14099 # nothing in it 14100 if ($table->is_empty) { 14101 if ($property->type == $BINARY) { 14102 push @tables_that_may_be_empty, $table->complete_name; 14103 } 14104 else { 14105 $table->set_fate($SUPPRESSED, $after_first_version); 14106 } 14107 } 14108 14109 # Now we add the removed code points to the property's 14110 # map, as they should now map to the grab-bag default 14111 # property (which they did in the first comparison 14112 # version). But we don't have to do this if the map is 14113 # only for internal use. 14114 if (defined $default_map && $property->to_output_map) { 14115 14116 # The gc property has pseudo property values whose names 14117 # have length 1. These are the union of all the 14118 # property values whose name is longer than 1 and 14119 # whose first letter is all the same. The replacement 14120 # is done once for the longer-named tables. 14121 next if $property == $gc && length $table->name == 1; 14122 14123 foreach my $range ($deltas->ranges) { 14124 $property->add_map($range->start, 14125 $range->end, 14126 $default_map, 14127 Replace => $UNCONDITIONALLY); 14128 } 14129 } 14130 } 14131 } 14132 } 14133 } 14134 14135 # The above code doesn't work on 'gc=C', as it is a superset of the default 14136 # ('Cn') table. It's easiest to just special case it here. 14137 my $C = $gc->table('C'); 14138 $C += $gc->table('Cn'); 14139 14140 return; 14141} 14142 14143sub compile_perl() { 14144 # Create perl-defined tables. Almost all are part of the pseudo-property 14145 # named 'perl' internally to this program. Many of these are recommended 14146 # in UTS#18 "Unicode Regular Expressions", and their derivations are based 14147 # on those found there. 14148 # Almost all of these are equivalent to some Unicode property. 14149 # A number of these properties have equivalents restricted to the ASCII 14150 # range, with their names prefaced by 'Posix', to signify that these match 14151 # what the Posix standard says they should match. A couple are 14152 # effectively this, but the name doesn't have 'Posix' in it because there 14153 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended 14154 # to the full Unicode range, by our guesses as to what is appropriate. 14155 14156 # 'All' is all code points. As an error check, instead of just setting it 14157 # to be that, construct it to be the union of all the major categories 14158 $All = $perl->add_match_table('All', 14159 Description 14160 => "All code points, including those above Unicode. Same as qr/./s", 14161 Matches_All => 1); 14162 14163 foreach my $major_table ($gc->tables) { 14164 14165 # Major categories are the ones with single letter names. 14166 next if length($major_table->name) != 1; 14167 14168 $All += $major_table; 14169 } 14170 14171 if ($All->max != $MAX_WORKING_CODEPOINT) { 14172 Carp::my_carp_bug("Generated highest code point (" 14173 . sprintf("%X", $All->max) 14174 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.") 14175 } 14176 if ($All->range_count != 1 || $All->min != 0) { 14177 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.") 14178 } 14179 14180 my $Any = $perl->add_match_table('Any', 14181 Description => "All Unicode code points"); 14182 $Any->add_range(0, $MAX_UNICODE_CODEPOINT); 14183 $Any->add_alias('Unicode'); 14184 14185 calculate_Assigned(); 14186 14187 my $ASCII = $perl->add_match_table('ASCII'); 14188 if (defined $block) { # This is equivalent to the block if have it. 14189 my $Unicode_ASCII = $block->table('Basic_Latin'); 14190 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { 14191 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); 14192 } 14193 } 14194 14195 # Very early releases didn't have blocks, so initialize ASCII ourselves if 14196 # necessary 14197 if ($ASCII->is_empty) { 14198 if (! NON_ASCII_PLATFORM) { 14199 $ASCII->add_range(0, 127); 14200 } 14201 else { 14202 for my $i (0 .. 127) { 14203 $ASCII->add_range(utf8::unicode_to_native($i), 14204 utf8::unicode_to_native($i)); 14205 } 14206 } 14207 } 14208 14209 # Get the best available case definitions. Early Unicode versions didn't 14210 # have Uppercase and Lowercase defined, so use the general category 14211 # instead for them, modified by hard-coding in the code points each is 14212 # missing. 14213 my $Lower = $perl->add_match_table('XPosixLower'); 14214 my $Unicode_Lower = property_ref('Lowercase'); 14215 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { 14216 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); 14217 14218 } 14219 else { 14220 $Lower += $gc->table('Lowercase_Letter'); 14221 14222 # There are quite a few code points in Lower, that aren't in gc=lc, 14223 # and not all are in all releases. 14224 my $temp = Range_List->new(Initialize => [ 14225 utf8::unicode_to_native(0xAA), 14226 utf8::unicode_to_native(0xBA), 14227 0x02B0 .. 0x02B8, 14228 0x02C0 .. 0x02C1, 14229 0x02E0 .. 0x02E4, 14230 0x0345, 14231 0x037A, 14232 0x1D2C .. 0x1D6A, 14233 0x1D78, 14234 0x1D9B .. 0x1DBF, 14235 0x2071, 14236 0x207F, 14237 0x2090 .. 0x209C, 14238 0x2170 .. 0x217F, 14239 0x24D0 .. 0x24E9, 14240 0x2C7C .. 0x2C7D, 14241 0xA770, 14242 0xA7F8 .. 0xA7F9, 14243 ]); 14244 $Lower += $temp & $Assigned; 14245 } 14246 my $Posix_Lower = $perl->add_match_table("PosixLower", 14247 Initialize => $Lower & $ASCII, 14248 ); 14249 14250 my $Upper = $perl->add_match_table("XPosixUpper"); 14251 my $Unicode_Upper = property_ref('Uppercase'); 14252 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { 14253 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); 14254 } 14255 else { 14256 14257 # Unlike Lower, there are only two ranges in Upper that aren't in 14258 # gc=Lu, and all code points were assigned in all releases. 14259 $Upper += $gc->table('Uppercase_Letter'); 14260 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals 14261 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters 14262 } 14263 my $Posix_Upper = $perl->add_match_table("PosixUpper", 14264 Initialize => $Upper & $ASCII, 14265 ); 14266 14267 # Earliest releases didn't have title case. Initialize it to empty if not 14268 # otherwise present 14269 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase', 14270 Description => '(= \p{Gc=Lt})'); 14271 my $lt = $gc->table('Lt'); 14272 14273 # Earlier versions of mktables had this related to $lt since they have 14274 # identical code points, but their caseless equivalents are not the same, 14275 # one being 'Cased' and the other being 'LC', and so now must be kept as 14276 # separate entities. 14277 if (defined $lt) { 14278 $Title += $lt; 14279 } 14280 else { 14281 push @tables_that_may_be_empty, $Title->complete_name; 14282 } 14283 14284 my $Unicode_Cased = property_ref('Cased'); 14285 if (defined $Unicode_Cased) { 14286 my $yes = $Unicode_Cased->table('Y'); 14287 my $no = $Unicode_Cased->table('N'); 14288 $Title->set_caseless_equivalent($yes); 14289 if (defined $Unicode_Upper) { 14290 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes); 14291 $Unicode_Upper->table('N')->set_caseless_equivalent($no); 14292 } 14293 $Upper->set_caseless_equivalent($yes); 14294 if (defined $Unicode_Lower) { 14295 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes); 14296 $Unicode_Lower->table('N')->set_caseless_equivalent($no); 14297 } 14298 $Lower->set_caseless_equivalent($yes); 14299 } 14300 else { 14301 # If this Unicode version doesn't have Cased, set up the Perl 14302 # extension from first principles. From Unicode 5.1: Definition D120: 14303 # A character C is defined to be cased if and only if C has the 14304 # Lowercase or Uppercase property or has a General_Category value of 14305 # Titlecase_Letter. 14306 my $cased = $perl->add_match_table('Cased', 14307 Initialize => $Lower + $Upper + $Title, 14308 Description => 'Uppercase or Lowercase or Titlecase', 14309 ); 14310 # $notcased is purely for the caseless equivalents below 14311 my $notcased = $perl->add_match_table('_Not_Cased', 14312 Initialize => ~ $cased, 14313 Fate => $INTERNAL_ONLY, 14314 Description => 'All not-cased code points'); 14315 $Title->set_caseless_equivalent($cased); 14316 if (defined $Unicode_Upper) { 14317 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased); 14318 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased); 14319 } 14320 $Upper->set_caseless_equivalent($cased); 14321 if (defined $Unicode_Lower) { 14322 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased); 14323 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased); 14324 } 14325 $Lower->set_caseless_equivalent($cased); 14326 } 14327 14328 # The remaining perl defined tables are mostly based on Unicode TR 18, 14329 # "Annex C: Compatibility Properties". All of these have two versions, 14330 # one whose name generally begins with Posix that is posix-compliant, and 14331 # one that matches Unicode characters beyond the Posix, ASCII range 14332 14333 my $Alpha = $perl->add_match_table('XPosixAlpha'); 14334 14335 # Alphabetic was not present in early releases 14336 my $Alphabetic = property_ref('Alphabetic'); 14337 if (defined $Alphabetic && ! $Alphabetic->is_empty) { 14338 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); 14339 } 14340 else { 14341 14342 # The Alphabetic property doesn't exist for early releases, so 14343 # generate it. The actual definition, in 5.2 terms is: 14344 # 14345 # gc=L + gc=Nl + Other_Alphabetic 14346 # 14347 # Other_Alphabetic is also not defined in these early releases, but it 14348 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add 14349 # those last two as well, then subtract the relatively few of them that 14350 # shouldn't have been added. (The gc=So range is the circled capital 14351 # Latin characters. Early releases mistakenly didn't also include the 14352 # lower-case versions of these characters, and so we don't either, to 14353 # maintain consistency with those releases that first had this 14354 # property. 14355 $Alpha->initialize($gc->table('Letter') 14356 + pre_3_dot_1_Nl() 14357 + $gc->table('Mn') 14358 + $gc->table('Mc') 14359 ); 14360 $Alpha->add_range(0x24D0, 0x24E9); # gc=So 14361 foreach my $range ( [ 0x0300, 0x0344 ], 14362 [ 0x0346, 0x034E ], 14363 [ 0x0360, 0x0362 ], 14364 [ 0x0483, 0x0486 ], 14365 [ 0x0591, 0x05AF ], 14366 [ 0x06DF, 0x06E0 ], 14367 [ 0x06EA, 0x06EC ], 14368 [ 0x0740, 0x074A ], 14369 0x093C, 14370 0x094D, 14371 [ 0x0951, 0x0954 ], 14372 0x09BC, 14373 0x09CD, 14374 0x0A3C, 14375 0x0A4D, 14376 0x0ABC, 14377 0x0ACD, 14378 0x0B3C, 14379 0x0B4D, 14380 0x0BCD, 14381 0x0C4D, 14382 0x0CCD, 14383 0x0D4D, 14384 0x0DCA, 14385 [ 0x0E47, 0x0E4C ], 14386 0x0E4E, 14387 [ 0x0EC8, 0x0ECC ], 14388 [ 0x0F18, 0x0F19 ], 14389 0x0F35, 14390 0x0F37, 14391 0x0F39, 14392 [ 0x0F3E, 0x0F3F ], 14393 [ 0x0F82, 0x0F84 ], 14394 [ 0x0F86, 0x0F87 ], 14395 0x0FC6, 14396 0x1037, 14397 0x1039, 14398 [ 0x17C9, 0x17D3 ], 14399 [ 0x20D0, 0x20DC ], 14400 0x20E1, 14401 [ 0x302A, 0x302F ], 14402 [ 0x3099, 0x309A ], 14403 [ 0xFE20, 0xFE23 ], 14404 [ 0x1D165, 0x1D169 ], 14405 [ 0x1D16D, 0x1D172 ], 14406 [ 0x1D17B, 0x1D182 ], 14407 [ 0x1D185, 0x1D18B ], 14408 [ 0x1D1AA, 0x1D1AD ], 14409 ) { 14410 if (ref $range) { 14411 $Alpha->delete_range($range->[0], $range->[1]); 14412 } 14413 else { 14414 $Alpha->delete_range($range, $range); 14415 } 14416 } 14417 $Alpha->add_description('Alphabetic'); 14418 $Alpha->add_alias('Alphabetic'); 14419 } 14420 my $Posix_Alpha = $perl->add_match_table("PosixAlpha", 14421 Initialize => $Alpha & $ASCII, 14422 ); 14423 $Posix_Upper->set_caseless_equivalent($Posix_Alpha); 14424 $Posix_Lower->set_caseless_equivalent($Posix_Alpha); 14425 14426 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum', 14427 Description => 'Alphabetic and (decimal) Numeric', 14428 Initialize => $Alpha + $gc->table('Decimal_Number'), 14429 ); 14430 $perl->add_match_table("PosixAlnum", 14431 Initialize => $Alnum & $ASCII, 14432 ); 14433 14434 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord', 14435 Description => '\w, including beyond ASCII;' 14436 . ' = \p{Alnum} + \pM + \p{Pc}' 14437 . ' + \p{Join_Control}', 14438 Initialize => $Alnum + $gc->table('Mark'), 14439 ); 14440 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 14441 if (defined $Pc) { 14442 $Word += $Pc; 14443 } 14444 else { 14445 $Word += ord('_'); # Make sure this is a $Word 14446 } 14447 my $JC = property_ref('Join_Control'); # Wasn't in release 1 14448 if (defined $JC) { 14449 $Word += $JC->table('Y'); 14450 } 14451 else { 14452 $Word += 0x200C + 0x200D; 14453 } 14454 14455 # This is a Perl extension, so the name doesn't begin with Posix. 14456 my $PerlWord = $perl->add_match_table('PosixWord', 14457 Description => '\w, restricted to ASCII', 14458 Initialize => $Word & $ASCII, 14459 ); 14460 $PerlWord->add_alias('PerlWord'); 14461 14462 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank', 14463 Description => '\h, Horizontal white space', 14464 14465 # 200B is Zero Width Space which is for line 14466 # break control, and was listed as 14467 # Space_Separator in early releases 14468 Initialize => $gc->table('Space_Separator') 14469 + ord("\t") 14470 - 0x200B, # ZWSP 14471 ); 14472 $Blank->add_alias('HorizSpace'); # Another name for it. 14473 $perl->add_match_table("PosixBlank", 14474 Initialize => $Blank & $ASCII, 14475 ); 14476 14477 my $VertSpace = $perl->add_match_table('VertSpace', 14478 Description => '\v', 14479 Initialize => 14480 $gc->table('Line_Separator') 14481 + $gc->table('Paragraph_Separator') 14482 + utf8::unicode_to_native(0x0A) # LINE FEED 14483 + utf8::unicode_to_native(0x0B) # VERTICAL TAB 14484 + ord("\f") 14485 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 14486 + utf8::unicode_to_native(0x85) # NEL 14487 ); 14488 # No Posix equivalent for vertical space 14489 14490 my $Space = $perl->add_match_table('XPosixSpace', 14491 Description => '\s including beyond ASCII and vertical tab', 14492 Initialize => $Blank + $VertSpace, 14493 ); 14494 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms 14495 $Space->add_alias('SpacePerl'); 14496 $Space->add_alias('Space') if $v_version lt v4.1.0; 14497 14498 my $Posix_space = $perl->add_match_table("PosixSpace", 14499 Initialize => $Space & $ASCII, 14500 ); 14501 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym 14502 14503 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl', 14504 Description => 'Control characters'); 14505 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); 14506 $perl->add_match_table("PosixCntrl", 14507 Description => "ASCII control characters", 14508 Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2," 14509 . " DC3, DC4, DEL, DLE, ENQ, EOM," 14510 . " EOT, ESC, ETB, ETX, FF, FS, GS," 14511 . " HT, LF, NAK, NUL, RS, SI, SO," 14512 . " SOH, STX, SUB, SYN, US, VT", 14513 Initialize => $Cntrl & $ASCII, 14514 ); 14515 14516 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate'); 14517 my $Cs = $gc->table('Cs'); 14518 if (defined $Cs && ! $Cs->is_empty) { 14519 $perl_surrogate += $Cs; 14520 } 14521 else { 14522 push @tables_that_may_be_empty, '_Perl_Surrogate'; 14523 } 14524 14525 # $controls is a temporary used to construct Graph. 14526 my $controls = Range_List->new(Initialize => $gc->table('Unassigned') 14527 + $gc->table('Control') 14528 + $perl_surrogate); 14529 14530 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) 14531 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph', 14532 Description => 'Characters that are graphical', 14533 Initialize => ~ ($Space + $controls), 14534 ); 14535 $perl->add_match_table("PosixGraph", 14536 Initialize => $Graph & $ASCII, 14537 ); 14538 14539 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint', 14540 Description => 'Characters that are graphical plus space characters (but no controls)', 14541 Initialize => $Blank + $Graph - $gc->table('Control'), 14542 ); 14543 $perl->add_match_table("PosixPrint", 14544 Initialize => $print & $ASCII, 14545 ); 14546 14547 my $Punct = $perl->add_match_table('Punct'); 14548 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); 14549 14550 # \p{punct} doesn't include the symbols, which posix does 14551 my $XPosixPunct = $perl->add_match_table('XPosixPunct', 14552 Description => '\p{Punct} + ASCII-range \p{Symbol}', 14553 Initialize => $gc->table('Punctuation') 14554 + ($ASCII & $gc->table('Symbol')), 14555 Perl_Extension => 1 14556 ); 14557 $perl->add_match_table('PosixPunct', Perl_Extension => 1, 14558 Initialize => $ASCII & $XPosixPunct, 14559 ); 14560 14561 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit', 14562 Description => '[0-9] + all other decimal digits'); 14563 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); 14564 my $PosixDigit = $perl->add_match_table("PosixDigit", 14565 Initialize => $Digit & $ASCII, 14566 ); 14567 14568 # Hex_Digit was not present in first release 14569 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit'); 14570 my $Hex = property_ref('Hex_Digit'); 14571 if (defined $Hex && ! $Hex->is_empty) { 14572 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); 14573 } 14574 else { 14575 $Xdigit->initialize([ ord('0') .. ord('9'), 14576 ord('A') .. ord('F'), 14577 ord('a') .. ord('f'), 14578 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); 14579 } 14580 14581 # AHex was not present in early releases 14582 my $PosixXDigit = $perl->add_match_table('PosixXDigit'); 14583 my $AHex = property_ref('ASCII_Hex_Digit'); 14584 if (defined $AHex && ! $AHex->is_empty) { 14585 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); 14586 } 14587 else { 14588 $PosixXDigit->initialize($Xdigit & $ASCII); 14589 $PosixXDigit->add_alias('AHex'); 14590 $PosixXDigit->add_alias('Ascii_Hex_Digit'); 14591 } 14592 14593 my $any_folds = $perl->add_match_table("_Perl_Any_Folds", 14594 Description => "Code points that particpate in some fold", 14595 ); 14596 my $loc_problem_folds = $perl->add_match_table( 14597 "_Perl_Problematic_Locale_Folds", 14598 Description => 14599 "Code points that are in some way problematic under locale", 14600 ); 14601 14602 # This allows regexec.c to skip some work when appropriate. Some of the 14603 # entries in _Perl_Problematic_Locale_Folds are multi-character folds, 14604 my $loc_problem_folds_start = $perl->add_match_table( 14605 "_Perl_Problematic_Locale_Foldeds_Start", 14606 Description => 14607 "The first character of every sequence in _Perl_Problematic_Locale_Folds", 14608 ); 14609 14610 my $cf = property_ref('Case_Folding'); 14611 14612 # Every character 0-255 is problematic because what each folds to depends 14613 # on the current locale 14614 $loc_problem_folds->add_range(0, 255); 14615 $loc_problem_folds->add_range(0x130, 0x131); # These are problematic in 14616 # Turkic locales 14617 $loc_problem_folds_start += $loc_problem_folds; 14618 14619 # Also problematic are anything these fold to outside the range. Likely 14620 # forever the only thing folded to by these outside the 0-255 range is the 14621 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code 14622 # completely general, which should catch any unexpected changes or errors. 14623 # We look at each code point 0-255, and add its fold (including each part 14624 # of a multi-char fold) to the list. See commit message 14625 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description 14626 # of the MU issue. 14627 foreach my $range ($loc_problem_folds->ranges) { 14628 foreach my $code_point ($range->start .. $range->end) { 14629 my $fold_range = $cf->containing_range($code_point); 14630 next unless defined $fold_range; 14631 14632 # Skip if folds to itself 14633 next if $fold_range->value eq $CODE_POINT; 14634 14635 my @hex_folds = split " ", $fold_range->value; 14636 my $start_cp = $hex_folds[0]; 14637 next if $start_cp eq $CODE_POINT; 14638 $start_cp = hex $start_cp; 14639 foreach my $i (0 .. @hex_folds - 1) { 14640 my $cp = $hex_folds[$i]; 14641 next if $cp eq $CODE_POINT; 14642 $cp = hex $cp; 14643 next unless $cp > 255; # Already have the < 256 ones 14644 14645 $loc_problem_folds->add_range($cp, $cp); 14646 $loc_problem_folds_start->add_range($start_cp, $start_cp); 14647 } 14648 } 14649 } 14650 14651 my $folds_to_multi_char = $perl->add_match_table( 14652 "_Perl_Folds_To_Multi_Char", 14653 Description => 14654 "Code points whose fold is a string of more than one character", 14655 ); 14656 my $in_multi_fold = $perl->add_match_table( 14657 "_Perl_Is_In_Multi_Char_Fold", 14658 Description => 14659 "Code points that are in some multiple character fold", 14660 ); 14661 if ($v_version lt v3.0.1) { 14662 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char', 14663 '_Perl_Is_In_Multi_Char_Fold', 14664 '_Perl_Non_Final_Folds'; 14665 } 14666 14667 # Look through all the known folds to populate these tables. 14668 foreach my $range ($cf->ranges) { 14669 next if $range->value eq $CODE_POINT; 14670 my $start = $range->start; 14671 my $end = $range->end; 14672 $any_folds->add_range($start, $end); 14673 14674 my @hex_folds = split " ", $range->value; 14675 if (@hex_folds > 1) { # Is multi-char fold 14676 $folds_to_multi_char->add_range($start, $end); 14677 } 14678 14679 my $found_locale_problematic = 0; 14680 14681 my $folded_count = @hex_folds; 14682 if ($folded_count > 3) { 14683 die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's $folded_count for U+" . sprintf "%04X", $range->start); 14684 } 14685 14686 # Look at each of the folded-to characters... 14687 foreach my $i (1 .. $folded_count) { 14688 my $cp = hex $hex_folds[$i-1]; 14689 $any_folds->add_range($cp, $cp); 14690 14691 # The fold is problematic if any of the folded-to characters is 14692 # already considered problematic. 14693 if ($loc_problem_folds->contains($cp)) { 14694 $loc_problem_folds->add_range($start, $end); 14695 $found_locale_problematic = 1; 14696 } 14697 14698 if ($folded_count > 1) { 14699 $in_multi_fold->add_range($cp, $cp); 14700 } 14701 } 14702 14703 # If this is a problematic fold, add to the start chars the 14704 # folding-from characters and first folded-to character. 14705 if ($found_locale_problematic) { 14706 $loc_problem_folds_start->add_range($start, $end); 14707 my $cp = hex $hex_folds[0]; 14708 $loc_problem_folds_start->add_range($cp, $cp); 14709 } 14710 } 14711 14712 my $dt = property_ref('Decomposition_Type'); 14713 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', 14714 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), 14715 Perl_Extension => 1, 14716 Note => 'Union of all non-canonical decompositions', 14717 ); 14718 14719 # For backward compatibility, Perl has its own definition for IDStart. 14720 # It is regular XID_Start plus the underscore, but all characters must be 14721 # Word characters as well 14722 my $XID_Start = property_ref('XID_Start'); 14723 my $perl_xids = $perl->add_match_table('_Perl_IDStart', 14724 Perl_Extension => 1, 14725 Fate => $INTERNAL_ONLY, 14726 Initialize => ord('_') 14727 ); 14728 if (defined $XID_Start 14729 || defined ($XID_Start = property_ref('ID_Start'))) 14730 { 14731 $perl_xids += $XID_Start->table('Y'); 14732 } 14733 else { 14734 # For Unicode versions that don't have the property, construct our own 14735 # from first principles. The actual definition is: 14736 # Letters 14737 # + letter numbers (Nl) 14738 # - Pattern_Syntax 14739 # - Pattern_White_Space 14740 # + stability extensions 14741 # - NKFC modifications 14742 # 14743 # What we do in the code below is to include the identical code points 14744 # that are in the first release that had Unicode's version of this 14745 # property, essentially extrapolating backwards. There were no 14746 # stability extensions until v4.1, so none are included; likewise in 14747 # no Unicode version so far do subtracting PatSyn and PatWS make any 14748 # difference, so those also are ignored. 14749 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl(); 14750 14751 # We do subtract the NFKC modifications that are in the first version 14752 # that had this property. We don't bother to test if they are in the 14753 # version in question, because if they aren't, the operation is a 14754 # no-op. The NKFC modifications are discussed in 14755 # http://www.unicode.org/reports/tr31/#NFKC_Modifications 14756 foreach my $range ( 0x037A, 14757 0x0E33, 14758 0x0EB3, 14759 [ 0xFC5E, 0xFC63 ], 14760 [ 0xFDFA, 0xFE70 ], 14761 [ 0xFE72, 0xFE76 ], 14762 0xFE78, 14763 0xFE7A, 14764 0xFE7C, 14765 0xFE7E, 14766 [ 0xFF9E, 0xFF9F ], 14767 ) { 14768 if (ref $range) { 14769 $perl_xids->delete_range($range->[0], $range->[1]); 14770 } 14771 else { 14772 $perl_xids->delete_range($range, $range); 14773 } 14774 } 14775 } 14776 14777 $perl_xids &= $Word; 14778 14779 my $perl_xidc = $perl->add_match_table('_Perl_IDCont', 14780 Perl_Extension => 1, 14781 Fate => $INTERNAL_ONLY); 14782 my $XIDC = property_ref('XID_Continue'); 14783 if (defined $XIDC 14784 || defined ($XIDC = property_ref('ID_Continue'))) 14785 { 14786 $perl_xidc += $XIDC->table('Y'); 14787 } 14788 else { 14789 # Similarly, we construct our own XIDC if necessary for early Unicode 14790 # versions. The definition is: 14791 # everything in XIDS 14792 # + Gc=Mn 14793 # + Gc=Mc 14794 # + Gc=Nd 14795 # + Gc=Pc 14796 # - Pattern_Syntax 14797 # - Pattern_White_Space 14798 # + stability extensions 14799 # - NFKC modifications 14800 # 14801 # The same thing applies to this as with XIDS for the PatSyn, PatWS, 14802 # and stability extensions. There is a somewhat different set of NFKC 14803 # mods to remove (and add in this case). The ones below make this 14804 # have identical code points as in the first release that defined it. 14805 $perl_xidc += $perl_xids 14806 + $gc->table('L') 14807 + $gc->table('Mn') 14808 + $gc->table('Mc') 14809 + $gc->table('Nd') 14810 + utf8::unicode_to_native(0xB7) 14811 ; 14812 if (defined (my $pc = $gc->table('Pc'))) { 14813 $perl_xidc += $pc; 14814 } 14815 else { # 1.1.5 didn't have Pc, but these should have been in it 14816 $perl_xidc += 0xFF3F; 14817 $perl_xidc->add_range(0x203F, 0x2040); 14818 $perl_xidc->add_range(0xFE33, 0xFE34); 14819 $perl_xidc->add_range(0xFE4D, 0xFE4F); 14820 } 14821 14822 # Subtract the NFKC mods 14823 foreach my $range ( 0x037A, 14824 [ 0xFC5E, 0xFC63 ], 14825 [ 0xFDFA, 0xFE1F ], 14826 0xFE70, 14827 [ 0xFE72, 0xFE76 ], 14828 0xFE78, 14829 0xFE7A, 14830 0xFE7C, 14831 0xFE7E, 14832 ) { 14833 if (ref $range) { 14834 $perl_xidc->delete_range($range->[0], $range->[1]); 14835 } 14836 else { 14837 $perl_xidc->delete_range($range, $range); 14838 } 14839 } 14840 } 14841 14842 $perl_xidc &= $Word; 14843 14844 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin', 14845 Perl_Extension => 1, 14846 Fate => $INTERNAL_ONLY, 14847 Initialize => $gc->table('Letter') & $Alpha & $perl_xids, 14848 ); 14849 14850 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue', 14851 Perl_Extension => 1, 14852 Fate => $INTERNAL_ONLY, 14853 Initialize => $perl_xidc 14854 + ord(" ") 14855 + ord("(") 14856 + ord(")") 14857 + ord("-") 14858 ); 14859 14860 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); 14861 14862 if (@named_sequences) { 14863 push @composition, 'Named_Sequence'; 14864 foreach my $sequence (@named_sequences) { 14865 $perl_charname->add_anomalous_entry($sequence); 14866 } 14867 } 14868 14869 my $alias_sentence = ""; 14870 my %abbreviations; 14871 my $alias = property_ref('_Perl_Name_Alias'); 14872 $perl_charname->set_proxy_for('_Perl_Name_Alias'); 14873 14874 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go 14875 # with respect to any existing entry depends on the entry type. 14876 # Corrections go before said entry, as they should be returned in 14877 # preference over the existing entry. (A correction to a correction 14878 # should be later in the _Perl_Name_Alias table, so it will correctly 14879 # precede the erroneous correction in Perl_Charnames.) 14880 # 14881 # Abbreviations go after everything else, so they are saved temporarily in 14882 # a hash for later. 14883 # 14884 # Everything else is added afterwards, which preserves the input 14885 # ordering 14886 14887 foreach my $range ($alias->ranges) { 14888 next if $range->value eq ""; 14889 my $code_point = $range->start; 14890 if ($code_point != $range->end) { 14891 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14892 } 14893 my ($value, $type) = split ': ', $range->value; 14894 my $replace_type; 14895 if ($type eq 'correction') { 14896 $replace_type = $MULTIPLE_BEFORE; 14897 } 14898 elsif ($type eq 'abbreviation') { 14899 14900 # Save for later 14901 $abbreviations{$value} = $code_point; 14902 next; 14903 } 14904 else { 14905 $replace_type = $MULTIPLE_AFTER; 14906 } 14907 14908 # Actually add; before or after current entry(ies) as determined 14909 # above. 14910 14911 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); 14912 } 14913 $alias_sentence = <<END; 14914The _Perl_Name_Alias property adds duplicate code point entries that are 14915alternatives to the original name. If an addition is a corrected 14916name, it will be physically first in the table. The original (less correct, 14917but still valid) name will be next; then any alternatives, in no particular 14918order; and finally any abbreviations, again in no particular order. 14919END 14920 14921 # Now add the Unicode_1 names for the controls. The Unicode_1 names had 14922 # precedence before 6.1, including the awful ones like "LINE FEED (LF)", 14923 # so should be first in the file; the other names have precedence starting 14924 # in 6.1, 14925 my $before_or_after = ($v_version lt v6.1.0) 14926 ? $MULTIPLE_BEFORE 14927 : $MULTIPLE_AFTER; 14928 14929 foreach my $range (property_ref('Unicode_1_Name')->ranges) { 14930 my $code_point = $range->start; 14931 my $unicode_1_value = $range->value; 14932 next if $unicode_1_value eq ""; # Skip if name doesn't exist. 14933 14934 if ($code_point != $range->end) { 14935 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14936 } 14937 14938 # To handle EBCDIC, we don't hard code in the code points of the 14939 # controls; instead realizing that all of them are below 256. 14940 last if $code_point > 255; 14941 14942 # We only add in the controls. 14943 next if $gc->value_of($code_point) ne 'Cc'; 14944 14945 # We reject this Unicode1 name for later Perls, as it is used for 14946 # another code point 14947 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0; 14948 14949 # This won't add an exact duplicate. 14950 $perl_charname->add_duplicate($code_point, $unicode_1_value, 14951 Replace => $before_or_after); 14952 } 14953 14954 # Now that have everything added, add in abbreviations after 14955 # everything else. Sort so results don't change between runs of this 14956 # program 14957 foreach my $value (sort keys %abbreviations) { 14958 $perl_charname->add_duplicate($abbreviations{$value}, $value, 14959 Replace => $MULTIPLE_AFTER); 14960 } 14961 14962 my $comment; 14963 if (@composition <= 2) { # Always at least 2 14964 $comment = join " and ", @composition; 14965 } 14966 else { 14967 $comment = join ", ", @composition[0 .. scalar @composition - 2]; 14968 $comment .= ", and $composition[-1]"; 14969 } 14970 14971 $perl_charname->add_comment(join_lines( <<END 14972This file is for charnames.pm. It is the union of the $comment properties. 14973Unicode_1_Name entries are used only for nameless code points in the Name 14974property. 14975$alias_sentence 14976This file doesn't include the algorithmically determinable names. For those, 14977use 'unicore/Name.pm' 14978END 14979 )); 14980 property_ref('Name')->add_comment(join_lines( <<END 14981This file doesn't include the algorithmically determinable names. For those, 14982use 'unicore/Name.pm' 14983END 14984 )); 14985 14986 # Construct the Present_In property from the Age property. 14987 if (-e 'DAge.txt' && defined $age) { 14988 my $default_map = $age->default_map; 14989 my $in = Property->new('In', 14990 Default_Map => $default_map, 14991 Full_Name => "Present_In", 14992 Perl_Extension => 1, 14993 Type => $ENUM, 14994 Initialize => $age, 14995 ); 14996 $in->add_comment(join_lines(<<END 14997THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the 14998same as for $age, and not for what $in really means. This is because anything 14999defined in a given release should have multiple values: that release and all 15000higher ones. But only one value per code point can be represented in a table 15001like this. 15002END 15003 )); 15004 15005 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the 15006 # lowest numbered (earliest) come first, with the non-numeric one 15007 # last. 15008 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/) 15009 ? 1 15010 : ($b->name !~ /^[\d.]*$/) 15011 ? -1 15012 : $a->name <=> $b->name 15013 } $age->tables; 15014 15015 # The Present_In property is the cumulative age properties. The first 15016 # one hence is identical to the first age one. 15017 my $first_in = $in->add_match_table($first_age->name); 15018 $first_in->set_equivalent_to($first_age, Related => 1); 15019 15020 my $description_start = "Code point's usage introduced in version "; 15021 $first_age->add_description($description_start . $first_age->name); 15022 foreach my $alias ($first_age->aliases) { # Include its aliases 15023 $first_in->add_alias($alias->name); 15024 } 15025 15026 # To construct the accumulated values, for each of the age tables 15027 # starting with the 2nd earliest, merge the earliest with it, to get 15028 # all those code points existing in the 2nd earliest. Repeat merging 15029 # the new 2nd earliest with the 3rd earliest to get all those existing 15030 # in the 3rd earliest, and so on. 15031 my $previous_in = $first_in; 15032 foreach my $current_age (@rest_ages) { 15033 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric 15034 15035 my $current_in = $in->add_match_table( 15036 $current_age->name, 15037 Initialize => $current_age + $previous_in, 15038 Description => $description_start 15039 . $current_age->name 15040 . ' or earlier', 15041 ); 15042 foreach my $alias ($current_age->aliases) { 15043 $current_in->add_alias($alias->name); 15044 } 15045 $previous_in = $current_in; 15046 15047 # Add clarifying material for the corresponding age file. This is 15048 # in part because of the confusing and contradictory information 15049 # given in the Standard's documentation itself, as of 5.2. 15050 $current_age->add_description( 15051 "Code point's usage was introduced in version " 15052 . $current_age->name); 15053 $current_age->add_note("See also $in"); 15054 15055 } 15056 15057 # And finally the code points whose usages have yet to be decided are 15058 # the same in both properties. Note that permanently unassigned code 15059 # points actually have their usage assigned (as being permanently 15060 # unassigned), so that these tables are not the same as gc=cn. 15061 my $unassigned = $in->add_match_table($default_map); 15062 my $age_default = $age->table($default_map); 15063 $age_default->add_description(<<END 15064Code point's usage has not been assigned in any Unicode release thus far. 15065END 15066 ); 15067 $unassigned->set_equivalent_to($age_default, Related => 1); 15068 foreach my $alias ($age_default->aliases) { 15069 $unassigned->add_alias($alias->name); 15070 } 15071 } 15072 15073 my $patws = $perl->add_match_table('_Perl_PatWS', 15074 Perl_Extension => 1, 15075 Fate => $INTERNAL_ONLY); 15076 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) { 15077 $patws->initialize($off_patws->table('Y')); 15078 } 15079 else { 15080 $patws->initialize([ ord("\t"), 15081 ord("\n"), 15082 utf8::unicode_to_native(0x0B), # VT 15083 ord("\f"), 15084 ord("\r"), 15085 ord(" "), 15086 utf8::unicode_to_native(0x85), # NEL 15087 0x200E..0x200F, # Left, Right marks 15088 0x2028..0x2029 # Line, Paragraph seps 15089 ] ); 15090 } 15091 15092 # See L<perlfunc/quotemeta> 15093 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', 15094 Perl_Extension => 1, 15095 Fate => $INTERNAL_ONLY, 15096 15097 # Initialize to what's common in 15098 # all Unicode releases. 15099 Initialize => 15100 $gc->table('Control') 15101 + $Space 15102 + $patws 15103 + ((~ $Word) & $ASCII) 15104 ); 15105 15106 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) { 15107 $quotemeta += $patsyn->table('Y'); 15108 } 15109 else { 15110 $quotemeta += ((~ $Word) & Range->new(0, 255)) 15111 - utf8::unicode_to_native(0xA8) 15112 - utf8::unicode_to_native(0xAF) 15113 - utf8::unicode_to_native(0xB2) 15114 - utf8::unicode_to_native(0xB3) 15115 - utf8::unicode_to_native(0xB4) 15116 - utf8::unicode_to_native(0xB7) 15117 - utf8::unicode_to_native(0xB8) 15118 - utf8::unicode_to_native(0xB9) 15119 - utf8::unicode_to_native(0xBC) 15120 - utf8::unicode_to_native(0xBD) 15121 - utf8::unicode_to_native(0xBE); 15122 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the 15123 # same in all releases 15124 0x2010 .. 0x2027, 15125 0x2030 .. 0x203E, 15126 0x2041 .. 0x2053, 15127 0x2055 .. 0x205E, 15128 0x2190 .. 0x245F, 15129 0x2500 .. 0x2775, 15130 0x2794 .. 0x2BFF, 15131 0x2E00 .. 0x2E7F, 15132 0x3001 .. 0x3003, 15133 0x3008 .. 0x3020, 15134 0x3030 .. 0x3030, 15135 0xFD3E .. 0xFD3F, 15136 0xFE45 .. 0xFE46 15137 ]; 15138 } 15139 15140 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 15141 $quotemeta += $di->table('Y') 15142 } 15143 else { 15144 if ($v_version ge v2.0) { 15145 $quotemeta += $gc->table('Cf') 15146 + $gc->table('Cs'); 15147 15148 # These are above the Unicode version 1 max 15149 $quotemeta->add_range(0xE0000, 0xE0FFF); 15150 } 15151 $quotemeta += $gc->table('Cc') 15152 - $Space; 15153 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D, 15154 0x2060 .. 0x206F, 15155 0xFE00 .. 0xFE0F, 15156 0xFFF0 .. 0xFFFB, 15157 ]); 15158 $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0; 15159 $quotemeta += $temp; 15160 } 15161 calculate_DI(); 15162 $quotemeta += $DI; 15163 15164 calculate_NChar(); 15165 15166 # Finished creating all the perl properties. All non-internal non-string 15167 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with 15168 # an underscore.) These do not get a separate entry in the pod file 15169 foreach my $table ($perl->tables) { 15170 foreach my $alias ($table->aliases) { 15171 next if $alias->name =~ /^_/; 15172 $table->add_alias('Is_' . $alias->name, 15173 Re_Pod_Entry => 0, 15174 UCD => 0, 15175 Status => $alias->status, 15176 OK_as_Filename => 0); 15177 } 15178 } 15179 15180 # Perl tailors the WordBreak property so that \b{wb} doesn't split 15181 # adjacent spaces into separate words. Unicode 11.0 moved in that 15182 # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO 15183 # BREAK SPACE as breaking, so we retained the original Perl customization. 15184 # To do this, in the Perl copy of WB, simply replace the mappings of 15185 # horizontal space characters that otherwise would map to the default or 15186 # the 11.0 'WSegSpace' to instead map to our tailoring. 15187 my $perl_wb = property_ref('_Perl_WB'); 15188 my $default = $perl_wb->default_map; 15189 for my $range ($Blank->ranges) { 15190 for my $i ($range->start .. $range->end) { 15191 my $value = $perl_wb->value_of($i); 15192 15193 next unless $value eq $default || $value eq 'WSegSpace'; 15194 $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace', 15195 Replace => $UNCONDITIONALLY); 15196 } 15197 } 15198 15199 # Also starting in Unicode 11.0, rules for some of the boundary types are 15200 # based on a non-UCD property (which we have read in if it exists). 15201 # Recall that these boundary properties partition the code points into 15202 # equivalence classes (represented as enums). 15203 # 15204 # The loop below goes through each code point that matches the non-UCD 15205 # property, and for each current equivalence class containing such a code 15206 # point, splits it so that those that are in both are now in a newly 15207 # created equivalence class whose name is a combination of the property 15208 # and the old class name, leaving unchanged everything that doesn't match 15209 # the non-UCD property. 15210 my $ep = property_ref('ExtPict'); 15211 $ep = $ep->table('Y') if defined $ep; 15212 if (defined $ep) { 15213 foreach my $base_property (property_ref('GCB'), 15214 property_ref('WB')) 15215 { 15216 my $property = property_ref('_Perl_' . $base_property->name); 15217 foreach my $range ($ep->ranges) { 15218 foreach my $i ($range->start .. $range->end) { 15219 my $current = $property->value_of($i); 15220 $current = $property->table($current)->short_name; 15221 $property->add_map($i, $i, 'ExtPict_' . $current, 15222 Replace => $UNCONDITIONALLY); 15223 } 15224 } 15225 } 15226 } 15227 15228 # Create a version of the LineBreak property with the mappings that are 15229 # omitted in the default algorithm remapped to what 15230 # http://www.unicode.org/reports/tr14 says they should be. 15231 # 15232 # First, create a plain copy, but with all property values written out in 15233 # their long form, as regen/mk_invlist.pl expects that, and also fix 15234 # occurrences of the typo in early Unicode versions: 'inseperable'. 15235 my $perl_lb = property_ref('_Perl_LB'); 15236 if (! defined $perl_lb) { 15237 $perl_lb = Property->new('_Perl_LB', 15238 Fate => $INTERNAL_ONLY, 15239 Perl_Extension => 1, 15240 Directory => $map_directory, 15241 Type => $STRING); 15242 my $lb = property_ref('Line_Break'); 15243 15244 # Populate from $lb, but use full name and fix typo. 15245 foreach my $range ($lb->ranges) { 15246 my $full_name = $lb->table($range->value)->full_name; 15247 $full_name = 'Inseparable' 15248 if standardize($full_name) eq 'inseperable'; 15249 $perl_lb->add_map($range->start, $range->end, $full_name); 15250 } 15251 } 15252 15253 # What tr14 says is this: 15254 15255 # Original Resolved General_Category 15256 # AI, SG, XX AL Any 15257 # SA CM Only Mn or Mc 15258 # SA AL Any except Mn and Mc 15259 # CJ NS Any 15260 15261 $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL 15262 15263 my $ea = property_ref('East_Asian_Width'); 15264 my $Cn_EP; 15265 $Cn_EP = $ep & $gc->table('Unassigned') if defined $ep; 15266 15267 for my $range ($perl_lb->ranges) { 15268 my $value = standardize($range->value); 15269 if ( $value eq standardize('Unknown') 15270 || $value eq standardize('Ambiguous') 15271 || $value eq standardize('Surrogate')) 15272 { 15273 $perl_lb->add_map($range->start, $range->end, 'Alphabetic', 15274 Replace => $UNCONDITIONALLY); 15275 } 15276 elsif ($value eq standardize('Conditional_Japanese_Starter')) { 15277 $perl_lb->add_map($range->start, $range->end, 'Nonstarter', 15278 Replace => $UNCONDITIONALLY); 15279 } 15280 elsif ($value eq standardize('Complex_Context')) { 15281 for my $i ($range->start .. $range->end) { 15282 my $gc_val = $gc->value_of($i); 15283 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') { 15284 $perl_lb->add_map($i, $i, 'Combining_Mark', 15285 Replace => $UNCONDITIONALLY); 15286 } 15287 else { 15288 $perl_lb->add_map($i, $i, 'Alphabetic', 15289 Replace => $UNCONDITIONALLY); 15290 } 15291 } 15292 } 15293 elsif (defined $ep && $value eq standardize('Ideographic')) { 15294 15295 # Unicode 14 adds a rule to not break lines before any potential 15296 # EBase, They say that any unassigned code point that is ExtPict, 15297 # is potentially an EBase. In 14.0, all such ones are in the 15298 # ExtPict=ID category. We must split that category for the 15299 # pairwise rule table to work. 15300 for my $i ($range->start .. $range->end) { 15301 if ($Cn_EP->contains($i)) { 15302 $perl_lb->add_map($i, $i, 15303 'Unassigned_Extended_Pictographic_Ideographic', 15304 Replace => $UNCONDITIONALLY); 15305 } 15306 } 15307 } 15308 elsif ( defined $ea 15309 && ( $value eq standardize('Close_Parenthesis') 15310 || $value eq standardize('Open_Punctuation'))) 15311 { 15312 # Unicode 13 splits the OP and CP properties each into East Asian, 15313 # and non-. We retain the (now somewhat misleading) names OP and 15314 # CP for the non-East Asian variety, as there are very few East 15315 # Asian ones. 15316 my $replace = ($value eq standardize('Open_Punctuation')) 15317 ? 'East_Asian_OP' 15318 : 'East_Asian_CP'; 15319 for my $i ($range->start .. $range->end) { 15320 my $ea_val = $ea->value_of($i); 15321 if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') { 15322 $perl_lb->add_map($i, $i, $replace, 15323 Replace => $UNCONDITIONALLY); 15324 } 15325 } 15326 } 15327 } 15328 15329 # This property is a modification of the scx property 15330 my $perl_scx = Property->new('_Perl_SCX', 15331 Fate => $INTERNAL_ONLY, 15332 Perl_Extension => 1, 15333 Directory => $map_directory, 15334 Type => $ENUM); 15335 my $source; 15336 15337 # Use scx if available; otherwise sc; if neither is there (a very old 15338 # Unicode version, just say that everything is 'Common' 15339 if (defined $scx) { 15340 $source = $scx; 15341 $perl_scx->set_default_map('Unknown'); 15342 } 15343 elsif (defined $script) { 15344 $source = $script; 15345 15346 # Early versions of 'sc', had everything be 'Common' 15347 if (defined $script->table('Unknown')) { 15348 $perl_scx->set_default_map('Unknown'); 15349 } 15350 else { 15351 $perl_scx->set_default_map('Common'); 15352 } 15353 } else { 15354 $perl_scx->add_match_table('Common'); 15355 $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common'); 15356 15357 $perl_scx->add_match_table('Unknown'); 15358 $perl_scx->set_default_map('Unknown'); 15359 } 15360 15361 $perl_scx->_set_format($STRING_WHITE_SPACE_LIST); 15362 $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 15363 15364 if (defined $source) { 15365 $perl_scx->initialize($source); 15366 15367 # UTS 39 says that the scx property should be modified for these 15368 # countries where certain mixed scripts are commonly used. 15369 for my $range ($perl_scx->ranges) { 15370 my $value = $range->value; 15371 my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi; 15372 $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi; 15373 $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi; 15374 $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) } 15375 {$1 Katakana Hiragana Jpan}xi; 15376 $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi; 15377 $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi; 15378 15379 if ($changed) { 15380 $value = join " ", uniques split " ", $value; 15381 $range->set_value($value) 15382 } 15383 } 15384 15385 foreach my $table ($source->tables) { 15386 my $scx_table = $perl_scx->add_match_table($table->name, 15387 Full_Name => $table->full_name); 15388 foreach my $alias ($table->aliases) { 15389 $scx_table->add_alias($alias->name); 15390 } 15391 } 15392 } 15393 15394 # Here done with all the basic stuff. Ready to populate the information 15395 # about each character if annotating them. 15396 if ($annotate) { 15397 15398 # See comments at its declaration 15399 $annotate_ranges = Range_Map->new; 15400 15401 # This separates out the non-characters from the other unassigneds, so 15402 # can give different annotations for each. 15403 $unassigned_sans_noncharacters = Range_List->new( 15404 Initialize => $gc->table('Unassigned')); 15405 $unassigned_sans_noncharacters &= (~ $NChar); 15406 15407 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) { 15408 $i = populate_char_info($i); # Note sets $i so may cause skips 15409 15410 } 15411 } 15412 15413 return; 15414} 15415 15416sub add_perl_synonyms() { 15417 # A number of Unicode tables have Perl synonyms that are expressed in 15418 # the single-form, \p{name}. These are: 15419 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and 15420 # \p{Is_Name} as synonyms 15421 # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms 15422 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms 15423 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no 15424 # conflict, \p{Value} and \p{Is_Value} as well 15425 # 15426 # This routine generates these synonyms, warning of any unexpected 15427 # conflicts. 15428 15429 # Construct the list of tables to get synonyms for. Start with all the 15430 # binary and the General_Category ones. 15431 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } 15432 property_ref('*'); 15433 push @tables, $gc->tables; 15434 15435 # If the version of Unicode includes the Script Extensions (preferably), 15436 # or Script property, add its tables 15437 if (defined $scx) { 15438 push @tables, $scx->tables; 15439 } 15440 else { 15441 push @tables, $script->tables if defined $script; 15442 } 15443 15444 # The Block tables are kept separate because they are treated differently. 15445 # And the earliest versions of Unicode didn't include them, so add only if 15446 # there are some. 15447 my @blocks; 15448 push @blocks, $block->tables if defined $block; 15449 15450 # Here, have the lists of tables constructed. Process blocks last so that 15451 # if there are name collisions with them, blocks have lowest priority. 15452 # Should there ever be other collisions, manual intervention would be 15453 # required. See the comments at the beginning of the program for a 15454 # possible way to handle those semi-automatically. 15455 foreach my $table (@tables, @blocks) { 15456 15457 # For non-binary properties, the synonym is just the name of the 15458 # table, like Greek, but for binary properties the synonym is the name 15459 # of the property, and means the code points in its 'Y' table. 15460 my $nominal = $table; 15461 my $nominal_property = $nominal->property; 15462 my $actual; 15463 if (! $nominal->isa('Property')) { 15464 $actual = $table; 15465 } 15466 else { 15467 15468 # Here is a binary property. Use the 'Y' table. Verify that is 15469 # there 15470 my $yes = $nominal->table('Y'); 15471 unless (defined $yes) { # Must be defined, but is permissible to 15472 # be empty. 15473 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); 15474 next; 15475 } 15476 $actual = $yes; 15477 } 15478 15479 foreach my $alias ($nominal->aliases) { 15480 15481 # Attempt to create a table in the perl directory for the 15482 # candidate table, using whatever aliases in it that don't 15483 # conflict. Also add non-conflicting aliases for all these 15484 # prefixed by 'Is_' (and/or 'In_' for Block property tables) 15485 PREFIX: 15486 foreach my $prefix ("", 'Is_', 'In_') { 15487 15488 # Only Block properties can have added 'In_' aliases. 15489 next if $prefix eq 'In_' and $nominal_property != $block; 15490 15491 my $proposed_name = $prefix . $alias->name; 15492 15493 # No Is_Is, In_In, nor combinations thereof 15494 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; 15495 next if $proposed_name =~ /^ I [ns] _I [ns] _/x; 15496 15497 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; 15498 15499 # Get a reference to any existing table in the perl 15500 # directory with the desired name. 15501 my $pre_existing = $perl->table($proposed_name); 15502 15503 if (! defined $pre_existing) { 15504 15505 # No name collision, so OK to add the perl synonym. 15506 15507 my $make_re_pod_entry; 15508 my $ok_as_filename; 15509 my $status = $alias->status; 15510 if ($nominal_property == $block) { 15511 15512 # For block properties, only the compound form is 15513 # preferred for external use; the others are 15514 # discouraged. The pod file contains wild cards for 15515 # the 'In' and 'Is' forms so no entries for those; and 15516 # we don't want people using the name without any 15517 # prefix, so discourage that. 15518 if ($prefix eq "") { 15519 $make_re_pod_entry = 1; 15520 $status = $status || $DISCOURAGED; 15521 $ok_as_filename = 0; 15522 } 15523 elsif ($prefix eq 'In_') { 15524 $make_re_pod_entry = 0; 15525 $status = $status || $DISCOURAGED; 15526 $ok_as_filename = 1; 15527 } 15528 else { 15529 $make_re_pod_entry = 0; 15530 $status = $status || $DISCOURAGED; 15531 $ok_as_filename = 0; 15532 } 15533 } 15534 elsif ($prefix ne "") { 15535 15536 # The 'Is' prefix is handled in the pod by a wild 15537 # card, and we won't use it for an external name 15538 $make_re_pod_entry = 0; 15539 $status = $status || $NORMAL; 15540 $ok_as_filename = 0; 15541 } 15542 else { 15543 15544 # Here, is an empty prefix, non block. This gets its 15545 # own pod entry and can be used for an external name. 15546 $make_re_pod_entry = 1; 15547 $status = $status || $NORMAL; 15548 $ok_as_filename = 1; 15549 } 15550 15551 # Here, there isn't a perl pre-existing table with the 15552 # name. Look through the list of equivalents of this 15553 # table to see if one is a perl table. 15554 foreach my $equivalent ($actual->leader->equivalents) { 15555 next if $equivalent->property != $perl; 15556 15557 # Here, have found a table for $perl. Add this alias 15558 # to it, and are done with this prefix. 15559 $equivalent->add_alias($proposed_name, 15560 Re_Pod_Entry => $make_re_pod_entry, 15561 15562 # Currently don't output these in the 15563 # ucd pod, as are strongly discouraged 15564 # from being used 15565 UCD => 0, 15566 15567 Status => $status, 15568 OK_as_Filename => $ok_as_filename); 15569 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; 15570 next PREFIX; 15571 } 15572 15573 # Here, $perl doesn't already have a table that is a 15574 # synonym for this property, add one. 15575 my $added_table = $perl->add_match_table($proposed_name, 15576 Re_Pod_Entry => $make_re_pod_entry, 15577 15578 # See UCD comment just above 15579 UCD => 0, 15580 15581 Status => $status, 15582 OK_as_Filename => $ok_as_filename); 15583 # And it will be related to the actual table, since it is 15584 # based on it. 15585 $added_table->set_equivalent_to($actual, Related => 1); 15586 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; 15587 next; 15588 } # End of no pre-existing. 15589 15590 # Here, there is a pre-existing table that has the proposed 15591 # name. We could be in trouble, but not if this is just a 15592 # synonym for another table that we have already made a child 15593 # of the pre-existing one. 15594 if ($pre_existing->is_set_equivalent_to($actual)) { 15595 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; 15596 $pre_existing->add_alias($proposed_name); 15597 next; 15598 } 15599 15600 # Here, there is a name collision, but it still could be OK if 15601 # the tables match the identical set of code points, in which 15602 # case, we can combine the names. Compare each table's code 15603 # point list to see if they are identical. 15604 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; 15605 if ($pre_existing->matches_identically_to($actual)) { 15606 15607 # Here, they do match identically. Not a real conflict. 15608 # Make the perl version a child of the Unicode one, except 15609 # in the non-obvious case of where the perl name is 15610 # already a synonym of another Unicode property. (This is 15611 # excluded by the test for it being its own parent.) The 15612 # reason for this exclusion is that then the two Unicode 15613 # properties become related; and we don't really know if 15614 # they are or not. We generate documentation based on 15615 # relatedness, and this would be misleading. Code 15616 # later executed in the process will cause the tables to 15617 # be represented by a single file anyway, without making 15618 # it look in the pod like they are necessarily related. 15619 if ($pre_existing->parent == $pre_existing 15620 && ($pre_existing->property == $perl 15621 || $actual->property == $perl)) 15622 { 15623 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; 15624 $pre_existing->set_equivalent_to($actual, Related => 1); 15625 } 15626 elsif (main::DEBUG && $to_trace) { 15627 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; 15628 trace $pre_existing->parent; 15629 } 15630 next PREFIX; 15631 } 15632 15633 # Here they didn't match identically, there is a real conflict 15634 # between our new name and a pre-existing property. 15635 $actual->add_conflicting($proposed_name, 'p', $pre_existing); 15636 $pre_existing->add_conflicting($nominal->full_name, 15637 'p', 15638 $actual); 15639 15640 # Don't output a warning for aliases for the block 15641 # properties (unless they start with 'In_') as it is 15642 # expected that there will be conflicts and the block 15643 # form loses. 15644 if ($verbosity >= $NORMAL_VERBOSITY 15645 && ($actual->property != $block || $prefix eq 'In_')) 15646 { 15647 print simple_fold(join_lines(<<END 15648There is already an alias named $proposed_name (from $pre_existing), 15649so not creating this alias for $actual 15650END 15651 ), "", 4); 15652 } 15653 15654 # Keep track for documentation purposes. 15655 $has_In_conflicts++ if $prefix eq 'In_'; 15656 $has_Is_conflicts++ if $prefix eq 'Is_'; 15657 } 15658 } 15659 } 15660 15661 # There are some properties which have No and Yes (and N and Y) as 15662 # property values, but aren't binary, and could possibly be confused with 15663 # binary ones. So create caveats for them. There are tables that are 15664 # named 'No', and tables that are named 'N', but confusion is not likely 15665 # unless they are the same table. For example, N meaning Number or 15666 # Neutral is not likely to cause confusion, so don't add caveats to things 15667 # like them. 15668 foreach my $property (grep { $_->type != $BINARY 15669 && $_->type != $FORCED_BINARY } 15670 property_ref('*')) 15671 { 15672 my $yes = $property->table('Yes'); 15673 if (defined $yes) { 15674 my $y = $property->table('Y'); 15675 if (defined $y && $yes == $y) { 15676 foreach my $alias ($property->aliases) { 15677 $yes->add_conflicting($alias->name); 15678 } 15679 } 15680 } 15681 my $no = $property->table('No'); 15682 if (defined $no) { 15683 my $n = $property->table('N'); 15684 if (defined $n && $no == $n) { 15685 foreach my $alias ($property->aliases) { 15686 $no->add_conflicting($alias->name, 'P'); 15687 } 15688 } 15689 } 15690 } 15691 15692 return; 15693} 15694 15695sub register_file_for_name($table, $directory_ref, $file) { 15696 # Given info about a table and a datafile that it should be associated 15697 # with, register that association 15698 15699 # $directory_ref # Array of the directory path for the file 15700 # $file # The file name in the final directory. 15701 15702 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace; 15703 15704 if ($table->isa('Property')) { 15705 $table->set_file_path(@$directory_ref, $file); 15706 push @map_properties, $table; 15707 15708 # No swash means don't do the rest of this. 15709 return if $table->fate != $ORDINARY 15710 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY); 15711 15712 # Get the path to the file 15713 my @path = $table->file_path; 15714 15715 # Use just the file name if no subdirectory. 15716 shift @path if $path[0] eq File::Spec->curdir(); 15717 15718 my $file = join '/', @path; 15719 15720 # Create a hash entry for Unicode::UCD to get the file that stores this 15721 # property's map table 15722 foreach my $alias ($table->aliases) { 15723 my $name = $alias->name; 15724 if ($name =~ /^_/) { 15725 $strict_property_to_file_of{lc $name} = $file; 15726 } 15727 else { 15728 $loose_property_to_file_of{standardize($name)} = $file; 15729 } 15730 } 15731 15732 # And a way for Unicode::UCD to find the proper key in the SwashInfo 15733 # hash for this property. 15734 $file_to_swash_name{$file} = "To" . $table->swash_name; 15735 return; 15736 } 15737 15738 # Do all of the work for all equivalent tables when called with the leader 15739 # table, so skip if isn't the leader. 15740 return if $table->leader != $table; 15741 15742 # If this is a complement of another file, use that other file instead, 15743 # with a ! prepended to it. 15744 my $complement; 15745 if (($complement = $table->complement) != 0) { 15746 my @directories = $complement->file_path; 15747 15748 # This assumes that the 0th element is something like 'lib', 15749 # the 1th element the property name (in its own directory), like 15750 # 'AHex', and the 2th element the file like 'Y' which will have a .pl 15751 # appended to it later. 15752 $directories[1] =~ s/^/!/; 15753 $file = pop @directories; 15754 $directory_ref =\@directories; 15755 } 15756 15757 # Join all the file path components together, using slashes. 15758 my $full_filename = join('/', @$directory_ref, $file); 15759 15760 # All go in the same subdirectory of unicore, or the special 15761 # pseudo-directory '#' 15762 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) { 15763 Carp::my_carp("Unexpected directory in " 15764 . join('/', @{$directory_ref}, $file)); 15765 } 15766 15767 # For this table and all its equivalents ... 15768 foreach my $table ($table, $table->equivalents) { 15769 15770 # Associate it with its file internally. Don't include the 15771 # $matches_directory first component 15772 $table->set_file_path(@$directory_ref, $file); 15773 15774 # No swash means don't do the rest of this. 15775 next if $table->isa('Map_Table') && $table->fate != $ORDINARY; 15776 15777 my $sub_filename = join('/', $directory_ref->[1, -1], $file); 15778 15779 my $property = $table->property; 15780 my $property_name = ($property == $perl) 15781 ? "" # 'perl' is never explicitly stated 15782 : standardize($property->name) . '='; 15783 15784 my $is_default = 0; # Is this table the default one for the property? 15785 15786 # To calculate $is_default, we find if this table is the same as the 15787 # default one for the property. But this is complicated by the 15788 # possibility that there is a master table for this one, and the 15789 # information is stored there instead of here. 15790 my $parent = $table->parent; 15791 my $leader_prop = $parent->property; 15792 my $default_map = $leader_prop->default_map; 15793 if (defined $default_map) { 15794 my $default_table = $leader_prop->table($default_map); 15795 $is_default = 1 if defined $default_table && $parent == $default_table; 15796 } 15797 15798 # Calculate the loose name for this table. Mostly it's just its name, 15799 # standardized. But in the case of Perl tables that are single-form 15800 # equivalents to Unicode properties, it is the latter's name. 15801 my $loose_table_name = 15802 ($property != $perl || $leader_prop == $perl) 15803 ? standardize($table->name) 15804 : standardize($parent->name); 15805 15806 my $deprecated = ($table->status eq $DEPRECATED) 15807 ? $table->status_info 15808 : ""; 15809 my $caseless_equivalent = $table->caseless_equivalent; 15810 15811 # And for each of the table's aliases... This inner loop eventually 15812 # goes through all aliases in the UCD that we generate regex match 15813 # files for 15814 foreach my $alias ($table->aliases) { 15815 my $standard = UCD_name($table, $alias); 15816 15817 # Generate an entry in either the loose or strict hashes, which 15818 # will translate the property and alias names combination into the 15819 # file where the table for them is stored. 15820 if ($alias->loose_match) { 15821 if (exists $loose_to_file_of{$standard}) { 15822 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); 15823 } 15824 else { 15825 $loose_to_file_of{$standard} = $sub_filename; 15826 } 15827 } 15828 else { 15829 if (exists $stricter_to_file_of{$standard}) { 15830 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); 15831 } 15832 else { 15833 $stricter_to_file_of{$standard} = $sub_filename; 15834 15835 # Tightly coupled with how Unicode::UCD works, for a 15836 # floating point number that is a whole number, get rid of 15837 # the trailing decimal point and 0's, so that Unicode::UCD 15838 # will work. Also note that this assumes that such a 15839 # number is matched strictly; so if that were to change, 15840 # this would be wrong. 15841 if ((my $integer_name = $alias->name) 15842 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) 15843 { 15844 $stricter_to_file_of{$property_name . $integer_name} 15845 = $sub_filename; 15846 } 15847 } 15848 } 15849 15850 # For Unicode::UCD, create a mapping of the prop=value to the 15851 # canonical =value for that property. 15852 if ($standard =~ /=/) { 15853 15854 # This could happen if a strict name mapped into an existing 15855 # loose name. In that event, the strict names would have to 15856 # be moved to a new hash. 15857 if (exists($loose_to_standard_value{$standard})) { 15858 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); 15859 } 15860 $loose_to_standard_value{$standard} = $loose_table_name; 15861 } 15862 15863 # Keep a list of the deprecated properties and their filenames 15864 if ($deprecated && $complement == 0) { 15865 $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated; 15866 } 15867 15868 # And a substitute table, if any, for case-insensitive matching 15869 if ($caseless_equivalent != 0) { 15870 $caseless_equivalent_to{$standard} = $caseless_equivalent; 15871 } 15872 15873 # Add to defaults list if the table this alias belongs to is the 15874 # default one 15875 $loose_defaults{$standard} = 1 if $is_default; 15876 } 15877 } 15878 15879 return; 15880} 15881 15882{ # Closure 15883 my %base_names; # Names already used for avoiding DOS 8.3 filesystem 15884 # conflicts 15885 my %full_dir_name_of; # Full length names of directories used. 15886 15887 sub construct_filename($name, $mutable, $directories_ref) { 15888 # Return a file name for a table, based on the table name, but perhaps 15889 # changed to get rid of non-portable characters in it, and to make 15890 # sure that it is unique on a file system that allows the names before 15891 # any period to be at most 8 characters (DOS). While we're at it 15892 # check and complain if there are any directory conflicts. 15893 15894 # $name # The name to start with 15895 # $mutable # Boolean: can it be changed? If no, but 15896 # yet it must be to work properly, a warning 15897 # is given 15898 # $directories_ref # A reference to an array containing the 15899 # path to the file, with each element one path 15900 # component. This is used because the same 15901 # name can be used in different directories. 15902 15903 my $warn = ! defined wantarray; # If true, then if the name is 15904 # changed, a warning is issued as well. 15905 15906 if (! defined $name) { 15907 Carp::my_carp("Undefined name in directory " 15908 . File::Spec->join(@$directories_ref) 15909 . ". '_' used"); 15910 return '_'; 15911 } 15912 15913 # Make sure that no directory names conflict with each other. Look at 15914 # each directory in the input file's path. If it is already in use, 15915 # assume it is correct, and is merely being re-used, but if we 15916 # truncate it to 8 characters, and find that there are two directories 15917 # that are the same for the first 8 characters, but differ after that, 15918 # then that is a problem. 15919 foreach my $directory (@$directories_ref) { 15920 my $short_dir = substr($directory, 0, 8); 15921 if (defined $full_dir_name_of{$short_dir}) { 15922 next if $full_dir_name_of{$short_dir} eq $directory; 15923 Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); 15924 } 15925 else { 15926 $full_dir_name_of{$short_dir} = $directory; 15927 } 15928 } 15929 15930 my $path = join '/', @$directories_ref; 15931 $path .= '/' if $path; 15932 15933 # Remove interior underscores. 15934 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; 15935 15936 # Convert the dot in floating point numbers to an underscore 15937 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x; 15938 15939 my $suffix = ""; 15940 15941 # Extract any suffix, delete any non-word character, and truncate to 3 15942 # after the dot 15943 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) { 15944 $filename = $1; 15945 $suffix = $2; 15946 $suffix =~ s/\W+//g; 15947 substr($suffix, 4) = "" if length($suffix) > 4; 15948 } 15949 15950 # Change any non-word character outside the suffix into an underscore, 15951 # and truncate to 8. 15952 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" 15953 substr($filename, 8) = "" if length($filename) > 8; 15954 15955 # Make sure the basename doesn't conflict with something we 15956 # might have already written. If we have, say, 15957 # InGreekExtended1 15958 # InGreekExtended2 15959 # they become 15960 # InGreekE 15961 # InGreek2 15962 my $warned = 0; 15963 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) { 15964 $num++; # so basenames with numbers start with '2', which 15965 # just looks more natural. 15966 15967 # Want to append $num, but if it'll make the basename longer 15968 # than 8 characters, pre-truncate $filename so that the result 15969 # is acceptable. 15970 my $delta = length($filename) + length($num) - 8; 15971 if ($delta > 0) { 15972 substr($filename, -$delta) = $num; 15973 } 15974 else { 15975 $filename .= $num; 15976 } 15977 if ($warn && ! $warned) { 15978 $warned = 1; 15979 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); 15980 } 15981 } 15982 15983 return $filename if $mutable; 15984 15985 # If not changeable, must return the input name, but warn if needed to 15986 # change it beyond shortening it. 15987 if ($name ne $filename 15988 && substr($name, 0, length($filename)) ne $filename) { 15989 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); 15990 } 15991 return $name; 15992 } 15993} 15994 15995# The pod file contains a very large table. Many of the lines in that table 15996# would exceed a typical output window's size, and so need to be wrapped with 15997# a hanging indent to make them look good. The pod language is really 15998# insufficient here. There is no general construct to do that in pod, so it 15999# is done here by beginning each such line with a space to cause the result to 16000# be output without formatting, and doing all the formatting here. This leads 16001# to the result that if the eventual display window is too narrow it won't 16002# look good, and if the window is too wide, no advantage is taken of that 16003# extra width. A further complication is that the output may be indented by 16004# the formatter so that there is less space than expected. What I (khw) have 16005# done is to assume that that indent is a particular number of spaces based on 16006# what it is in my Linux system; people can always resize their windows if 16007# necessary, but this is obviously less than desirable, but the best that can 16008# be expected. 16009my $automatic_pod_indent = 8; 16010 16011# Try to format so that uses fewest lines, but few long left column entries 16012# slide into the right column. An experiment on 5.1 data yielded the 16013# following percentages that didn't cut into the other side along with the 16014# associated first-column widths 16015# 69% = 24 16016# 80% not too bad except for a few blocks 16017# 90% = 33; # , cuts 353/3053 lines from 37 = 12% 16018# 95% = 37; 16019my $indent_info_column = 27; # 75% of lines didn't have overlap 16020 16021my $FILLER = 3; # Length of initial boiler-plate columns in a pod line 16022 # The 3 is because of: 16023 # 1 for the leading space to tell the pod formatter to 16024 # output as-is 16025 # 1 for the flag 16026 # 1 for the space between the flag and the main data 16027 16028sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) { 16029 # Take a pod line and return it, formatted properly 16030 16031 # $entry Contents of left column 16032 # $info Contents of right column 16033 16034 my $flags = ""; 16035 $flags .= $STRICTER if ! $loose_match; 16036 16037 $flags .= $status if $status; 16038 16039 # There is a blank in the left column to cause the pod formatter to 16040 # output the line as-is. 16041 return sprintf " %-*s%-*s %s\n", 16042 # The first * in the format is replaced by this, the -1 is 16043 # to account for the leading blank. There isn't a 16044 # hard-coded blank after this to separate the flags from 16045 # the rest of the line, so that in the unlikely event that 16046 # multiple flags are shown on the same line, they both 16047 # will get displayed at the expense of that separation, 16048 # but since they are left justified, a blank will be 16049 # inserted in the normal case. 16050 $FILLER - 1, 16051 $flags, 16052 16053 # The other * in the format is replaced by this number to 16054 # cause the first main column to right fill with blanks. 16055 # The -1 is for the guaranteed blank following it. 16056 $first_column_width - $FILLER - 1, 16057 $entry, 16058 $info; 16059} 16060 16061my @zero_match_tables; # List of tables that have no matches in this release 16062 16063sub make_re_pod_entries($input_table) { 16064 # This generates the entries for the pod file for a given table. 16065 # Also done at this time are any children tables. The output looks like: 16066 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) 16067 16068 # Generate parent and all its children at the same time. 16069 return if $input_table->parent != $input_table; 16070 16071 my $property = $input_table->property; 16072 my $type = $property->type; 16073 my $full_name = $property->full_name; 16074 16075 my $count = $input_table->count; 16076 my $unicode_count; 16077 my $non_unicode_string; 16078 if ($count > $MAX_UNICODE_CODEPOINTS) { 16079 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 16080 - $MAX_UNICODE_CODEPOINT); 16081 $non_unicode_string = " plus all above-Unicode code points"; 16082 } 16083 else { 16084 $unicode_count = $count; 16085 $non_unicode_string = ""; 16086 } 16087 16088 my $string_count = clarify_number($unicode_count) . $non_unicode_string; 16089 16090 my $definition = $input_table->calculate_table_definition; 16091 if ($definition) { 16092 16093 # Save the definition for later use. 16094 $input_table->set_definition($definition); 16095 16096 $definition = ": $definition"; 16097 } 16098 16099 my $status = $input_table->status; 16100 my $status_info = $input_table->status_info; 16101 my $caseless_equivalent = $input_table->caseless_equivalent; 16102 16103 # Don't mention a placeholder equivalent as it isn't to be listed in the 16104 # pod 16105 $caseless_equivalent = 0 if $caseless_equivalent != 0 16106 && $caseless_equivalent->fate > $ORDINARY; 16107 16108 my $entry_for_first_table; # The entry for the first table output. 16109 # Almost certainly, it is the parent. 16110 16111 # For each related table (including itself), we will generate a pod entry 16112 # for each name each table goes by 16113 foreach my $table ($input_table, $input_table->children) { 16114 16115 # Unicode::UCD cannot deal with null string property values, so skip 16116 # any tables that have no non-null names. 16117 next if ! grep { $_->name ne "" } $table->aliases; 16118 16119 # First, gather all the info that applies to this table as a whole. 16120 16121 push @zero_match_tables, $table if $count == 0 16122 # Don't mention special tables 16123 # as being zero length 16124 && $table->fate == $ORDINARY; 16125 16126 my $table_property = $table->property; 16127 16128 # The short name has all the underscores removed, while the full name 16129 # retains them. Later, we decide whether to output a short synonym 16130 # for the full one, we need to compare apples to apples, so we use the 16131 # short name's length including underscores. 16132 my $table_property_short_name_length; 16133 my $table_property_short_name 16134 = $table_property->short_name(\$table_property_short_name_length); 16135 my $table_property_full_name = $table_property->full_name; 16136 16137 # Get how much savings there is in the short name over the full one 16138 # (delta will always be <= 0) 16139 my $table_property_short_delta = $table_property_short_name_length 16140 - length($table_property_full_name); 16141 my @table_description = $table->description; 16142 my @table_note = $table->note; 16143 16144 # Generate an entry for each alias in this table. 16145 my $entry_for_first_alias; # saves the first one encountered. 16146 foreach my $alias ($table->aliases) { 16147 16148 # Skip if not to go in pod. 16149 next unless $alias->make_re_pod_entry; 16150 16151 # Start gathering all the components for the entry 16152 my $name = $alias->name; 16153 16154 # Skip if name is empty, as can't be accessed by regexes. 16155 next if $name eq ""; 16156 16157 my $entry; # Holds the left column, may include extras 16158 my $entry_ref; # To refer to the left column's contents from 16159 # another entry; has no extras 16160 16161 # First the left column of the pod entry. Tables for the $perl 16162 # property always use the single form. 16163 if ($table_property == $perl) { 16164 $entry = "\\p{$name}"; 16165 $entry .= " \\p$name" if length $name == 1; # Show non-braced 16166 # form too 16167 $entry_ref = "\\p{$name}"; 16168 } 16169 else { # Compound form. 16170 16171 # Only generate one entry for all the aliases that mean true 16172 # or false in binary properties. Append a '*' to indicate 16173 # some are missing. (The heading comment notes this.) 16174 my $rhs; 16175 if ($type == $BINARY) { 16176 next if $name ne 'N' && $name ne 'Y'; 16177 $rhs = "$name*"; 16178 } 16179 elsif ($type != $FORCED_BINARY) { 16180 $rhs = $name; 16181 } 16182 else { 16183 16184 # Forced binary properties require special handling. It 16185 # has two sets of tables, one set is true/false; and the 16186 # other set is everything else. Entries are generated for 16187 # each set. Use the Bidi_Mirrored property (which appears 16188 # in all Unicode versions) to get a list of the aliases 16189 # for the true/false tables. Of these, only output the N 16190 # and Y ones, the same as, a regular binary property. And 16191 # output all the rest, same as a non-binary property. 16192 my $bm = property_ref("Bidi_Mirrored"); 16193 if ($name eq 'N' || $name eq 'Y') { 16194 $rhs = "$name*"; 16195 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, 16196 $bm->table("N")->aliases) 16197 { 16198 next; 16199 } 16200 else { 16201 $rhs = $name; 16202 } 16203 } 16204 16205 # Colon-space is used to give a little more space to be easier 16206 # to read; 16207 $entry = "\\p{" 16208 . $table_property_full_name 16209 . ": $rhs}"; 16210 16211 # But for the reference to this entry, which will go in the 16212 # right column, where space is at a premium, use equals 16213 # without a space 16214 $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; 16215 } 16216 16217 # Then the right (info) column. This is stored as components of 16218 # an array for the moment, then joined into a string later. For 16219 # non-internal only properties, begin the info with the entry for 16220 # the first table we encountered (if any), as things are ordered 16221 # so that that one is the most descriptive. This leads to the 16222 # info column of an entry being a more descriptive version of the 16223 # name column 16224 my @info; 16225 if ($name =~ /^_/) { 16226 push @info, 16227 '(For internal use by Perl, not necessarily stable)'; 16228 } 16229 elsif ($entry_for_first_alias) { 16230 push @info, $entry_for_first_alias; 16231 } 16232 16233 # If this entry is equivalent to another, add that to the info, 16234 # using the first such table we encountered 16235 if ($entry_for_first_table) { 16236 if (@info) { 16237 push @info, "(= $entry_for_first_table)"; 16238 } 16239 else { 16240 push @info, $entry_for_first_table; 16241 } 16242 } 16243 16244 # If the name is a large integer, add an equivalent with an 16245 # exponent for better readability 16246 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { 16247 push @info, sprintf "(= %.1e)", $name 16248 } 16249 16250 my $parenthesized = ""; 16251 if (! $entry_for_first_alias) { 16252 16253 # This is the first alias for the current table. The alias 16254 # array is ordered so that this is the fullest, most 16255 # descriptive alias, so it gets the fullest info. The other 16256 # aliases are mostly merely pointers to this one, using the 16257 # information already added above. 16258 16259 # Display any status message, but only on the parent table 16260 if ($status && ! $entry_for_first_table) { 16261 push @info, $status_info; 16262 } 16263 16264 # Put out any descriptive info 16265 if (@table_description || @table_note) { 16266 push @info, join "; ", @table_description, @table_note; 16267 } 16268 16269 # Look to see if there is a shorter name we can point people 16270 # at 16271 my $standard_name = standardize($name); 16272 my $short_name; 16273 my $proposed_short = $table->short_name; 16274 if (defined $proposed_short) { 16275 my $standard_short = standardize($proposed_short); 16276 16277 # If the short name is shorter than the standard one, or 16278 # even if it's not, but the combination of it and its 16279 # short property name (as in \p{prop=short} ($perl doesn't 16280 # have this form)) saves at least two characters, then, 16281 # cause it to be listed as a shorter synonym. 16282 if (length $standard_short < length $standard_name 16283 || ($table_property != $perl 16284 && (length($standard_short) 16285 - length($standard_name) 16286 + $table_property_short_delta) # (<= 0) 16287 < -2)) 16288 { 16289 $short_name = $proposed_short; 16290 if ($table_property != $perl) { 16291 $short_name = $table_property_short_name 16292 . "=$short_name"; 16293 } 16294 $short_name = "\\p{$short_name}"; 16295 } 16296 } 16297 16298 # And if this is a compound form name, see if there is a 16299 # single form equivalent 16300 my $single_form; 16301 if ($table_property != $perl && $table_property != $block) { 16302 16303 # Special case the binary N tables, so that will print 16304 # \P{single}, but use the Y table values to populate 16305 # 'single', as we haven't likewise populated the N table. 16306 # For forced binary tables, we can't just look at the N 16307 # table, but must see if this table is equivalent to the N 16308 # one, as there are two equivalent beasts in these 16309 # properties. 16310 my $test_table; 16311 my $p; 16312 if ( ($type == $BINARY 16313 && $input_table == $property->table('No')) 16314 || ($type == $FORCED_BINARY 16315 && $property->table('No')-> 16316 is_set_equivalent_to($input_table))) 16317 { 16318 $test_table = $property->table('Yes'); 16319 $p = 'P'; 16320 } 16321 else { 16322 $test_table = $input_table; 16323 $p = 'p'; 16324 } 16325 16326 # Look for a single form amongst all the children. 16327 foreach my $table ($test_table->children) { 16328 next if $table->property != $perl; 16329 my $proposed_name = $table->short_name; 16330 next if ! defined $proposed_name; 16331 16332 # Don't mention internal-only properties as a possible 16333 # single form synonym 16334 next if substr($proposed_name, 0, 1) eq '_'; 16335 16336 $proposed_name = "\\$p\{$proposed_name}"; 16337 if (! defined $single_form 16338 || length($proposed_name) < length $single_form) 16339 { 16340 $single_form = $proposed_name; 16341 16342 # The goal here is to find a single form; not the 16343 # shortest possible one. We've already found a 16344 # short name. So, stop at the first single form 16345 # found, which is likely to be closer to the 16346 # original. 16347 last; 16348 } 16349 } 16350 } 16351 16352 # Output both short and single in the same parenthesized 16353 # expression, but with only one of 'Single', 'Short' if there 16354 # are both items. 16355 if ($short_name || $single_form || $table->conflicting) { 16356 $parenthesized .= "Short: $short_name" if $short_name; 16357 if ($short_name && $single_form) { 16358 $parenthesized .= ', '; 16359 } 16360 elsif ($single_form) { 16361 $parenthesized .= 'Single: '; 16362 } 16363 $parenthesized .= $single_form if $single_form; 16364 } 16365 } 16366 16367 if ($caseless_equivalent != 0) { 16368 $parenthesized .= '; ' if $parenthesized ne ""; 16369 $parenthesized .= "/i= " . $caseless_equivalent->complete_name; 16370 } 16371 16372 16373 # Warn if this property isn't the same as one that a 16374 # semi-casual user might expect. The other components of this 16375 # parenthesized structure are calculated only for the first entry 16376 # for this table, but the conflicting is deemed important enough 16377 # to go on every entry. 16378 my $conflicting = join " NOR ", $table->conflicting; 16379 if ($conflicting) { 16380 $parenthesized .= '; ' if $parenthesized ne ""; 16381 $parenthesized .= "NOT $conflicting"; 16382 } 16383 16384 push @info, "($parenthesized)" if $parenthesized; 16385 16386 if ($name =~ /_$/ && $alias->loose_match) { 16387 push @info, "Note the trailing '_' matters in spite of loose matching rules."; 16388 } 16389 16390 if ($table_property != $perl && $table->perl_extension) { 16391 push @info, '(Perl extension)'; 16392 } 16393 my $definition = $table->definition // ""; 16394 $definition = "" if $entry_for_first_alias; 16395 $definition = ": $definition" if $definition; 16396 push @info, "($string_count$definition)"; 16397 16398 # Now, we have both the entry and info so add them to the 16399 # list of all the properties. 16400 push @match_properties, 16401 format_pod_line($indent_info_column, 16402 $entry, 16403 join( " ", @info), 16404 $alias->status, 16405 $alias->loose_match); 16406 16407 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; 16408 } # End of looping through the aliases for this table. 16409 16410 if (! $entry_for_first_table) { 16411 $entry_for_first_table = $entry_for_first_alias; 16412 } 16413 } # End of looping through all the related tables 16414 return; 16415} 16416 16417sub make_ucd_table_pod_entries($table) { 16418 # Generate the entries for the UCD section of the pod for $table. This 16419 # also calculates if names are ambiguous, so has to be called even if the 16420 # pod is not being output 16421 16422 my $short_name = $table->name; 16423 my $standard_short_name = standardize($short_name); 16424 my $full_name = $table->full_name; 16425 my $standard_full_name = standardize($full_name); 16426 16427 my $full_info = ""; # Text of info column for full-name entries 16428 my $other_info = ""; # Text of info column for short-name entries 16429 my $short_info = ""; # Text of info column for other entries 16430 my $meaning = ""; # Synonym of this table 16431 16432 my $property = ($table->isa('Property')) 16433 ? $table 16434 : $table->parent->property; 16435 16436 my $perl_extension = $table->perl_extension; 16437 my $is_perl_extension_match_table_but_not_dollar_perl 16438 = $property != $perl 16439 && $perl_extension 16440 && $property != $table; 16441 16442 # Get the more official name for perl extensions that aren't 16443 # stand-alone properties 16444 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16445 if ($property->type == $BINARY) { 16446 $meaning = $property->full_name; 16447 } 16448 else { 16449 $meaning = $table->parent->complete_name; 16450 } 16451 } 16452 16453 # There are three types of info column. One for the short name, one for 16454 # the full name, and one for everything else. They mostly are the same, 16455 # so initialize in the same loop. 16456 16457 foreach my $info_ref (\$full_info, \$short_info, \$other_info) { 16458 if ($info_ref != \$full_info) { 16459 16460 # The non-full name columns include the full name 16461 $$info_ref .= $full_name; 16462 } 16463 16464 16465 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16466 16467 # Add the synonymous name for the non-full name entries; and to 16468 # the full-name entry if it adds extra information 16469 if ( standardize($meaning) ne $standard_full_name 16470 || $info_ref == \$other_info 16471 || $info_ref == \$short_info) 16472 { 16473 my $parenthesized = $info_ref != \$full_info; 16474 $$info_ref .= " " if $$info_ref && $parenthesized; 16475 $$info_ref .= "(=" if $parenthesized; 16476 $$info_ref .= "$meaning"; 16477 $$info_ref .= ")" if $parenthesized; 16478 $$info_ref .= "."; 16479 } 16480 } 16481 16482 # And the full-name entry includes the short name, if shorter 16483 if ($info_ref == \$full_info 16484 && length $standard_short_name < length $standard_full_name) 16485 { 16486 $full_info =~ s/\.\Z//; 16487 $full_info .= " " if $full_info; 16488 $full_info .= "(Short: $short_name)"; 16489 } 16490 16491 if ($table->perl_extension) { 16492 $$info_ref =~ s/\.\Z//; 16493 $$info_ref .= ". " if $$info_ref; 16494 $$info_ref .= "(Perl extension)"; 16495 } 16496 } 16497 16498 my $definition; 16499 my $definition_table; 16500 my $type = $table->property->type; 16501 if ($type == $BINARY || $type == $FORCED_BINARY) { 16502 $definition_table = $table->property->table('Y'); 16503 } 16504 elsif ($table->isa('Match_Table')) { 16505 $definition_table = $table; 16506 } 16507 16508 $definition = $definition_table->calculate_table_definition 16509 if defined $definition_table 16510 && $definition_table != 0; 16511 16512 # Add any extra annotations to the full name entry 16513 foreach my $more_info ($table->description, 16514 $definition, 16515 $table->note, 16516 $table->status_info) 16517 { 16518 next unless $more_info; 16519 $full_info =~ s/\.\Z//; 16520 $full_info .= ". " if $full_info; 16521 $full_info .= $more_info; 16522 } 16523 if ($table->property->type == $FORCED_BINARY) { 16524 if ($full_info) { 16525 $full_info =~ s/\.\Z//; 16526 $full_info .= ". "; 16527 } 16528 $full_info .= "This is a combination property which has both:" 16529 . " 1) a map to various string values; and" 16530 . " 2) a map to boolean Y/N, where 'Y' means the" 16531 . " string value is non-empty. Add the prefix 'is'" 16532 . " to the prop_invmap() call to get the latter"; 16533 } 16534 16535 # These keep track if have created full and short name pod entries for the 16536 # property 16537 my $done_full = 0; 16538 my $done_short = 0; 16539 16540 # Every possible name is kept track of, even those that aren't going to be 16541 # output. This way we can be sure to find the ambiguities. 16542 foreach my $alias ($table->aliases) { 16543 my $name = $alias->name; 16544 my $standard = standardize($name); 16545 my $info; 16546 my $output_this = $alias->ucd; 16547 16548 # If the full and short names are the same, we want to output the full 16549 # one's entry, so it has priority. 16550 if ($standard eq $standard_full_name) { 16551 next if $done_full; 16552 $done_full = 1; 16553 $info = $full_info; 16554 } 16555 elsif ($standard eq $standard_short_name) { 16556 next if $done_short; 16557 $done_short = 1; 16558 next if $standard_short_name eq $standard_full_name; 16559 $info = $short_info; 16560 } 16561 else { 16562 $info = $other_info; 16563 } 16564 16565 $combination_property{$standard} = 1 16566 if $table->property->type == $FORCED_BINARY; 16567 16568 # Here, we have set up the two columns for this entry. But if an 16569 # entry already exists for this name, we have to decide which one 16570 # we're going to later output. 16571 if (exists $ucd_pod{$standard}) { 16572 16573 # If the two entries refer to the same property, it's not going to 16574 # be ambiguous. (Likely it's because the names when standardized 16575 # are the same.) But that means if they are different properties, 16576 # there is ambiguity. 16577 if ($ucd_pod{$standard}->{'property'} != $property) { 16578 16579 # Here, we have an ambiguity. This code assumes that one is 16580 # scheduled to be output and one not and that one is a perl 16581 # extension (which is not to be output) and the other isn't. 16582 # If those assumptions are wrong, things have to be rethought. 16583 if ($ucd_pod{$standard}{'output_this'} == $output_this 16584 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension 16585 || $output_this == $perl_extension) 16586 { 16587 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); 16588 } 16589 16590 # We modify the info column of the one being output to 16591 # indicate the ambiguity. Set $which to point to that one's 16592 # info. 16593 my $which; 16594 if ($ucd_pod{$standard}{'output_this'}) { 16595 $which = \$ucd_pod{$standard}->{'info'}; 16596 } 16597 else { 16598 $which = \$info; 16599 $meaning = $ucd_pod{$standard}{'meaning'}; 16600 } 16601 16602 chomp $$which; 16603 $$which =~ s/\.\Z//; 16604 $$which .= "; NOT '$standard' meaning '$meaning'"; 16605 16606 $ambiguous_names{$standard} = 1; 16607 } 16608 16609 # Use the non-perl-extension variant 16610 next unless $ucd_pod{$standard}{'perl_extension'}; 16611 } 16612 16613 # Store enough information about this entry that we can later look for 16614 # ambiguities, and output it properly. 16615 $ucd_pod{$standard} = { 'name' => $name, 16616 'info' => $info, 16617 'meaning' => $meaning, 16618 'output_this' => $output_this, 16619 'perl_extension' => $perl_extension, 16620 'property' => $property, 16621 'status' => $alias->status, 16622 }; 16623 } # End of looping through all this table's aliases 16624 16625 return; 16626} 16627 16628sub pod_alphanumeric_sort { 16629 # Sort pod entries alphanumerically. 16630 16631 # The first few character columns are filler, plus the '\p{'; and get rid 16632 # of all the trailing stuff, starting with the trailing '}', so as to sort 16633 # on just 'Name=Value' 16634 (my $a = lc $a) =~ s/^ .*? \{ //x; 16635 $a =~ s/}.*//; 16636 (my $b = lc $b) =~ s/^ .*? \{ //x; 16637 $b =~ s/}.*//; 16638 16639 # Determine if the two operands are both internal only or both not. 16640 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 16641 # should be the underscore that begins internal only 16642 my $a_is_internal = (substr($a, 0, 1) eq '_'); 16643 my $b_is_internal = (substr($b, 0, 1) eq '_'); 16644 16645 # Sort so the internals come last in the table instead of first (which the 16646 # leading underscore would otherwise indicate). 16647 if ($a_is_internal != $b_is_internal) { 16648 return 1 if $a_is_internal; 16649 return -1 16650 } 16651 16652 # Determine if the two operands are compound or not, and if so if are 16653 # "numeric" property values or not, like \p{Age: 3.0}. But there are also 16654 # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0}, 16655 # all of which this considers numeric, and for sorting, looks just at the 16656 # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}. 16657 my $split_re = qr/ 16658 ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the 16659 # property name 16660 [:=] \s* # The syntax for the compound form 16661 (?: # followed by ... 16662 ( # $2 gets defined if what follows is a "numeric" 16663 # expression, which is ... 16664 ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational 16665 # number, optionally signed 16666 | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either 16667 # of these go into $3 16668 | ( V \d+ _ \d+ ) # or a Unicode's Age property version 16669 # number, into $4 16670 ) 16671 | .* $ # If not "numeric", accept anything so that $1 gets 16672 # defined if it is any compound form 16673 ) /ix; 16674 my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re); 16675 my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re); 16676 16677 # Sort alphabeticlly on the whole property name if either operand isn't 16678 # compound, or they differ. 16679 return $a cmp $b if ! defined $a_initial 16680 || ! defined $b_initial 16681 || $a_initial ne $b_initial; 16682 16683 if (! defined $a_numeric) { 16684 16685 # If neither is numeric, use alpha sort 16686 return $a cmp $b if ! defined $b_numeric; 16687 return 1; # Sort numeric ahead of alpha 16688 } 16689 16690 # Here $a is numeric 16691 return -1 if ! defined $b_numeric; # Numeric sorts before alpha 16692 16693 # Here they are both numeric in the same property. 16694 # Convert version numbers into regular numbers 16695 if (defined $a_version) { 16696 ($a_number = $a_version) =~ s/^V//i; 16697 $a_number =~ s/_/./; 16698 } 16699 else { # Otherwise get rid of the, e.g., CCC in CCC9 */ 16700 $a_number =~ s/ ^ [[:alpha:]]+ //x; 16701 } 16702 if (defined $b_version) { 16703 ($b_number = $b_version) =~ s/^V//i; 16704 $b_number =~ s/_/./; 16705 } 16706 else { 16707 $b_number =~ s/ ^ [[:alpha:]]+ //x; 16708 } 16709 16710 # Convert rationals to floating for the comparison. 16711 $a_number = eval $a_number if $a_number =~ qr{/}; 16712 $b_number = eval $b_number if $b_number =~ qr{/}; 16713 16714 return $a_number <=> $b_number || $a cmp $b; 16715} 16716 16717sub make_pod () { 16718 # Create the .pod file. This generates the various subsections and then 16719 # combines them in one big HERE document. 16720 16721 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; 16722 16723 return unless defined $pod_directory; 16724 print "Making pod file\n" if $verbosity >= $PROGRESS; 16725 16726 my $exception_message = 16727 '(Any exceptions are individually noted beginning with the word NOT.)'; 16728 my @block_warning; 16729 if (-e 'Blocks.txt') { 16730 16731 # Add the line: '\p{In_*} \p{Block: *}', with the warning message 16732 # if the global $has_In_conflicts indicates we have them. 16733 push @match_properties, format_pod_line($indent_info_column, 16734 '\p{In_*}', 16735 '\p{Block: *}' 16736 . (($has_In_conflicts) 16737 ? " $exception_message" 16738 : ""), 16739 $DISCOURAGED); 16740 @block_warning = << "END"; 16741 16742In particular, matches in the Block property have single forms 16743defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at 16744all, Like all B<DISCOURAGED> forms, these are not stable. For example, 16745C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>, 16746C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may 16747come along that would force Perl to change the meaning of one or more of 16748these, and your program would no longer be correct. Currently there are no 16749such conflicts with the form that begins C<"In_">, but there are many with the 16750other two shortcuts, and Unicode continues to define new properties that begin 16751with C<"In">, so it's quite possible that a conflict will occur in the future. 16752The compound form is guaranteed to not become obsolete, and its meaning is 16753clearer anyway. See L<perlunicode/"Blocks"> for more information about this. 16754 16755User-defined properties must begin with "In" or "Is". These override any 16756Unicode property of the same name. 16757END 16758 } 16759 my $text = $Is_flags_text; 16760 $text = "$exception_message $text" if $has_Is_conflicts; 16761 16762 # And the 'Is_ line'; 16763 push @match_properties, format_pod_line($indent_info_column, 16764 '\p{Is_*}', 16765 "\\p{*} $text"); 16766 push @match_properties, format_pod_line($indent_info_column, 16767 '\p{Name=*}', 16768 "Combination of Name and Name_Alias properties; has special" 16769 . " loose matching rules, for which see Unicode UAX #44"); 16770 push @match_properties, format_pod_line($indent_info_column, 16771 '\p{Na=*}', 16772 '\p{Name=*}'); 16773 16774 # Sort the properties array for output. It is sorted alphabetically 16775 # except numerically for numeric properties, and only output unique lines. 16776 @match_properties = sort pod_alphanumeric_sort uniques @match_properties; 16777 16778 my $formatted_properties = simple_fold(\@match_properties, 16779 "", 16780 # indent succeeding lines by two extra 16781 # which looks better 16782 $indent_info_column + 2, 16783 16784 # shorten the line length by how much 16785 # the formatter indents, so the folded 16786 # line will fit in the space 16787 # presumably available 16788 $automatic_pod_indent); 16789 # Add column headings, indented to be a little more centered, but not 16790 # exactly 16791 $formatted_properties = format_pod_line($indent_info_column, 16792 ' NAME', 16793 ' INFO') 16794 . "\n" 16795 . $formatted_properties; 16796 16797 # Generate pod documentation lines for the tables that match nothing 16798 my $zero_matches = ""; 16799 if (@zero_match_tables) { 16800 @zero_match_tables = uniques(@zero_match_tables); 16801 $zero_matches = join "\n\n", 16802 map { $_ = '=item \p{' . $_->complete_name . "}" } 16803 sort { $a->complete_name cmp $b->complete_name } 16804 @zero_match_tables; 16805 16806 $zero_matches = <<END; 16807 16808=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters 16809 16810Unicode has some property-value pairs that currently don't match anything. 16811This happens generally either because they are obsolete, or they exist for 16812symmetry with other forms, but no language has yet been encoded that uses 16813them. In this version of Unicode, the following match zero code points: 16814 16815=over 4 16816 16817$zero_matches 16818 16819=back 16820 16821END 16822 } 16823 16824 # Generate list of properties that we don't accept, grouped by the reasons 16825 # why. This is so only put out the 'why' once, and then list all the 16826 # properties that have that reason under it. 16827 16828 my %why_list; # The keys are the reasons; the values are lists of 16829 # properties that have the key as their reason 16830 16831 # For each property, add it to the list that are suppressed for its reason 16832 # The sort will cause the alphabetically first properties to be added to 16833 # each list first, so each list will be sorted. 16834 foreach my $property (sort keys %why_suppressed) { 16835 next unless $why_suppressed{$property}; 16836 push @{$why_list{$why_suppressed{$property}}}, $property; 16837 } 16838 16839 # For each reason (sorted by the first property that has that reason)... 16840 my @bad_re_properties; 16841 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } 16842 keys %why_list) 16843 { 16844 # Add to the output, all the properties that have that reason. 16845 my $has_item = 0; # Flag if actually output anything. 16846 foreach my $name (@{$why_list{$why}}) { 16847 16848 # Split compound names into $property and $table components 16849 my $property = $name; 16850 my $table; 16851 if ($property =~ / (.*) = (.*) /x) { 16852 $property = $1; 16853 $table = $2; 16854 } 16855 16856 # This release of Unicode may not have a property that is 16857 # suppressed, so don't reference a non-existent one. 16858 $property = property_ref($property); 16859 next if ! defined $property; 16860 16861 # And since this list is only for match tables, don't list the 16862 # ones that don't have match tables. 16863 next if ! $property->to_create_match_tables; 16864 16865 # Find any abbreviation, and turn it into a compound name if this 16866 # is a property=value pair. 16867 my $short_name = $property->name; 16868 $short_name .= '=' . $property->table($table)->name if $table; 16869 16870 # Start with an empty line. 16871 push @bad_re_properties, "\n\n" unless $has_item; 16872 16873 # And add the property as an item for the reason. 16874 push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; 16875 $has_item = 1; 16876 } 16877 16878 # And add the reason under the list of properties, if such a list 16879 # actually got generated. Note that the header got added 16880 # unconditionally before. But pod ignores extra blank lines, so no 16881 # harm. 16882 push @bad_re_properties, "\n$why\n" if $has_item; 16883 16884 } # End of looping through each reason. 16885 16886 if (! @bad_re_properties) { 16887 push @bad_re_properties, 16888 "*** This installation accepts ALL non-Unihan properties ***"; 16889 } 16890 else { 16891 # Add =over only if non-empty to avoid an empty =over/=back section, 16892 # which is considered bad form. 16893 unshift @bad_re_properties, "\n=over 4\n"; 16894 push @bad_re_properties, "\n=back\n"; 16895 } 16896 16897 # Similarly, generate a list of files that we don't use, grouped by the 16898 # reasons why (Don't output if the reason is empty). First, create a hash 16899 # whose keys are the reasons, and whose values are anonymous arrays of all 16900 # the files that share that reason. 16901 my %grouped_by_reason; 16902 foreach my $file (keys %skipped_files) { 16903 next unless $skipped_files{$file}; 16904 push @{$grouped_by_reason{$skipped_files{$file}}}, $file; 16905 } 16906 16907 # Then, sort each group. 16908 foreach my $group (keys %grouped_by_reason) { 16909 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } 16910 @{$grouped_by_reason{$group}} ; 16911 } 16912 16913 # Finally, create the output text. For each reason (sorted by the 16914 # alphabetically first file that has that reason)... 16915 my @unused_files; 16916 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] 16917 cmp lc $grouped_by_reason{$b}->[0] 16918 } 16919 keys %grouped_by_reason) 16920 { 16921 # Add all the files that have that reason to the output. Start 16922 # with an empty line. 16923 push @unused_files, "\n\n"; 16924 push @unused_files, map { "\n=item F<$_> \n" } 16925 @{$grouped_by_reason{$reason}}; 16926 # And add the reason under the list of files 16927 push @unused_files, "\n$reason\n"; 16928 } 16929 16930 # Similarly, create the output text for the UCD section of the pod 16931 my @ucd_pod; 16932 foreach my $key (keys %ucd_pod) { 16933 next unless $ucd_pod{$key}->{'output_this'}; 16934 push @ucd_pod, format_pod_line($indent_info_column, 16935 $ucd_pod{$key}->{'name'}, 16936 $ucd_pod{$key}->{'info'}, 16937 $ucd_pod{$key}->{'status'}, 16938 ); 16939 } 16940 16941 # Sort alphabetically, and fold for output 16942 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; 16943 my $ucd_pod = simple_fold(\@ucd_pod, 16944 ' ', 16945 $indent_info_column, 16946 $automatic_pod_indent); 16947 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') 16948 . "\n" 16949 . $ucd_pod; 16950 my $space_hex = sprintf("%02x", ord " "); 16951 local $" = ""; 16952 16953 # Everything is ready to assemble. 16954 my @OUT = << "END"; 16955=begin comment 16956 16957$HEADER 16958 16959To change this file, edit $0 instead. 16960 16961=end comment 16962 16963=head1 NAME 16964 16965$pod_file - Index of Unicode Version $unicode_version character properties in Perl 16966 16967=head1 DESCRIPTION 16968 16969This document provides information about the portion of the Unicode database 16970that deals with character properties, that is the portion that is defined on 16971single code points. (L</Other information in the Unicode data base> 16972below briefly mentions other data that Unicode provides.) 16973 16974Perl can provide access to all non-provisional Unicode character properties, 16975though not all are enabled by default. The omitted ones are the Unihan 16976properties and certain 16977deprecated or Unicode-internal properties. (An installation may choose to 16978recompile Perl's tables to change this. See L</Unicode character 16979properties that are NOT accepted by Perl>.) 16980 16981For most purposes, access to Unicode properties from the Perl core is through 16982regular expression matches, as described in the next section. 16983For some special purposes, and to access the properties that are not suitable 16984for regular expression matching, all the Unicode character properties that 16985Perl handles are accessible via the standard L<Unicode::UCD> module, as 16986described in the section L</Properties accessible through Unicode::UCD>. 16987 16988Perl also provides some additional extensions and short-cut synonyms 16989for Unicode properties. 16990 16991This document merely lists all available properties and does not attempt to 16992explain what each property really means. There is a brief description of each 16993Perl extension; see L<perlunicode/Other Properties> for more information on 16994these. There is some detail about Blocks, Scripts, General_Category, 16995and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the 16996official Unicode properties, refer to the Unicode standard. A good starting 16997place is L<$unicode_reference_url>. 16998 16999Note that you can define your own properties; see 17000L<perlunicode/"User-Defined Character Properties">. 17001 17002=head1 Properties accessible through C<\\p{}> and C<\\P{}> 17003 17004The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to 17005most of the Unicode character properties. The table below shows all these 17006constructs, both single and compound forms. 17007 17008B<Compound forms> consist of two components, separated by an equals sign or a 17009colon. The first component is the property name, and the second component is 17010the particular value of the property to match against, for example, 17011C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean 17012to match characters whose Script_Extensions property value is Greek. 17013(C<Script_Extensions> is an improved version of the C<Script> property.) 17014 17015B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for 17016their equivalent compound forms. The table shows these equivalences. (In our 17017example, C<\\p{Greek}> is a just a shortcut for 17018C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single 17019forms that are not shortcuts for a compound form. One such is C<\\p{Word}>. 17020These are also listed in the table. 17021 17022In parsing these constructs, Perl always ignores Upper/lower case differences 17023everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as 17024C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before 17025the left brace completely changes the meaning of the construct, from "match" 17026(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is 17027for improved legibility. 17028 17029Also, white space, hyphens, and underscores are normally ignored 17030everywhere between the {braces}, and hence can be freely added or removed 17031even if the C</x> modifier hasn't been specified on the regular expression. 17032But in the table below $a_bold_stricter at the beginning of an entry 17033means that tighter (stricter) rules are used for that entry: 17034 17035=over 4 17036 17037=over 4 17038 17039=item Single form (C<\\p{name}>) tighter rules: 17040 17041White space, hyphens, and underscores ARE significant 17042except for: 17043 17044=over 4 17045 17046=item * white space adjacent to a non-word character 17047 17048=item * underscores separating digits in numbers 17049 17050=back 17051 17052That means, for example, that you can freely add or remove white space 17053adjacent to (but within) the braces without affecting the meaning. 17054 17055=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules: 17056 17057The tighter rules given above for the single form apply to everything to the 17058right of the colon or equals; the looser rules still apply to everything to 17059the left. 17060 17061That means, for example, that you can freely add or remove white space 17062adjacent to (but within) the braces and the colon or equal sign. 17063 17064=back 17065 17066=back 17067 17068Some properties are considered obsolete by Unicode, but still available. 17069There are several varieties of obsolescence: 17070 17071=over 4 17072 17073=over 4 17074 17075=item Stabilized 17076 17077A property may be stabilized. Such a determination does not indicate 17078that the property should or should not be used; instead it is a declaration 17079that the property will not be maintained nor extended for newly encoded 17080characters. Such properties are marked with $a_bold_stabilized in the 17081table. 17082 17083=item Deprecated 17084 17085A property may be deprecated, perhaps because its original intent 17086has been replaced by another property, or because its specification was 17087somehow defective. This means that its use is strongly 17088discouraged, so much so that a warning will be issued if used, unless the 17089regular expression is in the scope of a C<S<no warnings 'deprecated'>> 17090statement. $A_bold_deprecated flags each such entry in the table, and 17091the entry there for the longest, most descriptive version of the property will 17092give the reason it is deprecated, and perhaps advice. Perl may issue such a 17093warning, even for properties that aren't officially deprecated by Unicode, 17094when there used to be characters or code points that were matched by them, but 17095no longer. This is to warn you that your program may not work like it did on 17096earlier Unicode releases. 17097 17098A deprecated property may be made unavailable in a future Perl version, so it 17099is best to move away from them. 17100 17101A deprecated property may also be stabilized, but this fact is not shown. 17102 17103=item Obsolete 17104 17105Properties marked with $a_bold_obsolete in the table are considered (plain) 17106obsolete. Generally this designation is given to properties that Unicode once 17107used for internal purposes (but not any longer). 17108 17109=item Discouraged 17110 17111This is not actually a Unicode-specified obsolescence, but applies to certain 17112Perl extensions that are present for backwards compatibility, but are 17113discouraged from being used. These are not obsolete, but their meanings are 17114not stable. Future Unicode versions could force any of these extensions to be 17115removed without warning, replaced by another property with the same name that 17116means something different. $A_bold_discouraged flags each such entry in the 17117table. Use the equivalent shown instead. 17118 17119@block_warning 17120 17121=back 17122 17123=back 17124 17125The table below has two columns. The left column contains the C<\\p{}> 17126constructs to look up, possibly preceded by the flags mentioned above; and 17127the right column contains information about them, like a description, or 17128synonyms. The table shows both the single and compound forms for each 17129property that has them. If the left column is a short name for a property, 17130the right column will give its longer, more descriptive name; and if the left 17131column is the longest name, the right column will show any equivalent shortest 17132name, in both single and compound forms if applicable. 17133 17134If braces are not needed to specify a property (e.g., C<\\pL>), the left 17135column contains both forms, with and without braces. 17136 17137The right column will also caution you if a property means something different 17138than what might normally be expected. 17139 17140All single forms are Perl extensions; a few compound forms are as well, and 17141are noted as such. 17142 17143Numbers in (parentheses) indicate the total number of Unicode code points 17144matched by the property. For the entries that give the longest, most 17145descriptive version of the property, the count is followed by a list of some 17146of the code points matched by it. The list includes all the matched 17147characters in the 0-255 range, enclosed in the familiar [brackets] the same as 17148a regular expression bracketed character class. Following that, the next few 17149higher matching ranges are also given. To avoid visual ambiguity, the SPACE 17150character is represented as C<\\x$space_hex>. 17151 17152For emphasis, those properties that match no code points at all are listed as 17153well in a separate section following the table. 17154 17155Most properties match the same code points regardless of whether C<"/i"> 17156case-insensitive matching is specified or not. But a few properties are 17157affected. These are shown with the notation S<C<(/i= I<other_property>)>> 17158in the second column. Under case-insensitive matching they match the 17159same code pode points as the property I<other_property>. 17160 17161There is no description given for most non-Perl defined properties (See 17162L<$unicode_reference_url> for that). 17163 17164For compactness, 'B<*>' is used as a wildcard instead of showing all possible 17165combinations. For example, entries like: 17166 17167 \\p{Gc: *} \\p{General_Category: *} 17168 17169mean that 'Gc' is a synonym for 'General_Category', and anything that is valid 17170for the latter is also valid for the former. Similarly, 17171 17172 \\p{Is_*} \\p{*} 17173 17174means that if and only if, for example, C<\\p{Foo}> exists, then 17175C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing. 17176And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and 17177C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an 17178underscore. 17179 17180Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. 17181And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 17182'N*' to indicate this, and doesn't have separate entries for the other 17183possibilities. Note that not all properties which have values 'Yes' and 'No' 17184are binary, and they have all their values spelled out without using this wild 17185card, and a C<NOT> clause in their description that highlights their not being 17186binary. These also require the compound form to match them, whereas true 17187binary properties have both single and compound forms available. 17188 17189Note that all non-essential underscores are removed in the display of the 17190short names below. 17191 17192B<Legend summary:> 17193 17194=over 4 17195 17196=item Z<>B<*> is a wild-card 17197 17198=item B<(\\d+)> in the info column gives the number of Unicode code points matched 17199by this property. 17200 17201=item B<$DEPRECATED> means this is deprecated. 17202 17203=item B<$OBSOLETE> means this is obsolete. 17204 17205=item B<$STABILIZED> means this is stabilized. 17206 17207=item B<$STRICTER> means tighter (stricter) name matching applies. 17208 17209=item B<$DISCOURAGED> means use of this form is discouraged, and may not be 17210stable. 17211 17212=back 17213 17214$formatted_properties 17215 17216$zero_matches 17217 17218=head1 Properties accessible through Unicode::UCD 17219 17220The value of any Unicode (not including Perl extensions) character 17221property mentioned above for any single code point is available through 17222L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the 17223values of all the Unicode properties for a given code point. 17224 17225Besides these, all the Unicode character properties mentioned above 17226(except for those marked as for internal use by Perl) are also 17227accessible by L<Unicode::UCD/prop_invlist()>. 17228 17229Due to their nature, not all Unicode character properties are suitable for 17230regular expression matches, nor C<prop_invlist()>. The remaining 17231non-provisional, non-internal ones are accessible via 17232L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation 17233hasn't included; see L<below for which those are|/Unicode character properties 17234that are NOT accepted by Perl>). 17235 17236For compatibility with other parts of Perl, all the single forms given in the 17237table in the L<section above|/Properties accessible through \\p{} and \\P{}> 17238are recognized. BUT, there are some ambiguities between some Perl extensions 17239and the Unicode properties, all of which are silently resolved in favor of the 17240official Unicode property. To avoid surprises, you should only use 17241C<prop_invmap()> for forms listed in the table below, which omits the 17242non-recommended ones. The affected forms are the Perl single form equivalents 17243of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of 17244C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property, 17245whose short name is C<sc>. The table indicates the current ambiguities in the 17246INFO column, beginning with the word C<"NOT">. 17247 17248The standard Unicode properties listed below are documented in 17249L<$unicode_reference_url>; Perl_Decimal_Digit is documented in 17250L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in 17251L<perlunicode/Other Properties>; 17252 17253The first column in the table is a name for the property; the second column is 17254an alternative name, if any, plus possibly some annotations. The alternative 17255name is the property's full name, unless that would simply repeat the first 17256column, in which case the second column indicates the property's short name 17257(if different). The annotations are given only in the entry for the full 17258name. The annotations for binary properties include a list of the first few 17259ranges that the property matches. To avoid any ambiguity, the SPACE character 17260is represented as C<\\x$space_hex>. 17261 17262If a property is obsolete, etc, the entry will be flagged with the same 17263characters used in the table in the L<section above|/Properties accessible 17264through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>. 17265 17266$ucd_pod 17267 17268=head1 Properties accessible through other means 17269 17270Certain properties are accessible also via core function calls. These are: 17271 17272 Lowercase_Mapping lc() and lcfirst() 17273 Titlecase_Mapping ucfirst() 17274 Uppercase_Mapping uc() 17275 17276Also, Case_Folding is accessible through the C</i> modifier in regular 17277expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>> 17278operator. 17279 17280Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases 17281properties are accessible through the C<\\N{}> interpolation in double-quoted 17282strings and regular expressions; and functions C<charnames::viacode()>, 17283C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a 17284C<use charnames ();> to be specified. 17285 17286Finally, most properties related to decomposition are accessible via 17287L<Unicode::Normalize>. 17288 17289=head1 Unicode character properties that are NOT accepted by Perl 17290 17291Perl will generate an error for a few character properties in Unicode when 17292used in a regular expression. The non-Unihan ones are listed below, with the 17293reasons they are not accepted, perhaps with work-arounds. The short names for 17294the properties are listed enclosed in (parentheses). 17295As described after the list, an installation can change the defaults and choose 17296to accept any of these. The list is machine generated based on the 17297choices made for the installation that generated this document. 17298 17299@bad_re_properties 17300 17301An installation can choose to allow any of these to be matched by downloading 17302the Unicode database from L<http://www.unicode.org/Public/> to 17303C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the 17304controlling lists contained in the program 17305C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing. 17306(C<\%Config> is available from the Config module). 17307 17308Also, perl can be recompiled to operate on an earlier version of the Unicode 17309standard. Further information is at 17310C<\$Config{privlib}>/F<unicore/README.perl>. 17311 17312=head1 Other information in the Unicode data base 17313 17314The Unicode data base is delivered in two different formats. The XML version 17315is valid for more modern Unicode releases. The other version is a collection 17316of files. The two are intended to give equivalent information. Perl uses the 17317older form; this allows you to recompile Perl to use early Unicode releases. 17318 17319The only non-character property that Perl currently supports is Named 17320Sequences, in which a sequence of code points 17321is given a name and generally treated as a single entity. (Perl supports 17322these via the C<\\N{...}> double-quotish construct, 17323L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>. 17324 17325Below is a list of the files in the Unicode data base that Perl doesn't 17326currently use, along with very brief descriptions of their purposes. 17327Some of the names of the files have been shortened from those that Unicode 17328uses, in order to allow them to be distinguishable from similarly named files 17329on file systems for which only the first 8 characters of a name are 17330significant. 17331 17332=over 4 17333 17334@unused_files 17335 17336=back 17337 17338=head1 SEE ALSO 17339 17340L<$unicode_reference_url> 17341 17342L<perlrecharclass> 17343 17344L<perlunicode> 17345 17346END 17347 17348 # And write it. The 0 means no utf8. 17349 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT); 17350 return; 17351} 17352 17353sub make_Name_pm () { 17354 # Create and write Name.pm, which contains subroutines and data to use in 17355 # conjunction with Name.pl 17356 17357 # Maybe there's nothing to do. 17358 return unless $has_hangul_syllables || @code_points_ending_in_code_point; 17359 17360 my @name = <<END; 17361$HEADER 17362$INTERNAL_ONLY_HEADER 17363 17364END 17365 17366 # Convert these structures to output format. 17367 my $code_points_ending_in_code_point = 17368 main::simple_dumper(\@code_points_ending_in_code_point, 17369 ' ' x 8); 17370 my $names = main::simple_dumper(\%names_ending_in_code_point, 17371 ' ' x 8); 17372 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point, 17373 ' ' x 8); 17374 17375 # Do the same with the Hangul names, 17376 my $jamo; 17377 my $jamo_l; 17378 my $jamo_v; 17379 my $jamo_t; 17380 my $jamo_re; 17381 if ($has_hangul_syllables) { 17382 17383 # Construct a regular expression of all the possible 17384 # combinations of the Hangul syllables. 17385 my @L_re; # Leading consonants 17386 for my $i ($LBase .. $LBase + $LCount - 1) { 17387 push @L_re, $Jamo{$i} 17388 } 17389 my @V_re; # Middle vowels 17390 for my $i ($VBase .. $VBase + $VCount - 1) { 17391 push @V_re, $Jamo{$i} 17392 } 17393 my @T_re; # Trailing consonants 17394 for my $i ($TBase + 1 .. $TBase + $TCount - 1) { 17395 push @T_re, $Jamo{$i} 17396 } 17397 17398 # The whole re is made up of the L V T combination. 17399 $jamo_re = '(' 17400 . join ('|', sort @L_re) 17401 . ')(' 17402 . join ('|', sort @V_re) 17403 . ')(' 17404 . join ('|', sort @T_re) 17405 . ')?'; 17406 17407 # These hashes needed by the algorithm were generated 17408 # during reading of the Jamo.txt file 17409 $jamo = main::simple_dumper(\%Jamo, ' ' x 8); 17410 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8); 17411 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8); 17412 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8); 17413 } 17414 17415 push @name, <<END; 17416 17417package charnames; 17418 17419# This module contains machine-generated tables and code for the 17420# algorithmically-determinable Unicode character names. The following 17421# routines can be used to translate between name and code point and vice versa 17422 17423{ # Closure 17424 17425 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 17426 # two must be 10; if there are 5, the first must not be a 0. Written this 17427 # way to decrease backtracking. The first regex allows the code point to 17428 # be at the end of a word, but to work properly, the word shouldn't end 17429 # with a valid hex character. The second one won't match a code point at 17430 # the end of a word, and doesn't have the run-on issue 17431 my \$run_on_code_point_re = qr/$run_on_code_point_re/; 17432 my \$code_point_re = qr/$code_point_re/; 17433 17434 # In the following hash, the keys are the bases of names which include 17435 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 17436 # of each key is another hash which is used to get the low and high ends 17437 # for each range of code points that apply to the name. 17438 my %names_ending_in_code_point = ( 17439$names 17440 ); 17441 17442 # The following hash is a copy of the previous one, except is for loose 17443 # matching, so each name has blanks and dashes squeezed out 17444 my %loose_names_ending_in_code_point = ( 17445$loose_names 17446 ); 17447 17448 # And the following array gives the inverse mapping from code points to 17449 # names. Lowest code points are first 17450 \@code_points_ending_in_code_point = ( 17451$code_points_ending_in_code_point 17452 ); 17453 17454 # Is exportable, make read-only 17455 Internals::SvREADONLY(\@code_points_ending_in_code_point, 1); 17456END 17457 # Earlier releases didn't have Jamos. No sense outputting 17458 # them unless will be used. 17459 if ($has_hangul_syllables) { 17460 push @name, <<END; 17461 17462 # Convert from code point to Jamo short name for use in composing Hangul 17463 # syllable names 17464 my %Jamo = ( 17465$jamo 17466 ); 17467 17468 # Leading consonant (can be null) 17469 my %Jamo_L = ( 17470$jamo_l 17471 ); 17472 17473 # Vowel 17474 my %Jamo_V = ( 17475$jamo_v 17476 ); 17477 17478 # Optional trailing consonant 17479 my %Jamo_T = ( 17480$jamo_t 17481 ); 17482 17483 # Computed re that splits up a Hangul name into LVT or LV syllables 17484 my \$syllable_re = qr/$jamo_re/; 17485 17486 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 17487 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 17488 17489 # These constants names and values were taken from the Unicode standard, 17490 # version 5.1, section 3.12. They are used in conjunction with Hangul 17491 # syllables 17492 my \$SBase = $SBase_string; 17493 my \$LBase = $LBase_string; 17494 my \$VBase = $VBase_string; 17495 my \$TBase = $TBase_string; 17496 my \$SCount = $SCount; 17497 my \$LCount = $LCount; 17498 my \$VCount = $VCount; 17499 my \$TCount = $TCount; 17500 my \$NCount = \$VCount * \$TCount; 17501END 17502 } # End of has Jamos 17503 17504 push @name, << 'END'; 17505 17506 sub name_to_code_point_special { 17507 my ($name, $loose) = @_; 17508 17509 # Returns undef if not one of the specially handled names; otherwise 17510 # returns the code point equivalent to the input name 17511 # $loose is non-zero if to use loose matching, 'name' in that case 17512 # must be input as upper case with all blanks and dashes squeezed out. 17513END 17514 if ($has_hangul_syllables) { 17515 push @name, << 'END'; 17516 17517 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 17518 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 17519 { 17520 return if $name !~ qr/^$syllable_re$/; 17521 my $L = $Jamo_L{$1}; 17522 my $V = $Jamo_V{$2}; 17523 my $T = (defined $3) ? $Jamo_T{$3} : 0; 17524 return ($L * $VCount + $V) * $TCount + $T + $SBase; 17525 } 17526END 17527 } 17528 push @name, << 'END'; 17529 17530 # Name must end in 'code_point' for this to handle. 17531 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 17532 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 17533 17534 my $base = $1; 17535 my $code_point = CORE::hex $2; 17536 my $names_ref; 17537 17538 if ($loose) { 17539 $names_ref = \%loose_names_ending_in_code_point; 17540 } 17541 else { 17542 return if $base !~ s/-$//; 17543 $names_ref = \%names_ending_in_code_point; 17544 } 17545 17546 # Name must be one of the ones which has the code point in it. 17547 return if ! $names_ref->{$base}; 17548 17549 # Look through the list of ranges that apply to this name to see if 17550 # the code point is in one of them. 17551 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 17552 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 17553 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 17554 17555 # Here, the code point is in the range. 17556 return $code_point; 17557 } 17558 17559 # Here, looked like the name had a code point number in it, but 17560 # did not match one of the valid ones. 17561 return; 17562 } 17563 17564 sub code_point_to_name_special { 17565 my $code_point = shift; 17566 17567 # Returns the name of a code point if algorithmically determinable; 17568 # undef if not 17569END 17570 if ($has_hangul_syllables) { 17571 push @name, << 'END'; 17572 17573 # If in the Hangul range, calculate the name based on Unicode's 17574 # algorithm 17575 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 17576 use integer; 17577 my $SIndex = $code_point - $SBase; 17578 my $L = $LBase + $SIndex / $NCount; 17579 my $V = $VBase + ($SIndex % $NCount) / $TCount; 17580 my $T = $TBase + $SIndex % $TCount; 17581 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 17582 $name .= $Jamo{$T} if $T != $TBase; 17583 return $name; 17584 } 17585END 17586 } 17587 push @name, << 'END'; 17588 17589 # Look through list of these code points for one in range. 17590 foreach my $hash (@code_points_ending_in_code_point) { 17591 return if $code_point < $hash->{'low'}; 17592 if ($code_point <= $hash->{'high'}) { 17593 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 17594 } 17595 } 17596 return; # None found 17597 } 17598} # End closure 17599 176001; 17601END 17602 17603 main::write("Name.pm", 0, \@name); # The 0 means no utf8. 17604 return; 17605} 17606 17607sub make_UCD () { 17608 # Create and write UCD.pl, which passes info about the tables to 17609 # Unicode::UCD 17610 17611 # Stringify structures for output 17612 my $loose_property_name_of 17613 = simple_dumper(\%loose_property_name_of, ' ' x 4); 17614 chomp $loose_property_name_of; 17615 17616 my $strict_property_name_of 17617 = simple_dumper(\%strict_property_name_of, ' ' x 4); 17618 chomp $strict_property_name_of; 17619 17620 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); 17621 chomp $stricter_to_file_of; 17622 17623 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4); 17624 chomp $inline_definitions; 17625 17626 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); 17627 chomp $loose_to_file_of; 17628 17629 my $nv_floating_to_rational 17630 = simple_dumper(\%nv_floating_to_rational, ' ' x 4); 17631 chomp $nv_floating_to_rational; 17632 17633 my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4); 17634 chomp $why_deprecated; 17635 17636 # We set the key to the file when we associated files with tables, but we 17637 # couldn't do the same for the value then, as we might not have the file 17638 # for the alternate table figured out at that time. 17639 foreach my $cased (keys %caseless_equivalent_to) { 17640 my @path = $caseless_equivalent_to{$cased}->file_path; 17641 my $path; 17642 if ($path[0] eq "#") { # Pseudo-directory '#' 17643 $path = join '/', @path; 17644 } 17645 else { # Gets rid of lib/ 17646 $path = join '/', @path[1, -1]; 17647 } 17648 $caseless_equivalent_to{$cased} = $path; 17649 } 17650 my $caseless_equivalent_to 17651 = simple_dumper(\%caseless_equivalent_to, ' ' x 4); 17652 chomp $caseless_equivalent_to; 17653 17654 my $loose_property_to_file_of 17655 = simple_dumper(\%loose_property_to_file_of, ' ' x 4); 17656 chomp $loose_property_to_file_of; 17657 17658 my $strict_property_to_file_of 17659 = simple_dumper(\%strict_property_to_file_of, ' ' x 4); 17660 chomp $strict_property_to_file_of; 17661 17662 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); 17663 chomp $file_to_swash_name; 17664 17665 # Create a mapping from each alias of Perl single-form extensions to all 17666 # its equivalent aliases, for quick look-up. 17667 my %perlprop_to_aliases; 17668 foreach my $table ($perl->tables) { 17669 17670 # First create the list of the aliases of each extension 17671 my @aliases_list; # List of legal aliases for this extension 17672 17673 my $table_name = $table->name; 17674 my $standard_table_name = standardize($table_name); 17675 my $table_full_name = $table->full_name; 17676 my $standard_table_full_name = standardize($table_full_name); 17677 17678 # Make sure that the list has both the short and full names 17679 push @aliases_list, $table_name, $table_full_name; 17680 17681 my $found_ucd = 0; # ? Did we actually get an alias that should be 17682 # output for this table 17683 17684 # Go through all the aliases (including the two just added), and add 17685 # any new unique ones to the list 17686 foreach my $alias ($table->aliases) { 17687 17688 # Skip non-legal names 17689 next unless $alias->ok_as_filename; 17690 next unless $alias->ucd; 17691 17692 $found_ucd = 1; # have at least one legal name 17693 17694 my $name = $alias->name; 17695 my $standard = standardize($name); 17696 17697 # Don't repeat a name that is equivalent to one already on the 17698 # list 17699 next if $standard eq $standard_table_name; 17700 next if $standard eq $standard_table_full_name; 17701 17702 push @aliases_list, $name; 17703 } 17704 17705 # If there were no legal names, don't output anything. 17706 next unless $found_ucd; 17707 17708 # To conserve memory in the program reading these in, omit full names 17709 # that are identical to the short name, when those are the only two 17710 # aliases for the property. 17711 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) { 17712 pop @aliases_list; 17713 } 17714 17715 # Here, @aliases_list is the list of all the aliases that this 17716 # extension legally has. Now can create a map to it from each legal 17717 # standardized alias 17718 foreach my $alias ($table->aliases) { 17719 next unless $alias->ucd; 17720 next unless $alias->ok_as_filename; 17721 push @{$perlprop_to_aliases{standardize($alias->name)}}, 17722 uniques @aliases_list; 17723 } 17724 } 17725 17726 # Make a list of all combinations of properties/values that are suppressed. 17727 my @suppressed; 17728 if (! $debug_skip) { # This tends to fail in this debug mode 17729 foreach my $property_name (keys %why_suppressed) { 17730 17731 # Just the value 17732 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; 17733 17734 # The hash may contain properties not in this release of Unicode 17735 next unless defined (my $property = property_ref($property_name)); 17736 17737 # Find all combinations 17738 foreach my $prop_alias ($property->aliases) { 17739 my $prop_alias_name = standardize($prop_alias->name); 17740 17741 # If no =value, there's just one combination possible for this 17742 if (! $value_name) { 17743 17744 # The property may be suppressed, but there may be a proxy 17745 # for it, so it shouldn't be listed as suppressed 17746 next if $prop_alias->ucd; 17747 push @suppressed, $prop_alias_name; 17748 } 17749 else { # Otherwise 17750 foreach my $value_alias 17751 ($property->table($value_name)->aliases) 17752 { 17753 next if $value_alias->ucd; 17754 17755 push @suppressed, "$prop_alias_name=" 17756 . standardize($value_alias->name); 17757 } 17758 } 17759 } 17760 } 17761 } 17762 @suppressed = sort @suppressed; # So doesn't change between runs of this 17763 # program 17764 17765 # Convert the structure below (designed for Name.pm) to a form that UCD 17766 # wants, so it doesn't have to modify it at all; i.e. so that it includes 17767 # an element for the Hangul syllables in the appropriate place, and 17768 # otherwise changes the name to include the "-<code point>" suffix. 17769 my @algorithm_names; 17770 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came 17771 # along in this version 17772 # Copy it linearly. 17773 for my $i (0 .. @code_points_ending_in_code_point - 1) { 17774 17775 # Insert the hanguls in the correct place. 17776 if (! $done_hangul 17777 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase) 17778 { 17779 $done_hangul = 1; 17780 push @algorithm_names, { low => $SBase, 17781 high => $SBase + $SCount - 1, 17782 name => '<hangul syllable>', 17783 }; 17784 } 17785 17786 # Copy the current entry, modified. 17787 push @algorithm_names, { 17788 low => $code_points_ending_in_code_point[$i]->{'low'}, 17789 high => $code_points_ending_in_code_point[$i]->{'high'}, 17790 name => 17791 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>", 17792 }; 17793 } 17794 17795 # Serialize these structures for output. 17796 my $loose_to_standard_value 17797 = simple_dumper(\%loose_to_standard_value, ' ' x 4); 17798 chomp $loose_to_standard_value; 17799 17800 my $string_property_loose_to_name 17801 = simple_dumper(\%string_property_loose_to_name, ' ' x 4); 17802 chomp $string_property_loose_to_name; 17803 17804 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4); 17805 chomp $perlprop_to_aliases; 17806 17807 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4); 17808 chomp $prop_aliases; 17809 17810 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4); 17811 chomp $prop_value_aliases; 17812 17813 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : ""; 17814 chomp $suppressed; 17815 17816 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4); 17817 chomp $algorithm_names; 17818 17819 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); 17820 chomp $ambiguous_names; 17821 17822 my $combination_property = simple_dumper(\%combination_property, ' ' x 4); 17823 chomp $combination_property; 17824 17825 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); 17826 chomp $loose_defaults; 17827 17828 my @ucd = <<END; 17829$HEADER 17830$INTERNAL_ONLY_HEADER 17831 17832# This file is for the use of Unicode::UCD 17833 17834# Highest legal Unicode code point 17835\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING; 17836 17837# Hangul syllables 17838\$Unicode::UCD::HANGUL_BEGIN = $SBase_string; 17839\$Unicode::UCD::HANGUL_COUNT = $SCount; 17840 17841# Maps Unicode (not Perl single-form extensions) property names in loose 17842# standard form to their corresponding standard names 17843\%Unicode::UCD::loose_property_name_of = ( 17844$loose_property_name_of 17845); 17846 17847# Same, but strict names 17848\%Unicode::UCD::strict_property_name_of = ( 17849$strict_property_name_of 17850); 17851 17852# Gives the definitions (in the form of inversion lists) for those properties 17853# whose definitions aren't kept in files 17854\@Unicode::UCD::inline_definitions = ( 17855$inline_definitions 17856); 17857 17858# Maps property, table to file for those using stricter matching. For paths 17859# whose directory is '#', the file is in the form of a numeric index into 17860# \@inline_definitions 17861\%Unicode::UCD::stricter_to_file_of = ( 17862$stricter_to_file_of 17863); 17864 17865# Maps property, table to file for those using loose matching. For paths 17866# whose directory is '#', the file is in the form of a numeric index into 17867# \@inline_definitions 17868\%Unicode::UCD::loose_to_file_of = ( 17869$loose_to_file_of 17870); 17871 17872# Maps floating point to fractional form 17873\%Unicode::UCD::nv_floating_to_rational = ( 17874$nv_floating_to_rational 17875); 17876 17877# If a %e floating point number doesn't have this number of digits in it after 17878# the decimal point to get this close to a fraction, it isn't considered to be 17879# that fraction even if all the digits it does have match. 17880\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION; 17881 17882# Deprecated tables to generate a warning for. The key is the file containing 17883# the table, so as to avoid duplication, as many property names can map to the 17884# file, but we only need one entry for all of them. 17885\%Unicode::UCD::why_deprecated = ( 17886$why_deprecated 17887); 17888 17889# A few properties have different behavior under /i matching. This maps 17890# those to substitute files to use under /i. 17891\%Unicode::UCD::caseless_equivalent = ( 17892$caseless_equivalent_to 17893); 17894 17895# Property names to mapping files 17896\%Unicode::UCD::loose_property_to_file_of = ( 17897$loose_property_to_file_of 17898); 17899 17900# Property names to mapping files 17901\%Unicode::UCD::strict_property_to_file_of = ( 17902$strict_property_to_file_of 17903); 17904 17905# Files to the swash names within them. 17906\%Unicode::UCD::file_to_swash_name = ( 17907$file_to_swash_name 17908); 17909 17910# Keys are all the possible "prop=value" combinations, in loose form; values 17911# are the standard loose name for the 'value' part of the key 17912\%Unicode::UCD::loose_to_standard_value = ( 17913$loose_to_standard_value 17914); 17915 17916# String property loose names to standard loose name 17917\%Unicode::UCD::string_property_loose_to_name = ( 17918$string_property_loose_to_name 17919); 17920 17921# Keys are Perl extensions in loose form; values are each one's list of 17922# aliases 17923\%Unicode::UCD::loose_perlprop_to_name = ( 17924$perlprop_to_aliases 17925); 17926 17927# Keys are standard property name; values are each one's aliases 17928\%Unicode::UCD::prop_aliases = ( 17929$prop_aliases 17930); 17931 17932# Keys of top level are standard property name; values are keys to another 17933# hash, Each one is one of the property's values, in standard form. The 17934# values are that prop-val's aliases. If only one specified, the short and 17935# long alias are identical. 17936\%Unicode::UCD::prop_value_aliases = ( 17937$prop_value_aliases 17938); 17939 17940# Ordered (by code point ordinal) list of the ranges of code points whose 17941# names are algorithmically determined. Each range entry is an anonymous hash 17942# of the start and end points and a template for the names within it. 17943\@Unicode::UCD::algorithmic_named_code_points = ( 17944$algorithm_names 17945); 17946 17947# The properties that as-is have two meanings, and which must be disambiguated 17948\%Unicode::UCD::ambiguous_names = ( 17949$ambiguous_names 17950); 17951 17952# Keys are the prop-val combinations which are the default values for the 17953# given property, expressed in standard loose form 17954\%Unicode::UCD::loose_defaults = ( 17955$loose_defaults 17956); 17957 17958# The properties that are combinations, in that they have both a map table and 17959# a match table. This is actually for UCD.t, so it knows how to test for 17960# these. 17961\%Unicode::UCD::combination_property = ( 17962$combination_property 17963); 17964 17965# All combinations of names that are suppressed. 17966# This is actually for UCD.t, so it knows which properties shouldn't have 17967# entries. If it got any bigger, would probably want to put it in its own 17968# file to use memory only when it was needed, in testing. 17969\@Unicode::UCD::suppressed_properties = ( 17970$suppressed 17971); 17972 179731; 17974END 17975 17976 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8. 17977 return; 17978} 17979 17980sub write_all_tables() { 17981 # Write out all the tables generated by this program to files, as well as 17982 # the supporting data structures, pod file, and .t file. 17983 17984 my @writables; # List of tables that actually get written 17985 my %match_tables_to_write; # Used to collapse identical match tables 17986 # into one file. Each key is a hash function 17987 # result to partition tables into buckets. 17988 # Each value is an array of the tables that 17989 # fit in the bucket. 17990 17991 # For each property ... 17992 # (sort so that if there is an immutable file name, it has precedence, so 17993 # some other property can't come in and take over its file name. (We 17994 # don't care if both defined, as they had better be different anyway.) 17995 # The property named 'Perl' needs to be first (it doesn't have any 17996 # immutable file name) because empty properties are defined in terms of 17997 # its table named 'All' under the -annotate option.) We also sort by 17998 # the property's name. This is just for repeatability of the outputs 17999 # between runs of this program, but does not affect correctness. 18000 PROPERTY: 18001 foreach my $property ($perl, 18002 sort { return -1 if defined $a->file; 18003 return 1 if defined $b->file; 18004 return $a->name cmp $b->name; 18005 } grep { $_ != $perl } property_ref('*')) 18006 { 18007 my $type = $property->type; 18008 18009 # And for each table for that property, starting with the mapping 18010 # table for it ... 18011 TABLE: 18012 foreach my $table($property, 18013 18014 # and all the match tables for it (if any), sorted so 18015 # the ones with the shortest associated file name come 18016 # first. The length sorting prevents problems of a 18017 # longer file taking a name that might have to be used 18018 # by a shorter one. The alphabetic sorting prevents 18019 # differences between releases 18020 sort { my $ext_a = $a->external_name; 18021 return 1 if ! defined $ext_a; 18022 my $ext_b = $b->external_name; 18023 return -1 if ! defined $ext_b; 18024 18025 # But return the non-complement table before 18026 # the complement one, as the latter is defined 18027 # in terms of the former, and needs to have 18028 # the information for the former available. 18029 return 1 if $a->complement != 0; 18030 return -1 if $b->complement != 0; 18031 18032 # Similarly, return a subservient table after 18033 # a leader 18034 return 1 if $a->leader != $a; 18035 return -1 if $b->leader != $b; 18036 18037 my $cmp = length $ext_a <=> length $ext_b; 18038 18039 # Return result if lengths not equal 18040 return $cmp if $cmp; 18041 18042 # Alphabetic if lengths equal 18043 return $ext_a cmp $ext_b 18044 } $property->tables 18045 ) 18046 { 18047 18048 # Here we have a table associated with a property. It could be 18049 # the map table (done first for each property), or one of the 18050 # other tables. Determine which type. 18051 my $is_property = $table->isa('Property'); 18052 18053 my $name = $table->name; 18054 my $complete_name = $table->complete_name; 18055 18056 # See if should suppress the table if is empty, but warn if it 18057 # contains something. 18058 my $suppress_if_empty_warn_if_not 18059 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0; 18060 18061 # Calculate if this table should have any code points associated 18062 # with it or not. 18063 my $expected_empty = 18064 18065 # $perl should be empty 18066 ($is_property && ($table == $perl)) 18067 18068 # Match tables in properties we skipped populating should be 18069 # empty 18070 || (! $is_property && ! $property->to_create_match_tables) 18071 18072 # Tables and properties that are expected to have no code 18073 # points should be empty 18074 || $suppress_if_empty_warn_if_not 18075 ; 18076 18077 # Set a boolean if this table is the complement of an empty binary 18078 # table 18079 my $is_complement_of_empty_binary = 18080 $type == $BINARY && 18081 (($table == $property->table('Y') 18082 && $property->table('N')->is_empty) 18083 || ($table == $property->table('N') 18084 && $property->table('Y')->is_empty)); 18085 18086 if ($table->is_empty) { 18087 18088 if ($suppress_if_empty_warn_if_not) { 18089 $table->set_fate($SUPPRESSED, 18090 $suppress_if_empty_warn_if_not); 18091 } 18092 18093 # Suppress (by skipping them) expected empty tables. 18094 next TABLE if $expected_empty; 18095 18096 # And setup to later output a warning for those that aren't 18097 # known to be allowed to be empty. Don't do the warning if 18098 # this table is a child of another one to avoid duplicating 18099 # the warning that should come from the parent one. 18100 if (($table == $property || $table->parent == $table) 18101 && $table->fate != $SUPPRESSED 18102 && $table->fate != $MAP_PROXIED 18103 && ! grep { $complete_name =~ /^$_$/ } 18104 @tables_that_may_be_empty) 18105 { 18106 push @unhandled_properties, "$table"; 18107 } 18108 18109 # The old way of expressing an empty match list was to 18110 # complement the list that matches everything. The new way is 18111 # to create an empty inversion list, but this doesn't work for 18112 # annotating, so use the old way then. 18113 $table->set_complement($All) if $annotate 18114 && $table != $property; 18115 } 18116 elsif ($expected_empty) { 18117 my $because = ""; 18118 if ($suppress_if_empty_warn_if_not) { 18119 $because = " because $suppress_if_empty_warn_if_not"; 18120 } 18121 18122 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); 18123 } 18124 18125 # Some tables should match everything 18126 my $expected_full = 18127 ($table->fate == $SUPPRESSED) 18128 ? 0 18129 : ($is_property) 18130 ? # All these types of map tables will be full because 18131 # they will have been populated with defaults 18132 ($type == $ENUM) 18133 18134 : # A match table should match everything if its method 18135 # shows it should 18136 ($table->matches_all 18137 18138 # The complement of an empty binary table will match 18139 # everything 18140 || $is_complement_of_empty_binary 18141 ) 18142 ; 18143 18144 my $count = $table->count; 18145 if ($expected_full) { 18146 if ($count != $MAX_WORKING_CODEPOINTS) { 18147 Carp::my_carp("$table matches only " 18148 . clarify_number($count) 18149 . " Unicode code points but should match " 18150 . clarify_number($MAX_WORKING_CODEPOINTS) 18151 . " (off by " 18152 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count)) 18153 . "). Proceeding anyway."); 18154 } 18155 18156 # Here is expected to be full. If it is because it is the 18157 # complement of an (empty) binary table that is to be 18158 # suppressed, then suppress this one as well. 18159 if ($is_complement_of_empty_binary) { 18160 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; 18161 my $opposing = $property->table($opposing_name); 18162 my $opposing_status = $opposing->status; 18163 if ($opposing_status) { 18164 $table->set_status($opposing_status, 18165 $opposing->status_info); 18166 } 18167 } 18168 } 18169 elsif ($count == $MAX_UNICODE_CODEPOINTS 18170 && $name ne "Any" 18171 && ($table == $property || $table->leader == $table) 18172 && $table->property->status ne $NORMAL) 18173 { 18174 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); 18175 } 18176 18177 if ($table->fate >= $SUPPRESSED) { 18178 if (! $is_property) { 18179 my @children = $table->children; 18180 foreach my $child (@children) { 18181 if ($child->fate < $SUPPRESSED) { 18182 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); 18183 } 18184 } 18185 } 18186 next TABLE; 18187 18188 } 18189 18190 if (! $is_property) { 18191 18192 make_ucd_table_pod_entries($table) if $table->property == $perl; 18193 18194 # Several things need to be done just once for each related 18195 # group of match tables. Do them on the parent. 18196 if ($table->parent == $table) { 18197 18198 # Add an entry in the pod file for the table; it also does 18199 # the children. 18200 make_re_pod_entries($table) if defined $pod_directory; 18201 18202 # See if the table matches identical code points with 18203 # something that has already been processed and is ready 18204 # for output. In that case, no need to have two files 18205 # with the same code points in them. We use the table's 18206 # hash() method to store these in buckets, so that it is 18207 # quite likely that if two tables are in the same bucket 18208 # they will be identical, so don't have to compare tables 18209 # frequently. The tables have to have the same status to 18210 # share a file, so add this to the bucket hash. (The 18211 # reason for this latter is that UCD.pm associates a 18212 # status with a file.) We don't check tables that are 18213 # inverses of others, as it would lead to some coding 18214 # complications, and checking all the regular ones should 18215 # find everything. 18216 if ($table->complement == 0) { 18217 my $hash = $table->hash . ';' . $table->status; 18218 18219 # Look at each table that is in the same bucket as 18220 # this one would be. 18221 foreach my $comparison 18222 (@{$match_tables_to_write{$hash}}) 18223 { 18224 # If the table doesn't point back to this one, we 18225 # see if it matches identically 18226 if ( $comparison->leader != $table 18227 && $table->matches_identically_to($comparison)) 18228 { 18229 $table->set_equivalent_to($comparison, 18230 Related => 0); 18231 next TABLE; 18232 } 18233 } 18234 18235 # Here, not equivalent, add this table to the bucket. 18236 push @{$match_tables_to_write{$hash}}, $table; 18237 } 18238 } 18239 } 18240 else { 18241 18242 # Here is the property itself. 18243 # Don't write out or make references to the $perl property 18244 next if $table == $perl; 18245 18246 make_ucd_table_pod_entries($table); 18247 18248 # There is a mapping stored of the various synonyms to the 18249 # standardized name of the property for Unicode::UCD. 18250 # Also, the pod file contains entries of the form: 18251 # \p{alias: *} \p{full: *} 18252 # rather than show every possible combination of things. 18253 18254 my @property_aliases = $property->aliases; 18255 18256 my $full_property_name = $property->full_name; 18257 my $property_name = $property->name; 18258 my $standard_property_name = standardize($property_name); 18259 my $standard_property_full_name 18260 = standardize($full_property_name); 18261 18262 # We also create for Unicode::UCD a list of aliases for 18263 # the property. The list starts with the property name; 18264 # then its full name. 18265 my @property_list; 18266 my @standard_list; 18267 if ( $property->fate <= $MAP_PROXIED) { 18268 @property_list = ($property_name, $full_property_name); 18269 @standard_list = ($standard_property_name, 18270 $standard_property_full_name); 18271 } 18272 18273 # For each synonym ... 18274 for my $i (0 .. @property_aliases - 1) { 18275 my $alias = $property_aliases[$i]; 18276 my $alias_name = $alias->name; 18277 my $alias_standard = standardize($alias_name); 18278 18279 18280 # Add other aliases to the list of property aliases 18281 if ($property->fate <= $MAP_PROXIED 18282 && ! grep { $alias_standard eq $_ } @standard_list) 18283 { 18284 push @property_list, $alias_name; 18285 push @standard_list, $alias_standard; 18286 } 18287 18288 # For Unicode::UCD, set the mapping of the alias to the 18289 # property 18290 if ($type == $STRING) { 18291 if ($property->fate <= $MAP_PROXIED) { 18292 $string_property_loose_to_name{$alias_standard} 18293 = $standard_property_name; 18294 } 18295 } 18296 else { 18297 my $hash_ref = ($alias_standard =~ /^_/) 18298 ? \%strict_property_name_of 18299 : \%loose_property_name_of; 18300 if (exists $hash_ref->{$alias_standard}) { 18301 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained"); 18302 } 18303 else { 18304 $hash_ref->{$alias_standard} 18305 = $standard_property_name; 18306 } 18307 18308 # Now for the re pod entry for this alias. Skip if not 18309 # outputting a pod; skip the first one, which is the 18310 # full name so won't have an entry like: '\p{full: *} 18311 # \p{full: *}', and skip if don't want an entry for 18312 # this one. 18313 next if $i == 0 18314 || ! defined $pod_directory 18315 || ! $alias->make_re_pod_entry; 18316 18317 my $rhs = "\\p{$full_property_name: *}"; 18318 if ($property != $perl && $table->perl_extension) { 18319 $rhs .= ' (Perl extension)'; 18320 } 18321 push @match_properties, 18322 format_pod_line($indent_info_column, 18323 '\p{' . $alias->name . ': *}', 18324 $rhs, 18325 $alias->status); 18326 } 18327 } 18328 18329 # The list of all possible names is attached to each alias, so 18330 # lookup is easy 18331 if (@property_list) { 18332 push @{$prop_aliases{$standard_list[0]}}, @property_list; 18333 } 18334 18335 if ($property->fate <= $MAP_PROXIED) { 18336 18337 # Similarly, we create for Unicode::UCD a list of 18338 # property-value aliases. 18339 18340 # Look at each table in the property... 18341 foreach my $table ($property->tables) { 18342 my @values_list; 18343 my $table_full_name = $table->full_name; 18344 my $standard_table_full_name 18345 = standardize($table_full_name); 18346 my $table_name = $table->name; 18347 my $standard_table_name = standardize($table_name); 18348 18349 # The list starts with the table name and its full 18350 # name. 18351 push @values_list, $table_name, $table_full_name; 18352 18353 # We add to the table each unique alias that isn't 18354 # discouraged from use. 18355 foreach my $alias ($table->aliases) { 18356 next if $alias->status 18357 && $alias->status eq $DISCOURAGED; 18358 my $name = $alias->name; 18359 my $standard = standardize($name); 18360 next if $standard eq $standard_table_name; 18361 next if $standard eq $standard_table_full_name; 18362 push @values_list, $name; 18363 } 18364 18365 # Here @values_list is a list of all the aliases for 18366 # the table. That is, all the property-values given 18367 # by this table. By agreement with Unicode::UCD, 18368 # if the name and full name are identical, and there 18369 # are no other names, drop the duplicate entry to save 18370 # memory. 18371 if (@values_list == 2 18372 && $values_list[0] eq $values_list[1]) 18373 { 18374 pop @values_list 18375 } 18376 18377 # To save memory, unlike the similar list for property 18378 # aliases above, only the standard forms have the list. 18379 # This forces an extra step of converting from input 18380 # name to standard name, but the savings are 18381 # considerable. (There is only marginal savings if we 18382 # did this with the property aliases.) 18383 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list; 18384 } 18385 } 18386 18387 # Don't write out a mapping file if not desired. 18388 next if ! $property->to_output_map; 18389 } 18390 18391 # Here, we know we want to write out the table, but don't do it 18392 # yet because there may be other tables that come along and will 18393 # want to share the file, and the file's comments will change to 18394 # mention them. So save for later. 18395 push @writables, $table; 18396 18397 } # End of looping through the property and all its tables. 18398 } # End of looping through all properties. 18399 18400 # Now have all the tables that will have files written for them. Do it. 18401 foreach my $table (@writables) { 18402 my @directory; 18403 my $filename; 18404 my $property = $table->property; 18405 my $is_property = ($table == $property); 18406 18407 # For very short tables, instead of writing them out to actual files, 18408 # we in-line their inversion list definitions into UCD.pm. The 18409 # definition replaces the file name, and the special pseudo-directory 18410 # '#' is used to signal this. This significantly cuts down the number 18411 # of files written at little extra cost to the hashes in UCD.pm. 18412 # And it means, no run-time files to read to get the definitions. 18413 if (! $is_property 18414 && ! $annotate # For annotation, we want to explicitly show 18415 # everything, so keep in files 18416 && $table->ranges <= 3) 18417 { 18418 my @ranges = $table->ranges; 18419 my $count = @ranges; 18420 if ($count == 0) { # 0th index reserved for 0-length lists 18421 $filename = 0; 18422 } 18423 elsif ($table->leader != $table) { 18424 18425 # Here, is a table that is equivalent to another; code 18426 # in register_file_for_name() causes its leader's definition 18427 # to be used 18428 18429 next; 18430 } 18431 else { # No equivalent table so far. 18432 18433 # Build up its definition range-by-range. 18434 my $definition = ""; 18435 while (defined (my $range = shift @ranges)) { 18436 my $end = $range->end; 18437 if ($end < $MAX_WORKING_CODEPOINT) { 18438 $count++; 18439 $end = "\n" . ($end + 1); 18440 } 18441 else { # Extends to infinity, hence no 'end' 18442 $end = ""; 18443 } 18444 $definition .= "\n" . $range->start . $end; 18445 } 18446 $definition = "V$count" . $definition; 18447 $filename = @inline_definitions; 18448 push @inline_definitions, $definition; 18449 } 18450 @directory = "#"; 18451 register_file_for_name($table, \@directory, $filename); 18452 next; 18453 } 18454 18455 if (! $is_property) { 18456 # Match tables for the property go in lib/$subdirectory, which is 18457 # the property's name. Don't use the standard file name for this, 18458 # as may get an unfamiliar alias 18459 @directory = ($matches_directory, ($property->match_subdir) 18460 ? $property->match_subdir 18461 : $property->external_name); 18462 } 18463 else { 18464 18465 @directory = $table->directory; 18466 $filename = $table->file; 18467 } 18468 18469 # Use specified filename if available, or default to property's 18470 # shortest name. We need an 8.3 safe filename (which means "an 8 18471 # safe" filename, since after the dot is only 'pl', which is < 3) 18472 # The 2nd parameter is if the filename shouldn't be changed, and 18473 # it shouldn't iff there is a hard-coded name for this table. 18474 $filename = construct_filename( 18475 $filename || $table->external_name, 18476 ! $filename, # mutable if no filename 18477 \@directory); 18478 18479 register_file_for_name($table, \@directory, $filename); 18480 18481 # Only need to write one file when shared by more than one 18482 # property 18483 next if ! $is_property 18484 && ($table->leader != $table || $table->complement != 0); 18485 18486 # Construct a nice comment to add to the file 18487 $table->set_final_comment; 18488 18489 $table->write; 18490 } 18491 18492 18493 # Write out the pod file 18494 make_pod; 18495 18496 # And Name.pm, UCD.pl 18497 make_Name_pm; 18498 make_UCD; 18499 18500 make_property_test_script() if $make_test_script; 18501 make_normalization_test_script() if $make_norm_test_script; 18502 return; 18503} 18504 18505my @white_space_separators = ( # This used only for making the test script. 18506 "", 18507 ' ', 18508 "\t", 18509 ' ' 18510 ); 18511 18512sub generate_separator($lhs) { 18513 # This used only for making the test script. It generates the colon or 18514 # equal separator between the property and property value, with random 18515 # white space surrounding the separator 18516 18517 return "" if $lhs eq ""; # No separator if there's only one (the r) side 18518 18519 # Choose space before and after randomly 18520 my $spaces_before =$white_space_separators[rand(@white_space_separators)]; 18521 my $spaces_after = $white_space_separators[rand(@white_space_separators)]; 18522 18523 # And return the whole complex, half the time using a colon, half the 18524 # equals 18525 return $spaces_before 18526 . (rand() < 0.5) ? '=' : ':' 18527 . $spaces_after; 18528} 18529 18530sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18531 # This used only for making the test script. It generates test cases that 18532 # are expected to compile successfully in perl. Note that the LHS and 18533 # RHS are assumed to already be as randomized as the caller wants. 18534 18535 # $lhs # The property: what's to the left of the colon 18536 # or equals separator 18537 # $rhs # The property value; what's to the right 18538 # $valid_code # A code point that's known to be in the 18539 # table given by LHS=RHS; undef if table is 18540 # empty 18541 # $invalid_code # A code point known to not be in the table; 18542 # undef if the table is all code points 18543 # $warning 18544 18545 # Get the colon or equal 18546 my $separator = generate_separator($lhs); 18547 18548 # The whole 'property=value' 18549 my $name = "$lhs$separator$rhs"; 18550 18551 my @output; 18552 # Create a complete set of tests, with complements. 18553 if (defined $valid_code) { 18554 push @output, <<"EOC" 18555Expect(1, $valid_code, '\\p{$name}', $warning); 18556Expect(0, $valid_code, '\\p{^$name}', $warning); 18557Expect(0, $valid_code, '\\P{$name}', $warning); 18558Expect(1, $valid_code, '\\P{^$name}', $warning); 18559EOC 18560 } 18561 if (defined $invalid_code) { 18562 push @output, <<"EOC" 18563Expect(0, $invalid_code, '\\p{$name}', $warning); 18564Expect(1, $invalid_code, '\\p{^$name}', $warning); 18565Expect(1, $invalid_code, '\\P{$name}', $warning); 18566Expect(0, $invalid_code, '\\P{^$name}', $warning); 18567EOC 18568 } 18569 return @output; 18570} 18571 18572sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18573 # This used only for making the test script. It generates wildcardl 18574 # matching test cases that are expected to compile successfully in perl. 18575 18576 # $lhs # The property: what's to the left of the 18577 # or equals separator 18578 # $rhs # The property value; what's to the right 18579 # $valid_code # A code point that's known to be in the 18580 # table given by LHS=RHS; undef if table is 18581 # empty 18582 # $invalid_code # A code point known to not be in the table; 18583 # undef if the table is all code points 18584 # $warning 18585 18586 return if $lhs eq ""; 18587 return if $lhs =~ / ^ Is_ /x; # These are not currently supported 18588 18589 # Generate a standardized pattern, with colon being the delimitter 18590 my $wildcard = "$lhs=:\\A$rhs\\z:"; 18591 18592 my @output; 18593 push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);" 18594 if defined $valid_code; 18595 push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);" 18596 if defined $invalid_code; 18597 return @output; 18598} 18599 18600sub generate_error($lhs, $rhs, $already_in_error=0) { 18601 # This used only for making the test script. It generates test cases that 18602 # are expected to not only not match, but to be syntax or similar errors 18603 18604 # $lhs # The property: what's to the left of the 18605 # colon or equals separator 18606 # $rhs # The property value; what's to the right 18607 # $already_in_error # Boolean; if true it's known that the 18608 # unmodified LHS and RHS will cause an error. 18609 # This routine should not force another one 18610 # Get the colon or equal 18611 my $separator = generate_separator($lhs); 18612 18613 # Since this is an error only, don't bother to randomly decide whether to 18614 # put the error on the left or right side; and assume that the RHS is 18615 # loosely matched, again for convenience rather than rigor. 18616 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; 18617 18618 my $property = $lhs . $separator . $rhs; 18619 18620 return <<"EOC"; 18621Error('\\p{$property}'); 18622Error('\\P{$property}'); 18623EOC 18624} 18625 18626# These are used only for making the test script 18627# XXX Maybe should also have a bad strict seps, which includes underscore. 18628 18629my @good_loose_seps = ( 18630 " ", 18631 "-", 18632 "\t", 18633 "", 18634 "_", 18635 ); 18636my @bad_loose_seps = ( 18637 "/a/", 18638 ':=', 18639 ); 18640 18641sub randomize_stricter_name($name) { 18642 # This used only for making the test script. Take the input name and 18643 # return a randomized, but valid version of it under the stricter matching 18644 # rules. 18645 18646 # If the name looks like a number (integer, floating, or rational), do 18647 # some extra work 18648 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { 18649 my $sign = $1; 18650 my $number = $2; 18651 my $separator = $3; 18652 18653 # If there isn't a sign, part of the time add a plus 18654 # Note: Not testing having any denominator having a minus sign 18655 if (! $sign) { 18656 $sign = '+' if rand() <= .3; 18657 } 18658 18659 # And add 0 or more leading zeros. 18660 $name = $sign . ('0' x int rand(10)) . $number; 18661 18662 if (defined $separator) { 18663 my $extra_zeros = '0' x int rand(10); 18664 18665 if ($separator eq '.') { 18666 18667 # Similarly, add 0 or more trailing zeros after a decimal 18668 # point 18669 $name .= $extra_zeros; 18670 } 18671 else { 18672 18673 # Or, leading zeros before the denominator 18674 $name =~ s,/,/$extra_zeros,; 18675 } 18676 } 18677 } 18678 18679 # For legibility of the test, only change the case of whole sections at a 18680 # time. To do this, first split into sections. The split returns the 18681 # delimiters 18682 my @sections; 18683 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { 18684 trace $section if main::DEBUG && $to_trace; 18685 18686 if (length $section > 1 && $section !~ /\D/) { 18687 18688 # If the section is a sequence of digits, about half the time 18689 # randomly add underscores between some of them. 18690 if (rand() > .5) { 18691 18692 # Figure out how many underscores to add. max is 1 less than 18693 # the number of digits. (But add 1 at the end to make sure 18694 # result isn't 0, and compensate earlier by subtracting 2 18695 # instead of 1) 18696 my $num_underscores = int rand(length($section) - 2) + 1; 18697 18698 # And add them evenly throughout, for convenience, not rigor 18699 use integer; 18700 my $spacing = (length($section) - 1)/ $num_underscores; 18701 my $temp = $section; 18702 $section = ""; 18703 for my $i (1 .. $num_underscores) { 18704 $section .= substr($temp, 0, $spacing, "") . '_'; 18705 } 18706 $section .= $temp; 18707 } 18708 push @sections, $section; 18709 } 18710 else { 18711 18712 # Here not a sequence of digits. Change the case of the section 18713 # randomly 18714 my $switch = int rand(4); 18715 if ($switch == 0) { 18716 push @sections, uc $section; 18717 } 18718 elsif ($switch == 1) { 18719 push @sections, lc $section; 18720 } 18721 elsif ($switch == 2) { 18722 push @sections, ucfirst $section; 18723 } 18724 else { 18725 push @sections, $section; 18726 } 18727 } 18728 } 18729 trace "returning", join "", @sections if main::DEBUG && $to_trace; 18730 return join "", @sections; 18731} 18732 18733sub randomize_loose_name($name, $want_error=0) { 18734 # This used only for making the test script 18735 18736 $name = randomize_stricter_name($name); 18737 18738 my @parts; 18739 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18740 18741 # Preserve trailing ones for the sake of not stripping the underscore from 18742 # 'L_' 18743 for my $part (split /[-\s_]+ (?= . )/, $name) { 18744 if (@parts) { 18745 if ($want_error and rand() < 0.3) { 18746 push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; 18747 $want_error = 0; 18748 } 18749 else { 18750 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18751 } 18752 } 18753 push @parts, $part; 18754 } 18755 my $new = join("", @parts); 18756 trace "$name => $new" if main::DEBUG && $to_trace; 18757 18758 if ($want_error) { 18759 if (rand() >= 0.5) { 18760 $new .= $bad_loose_seps[rand(@bad_loose_seps)]; 18761 } 18762 else { 18763 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; 18764 } 18765 } 18766 return $new; 18767} 18768 18769# Used to make sure don't generate duplicate test cases. 18770my %test_generated; 18771 18772sub make_property_test_script() { 18773 # This used only for making the test script 18774 # this written directly -- it's huge. 18775 18776 print "Making test script\n" if $verbosity >= $PROGRESS; 18777 18778 # This uses randomness to test different possibilities without testing all 18779 # possibilities. To ensure repeatability, set the seed to 0. But if 18780 # tests are added, it will perturb all later ones in the .t file 18781 srand 0; 18782 18783 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name 18784 18785 # Create a list of what the %f representation is for each rational number. 18786 # This will be used below. 18787 my @valid_base_floats = '0.0'; 18788 foreach my $e_representation (keys %nv_floating_to_rational) { 18789 push @valid_base_floats, 18790 eval $nv_floating_to_rational{$e_representation}; 18791 } 18792 18793 # It doesn't matter whether the elements of this array contain single lines 18794 # or multiple lines. main::write doesn't count the lines. 18795 my @output; 18796 18797 push @output, <<'EOF_CODE'; 18798Error('\p{Script=InGreek}'); # Bug #69018 18799Test_GCB("1100 $nobreak 1161"); # Bug #70940 18800Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 18801Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 18802Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726 18803Error('\p{InKana}'); # 'Kana' is not a block so InKana shouldn't compile 18804Expect(1, 0xB6, '\p{In=V1_1}', ""); # Didn't use to work 18805Expect(1, 0x3A2,'\p{In=NA}', ""); # Didn't use to work 18806 18807# Make sure this gets tested; it was not part of the official test suite at 18808# the time this was added. Note that this is as it would appear in the 18809# official suite, and gets modified to check for the perl tailoring by 18810# Test_WB() 18811Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable"); 18812Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable"); 18813Expect(1, ord(" "), '\p{gc=:(?aa)s:}', ""); # /aa is valid 18814Expect(1, ord(" "), '\p{gc=:(?-s)s:}', ""); # /-s is valid 18815EOF_CODE 18816 18817 # Sort these so get results in same order on different runs of this 18818 # program 18819 foreach my $property (sort { $a->has_dependency <=> $b->has_dependency 18820 or 18821 lc $a->name cmp lc $b->name 18822 } property_ref('*')) 18823 { 18824 # Non-binary properties should not match \p{}; Test all for that. 18825 if ($property->type != $BINARY && $property->type != $FORCED_BINARY) { 18826 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } 18827 $property->aliases; 18828 foreach my $property_alias ($property->aliases) { 18829 my $name = standardize($property_alias->name); 18830 18831 # But some names are ambiguous, meaning a binary property with 18832 # the same name when used in \p{}, and a different 18833 # (non-binary) property in other contexts. 18834 next if grep { $name eq $_ } keys %ambiguous_names; 18835 18836 push @output, <<"EOF_CODE"; 18837Error('\\p{$name}'); 18838Error('\\P{$name}'); 18839EOF_CODE 18840 } 18841 } 18842 foreach my $table (sort { $a->has_dependency <=> $b->has_dependency 18843 or 18844 lc $a->name cmp lc $b->name 18845 } $property->tables) 18846 { 18847 18848 # Find code points that match, and don't match this table. 18849 my $valid = $table->get_valid_code_point; 18850 my $invalid = $table->get_invalid_code_point; 18851 my $warning = ($table->status eq $DEPRECATED) 18852 ? "'deprecated'" 18853 : '""'; 18854 18855 # Test each possible combination of the property's aliases with 18856 # the table's. If this gets to be too many, could do what is done 18857 # in the set_final_comment() for Tables 18858 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases; 18859 next unless @table_aliases; 18860 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases; 18861 next unless @property_aliases; 18862 18863 # Every property can be optionally be prefixed by 'Is_', so test 18864 # that those work, by creating such a new alias for each 18865 # pre-existing one. 18866 push @property_aliases, map { Alias->new("Is_" . $_->name, 18867 $_->loose_match, 18868 $_->make_re_pod_entry, 18869 $_->ok_as_filename, 18870 $_->status, 18871 $_->ucd, 18872 ) 18873 } @property_aliases; 18874 my $max = max(scalar @table_aliases, scalar @property_aliases); 18875 for my $j (0 .. $max - 1) { 18876 18877 # The current alias for property is the next one on the list, 18878 # or if beyond the end, start over. Similarly for table 18879 my $property_name 18880 = $property_aliases[$j % @property_aliases]->name; 18881 18882 $property_name = "" if $table->property == $perl; 18883 my $table_alias = $table_aliases[$j % @table_aliases]; 18884 my $table_name = $table_alias->name; 18885 my $loose_match = $table_alias->loose_match; 18886 18887 # If the table doesn't have a file, any test for it is 18888 # already guaranteed to be in error 18889 my $already_error = ! $table->file_path; 18890 18891 # A table that begins with these could actually be a 18892 # user-defined property, so won't be compile time errors, as 18893 # the definitions of those can be deferred until runtime 18894 next if $already_error && $table_name =~ / ^ I[ns] /x; 18895 18896 # Generate error cases for this alias. 18897 push @output, generate_error($property_name, 18898 $table_name, 18899 $already_error); 18900 18901 # If the table is guaranteed to always generate an error, 18902 # quit now without generating success cases. 18903 next if $already_error; 18904 18905 # Now for the success cases. First, wildcard matching, as it 18906 # shouldn't have any randomization. 18907 if ($table_alias->status eq $NORMAL) { 18908 push @output, generate_wildcard_tests($property_name, 18909 $table_name, 18910 $valid, 18911 $invalid, 18912 $warning, 18913 ); 18914 } 18915 my $random; 18916 if ($loose_match) { 18917 18918 # For loose matching, create an extra test case for the 18919 # standard name. 18920 my $standard = standardize($table_name); 18921 18922 # $test_name should be a unique combination for each test 18923 # case; used just to avoid duplicate tests 18924 my $test_name = "$property_name=$standard"; 18925 18926 # Don't output duplicate test cases. 18927 if (! exists $test_generated{$test_name}) { 18928 $test_generated{$test_name} = 1; 18929 push @output, generate_tests($property_name, 18930 $standard, 18931 $valid, 18932 $invalid, 18933 $warning, 18934 ); 18935 if ($table_alias->status eq $NORMAL) { 18936 push @output, generate_wildcard_tests( 18937 $property_name, 18938 $standard, 18939 $valid, 18940 $invalid, 18941 $warning, 18942 ); 18943 } 18944 } 18945 $random = randomize_loose_name($table_name) 18946 } 18947 else { # Stricter match 18948 $random = randomize_stricter_name($table_name); 18949 } 18950 18951 # Now for the main test case for this alias. 18952 my $test_name = "$property_name=$random"; 18953 if (! exists $test_generated{$test_name}) { 18954 $test_generated{$test_name} = 1; 18955 push @output, generate_tests($property_name, 18956 $random, 18957 $valid, 18958 $invalid, 18959 $warning, 18960 ); 18961 18962 if ($property->name eq 'nv') { 18963 if ($table_name !~ qr{/}) { 18964 push @output, generate_tests($property_name, 18965 sprintf("%.15e", $table_name), 18966 $valid, 18967 $invalid, 18968 $warning, 18969 ); 18970 } 18971 else { 18972 # If the name is a rational number, add tests for a 18973 # non-reduced form, and for a floating point equivalent. 18974 18975 # 60 is a number divisible by a bunch of things 18976 my ($numerator, $denominator) = $table_name 18977 =~ m! (.+) / (.+) !x; 18978 $numerator *= 60; 18979 $denominator *= 60; 18980 push @output, generate_tests($property_name, 18981 "$numerator/$denominator", 18982 $valid, 18983 $invalid, 18984 $warning, 18985 ); 18986 18987 # Calculate the float, and the %e representation 18988 my $float = eval $table_name; 18989 my $e_representation = sprintf("%.*e", 18990 $E_FLOAT_PRECISION, $float); 18991 # Parse that 18992 my ($non_zeros, $zeros, $exponent_sign, $exponent) 18993 = $e_representation 18994 =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x; 18995 my $min_e_precision; 18996 my $min_f_precision; 18997 18998 if ($exponent_sign eq '+' && $exponent != 0) { 18999 Carp::my_carp_bug("Not yet equipped to handle" 19000 . " positive exponents"); 19001 return; 19002 } 19003 else { 19004 # We're trying to find the minimum precision that 19005 # is needed to indicate this particular rational 19006 # for the given $E_FLOAT_PRECISION. For %e, any 19007 # trailing zeros, like 1.500e-02 aren't needed, so 19008 # the correct value is how many non-trailing zeros 19009 # there are after the decimal point. 19010 $min_e_precision = length $non_zeros; 19011 19012 # For %f, like .01500, we want at least 19013 # $E_FLOAT_PRECISION digits, but any trailing 19014 # zeros aren't needed, so we can subtract the 19015 # length of those. But we also need to include 19016 # the zeros after the decimal point, but before 19017 # the first significant digit. 19018 $min_f_precision = $E_FLOAT_PRECISION 19019 + $exponent 19020 - length $zeros; 19021 } 19022 19023 # Make tests for each possible precision from 1 to 19024 # just past the worst case. 19025 my $upper_limit = ($min_e_precision > $min_f_precision) 19026 ? $min_e_precision 19027 : $min_f_precision; 19028 19029 for my $i (1 .. $upper_limit + 1) { 19030 for my $format ("e", "f") { 19031 my $this_table 19032 = sprintf("%.*$format", $i, $float); 19033 19034 # If we don't have enough precision digits, 19035 # make a fail test; otherwise a pass test. 19036 my $pass = ($format eq "e") 19037 ? $i >= $min_e_precision 19038 : $i >= $min_f_precision; 19039 if ($pass) { 19040 push @output, generate_tests($property_name, 19041 $this_table, 19042 $valid, 19043 $invalid, 19044 $warning, 19045 ); 19046 } 19047 elsif ( $format eq "e" 19048 19049 # Here we would fail, but in the %f 19050 # case, the representation at this 19051 # precision could actually be a 19052 # valid one for some other rational 19053 || ! grep { $this_table 19054 =~ / ^ $_ 0* $ /x } 19055 @valid_base_floats) 19056 { 19057 push @output, 19058 generate_error($property_name, 19059 $this_table, 19060 1 # 1 => already an 19061 # error 19062 ); 19063 } 19064 } 19065 } 19066 } 19067 } 19068 } 19069 } 19070 $table->DESTROY(); 19071 } 19072 $property->DESTROY(); 19073 } 19074 19075 # Make any test of the boundary (break) properties TODO if the code 19076 # doesn't match the version being compiled 19077 my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version) 19078 ? "\nsub TODO_FAILING_BREAKS { 1 }\n" 19079 : "\nsub TODO_FAILING_BREAKS { 0 }\n"; 19080 19081 @output= map { 19082 map s/^/ /mgr, 19083 map "$_;\n", 19084 split /;\n/, $_ 19085 } @output; 19086 19087 # Cause there to be 'if' statements to only execute a portion of this 19088 # long-running test each time, so that we can have a bunch of .t's running 19089 # in parallel 19090 my $chunks = 10 # Number of test files 19091 - 1 # For GCB & SB 19092 - 1 # For WB 19093 - 4; # LB split into this many files 19094 my @output_chunked; 19095 my $chunk_count=0; 19096 my $chunk_size= int(@output / $chunks) + 1; 19097 while (@output) { 19098 $chunk_count++; 19099 my @chunk= splice @output, 0, $chunk_size; 19100 push @output_chunked, 19101 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19102 @chunk, 19103 "}\n"; 19104 } 19105 19106 $chunk_count++; 19107 push @output_chunked, 19108 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19109 (map {" Test_GCB('$_');\n"} @backslash_X_tests), 19110 (map {" Test_SB('$_');\n"} @SB_tests), 19111 "}\n"; 19112 19113 19114 $chunk_size= int(@LB_tests / 4) + 1; 19115 @LB_tests = map {" Test_LB('$_');\n"} @LB_tests; 19116 while (@LB_tests) { 19117 $chunk_count++; 19118 my @chunk= splice @LB_tests, 0, $chunk_size; 19119 push @output_chunked, 19120 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19121 @chunk, 19122 "}\n"; 19123 } 19124 19125 $chunk_count++; 19126 push @output_chunked, 19127 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19128 (map {" Test_WB('$_');\n"} @WB_tests), 19129 "}\n"; 19130 19131 &write($t_path, 19132 0, # Not utf8; 19133 [$HEADER, 19134 $TODO_FAILING_BREAKS, 19135 <DATA>, 19136 @output_chunked, 19137 "Finished();\n", 19138 ]); 19139 19140 return; 19141} 19142 19143sub make_normalization_test_script() { 19144 print "Making normalization test script\n" if $verbosity >= $PROGRESS; 19145 19146 my $n_path = 'TestNorm.pl'; 19147 19148 unshift @normalization_tests, <<'END'; 19149use utf8; 19150use Test::More; 19151 19152sub ord_string { # Convert packed ords to printable string 19153 use charnames (); 19154 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } 19155 unpack "U*", shift) . "'"; 19156 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; 19157} 19158 19159sub Test_N { 19160 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; 19161 my $display_source = ord_string($source); 19162 my $display_nfc = ord_string($nfc); 19163 my $display_nfd = ord_string($nfd); 19164 my $display_nfkc = ord_string($nfkc); 19165 my $display_nfkd = ord_string($nfkd); 19166 19167 use Unicode::Normalize; 19168 # NFC 19169 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd) 19170 # nfkc == toNFC(nfkc) == toNFC(nfkd) 19171 # 19172 # NFD 19173 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd) 19174 # nfkd == toNFD(nfkc) == toNFD(nfkd) 19175 # 19176 # NFKC 19177 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) == 19178 # toNFKC(nfkc) == toNFKC(nfkd) 19179 # 19180 # NFKD 19181 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) == 19182 # toNFKD(nfkc) == toNFKD(nfkd) 19183 19184 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc"); 19185 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc"); 19186 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc"); 19187 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc"); 19188 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc"); 19189 19190 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd"); 19191 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd"); 19192 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd"); 19193 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd"); 19194 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd"); 19195 19196 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc"); 19197 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc"); 19198 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc"); 19199 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc"); 19200 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc"); 19201 19202 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd"); 19203 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd"); 19204 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd"); 19205 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd"); 19206 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd"); 19207} 19208END 19209 19210 &write($n_path, 19211 1, # Is utf8; 19212 [ 19213 @normalization_tests, 19214 'done_testing();' 19215 ]); 19216 return; 19217} 19218 19219# Skip reasons, so will be exact same text and hence the files with each 19220# reason will get grouped together in perluniprops. 19221my $Documentation = "Documentation"; 19222my $Indic_Skip 19223 = "Provisional; for the analysis and processing of Indic scripts"; 19224my $Validation = "Validation Tests"; 19225my $Validation_Documentation = "Documentation of validation Tests"; 19226my $Unused_Skip = "Currently unused by Perl"; 19227 19228# This is a list of the input files and how to handle them. The files are 19229# processed in their order in this list. Some reordering is possible if 19230# desired, but the PropertyAliases and PropValueAliases files should be first, 19231# and the extracted before the others (as data in an extracted file can be 19232# over-ridden by the non-extracted. Some other files depend on data derived 19233# from an earlier file, like UnicodeData requires data from Jamo, and the case 19234# changing and folding requires data from Unicode. Mostly, it is safest to 19235# order by first version releases in (except the Jamo). 19236# 19237# The version strings allow the program to know whether to expect a file or 19238# not, but if a file exists in the directory, it will be processed, even if it 19239# is in a version earlier than expected, so you can copy files from a later 19240# release into an earlier release's directory. 19241my @input_file_objects = ( 19242 Input_file->new('PropertyAliases.txt', v3.2, 19243 Handler => \&process_PropertyAliases, 19244 Early => [ \&substitute_PropertyAliases ], 19245 Required_Even_in_Debug_Skip => 1, 19246 ), 19247 Input_file->new(undef, v0, # No file associated with this 19248 Progress_Message => 'Finishing property setup', 19249 Handler => \&finish_property_setup, 19250 ), 19251 Input_file->new('PropValueAliases.txt', v3.2, 19252 Handler => \&process_PropValueAliases, 19253 Early => [ \&substitute_PropValueAliases ], 19254 Has_Missings_Defaults => $NOT_IGNORED, 19255 Required_Even_in_Debug_Skip => 1, 19256 ), 19257 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, 19258 Property => 'General_Category', 19259 ), 19260 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, 19261 Property => 'Canonical_Combining_Class', 19262 Has_Missings_Defaults => $NOT_IGNORED, 19263 ), 19264 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, 19265 Property => 'Numeric_Type', 19266 Has_Missings_Defaults => $NOT_IGNORED, 19267 ), 19268 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, 19269 Property => 'East_Asian_Width', 19270 Has_Missings_Defaults => $NOT_IGNORED, 19271 ), 19272 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, 19273 Property => 'Line_Break', 19274 Has_Missings_Defaults => $NOT_IGNORED, 19275 ), 19276 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, 19277 Property => 'Bidi_Class', 19278 Has_Missings_Defaults => $NOT_IGNORED, 19279 ), 19280 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, 19281 Property => 'Decomposition_Type', 19282 Has_Missings_Defaults => $NOT_IGNORED, 19283 ), 19284 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), 19285 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, 19286 Property => 'Numeric_Value', 19287 Each_Line_Handler => \&filter_numeric_value_line, 19288 Has_Missings_Defaults => $NOT_IGNORED, 19289 ), 19290 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, 19291 Property => 'Joining_Group', 19292 Has_Missings_Defaults => $NOT_IGNORED, 19293 ), 19294 19295 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, 19296 Property => 'Joining_Type', 19297 Has_Missings_Defaults => $NOT_IGNORED, 19298 ), 19299 Input_file->new("${EXTRACTED}DName.txt", v10.0.0, 19300 Skip => 'This file adds no new information not already' 19301 . ' present in other files', 19302 # And it's unnecessary programmer work to handle this new 19303 # format. Previous Derived files actually had bug fixes 19304 # in them that were useful, but that should not be the 19305 # case here. 19306 ), 19307 Input_file->new('Jamo.txt', v2.0.0, 19308 Property => 'Jamo_Short_Name', 19309 Each_Line_Handler => \&filter_jamo_line, 19310 ), 19311 Input_file->new('UnicodeData.txt', v1.1.5, 19312 Pre_Handler => \&setup_UnicodeData, 19313 19314 # We clean up this file for some early versions. 19315 Each_Line_Handler => [ (($v_version lt v2.0.0 ) 19316 ? \&filter_v1_ucd 19317 : ($v_version eq v2.1.5) 19318 ? \&filter_v2_1_5_ucd 19319 19320 # And for 5.14 Perls with 6.0, 19321 # have to also make changes 19322 : ($v_version ge v6.0.0 19323 && $^V lt v5.17.0) 19324 ? \&filter_v6_ucd 19325 : undef), 19326 19327 # Early versions did not have the 19328 # proper Unicode_1 names for the 19329 # controls 19330 (($v_version lt v3.0.0) 19331 ? \&filter_early_U1_names 19332 : undef), 19333 19334 # Early versions did not correctly 19335 # use the later method for giving 19336 # decimal digit values 19337 (($v_version le v3.2.0) 19338 ? \&filter_bad_Nd_ucd 19339 : undef), 19340 19341 # And the main filter 19342 \&filter_UnicodeData_line, 19343 ], 19344 EOF_Handler => \&EOF_UnicodeData, 19345 ), 19346 Input_file->new('CJKXREF.TXT', v1.1.5, 19347 Withdrawn => v2.0.0, 19348 Skip => 'Gives the mapping of CJK code points ' 19349 . 'between Unicode and various other standards', 19350 ), 19351 Input_file->new('ArabicShaping.txt', v2.0.0, 19352 Each_Line_Handler => 19353 ($v_version lt 4.1.0) 19354 ? \&filter_old_style_arabic_shaping 19355 : undef, 19356 # The first field after the range is a "schematic name" 19357 # not used by Perl 19358 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ], 19359 Has_Missings_Defaults => $NOT_IGNORED, 19360 ), 19361 Input_file->new('Blocks.txt', v2.0.0, 19362 Property => 'Block', 19363 Has_Missings_Defaults => $NOT_IGNORED, 19364 Each_Line_Handler => \&filter_blocks_lines 19365 ), 19366 Input_file->new('Index.txt', v2.0.0, 19367 Skip => 'Alphabetical index of Unicode characters', 19368 ), 19369 Input_file->new('NamesList.txt', v2.0.0, 19370 Skip => 'Annotated list of characters', 19371 ), 19372 Input_file->new('PropList.txt', v2.0.0, 19373 Each_Line_Handler => (($v_version lt v3.1.0) 19374 ? \&filter_old_style_proplist 19375 : undef), 19376 ), 19377 Input_file->new('Props.txt', v2.0.0, 19378 Withdrawn => v3.0.0, 19379 Skip => 'A subset of F<PropList.txt> (which is used instead)', 19380 ), 19381 Input_file->new('ReadMe.txt', v2.0.0, 19382 Skip => $Documentation, 19383 ), 19384 Input_file->new('Unihan.txt', v2.0.0, 19385 Withdrawn => v5.2.0, 19386 Construction_Time_Handler => \&construct_unihan, 19387 Pre_Handler => \&setup_unihan, 19388 Optional => [ "", 19389 'Unicode_Radical_Stroke' 19390 ], 19391 Each_Line_Handler => \&filter_unihan_line, 19392 ), 19393 Input_file->new('SpecialCasing.txt', v2.1.8, 19394 Each_Line_Handler => ($v_version eq 2.1.8) 19395 ? \&filter_2_1_8_special_casing_line 19396 : \&filter_special_casing_line, 19397 Pre_Handler => \&setup_special_casing, 19398 Has_Missings_Defaults => $IGNORED, 19399 ), 19400 Input_file->new( 19401 'LineBreak.txt', v3.0.0, 19402 Has_Missings_Defaults => $NOT_IGNORED, 19403 Property => 'Line_Break', 19404 # Early versions had problematic syntax 19405 Each_Line_Handler => ($v_version ge v3.1.0) 19406 ? undef 19407 : ($v_version lt v3.0.0) 19408 ? \&filter_substitute_lb 19409 : \&filter_early_ea_lb, 19410 # Must use long names for property values see comments at 19411 # sub filter_substitute_lb 19412 Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic', 19413 'Alphabetic', # default to this because XX -> 19414 # AL 19415 19416 # Don't use _Perl_LB as a synonym for 19417 # Line_Break in later perls, as it is tailored 19418 # and isn't the same as Line_Break 19419 'ONLY_EARLY' ], 19420 ), 19421 Input_file->new('EastAsianWidth.txt', v3.0.0, 19422 Property => 'East_Asian_Width', 19423 Has_Missings_Defaults => $NOT_IGNORED, 19424 # Early versions had problematic syntax 19425 Each_Line_Handler => (($v_version lt v3.1.0) 19426 ? \&filter_early_ea_lb 19427 : undef), 19428 ), 19429 Input_file->new('CompositionExclusions.txt', v3.0.0, 19430 Property => 'Composition_Exclusion', 19431 ), 19432 Input_file->new('UnicodeData.html', v3.0.0, 19433 Withdrawn => v4.0.1, 19434 Skip => $Documentation, 19435 ), 19436 Input_file->new('BidiMirroring.txt', v3.0.1, 19437 Property => 'Bidi_Mirroring_Glyph', 19438 Has_Missings_Defaults => ($v_version lt v6.2.0) 19439 ? $NO_DEFAULTS 19440 # Is <none> which doesn't mean 19441 # anything to us, we will use the 19442 # null string 19443 : $IGNORED, 19444 ), 19445 Input_file->new('NamesList.html', v3.0.0, 19446 Skip => 'Describes the format and contents of ' 19447 . 'F<NamesList.txt>', 19448 ), 19449 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0, 19450 Withdrawn => v5.1, 19451 Skip => $Documentation, 19452 ), 19453 Input_file->new('CaseFolding.txt', v3.0.1, 19454 Pre_Handler => \&setup_case_folding, 19455 Each_Line_Handler => 19456 [ ($v_version lt v3.1.0) 19457 ? \&filter_old_style_case_folding 19458 : undef, 19459 \&filter_case_folding_line 19460 ], 19461 Has_Missings_Defaults => $IGNORED, 19462 ), 19463 Input_file->new("NormTest.txt", v3.0.1, 19464 Handler => \&process_NormalizationsTest, 19465 Skip => ($make_norm_test_script) ? 0 : $Validation, 19466 ), 19467 Input_file->new('DCoreProperties.txt', v3.1.0, 19468 # 5.2 changed this file 19469 Has_Missings_Defaults => (($v_version ge v5.2.0) 19470 ? $NOT_IGNORED 19471 : $NO_DEFAULTS), 19472 ), 19473 Input_file->new('DProperties.html', v3.1.0, 19474 Withdrawn => v3.2.0, 19475 Skip => $Documentation, 19476 ), 19477 Input_file->new('PropList.html', v3.1.0, 19478 Withdrawn => v5.1, 19479 Skip => $Documentation, 19480 ), 19481 Input_file->new('Scripts.txt', v3.1.0, 19482 Property => 'Script', 19483 Each_Line_Handler => (($v_version le v4.0.0) 19484 ? \&filter_all_caps_script_names 19485 : undef), 19486 Has_Missings_Defaults => $NOT_IGNORED, 19487 ), 19488 Input_file->new('DNormalizationProps.txt', v3.1.0, 19489 Has_Missings_Defaults => $NOT_IGNORED, 19490 Each_Line_Handler => (($v_version lt v4.0.1) 19491 ? \&filter_old_style_normalization_lines 19492 : undef), 19493 ), 19494 Input_file->new('DerivedProperties.html', v3.1.1, 19495 Withdrawn => v5.1, 19496 Skip => $Documentation, 19497 ), 19498 Input_file->new('DAge.txt', v3.2.0, 19499 Has_Missings_Defaults => $NOT_IGNORED, 19500 Property => 'Age' 19501 ), 19502 Input_file->new('HangulSyllableType.txt', v4.0, 19503 Has_Missings_Defaults => $NOT_IGNORED, 19504 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ], 19505 Property => 'Hangul_Syllable_Type' 19506 ), 19507 Input_file->new('NormalizationCorrections.txt', v3.2.0, 19508 # This documents the cumulative fixes to erroneous 19509 # normalizations in earlier Unicode versions. Its main 19510 # purpose is so that someone running on an earlier 19511 # version can use this file to override what got 19512 # published in that earlier release. It would be easy 19513 # for mktables to handle this file. But all the 19514 # corrections in it should already be in the other files 19515 # for the release it is. To get it to actually mean 19516 # something useful, someone would have to be using an 19517 # earlier Unicode release, and copy it into the directory 19518 # for that release and recompile. So far there has been 19519 # no demand to do that, so this hasn't been implemented. 19520 Skip => 'Documentation of corrections already ' 19521 . 'incorporated into the Unicode data base', 19522 ), 19523 Input_file->new('StandardizedVariants.html', v3.2.0, 19524 Skip => 'Obsoleted as of Unicode 9.0, but previously ' 19525 . 'provided a visual display of the standard ' 19526 . 'variant sequences derived from ' 19527 . 'F<StandardizedVariants.txt>.', 19528 # I don't know why the html came earlier than the 19529 # .txt, but both are skipped anyway, so it doesn't 19530 # matter. 19531 ), 19532 Input_file->new('StandardizedVariants.txt', v4.0.0, 19533 Skip => 'Certain glyph variations for character display ' 19534 . 'are standardized. This lists the non-Unihan ' 19535 . 'ones; the Unihan ones are also not used by ' 19536 . 'Perl, and are in a separate Unicode data base ' 19537 . 'L<http://www.unicode.org/ivd>', 19538 ), 19539 Input_file->new('UCD.html', v4.0.0, 19540 Withdrawn => v5.2, 19541 Skip => $Documentation, 19542 ), 19543 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, 19544 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ], 19545 Property => 'Word_Break', 19546 Has_Missings_Defaults => $NOT_IGNORED, 19547 ), 19548 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1, 19549 Early => [ \&generate_GCB, '_Perl_GCB' ], 19550 Property => 'Grapheme_Cluster_Break', 19551 Has_Missings_Defaults => $NOT_IGNORED, 19552 ), 19553 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, 19554 Handler => \&process_GCB_test, 19555 retain_trailing_comments => 1, 19556 ), 19557 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0, 19558 Skip => $Validation_Documentation, 19559 ), 19560 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, 19561 Handler => \&process_SB_test, 19562 retain_trailing_comments => 1, 19563 ), 19564 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0, 19565 Skip => $Validation_Documentation, 19566 ), 19567 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, 19568 Handler => \&process_WB_test, 19569 retain_trailing_comments => 1, 19570 ), 19571 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0, 19572 Skip => $Validation_Documentation, 19573 ), 19574 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, 19575 Property => 'Sentence_Break', 19576 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ], 19577 Has_Missings_Defaults => $NOT_IGNORED, 19578 ), 19579 Input_file->new('NamedSequences.txt', v4.1.0, 19580 Handler => \&process_NamedSequences 19581 ), 19582 Input_file->new('Unihan.html', v4.1.0, 19583 Withdrawn => v5.2, 19584 Skip => $Documentation, 19585 ), 19586 Input_file->new('NameAliases.txt', v5.0, 19587 Property => 'Name_Alias', 19588 Each_Line_Handler => ($v_version le v6.0.0) 19589 ? \&filter_early_version_name_alias_line 19590 : \&filter_later_version_name_alias_line, 19591 ), 19592 # NameAliases.txt came along in v5.0. The above constructor handles 19593 # this. But until 6.1, it was lacking some information needed by core 19594 # perl. The constructor below handles that. It is either a kludge or 19595 # clever, depending on your point of view. The 'Withdrawn' parameter 19596 # indicates not to use it at all starting in 6.1 (so the above 19597 # constructor applies), and the 'v6.1' parameter indicates to use the 19598 # Early parameter before 6.1. Therefore 'Early" is always used, 19599 # yielding the internal-only property '_Perl_Name_Alias', which it 19600 # gets from a NameAliases.txt from 6.1 or later stored in 19601 # N_Asubst.txt. In combination with the above constructor, 19602 # 'Name_Alias' is publicly accessible starting with v5.0, and the 19603 # better 6.1 version is accessible to perl core in all releases. 19604 Input_file->new("NameAliases.txt", v6.1, 19605 Withdrawn => v6.1, 19606 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ], 19607 Property => 'Name_Alias', 19608 EOF_Handler => \&fixup_early_perl_name_alias, 19609 Each_Line_Handler => 19610 \&filter_later_version_name_alias_line, 19611 ), 19612 Input_file->new('NamedSqProv.txt', v5.0.0, 19613 Skip => 'Named sequences proposed for inclusion in a ' 19614 . 'later version of the Unicode Standard; if you ' 19615 . 'need them now, you can append this file to ' 19616 . 'F<NamedSequences.txt> and recompile perl', 19617 ), 19618 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0, 19619 Handler => \&process_LB_test, 19620 retain_trailing_comments => 1, 19621 ), 19622 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0, 19623 Skip => $Validation_Documentation, 19624 ), 19625 Input_file->new("BidiTest.txt", v5.2.0, 19626 Skip => $Validation, 19627 ), 19628 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, 19629 Optional => "", 19630 Each_Line_Handler => \&filter_unihan_line, 19631 ), 19632 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, 19633 Optional => "", 19634 Each_Line_Handler => \&filter_unihan_line, 19635 ), 19636 Input_file->new('UnihanIRGSources.txt', v5.2.0, 19637 Optional => [ "", 19638 'kCompatibilityVariant', 19639 'kIICore', 19640 'kIRG_GSource', 19641 'kIRG_HSource', 19642 'kIRG_JSource', 19643 'kIRG_KPSource', 19644 'kIRG_MSource', 19645 'kIRG_KSource', 19646 'kIRG_SSource', 19647 'kIRG_TSource', 19648 'kIRG_USource', 19649 'kIRG_UKSource', 19650 'kIRG_VSource', 19651 ], 19652 Pre_Handler => \&setup_unihan, 19653 Each_Line_Handler => \&filter_unihan_line, 19654 ), 19655 Input_file->new('UnihanNumericValues.txt', v5.2.0, 19656 Optional => [ "", 19657 'kAccountingNumeric', 19658 'kOtherNumeric', 19659 'kPrimaryNumeric', 19660 ], 19661 Each_Line_Handler => \&filter_unihan_line, 19662 ), 19663 Input_file->new('UnihanOtherMappings.txt', v5.2.0, 19664 Optional => "", 19665 Each_Line_Handler => \&filter_unihan_line, 19666 ), 19667 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, 19668 Optional => [ "", 19669 'Unicode_Radical_Stroke' 19670 ], 19671 Each_Line_Handler => \&filter_unihan_line, 19672 ), 19673 Input_file->new('UnihanReadings.txt', v5.2.0, 19674 Optional => "", 19675 Each_Line_Handler => \&filter_unihan_line, 19676 ), 19677 Input_file->new('UnihanVariants.txt', v5.2.0, 19678 Optional => "", 19679 Each_Line_Handler => \&filter_unihan_line, 19680 ), 19681 Input_file->new('CJKRadicals.txt', v5.2.0, 19682 Skip => 'Maps the kRSUnicode property values to ' 19683 . 'corresponding code points', 19684 ), 19685 Input_file->new('EmojiSources.txt', v6.0.0, 19686 Skip => 'Maps certain Unicode code points to their ' 19687 . 'legacy Japanese cell-phone values', 19688 ), 19689 # This file is actually not usable as-is until 6.1.0, because the property 19690 # is provisional, so its name is missing from PropertyAliases.txt until 19691 # that release, so that further work would have to be done to get it to 19692 # work properly 19693 Input_file->new('ScriptExtensions.txt', v6.0.0, 19694 Property => 'Script_Extensions', 19695 Early => [ sub {} ], # Doesn't do anything but ensures 19696 # that this isn't skipped for early 19697 # versions 19698 Pre_Handler => \&setup_script_extensions, 19699 Each_Line_Handler => \&filter_script_extensions_line, 19700 Has_Missings_Defaults => (($v_version le v6.0.0) 19701 ? $NO_DEFAULTS 19702 : $IGNORED), 19703 ), 19704 # These two Indic files are actually not usable as-is until 6.1.0, 19705 # because they are provisional, so their property values are missing from 19706 # PropValueAliases.txt until that release, so that further work would have 19707 # to be done to get them to work properly. 19708 Input_file->new('IndicMatraCategory.txt', v6.0.0, 19709 Withdrawn => v8.0.0, 19710 Property => 'Indic_Matra_Category', 19711 Has_Missings_Defaults => $NOT_IGNORED, 19712 Skip => $Indic_Skip, 19713 ), 19714 Input_file->new('IndicSyllabicCategory.txt', v6.0.0, 19715 Property => 'Indic_Syllabic_Category', 19716 Has_Missings_Defaults => $NOT_IGNORED, 19717 Skip => (($v_version lt v8.0.0) 19718 ? $Indic_Skip 19719 : 0), 19720 ), 19721 Input_file->new('USourceData.txt', v6.2.0, 19722 Skip => 'Documentation of status and cross reference of ' 19723 . 'proposals for encoding by Unicode of Unihan ' 19724 . 'characters', 19725 ), 19726 Input_file->new('USourceGlyphs.pdf', v6.2.0, 19727 Skip => 'Pictures of the characters in F<USourceData.txt>', 19728 ), 19729 Input_file->new('BidiBrackets.txt', v6.3.0, 19730 Properties => [ 'Bidi_Paired_Bracket', 19731 'Bidi_Paired_Bracket_Type' 19732 ], 19733 Has_Missings_Defaults => $NO_DEFAULTS, 19734 ), 19735 Input_file->new("BidiCharacterTest.txt", v6.3.0, 19736 Skip => $Validation, 19737 ), 19738 Input_file->new('IndicPositionalCategory.txt', v8.0.0, 19739 Property => 'Indic_Positional_Category', 19740 Has_Missings_Defaults => $NOT_IGNORED, 19741 ), 19742 Input_file->new('TangutSources.txt', v9.0.0, 19743 Skip => 'Specifies source mappings for Tangut ideographs' 19744 . ' and components. This data file also includes' 19745 . ' informative radical-stroke values that are used' 19746 . ' internally by Unicode', 19747 ), 19748 Input_file->new('VerticalOrientation.txt', v10.0.0, 19749 Property => 'Vertical_Orientation', 19750 Has_Missings_Defaults => $NOT_IGNORED, 19751 ), 19752 Input_file->new('NushuSources.txt', v10.0.0, 19753 Skip => 'Specifies source material for Nushu characters', 19754 ), 19755 Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0, 19756 Property => 'Equivalent_Unified_Ideograph', 19757 Has_Missings_Defaults => $NOT_IGNORED, 19758 ), 19759 Input_file->new('EmojiData.txt', v11.0.0, 19760 # Is in UAX #51 and not the UCD, so must be updated 19761 # separately, and the first line edited to indicate the 19762 # UCD release we're pretending it to be in. The UTC says 19763 # this is a transitional state, and in fact was moved to 19764 # the UCD in 13.0 19765 Withdrawn => v13.0.0, 19766 Pre_Handler => \&setup_emojidata, 19767 Has_Missings_Defaults => $NOT_IGNORED, 19768 Each_Line_Handler => \&filter_emojidata_line, 19769 UCD => 0, 19770 ), 19771 Input_file->new("$EMOJI/emoji.txt", v13.0.0, 19772 Has_Missings_Defaults => $NOT_IGNORED, 19773 UCD => 0, 19774 ), 19775 Input_file->new("$EMOJI/ReadMe.txt", v13.0.0, 19776 Skip => $Documentation, 19777 UCD => 0, 19778 ), 19779 Input_file->new('IdStatus.txt', v13.0.0, 19780 Pre_Handler => \&setup_IdStatus, 19781 Property => 'Identifier_Status', 19782 UCD => 0, 19783 ), 19784 Input_file->new('IdType.txt', v13.0.0, 19785 Pre_Handler => \&setup_IdType, 19786 Each_Line_Handler => \&filter_IdType_line, 19787 Property => 'Identifier_Type', 19788 UCD => 0, 19789 ), 19790 Input_file->new('confusables.txt', v15.0.0, 19791 Skip => $Unused_Skip, 19792 UCD => 0, 19793 ), 19794 Input_file->new('confusablesSummary.txt', v15.0.0, 19795 Skip => $Unused_Skip, 19796 UCD => 0, 19797 ), 19798 Input_file->new('intentional.txt', v15.0.0, 19799 Skip => $Unused_Skip, 19800 UCD => 0, 19801 ), 19802); 19803 19804# End of all the preliminaries. 19805# Do it... 19806 19807if (@missing_early_files) { 19808 print simple_fold(join_lines(<<END 19809 19810The compilation cannot be completed because one or more required input files, 19811listed below, are missing. This is because you are compiling Unicode version 19812$unicode_version, which predates the existence of these file(s). To fully 19813function, perl needs the data that these files would have contained if they 19814had been in this release. To work around this, create copies of later 19815versions of the missing files in the directory containing '$0'. (Perl will 19816make the necessary adjustments to the data to compensate for it not being the 19817same version as is being compiled.) The files are available from unicode.org, 19818via either ftp or http. If using http, they will be under 19819www.unicode.org/versions/. Below are listed the source file name of each 19820missing file, the Unicode version to copy it from, and the name to store it 19821as. (Note that the listed source file name may not be exactly the one that 19822Unicode calls it. If you don't find it, you can look it up in 'README.perl' 19823to get the correct name.) 19824END 19825 )); 19826 print simple_fold(join_lines("\n$_")) for @missing_early_files; 19827 exit 2; 19828} 19829 19830if ($compare_versions) { 19831 Carp::my_carp(<<END 19832Warning. \$compare_versions is set. Output is not suitable for production 19833END 19834 ); 19835} 19836 19837# Put into %potential_files a list of all the files in the directory structure 19838# that could be inputs to this program 19839File::Find::find({ 19840 wanted=>sub { 19841 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the 19842 # name's case 19843 my $full = lc(File::Spec->rel2abs($_)); 19844 $potential_files{$full} = 1; 19845 return; 19846 } 19847}, File::Spec->curdir()); 19848 19849my @mktables_list_output_files; 19850my $old_start_time = 0; 19851my $old_options = ""; 19852 19853if (! -e $file_list) { 19854 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; 19855 $write_unchanged_files = 1; 19856} elsif ($write_unchanged_files) { 19857 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; 19858} 19859else { 19860 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; 19861 my $file_handle; 19862 if (! open $file_handle, "<", $file_list) { 19863 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); 19864 $glob_list = 1; 19865 } 19866 else { 19867 my @input; 19868 19869 # Read and parse mktables.lst, placing the results from the first part 19870 # into @input, and the second part into @mktables_list_output_files 19871 for my $list ( \@input, \@mktables_list_output_files ) { 19872 while (<$file_handle>) { 19873 s/^ \s+ | \s+ $//xg; 19874 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) { 19875 $old_start_time = $1; 19876 next; 19877 } 19878 if (/^ \s* \# \s* From\ options\ (.+) /x) { 19879 $old_options = $1; 19880 next; 19881 } 19882 next if /^ \s* (?: \# .* )? $/x; 19883 last if /^ =+ $/x; 19884 my ( $file ) = split /\t/; 19885 push @$list, $file; 19886 } 19887 @$list = uniques(@$list); 19888 next; 19889 } 19890 19891 # Look through all the input files 19892 foreach my $input (@input) { 19893 next if $input eq 'version'; # Already have checked this. 19894 19895 # Ignore if doesn't exist. The checking about whether we care or 19896 # not is done via the Input_file object. 19897 next if ! file_exists($input); 19898 19899 # The paths are stored with relative names, and with '/' as the 19900 # delimiter; convert to absolute on this machine 19901 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); 19902 $potential_files{lc $full} = 1; 19903 } 19904 } 19905 19906 close $file_handle; 19907} 19908 19909if ($glob_list) { 19910 19911 # Here wants to process all .txt files in the directory structure. 19912 # Convert them to full path names. They are stored in the platform's 19913 # relative style 19914 my @known_files; 19915 foreach my $object (@input_file_objects) { 19916 my $file = $object->file; 19917 next unless defined $file; 19918 push @known_files, File::Spec->rel2abs($file); 19919 } 19920 19921 my @unknown_input_files; 19922 foreach my $file (keys %potential_files) { # The keys are stored in lc 19923 next if grep { $file eq lc($_) } @known_files; 19924 19925 # Here, the file is unknown to us. Get relative path name 19926 $file = File::Spec->abs2rel($file); 19927 push @unknown_input_files, $file; 19928 19929 # What will happen is we create a data structure for it, and add it to 19930 # the list of input files to process. First get the subdirectories 19931 # into an array 19932 my (undef, $directories, undef) = File::Spec->splitpath($file); 19933 $directories =~ s;/$;;; # Can have extraneous trailing '/' 19934 my @directories = File::Spec->splitdir($directories); 19935 19936 # If the file isn't extracted (meaning none of the directories is the 19937 # extracted one), just add it to the end of the list of inputs. 19938 if (! grep { $EXTRACTED_DIR eq $_ } @directories) { 19939 push @input_file_objects, Input_file->new($file, v0); 19940 } 19941 else { 19942 19943 # Here, the file is extracted. It needs to go ahead of most other 19944 # processing. Search for the first input file that isn't a 19945 # special required property (that is, find one whose first_release 19946 # is non-0), and isn't extracted. Also, the Age property file is 19947 # processed before the extracted ones, just in case 19948 # $compare_versions is set. 19949 for (my $i = 0; $i < @input_file_objects; $i++) { 19950 if ($input_file_objects[$i]->first_released ne v0 19951 && lc($input_file_objects[$i]->file) ne 'dage.txt' 19952 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) 19953 { 19954 splice @input_file_objects, $i, 0, 19955 Input_file->new($file, v0); 19956 last; 19957 } 19958 } 19959 19960 } 19961 } 19962 if (@unknown_input_files) { 19963 print STDERR simple_fold(join_lines(<<END 19964 19965The following files are unknown as to how to handle. Assuming they are 19966typical property files. You'll know by later error messages if it worked or 19967not: 19968END 19969 ) . " " . join(", ", @unknown_input_files) . "\n\n"); 19970 } 19971} # End of looking through directory structure for more .txt files. 19972 19973# Create the list of input files from the objects we have defined, plus 19974# version 19975my @input_files = qw(version Makefile); 19976foreach my $object (@input_file_objects) { 19977 my $file = $object->file; 19978 next if ! defined $file; # Not all objects have files 19979 next if defined $object->skip;; 19980 push @input_files, $file; 19981} 19982 19983if ( $verbosity >= $VERBOSE ) { 19984 print "Expecting ".scalar( @input_files )." input files. ", 19985 "Checking ".scalar( @mktables_list_output_files )." output files.\n"; 19986} 19987 19988# We set $most_recent to be the most recently changed input file, including 19989# this program itself (done much earlier in this file) 19990foreach my $in (@input_files) { 19991 next unless -e $in; # Keep going even if missing a file 19992 my $mod_time = (stat $in)[9]; 19993 $most_recent = $mod_time if $mod_time > $most_recent; 19994 19995 # See that the input files have distinct names, to warn someone if they 19996 # are adding a new one 19997 if ($make_list) { 19998 my ($volume, $directories, $file ) = File::Spec->splitpath($in); 19999 $directories =~ s;/$;;; # Can have extraneous trailing '/' 20000 my @directories = File::Spec->splitdir($directories); 20001 construct_filename($file, 'mutable', \@directories); 20002 } 20003} 20004 20005# We use 'Makefile' just to see if it has changed since the last time we 20006# rebuilt. Now discard it. 20007@input_files = grep { $_ ne 'Makefile' } @input_files; 20008 20009my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild 20010 || ! scalar @mktables_list_output_files # or if no outputs known 20011 || $old_start_time < $most_recent # or out-of-date 20012 || $old_options ne $command_line_arguments; # or with different 20013 # options 20014 20015# Now we check to see if any output files are older than youngest, if 20016# they are, we need to continue on, otherwise we can presumably bail. 20017if (! $rebuild) { 20018 foreach my $out (@mktables_list_output_files) { 20019 if ( ! file_exists($out)) { 20020 print "'$out' is missing.\n" if $verbosity >= $VERBOSE; 20021 $rebuild = 1; 20022 last; 20023 } 20024 #local $to_trace = 1 if main::DEBUG; 20025 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; 20026 if ( (stat $out)[9] <= $most_recent ) { 20027 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; 20028 print "'$out' is too old.\n" if $verbosity >= $VERBOSE; 20029 $rebuild = 1; 20030 last; 20031 } 20032 } 20033} 20034if (! $rebuild) { 20035 print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; 20036 exit(0); 20037} 20038print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE; 20039 20040# Ready to do the major processing. First create the perl pseudo-property. 20041$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); 20042 20043# Process each input file 20044foreach my $file (@input_file_objects) { 20045 $file->run; 20046} 20047 20048# Finish the table generation. 20049 20050print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; 20051finish_Unicode(); 20052 20053# For the very specialized case of comparing two Unicode versions... 20054if (DEBUG && $compare_versions) { 20055 handle_compare_versions(); 20056} 20057 20058print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; 20059compile_perl(); 20060 20061print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; 20062add_perl_synonyms(); 20063 20064print "Writing tables\n" if $verbosity >= $PROGRESS; 20065write_all_tables(); 20066 20067# Write mktables.lst 20068if ( $file_list and $make_list ) { 20069 20070 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; 20071 foreach my $file (@input_files, @files_actually_output) { 20072 my (undef, $directories, $basefile) = File::Spec->splitpath($file); 20073 my @directories = grep length, File::Spec->splitdir($directories); 20074 $file = join '/', @directories, $basefile; 20075 } 20076 20077 my $ofh; 20078 if (! open $ofh,">",$file_list) { 20079 Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); 20080 return 20081 } 20082 else { 20083 my $localtime = localtime $start_time; 20084 print $ofh <<"END"; 20085# 20086# $file_list -- File list for $0. 20087# 20088# Autogenerated starting on $start_time ($localtime) 20089# From options $command_line_arguments 20090# 20091# - First section is input files 20092# ($0 itself is not listed but is automatically considered an input) 20093# - Section separator is /^=+\$/ 20094# - Second section is a list of output files. 20095# - Lines matching /^\\s*#/ are treated as comments 20096# which along with blank lines are ignored. 20097# 20098 20099# Input files: 20100 20101END 20102 print $ofh "$_\n" for sort(@input_files); 20103 print $ofh "\n=================================\n# Output files:\n\n"; 20104 print $ofh "$_\n" for sort @files_actually_output; 20105 print $ofh "\n# ",scalar(@input_files)," input files\n", 20106 "# ",scalar(@files_actually_output)+1," output files\n\n", 20107 "# End list\n"; 20108 close $ofh 20109 or Carp::my_carp("Failed to close $ofh: $!"); 20110 20111 print "Filelist has ",scalar(@input_files)," input files and ", 20112 scalar(@files_actually_output)+1," output files\n" 20113 if $verbosity >= $VERBOSE; 20114 } 20115} 20116 20117# Output these warnings unless -q explicitly specified. 20118if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { 20119 if (@unhandled_properties) { 20120 print "\nProperties and tables that unexpectedly have no code points\n"; 20121 foreach my $property (sort @unhandled_properties) { 20122 print $property, "\n"; 20123 } 20124 } 20125 20126 if (%potential_files) { 20127 print "\nInput files that are not considered:\n"; 20128 foreach my $file (sort keys %potential_files) { 20129 print File::Spec->abs2rel($file), "\n"; 20130 } 20131 } 20132 print "\nAll done\n" if $verbosity >= $VERBOSE; 20133} 20134 20135if ($version_of_mk_invlist_bounds lt $v_version) { 20136 Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need" 20137 . " to be checked and possibly updated to Unicode" 20138 . " $string_version. Failing tests will be marked TODO"); 20139} 20140 20141exit(0); 20142 20143# TRAILING CODE IS USED BY make_property_test_script() 20144__DATA__ 20145 20146use strict; 20147use warnings; 20148 20149use feature 'signatures'; 20150 20151no warnings 'experimental::uniprop_wildcards'; 20152 20153# Test qr/\X/ and the \p{} regular expression constructs. This file is 20154# constructed by mktables from the tables it generates, so if mktables is 20155# buggy, this won't necessarily catch those bugs. Tests are generated for all 20156# feasible properties; a few aren't currently feasible; see 20157# is_code_point_usable() in mktables for details. 20158 20159# Standard test packages are not used because this manipulates SIG_WARN. It 20160# exits 0 if every non-skipped test succeeded; -1 if any failed. 20161 20162my $Tests = 0; 20163my $Fails = 0; 20164 20165# loc_tools.pl requires this function to be defined 20166sub ok($pass, @msg) { 20167 print "not " unless $pass; 20168 print "ok "; 20169 print ++$Tests; 20170 print " - ", join "", @msg if @msg; 20171 print "\n"; 20172} 20173 20174sub Expect($expected, $ord, $regex, $warning_type='') { 20175 my $line = (caller)[2]; 20176 20177 # Convert the code point to hex form 20178 my $string = sprintf "\"\\x{%04X}\"", $ord; 20179 20180 my @tests = ""; 20181 20182 # The first time through, use all warnings. If the input should generate 20183 # a warning, add another time through with them turned off 20184 push @tests, "no warnings '$warning_type';" if $warning_type; 20185 20186 foreach my $no_warnings (@tests) { 20187 20188 # Store any warning messages instead of outputting them 20189 local $SIG{__WARN__} = $SIG{__WARN__}; 20190 my $warning_message; 20191 $SIG{__WARN__} = sub { $warning_message = $_[0] }; 20192 20193 $Tests++; 20194 20195 # A string eval is needed because of the 'no warnings'. 20196 # Assumes no parentheses in the regular expression 20197 my $result = eval "$no_warnings 20198 my \$RegObj = qr($regex); 20199 $string =~ \$RegObj ? 1 : 0"; 20200 if (not defined $result) { 20201 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; 20202 $Fails++; 20203 } 20204 elsif ($result ^ $expected) { 20205 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; 20206 $Fails++; 20207 } 20208 elsif ($warning_message) { 20209 if (! $warning_type || ($warning_type && $no_warnings)) { 20210 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; 20211 $Fails++; 20212 } 20213 else { 20214 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; 20215 } 20216 } 20217 elsif ($warning_type && ! $no_warnings) { 20218 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; 20219 $Fails++; 20220 } 20221 else { 20222 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; 20223 } 20224 } 20225 return; 20226} 20227 20228sub Error($regex) { 20229 $Tests++; 20230 if (eval { 'x' =~ qr/$regex/; 1 }) { 20231 $Fails++; 20232 my $line = (caller)[2]; 20233 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; 20234 } 20235 else { 20236 my $line = (caller)[2]; 20237 print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; 20238 } 20239 return; 20240} 20241 20242# Break test files (e.g. GCBTest.txt) character that break allowed here 20243my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7)); 20244utf8::upgrade($breakable_utf8); 20245 20246# Break test files (e.g. GCBTest.txt) character that indicates can't break 20247# here 20248my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7)); 20249utf8::upgrade($nobreak_utf8); 20250 20251my $are_ctype_locales_available; 20252my $utf8_locale; 20253chdir 't' if -d 't'; 20254eval { require "./loc_tools.pl" }; 20255if (defined &locales_enabled) { 20256 $are_ctype_locales_available = locales_enabled('LC_CTYPE'); 20257 if ($are_ctype_locales_available) { 20258 $utf8_locale = &find_utf8_ctype_locale; 20259 } 20260} 20261 20262# Eval'd so can run on versions earlier than the property is available in 20263my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/'; 20264if (! defined $WB_Extend_or_Format_re) { 20265 $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/'; 20266} 20267 20268sub _test_break($template, $break_type) { 20269 # Test various break property matches. The 2nd parameter gives the 20270 # property name. The input is a line from auxiliary/*Test.txt for the 20271 # given property. Each such line is a sequence of Unicode (not native) 20272 # code points given by their hex numbers, separated by the two characters 20273 # defined just before this subroutine that indicate that either there can 20274 # or cannot be a break between the adjacent code points. All these are 20275 # tested. 20276 # 20277 # For the gcb property extra tests are made. if there isn't a break, that 20278 # means the sequence forms an extended grapheme cluster, which means that 20279 # \X should match the whole thing. If there is a break, \X should stop 20280 # there. This is all converted by this routine into a match: $string =~ 20281 # /(\X)/, Each \X should match the next cluster; and that is what is 20282 # checked. 20283 20284 my $line = (caller 1)[2]; # Line number 20285 my $comment = ""; 20286 20287 if ($template =~ / ( .*? ) \s* \# (.*) /x) { 20288 $template = $1; 20289 $comment = $2; 20290 20291 # Replace leading spaces with a single one. 20292 $comment =~ s/ ^ \s* / # /x; 20293 } 20294 20295 # The line contains characters above the ASCII range, but in Latin1. It 20296 # may or may not be in utf8, and if it is, it may or may not know it. So, 20297 # convert these characters to 8 bits. If knows is in utf8, simply 20298 # downgrade. 20299 if (utf8::is_utf8($template)) { 20300 utf8::downgrade($template); 20301 } else { 20302 20303 # Otherwise, if it is in utf8, but doesn't know it, the next lines 20304 # convert the two problematic characters to their 8-bit equivalents. 20305 # If it isn't in utf8, they don't harm anything. 20306 use bytes; 20307 $template =~ s/$nobreak_utf8/$nobreak/g; 20308 $template =~ s/$breakable_utf8/$breakable/g; 20309 } 20310 20311 # Perl customizes wb. So change the official tests accordingly 20312 if ($break_type eq 'wb' && $WB_Extend_or_Format_re) { 20313 20314 # Split into elements that alternate between code point and 20315 # break/no-break 20316 my @line = split / +/, $template; 20317 20318 # Look at each code point and its following one 20319 for (my $i = 1; $i < @line - 1 - 1; $i+=2) { 20320 20321 # The customization only involves changing some breaks to 20322 # non-breaks. 20323 next if $line[$i+1] =~ /$nobreak/; 20324 20325 my $lhs = chr utf8::unicode_to_native(hex $line[$i]); 20326 my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]); 20327 20328 # And it only affects adjacent space characters. 20329 next if $lhs !~ /\s/u; 20330 20331 # But, we want to make sure to test spaces followed by a Extend 20332 # or Format. 20333 next if $rhs !~ /\s|$WB_Extend_or_Format_re/; 20334 20335 # To test the customization, add some white-space before this to 20336 # create a span. The $lhs white space may or may not be bound to 20337 # that span, and also with the $rhs. If the $rhs is a binding 20338 # character, the $lhs is bound to it and not to the span, unless 20339 # $lhs is vertical space. In all other cases, the $lhs is bound 20340 # to the span. If the $rhs is white space, it is bound to the 20341 # $lhs 20342 my $bound; 20343 my $span; 20344 if ($rhs =~ /$WB_Extend_or_Format_re/) { 20345 if ($lhs =~ /\v/) { 20346 $bound = $breakable; 20347 $span = $nobreak; 20348 } 20349 else { 20350 $bound = $nobreak; 20351 $span = $breakable; 20352 } 20353 } 20354 else { 20355 $span = $nobreak; 20356 $bound = $nobreak; 20357 } 20358 20359 splice @line, $i, 0, ( '0020', $nobreak, '0020', $span); 20360 $i += 4; 20361 $line[$i+1] = $bound; 20362 } 20363 $template = join " ", @line; 20364 } 20365 20366 # The input is just the break/no-break symbols and sequences of Unicode 20367 # code points as hex digits separated by spaces for legibility. e.g.: 20368 # ÷ 0020 × 0308 ÷ 0020 ÷ 20369 # Convert to native \x format 20370 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex; 20371 $template =~ s/ \s* //gx; # Probably the line above removed all spaces; 20372 # but be sure 20373 20374 # Make a copy of the input with the symbols replaced by \b{} and \B{} as 20375 # appropriate 20376 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx; 20377 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx; 20378 20379 my $display_string = $template =~ s/[$breakable$nobreak]//gr; 20380 my $string = eval "\"$display_string\""; 20381 20382 # The remaining massaging of the input is for the \X tests. Get rid of 20383 # the leading and trailing breakables 20384 $template =~ s/^ \s* $breakable \s* //x; 20385 $template =~ s/ \s* $breakable \s* $ //x; 20386 20387 # Delete no-breaks 20388 $template =~ s/ \s* $nobreak \s* //xg; 20389 20390 # Split the input into segments that are breakable between them. 20391 my @should_display = split /\s*$breakable\s*/, $template; 20392 my @should_match = map { eval "\"$_\"" } @should_display; 20393 20394 # If a string can be represented in both non-ut8 and utf8, test both cases 20395 my $display_upgrade = ""; 20396 UPGRADE: 20397 for my $to_upgrade (0 .. 1) { 20398 20399 if ($to_upgrade) { 20400 20401 # If already in utf8, would just be a repeat 20402 next UPGRADE if utf8::is_utf8($string); 20403 20404 utf8::upgrade($string); 20405 $display_upgrade = " (utf8-upgraded)"; 20406 } 20407 20408 my @modifiers = qw(a aa d u i); 20409 if ($are_ctype_locales_available) { 20410 push @modifiers, "l$utf8_locale" if defined $utf8_locale; 20411 20412 # The /l modifier has C after it to indicate the locale to try 20413 push @modifiers, "lC"; 20414 } 20415 20416 # Test for each of the regex modifiers. 20417 for my $modifier (@modifiers) { 20418 my $display_locale = ""; 20419 20420 # For /l, set the locale to what it says to. 20421 if ($modifier =~ / ^ l (.*) /x) { 20422 my $locale = $1; 20423 $display_locale = "(locale = $locale)"; 20424 POSIX::setlocale(POSIX::LC_CTYPE(), $locale); 20425 $modifier = 'l'; 20426 } 20427 20428 no warnings qw(locale regexp surrogate); 20429 my $pattern = "(?$modifier:$break_pattern)"; 20430 20431 # Actually do the test 20432 my $matched_text; 20433 my $matched = $string =~ qr/$pattern/; 20434 if ($matched) { 20435 $matched_text = "matched"; 20436 } 20437 else { 20438 $matched_text = "failed to match"; 20439 print "not "; 20440 20441 if (TODO_FAILING_BREAKS) { 20442 $comment = " # $comment" unless $comment =~ / ^ \s* \# /x; 20443 $comment =~ s/#/# TODO/; 20444 } 20445 } 20446 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n"; 20447 20448 # Only print the comment on the first use of this line 20449 $comment = ""; 20450 20451 # Repeat with the first \B{} in the pattern. This makes sure the 20452 # code in regexec.c:find_byclass() for \B gets executed 20453 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) { 20454 my $B_pattern = "$1$2"; 20455 $matched = $string =~ qr/$B_pattern/; 20456 print "not " unless $matched; 20457 $matched_text = ($matched) ? "matched" : "failed to match"; 20458 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale"; 20459 print " # TODO" if TODO_FAILING_BREAKS && ! $matched; 20460 print "\n"; 20461 } 20462 } 20463 20464 next if $break_type ne 'gcb'; 20465 20466 # Finally, do the \X match. 20467 my @matches = $string =~ /(\X)/g; 20468 20469 # Look through each matched cluster to verify that it matches what we 20470 # expect. 20471 my $min = (@matches < @should_match) ? @matches : @should_match; 20472 for my $i (0 .. $min - 1) { 20473 $Tests++; 20474 if ($matches[$i] eq $should_match[$i]) { 20475 print "ok $Tests - "; 20476 if ($i == 0) { 20477 print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; 20478 } else { 20479 print "And \\X #", $i + 1, 20480 } 20481 print " correctly matched $should_display[$i]; line $line\n"; 20482 } else { 20483 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ } 20484 split "", $matches[$i]); 20485 print "not ok $Tests -"; 20486 print " # TODO" if TODO_FAILING_BREAKS; 20487 print " In \"$display_string\" =~ /(\\X)/g, \\X #", 20488 $i + 1, 20489 " should have matched $should_display[$i]", 20490 " but instead matched $matches[$i]", 20491 ". Abandoning rest of line $line\n"; 20492 next UPGRADE; 20493 } 20494 } 20495 20496 # And the number of matches should equal the number of expected matches. 20497 $Tests++; 20498 if (@matches == @should_match) { 20499 print "ok $Tests - Nothing was left over; line $line\n"; 20500 } else { 20501 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line"; 20502 print " # TODO" if TODO_FAILING_BREAKS; 20503 print "\n"; 20504 } 20505 } 20506 20507 return; 20508} 20509 20510sub Test_GCB($t) { 20511 _test_break($t, 'gcb'); 20512} 20513 20514sub Test_LB($t) { 20515 _test_break($t, 'lb'); 20516} 20517 20518sub Test_SB($t) { 20519 _test_break($t, 'sb'); 20520} 20521 20522sub Test_WB($t) { 20523 _test_break($t, 'wb'); 20524} 20525 20526sub Finished() { 20527 print "1..$Tests\n"; 20528 exit($Fails ? -1 : 0); 20529} 20530 20531