1#!/usr/bin/perl -w 2 3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! 4# Any files created or read by this program should be listed in 'mktables.lst' 5# Use -makelist to regenerate it. 6 7# There was an attempt when this was first rewritten to make it 5.8 8# compatible, but that has now been abandoned, and newer constructs are used 9# as convenient. 10 11# NOTE: this script can run quite slowly in older/slower systems. 12# It can also consume a lot of memory (128 MB or more), you may need 13# to raise your process resource limits (e.g. in bash, "ulimit -a" 14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set) 15 16my $start_time; 17BEGIN { # Get the time the script started running; do it at compilation to 18 # get it as close as possible 19 $start_time= time; 20} 21 22require 5.010_001; 23use strict; 24use warnings; 25use Carp; 26use Config; 27use File::Find; 28use File::Path; 29use File::Spec; 30use Text::Tabs; 31use re "/aa"; 32use feature 'state'; 33 34sub DEBUG () { 0 } # Set to 0 for production; 1 for development 35my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; 36 37sub NON_ASCII_PLATFORM { ord("A") != 65 } 38 39# When a new version of Unicode is published, unfortunately the algorithms for 40# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated 41# manually. The changes may or may not be backward compatible with older 42# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the 43# changes, then come back here and set the variable below to what version the 44# code is expecting. If a newer version of Unicode is being compiled than 45# expected, a warning will be generated. If an older version is being 46# compiled, any bounds tests that fail in the generated test file (-maketest 47# option) will be marked as TODO. 48my $version_of_mk_invlist_bounds = v10.0.0; 49 50########################################################################## 51# 52# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), 53# from the Unicode database files (lib/unicore/.../*.txt), It also generates 54# a pod file and .t files, depending on option parameters. 55# 56# The structure of this file is: 57# First these introductory comments; then 58# code needed for everywhere, such as debugging stuff; then 59# code to handle input parameters; then 60# data structures likely to be of external interest (some of which depend on 61# the input parameters, so follows them; then 62# more data structures and subroutine and package (class) definitions; then 63# the small actual loop to process the input files and finish up; then 64# a __DATA__ section, for the .t tests 65# 66# This program works on all releases of Unicode so far. The outputs have been 67# scrutinized most intently for release 5.1. The others have been checked for 68# somewhat more than just sanity. It can handle all non-provisional Unicode 69# character properties in those releases. 70# 71# This program is mostly about Unicode character (or code point) properties. 72# A property describes some attribute or quality of a code point, like if it 73# is lowercase or not, its name, what version of Unicode it was first defined 74# in, or what its uppercase equivalent is. Unicode deals with these disparate 75# possibilities by making all properties into mappings from each code point 76# into some corresponding value. In the case of it being lowercase or not, 77# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each 78# property maps each Unicode code point to a single value, called a "property 79# value". (Some more recently defined properties, map a code point to a set 80# of values.) 81# 82# When using a property in a regular expression, what is desired isn't the 83# mapping of the code point to its property's value, but the reverse (or the 84# mathematical "inverse relation"): starting with the property value, "Does a 85# code point map to it?" These are written in a "compound" form: 86# \p{property=value}, e.g., \p{category=punctuation}. This program generates 87# files containing the lists of code points that map to each such regular 88# expression property value, one file per list 89# 90# There is also a single form shortcut that Perl adds for many of the commonly 91# used properties. This happens for all binary properties, plus script, 92# general_category, and block properties. 93# 94# Thus the outputs of this program are files. There are map files, mostly in 95# the 'To' directory; and there are list files for use in regular expression 96# matching, all in subdirectories of the 'lib' directory, with each 97# subdirectory being named for the property that the lists in it are for. 98# Bookkeeping, test, and documentation files are also generated. 99 100my $matches_directory = 'lib'; # Where match (\p{}) files go. 101my $map_directory = 'To'; # Where map files go. 102 103# DATA STRUCTURES 104# 105# The major data structures of this program are Property, of course, but also 106# Table. There are two kinds of tables, very similar to each other. 107# "Match_Table" is the data structure giving the list of code points that have 108# a particular property value, mentioned above. There is also a "Map_Table" 109# data structure which gives the property's mapping from code point to value. 110# There are two structures because the match tables need to be combined in 111# various ways, such as constructing unions, intersections, complements, etc., 112# and the map ones don't. And there would be problems, perhaps subtle, if 113# a map table were inadvertently operated on in some of those ways. 114# The use of separate classes with operations defined on one but not the other 115# prevents accidentally confusing the two. 116# 117# At the heart of each table's data structure is a "Range_List", which is just 118# an ordered list of "Ranges", plus ancillary information, and methods to 119# operate on them. A Range is a compact way to store property information. 120# Each range has a starting code point, an ending code point, and a value that 121# is meant to apply to all the code points between the two end points, 122# inclusive. For a map table, this value is the property value for those 123# code points. Two such ranges could be written like this: 124# 0x41 .. 0x5A, 'Upper', 125# 0x61 .. 0x7A, 'Lower' 126# 127# Each range also has a type used as a convenience to classify the values. 128# Most ranges in this program will be Type 0, or normal, but there are some 129# ranges that have a non-zero type. These are used only in map tables, and 130# are for mappings that don't fit into the normal scheme of things. Mappings 131# that require a hash entry to communicate with utf8.c are one example; 132# another example is mappings for charnames.pm to use which indicate a name 133# that is algorithmically determinable from its code point (and the reverse). 134# These are used to significantly compact these tables, instead of listing 135# each one of the tens of thousands individually. 136# 137# In a match table, the value of a range is irrelevant (and hence the type as 138# well, which will always be 0), and arbitrarily set to the empty string. 139# Using the example above, there would be two match tables for those two 140# entries, one named Upper would contain the 0x41..0x5A range, and the other 141# named Lower would contain 0x61..0x7A. 142# 143# Actually, there are two types of range lists, "Range_Map" is the one 144# associated with map tables, and "Range_List" with match tables. 145# Again, this is so that methods can be defined on one and not the others so 146# as to prevent operating on them in incorrect ways. 147# 148# Eventually, most tables are written out to files to be read by utf8_heavy.pl 149# in the perl core. All tables could in theory be written, but some are 150# suppressed because there is no current practical use for them. It is easy 151# to change which get written by changing various lists that are near the top 152# of the actual code in this file. The table data structures contain enough 153# ancillary information to allow them to be treated as separate entities for 154# writing, such as the path to each one's file. There is a heading in each 155# map table that gives the format of its entries, and what the map is for all 156# the code points missing from it. (This allows tables to be more compact.) 157# 158# The Property data structure contains one or more tables. All properties 159# contain a map table (except the $perl property which is a 160# pseudo-property containing only match tables), and any properties that 161# are usable in regular expression matches also contain various matching 162# tables, one for each value the property can have. A binary property can 163# have two values, True and False (or Y and N, which are preferred by Unicode 164# terminology). Thus each of these properties will have a map table that 165# takes every code point and maps it to Y or N (but having ranges cuts the 166# number of entries in that table way down), and two match tables, one 167# which has a list of all the code points that map to Y, and one for all the 168# code points that map to N. (For each binary property, a third table is also 169# generated for the pseudo Perl property. It contains the identical code 170# points as the Y table, but can be written in regular expressions, not in the 171# compound form, but in a "single" form like \p{IsUppercase}.) Many 172# properties are binary, but some properties have several possible values, 173# some have many, and properties like Name have a different value for every 174# named code point. Those will not, unless the controlling lists are changed, 175# have their match tables written out. But all the ones which can be used in 176# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally 177# a property would have either its map table or its match tables written but 178# not both. Again, what gets written is controlled by lists which can easily 179# be changed. Starting in 5.14, advantage was taken of this, and all the map 180# tables needed to reconstruct the Unicode db are now written out, while 181# suppressing the Unicode .txt files that contain the data. Our tables are 182# much more compact than the .txt files, so a significant space savings was 183# achieved. Also, tables are not written out that are trivially derivable 184# from tables that do get written. So, there typically is no file containing 185# the code points not matched by a binary property (the table for \P{} versus 186# lowercase \p{}), since you just need to invert the True table to get the 187# False table. 188 189# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on 190# how many match tables there are and the content of the maps. This 'Type' is 191# different than a range 'Type', so don't get confused by the two concepts 192# having the same name. 193# 194# For information about the Unicode properties, see Unicode's UAX44 document: 195 196my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; 197 198# As stated earlier, this program will work on any release of Unicode so far. 199# Most obvious problems in earlier data have NOT been corrected except when 200# necessary to make Perl or this program work reasonably, and to keep out 201# potential security issues. For example, no folding information was given in 202# early releases, so this program substitutes lower case instead, just so that 203# a regular expression with the /i option will do something that actually 204# gives the right results in many cases. There are also a couple other 205# corrections for version 1.1.5, commented at the point they are made. As an 206# example of corrections that weren't made (but could be) is this statement 207# from DerivedAge.txt: "The supplementary private use code points and the 208# non-character code points were assigned in version 2.0, but not specifically 209# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise 210# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is 211# further down in these introductory comments. 212# 213# This program works on all non-provisional properties as of the current 214# Unicode release, though the files for some are suppressed for various 215# reasons. You can change which are output by changing lists in this program. 216# 217# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's 218# loose matchings rules (from Unicode TR18): 219# 220# The recommended names for UCD properties and property values are in 221# PropertyAliases.txt [Prop] and PropertyValueAliases.txt 222# [PropValue]. There are both abbreviated names and longer, more 223# descriptive names. It is strongly recommended that both names be 224# recognized, and that loose matching of property names be used, 225# whereby the case distinctions, whitespace, hyphens, and underbar 226# are ignored. 227# 228# The program still allows Fuzzy to override its determination of if loose 229# matching should be used, but it isn't currently used, as it is no longer 230# needed; the calculations it makes are good enough. 231# 232# SUMMARY OF HOW IT WORKS: 233# 234# Process arguments 235# 236# A list is constructed containing each input file that is to be processed 237# 238# Each file on the list is processed in a loop, using the associated handler 239# code for each: 240# The PropertyAliases.txt and PropValueAliases.txt files are processed 241# first. These files name the properties and property values. 242# Objects are created of all the property and property value names 243# that the rest of the input should expect, including all synonyms. 244# The other input files give mappings from properties to property 245# values. That is, they list code points and say what the mapping 246# is under the given property. Some files give the mappings for 247# just one property; and some for many. This program goes through 248# each file and populates the properties and their map tables from 249# them. Some properties are listed in more than one file, and 250# Unicode has set up a precedence as to which has priority if there 251# is a conflict. Thus the order of processing matters, and this 252# program handles the conflict possibility by processing the 253# overriding input files last, so that if necessary they replace 254# earlier values. 255# After this is all done, the program creates the property mappings not 256# furnished by Unicode, but derivable from what it does give. 257# The tables of code points that match each property value in each 258# property that is accessible by regular expressions are created. 259# The Perl-defined properties are created and populated. Many of these 260# require data determined from the earlier steps 261# Any Perl-defined synonyms are created, and name clashes between Perl 262# and Unicode are reconciled and warned about. 263# All the properties are written to files 264# Any other files are written, and final warnings issued. 265# 266# For clarity, a number of operators have been overloaded to work on tables: 267# ~ means invert (take all characters not in the set). The more 268# conventional '!' is not used because of the possibility of confusing 269# it with the actual boolean operation. 270# + means union 271# - means subtraction 272# & means intersection 273# The precedence of these is the order listed. Parentheses should be 274# copiously used. These are not a general scheme. The operations aren't 275# defined for a number of things, deliberately, to avoid getting into trouble. 276# Operations are done on references and affect the underlying structures, so 277# that the copy constructors for them have been overloaded to not return a new 278# clone, but the input object itself. 279# 280# The bool operator is deliberately not overloaded to avoid confusion with 281# "should it mean if the object merely exists, or also is non-empty?". 282# 283# WHY CERTAIN DESIGN DECISIONS WERE MADE 284# 285# This program needs to be able to run under miniperl. Therefore, it uses a 286# minimum of other modules, and hence implements some things itself that could 287# be gotten from CPAN 288# 289# This program uses inputs published by the Unicode Consortium. These can 290# change incompatibly between releases without the Perl maintainers realizing 291# it. Therefore this program is now designed to try to flag these. It looks 292# at the directories where the inputs are, and flags any unrecognized files. 293# It keeps track of all the properties in the files it handles, and flags any 294# that it doesn't know how to handle. It also flags any input lines that 295# don't match the expected syntax, among other checks. 296# 297# It is also designed so if a new input file matches one of the known 298# templates, one hopefully just needs to add it to a list to have it 299# processed. 300# 301# As mentioned earlier, some properties are given in more than one file. In 302# particular, the files in the extracted directory are supposedly just 303# reformattings of the others. But they contain information not easily 304# derivable from the other files, including results for Unihan (which isn't 305# usually available to this program) and for unassigned code points. They 306# also have historically had errors or been incomplete. In an attempt to 307# create the best possible data, this program thus processes them first to 308# glean information missing from the other files; then processes those other 309# files to override any errors in the extracted ones. Much of the design was 310# driven by this need to store things and then possibly override them. 311# 312# It tries to keep fatal errors to a minimum, to generate something usable for 313# testing purposes. It always looks for files that could be inputs, and will 314# warn about any that it doesn't know how to handle (the -q option suppresses 315# the warning). 316# 317# Why is there more than one type of range? 318# This simplified things. There are some very specialized code points that 319# have to be handled specially for output, such as Hangul syllable names. 320# By creating a range type (done late in the development process), it 321# allowed this to be stored with the range, and overridden by other input. 322# Originally these were stored in another data structure, and it became a 323# mess trying to decide if a second file that was for the same property was 324# overriding the earlier one or not. 325# 326# Why are there two kinds of tables, match and map? 327# (And there is a base class shared by the two as well.) As stated above, 328# they actually are for different things. Development proceeded much more 329# smoothly when I (khw) realized the distinction. Map tables are used to 330# give the property value for every code point (actually every code point 331# that doesn't map to a default value). Match tables are used for regular 332# expression matches, and are essentially the inverse mapping. Separating 333# the two allows more specialized methods, and error checks so that one 334# can't just take the intersection of two map tables, for example, as that 335# is nonsensical. 336# 337# What about 'fate' and 'status'. The concept of a table's fate was created 338# late when it became clear that something more was needed. The difference 339# between this and 'status' is unclean, and could be improved if someone 340# wanted to spend the effort. 341# 342# DEBUGGING 343# 344# This program is written so it will run under miniperl. Occasionally changes 345# will cause an error where the backtrace doesn't work well under miniperl. 346# To diagnose the problem, you can instead run it under regular perl, if you 347# have one compiled. 348# 349# There is a good trace facility. To enable it, first sub DEBUG must be set 350# to return true. Then a line like 351# 352# local $to_trace = 1 if main::DEBUG; 353# 354# can be added to enable tracing in its lexical scope (plus dynamic) or until 355# you insert another line: 356# 357# local $to_trace = 0 if main::DEBUG; 358# 359# To actually trace, use a line like "trace $a, @b, %c, ...; 360# 361# Some of the more complex subroutines already have trace statements in them. 362# Permanent trace statements should be like: 363# 364# trace ... if main::DEBUG && $to_trace; 365# 366# main::stack_trace() will display what its name implies 367# 368# If there is just one or a few files that you're debugging, you can easily 369# cause most everything else to be skipped. Change the line 370# 371# my $debug_skip = 0; 372# 373# to 1, and every file whose object is in @input_file_objects and doesn't have 374# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping 375# Jamo.txt or UnicodeData.txt will likely cause fatal errors. 376# 377# To compare the output tables, it may be useful to specify the -annotate 378# flag. (As of this writing, this can't be done on a clean workspace, due to 379# requirements in Text::Tabs used in this option; so first run mktables 380# without this option.) This option adds comment lines to each table, one for 381# each non-algorithmically named character giving, currently its code point, 382# name, and graphic representation if printable (and you have a font that 383# knows about it). This makes it easier to see what the particular code 384# points are in each output table. Non-named code points are annotated with a 385# description of their status, and contiguous ones with the same description 386# will be output as a range rather than individually. Algorithmically named 387# characters are also output as ranges, except when there are just a few 388# contiguous ones. 389# 390# FUTURE ISSUES 391# 392# The program would break if Unicode were to change its names so that 393# interior white space, underscores, or dashes differences were significant 394# within property and property value names. 395# 396# It might be easier to use the xml versions of the UCD if this program ever 397# would need heavy revision, and the ability to handle old versions was not 398# required. 399# 400# There is the potential for name collisions, in that Perl has chosen names 401# that Unicode could decide it also likes. There have been such collisions in 402# the past, with mostly Perl deciding to adopt the Unicode definition of the 403# name. However in the 5.2 Unicode beta testing, there were a number of such 404# collisions, which were withdrawn before the final release, because of Perl's 405# and other's protests. These all involved new properties which began with 406# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, 407# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a 408# Unicode document, so they are unlikely to be used by Unicode for another 409# purpose. However, they might try something beginning with 'In', or use any 410# of the other Perl-defined properties. This program will warn you of name 411# collisions, and refuse to generate tables with them, but manual intervention 412# will be required in this event. One scheme that could be implemented, if 413# necessary, would be to have this program generate another file, or add a 414# field to mktables.lst that gives the date of first definition of a property. 415# Each new release of Unicode would use that file as a basis for the next 416# iteration. And the Perl synonym addition code could sort based on the age 417# of the property, so older properties get priority, and newer ones that clash 418# would be refused; hence existing code would not be impacted, and some other 419# synonym would have to be used for the new property. This is ugly, and 420# manual intervention would certainly be easier to do in the short run; lets 421# hope it never comes to this. 422# 423# A NOTE ON UNIHAN 424# 425# This program can generate tables from the Unihan database. But that DB 426# isn't normally available, so it is marked as optional. Prior to version 427# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database 428# was split into 8 different files, all beginning with the letters 'Unihan'. 429# If you plunk those files down into the directory mktables ($0) is in, this 430# program will read them and automatically create tables for the properties 431# from it that are listed in PropertyAliases.txt and PropValueAliases.txt, 432# plus any you add to the @cjk_properties array and the @cjk_property_values 433# array, being sure to add necessary '# @missings' lines to the latter. For 434# Unicode versions earlier than 5.2, most of the Unihan properties are not 435# listed at all in PropertyAliases nor PropValueAliases. This program assumes 436# for these early releases that you want the properties that are specified in 437# the 5.2 release. 438# 439# You may need to adjust the entries to suit your purposes. setup_unihan(), 440# and filter_unihan_line() are the functions where this is done. This program 441# already does some adjusting to make the lines look more like the rest of the 442# Unicode DB; You can see what that is in filter_unihan_line() 443# 444# There is a bug in the 3.2 data file in which some values for the 445# kPrimaryNumeric property have commas and an unexpected comment. A filter 446# could be added to correct these; or for a particular installation, the 447# Unihan.txt file could be edited to fix them. 448# 449# HOW TO ADD A FILE TO BE PROCESSED 450# 451# A new file from Unicode needs to have an object constructed for it in 452# @input_file_objects, probably at the end or at the end of the extracted 453# ones. The program should warn you if its name will clash with others on 454# restrictive file systems, like DOS. If so, figure out a better name, and 455# add lines to the README.perl file giving that. If the file is a character 456# property, it should be in the format that Unicode has implicitly 457# standardized for such files for the more recently introduced ones. 458# If so, the Input_file constructor for @input_file_objects can just be the 459# file name and release it first appeared in. If not, then it should be 460# possible to construct an each_line_handler() to massage the line into the 461# standardized form. 462# 463# For non-character properties, more code will be needed. You can look at 464# the existing entries for clues. 465# 466# UNICODE VERSIONS NOTES 467# 468# The Unicode UCD has had a number of errors in it over the versions. And 469# these remain, by policy, in the standard for that version. Therefore it is 470# risky to correct them, because code may be expecting the error. So this 471# program doesn't generally make changes, unless the error breaks the Perl 472# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value 473# for U+1105, which causes real problems for the algorithms for Jamo 474# calculations, so it is changed here. 475# 476# But it isn't so clear cut as to what to do about concepts that are 477# introduced in a later release; should they extend back to earlier releases 478# where the concept just didn't exist? It was easier to do this than to not, 479# so that's what was done. For example, the default value for code points not 480# in the files for various properties was probably undefined until changed by 481# some version. No_Block for blocks is such an example. This program will 482# assign No_Block even in Unicode versions that didn't have it. This has the 483# benefit that code being written doesn't have to special case earlier 484# versions; and the detriment that it doesn't match the Standard precisely for 485# the affected versions. 486# 487# Here are some observations about some of the issues in early versions: 488# 489# Prior to version 3.0, there were 3 character decompositions. These are not 490# handled by Unicode::Normalize, nor will it compile when presented a version 491# that has them. However, you can trivially get it to compile by simply 492# ignoring those decompositions, by changing the croak to a carp. At the time 493# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or 494# dist/Unicode-Normalize/mkheader) reads 495# 496# croak("Weird Canonical Decomposition of U+$h"); 497# 498# Simply comment it out. It will compile, but will not know about any three 499# character decompositions. 500 501# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out 502# that the reason is that the CJK block starting at 4E00 was removed from 503# PropList, and was not put back in until 3.1.0. The Perl extension (the 504# single property name \p{alpha}) has the correct values. But the compound 505# form is simply not generated until 3.1, as it can be argued that prior to 506# this release, this was not an official property. The comments for 507# filter_old_style_proplist() give more details. 508# 509# Unicode introduced the synonym Space for White_Space in 4.1. Perl has 510# always had a \p{Space}. In release 3.2 only, they are not synonymous. The 511# reason is that 3.2 introduced U+205F=medium math space, which was not 512# classed as white space, but Perl figured out that it should have been. 4.0 513# reclassified it correctly. 514# 515# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 516# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB 517# became 202, and ATBL was left with no code points, as all the ones that 518# mapped to 202 stayed mapped to 202. Thus if your program used the numeric 519# name for the class, it would not have been affected, but if it used the 520# mnemonic, it would have been. 521# 522# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code 523# points which eventually came to have this script property value, instead 524# mapped to "Unknown". But in the next release all these code points were 525# moved to \p{sc=common} instead. 526 527# The tests furnished by Unicode for testing WordBreak and SentenceBreak 528# generate errors in 5.0 and earlier. 529# 530# The default for missing code points for BidiClass is complicated. Starting 531# in 3.1.1, the derived file DBidiClass.txt handles this, but this program 532# tries to do the best it can for earlier releases. It is done in 533# process_PropertyAliases() 534# 535# In version 2.1.2, the entry in UnicodeData.txt: 536# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; 537# should instead be 538# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F 539# Without this change, there are casing problems for this character. 540# 541# Search for $string_compare_versions to see how to compare changes to 542# properties between Unicode versions 543# 544############################################################################## 545 546my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing 547 # and errors 548my $MAX_LINE_WIDTH = 78; 549 550# Debugging aid to skip most files so as to not be distracted by them when 551# concentrating on the ones being debugged. Add 552# non_skip => 1, 553# to the constructor for those files you want processed when you set this. 554# Files with a first version number of 0 are special: they are always 555# processed regardless of the state of this flag. Generally, Jamo.txt and 556# UnicodeData.txt must not be skipped if you want this program to not die 557# before normal completion. 558my $debug_skip = 0; 559 560 561# Normally these are suppressed. 562my $write_Unicode_deprecated_tables = 0; 563 564# Set to 1 to enable tracing. 565our $to_trace = 0; 566 567{ # Closure for trace: debugging aid 568 my $print_caller = 1; # ? Include calling subroutine name 569 my $main_with_colon = 'main::'; 570 my $main_colon_length = length($main_with_colon); 571 572 sub trace { 573 return unless $to_trace; # Do nothing if global flag not set 574 575 my @input = @_; 576 577 local $DB::trace = 0; 578 $DB::trace = 0; # Quiet 'used only once' message 579 580 my $line_number; 581 582 # Loop looking up the stack to get the first non-trace caller 583 my $caller_line; 584 my $caller_name; 585 my $i = 0; 586 do { 587 $line_number = $caller_line; 588 (my $pkg, my $file, $caller_line, my $caller) = caller $i++; 589 $caller = $main_with_colon unless defined $caller; 590 591 $caller_name = $caller; 592 593 # get rid of pkg 594 $caller_name =~ s/.*:://; 595 if (substr($caller_name, 0, $main_colon_length) 596 eq $main_with_colon) 597 { 598 $caller_name = substr($caller_name, $main_colon_length); 599 } 600 601 } until ($caller_name ne 'trace'); 602 603 # If the stack was empty, we were called from the top level 604 $caller_name = 'main' if ($caller_name eq "" 605 || $caller_name eq 'trace'); 606 607 my $output = ""; 608 #print STDERR __LINE__, ": ", join ", ", @input, "\n"; 609 foreach my $string (@input) { 610 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { 611 $output .= simple_dumper($string); 612 } 613 else { 614 $string = "$string" if ref $string; 615 $string = $UNDEF unless defined $string; 616 chomp $string; 617 $string = '""' if $string eq ""; 618 $output .= " " if $output ne "" 619 && $string ne "" 620 && substr($output, -1, 1) ne " " 621 && substr($string, 0, 1) ne " "; 622 $output .= $string; 623 } 624 } 625 626 print STDERR sprintf "%4d: ", $line_number if defined $line_number; 627 print STDERR "$caller_name: " if $print_caller; 628 print STDERR $output, "\n"; 629 return; 630 } 631} 632 633sub stack_trace() { 634 local $to_trace = 1 if main::DEBUG; 635 my $line = (caller(0))[2]; 636 my $i = 1; 637 638 # Accumulate the stack trace 639 while (1) { 640 my ($pkg, $file, $caller_line, $caller) = caller $i++; 641 642 last unless defined $caller; 643 644 trace "called from $caller() at line $line"; 645 $line = $caller_line; 646 } 647} 648 649# This is for a rarely used development feature that allows you to compare two 650# versions of the Unicode standard without having to deal with changes caused 651# by the code points introduced in the later version. You probably also want 652# to use the -annotate option when using this. Run this program on a unicore 653# containing the starting release you want to compare. Save that output 654# structure. Then, switching to a unicore with the ending release, change the 655# 0 in the $string_compare_versions definition just below to a string 656# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding 657# to the starting release. This program will then compile, but throw away all 658# code points introduced after the starting release. Finally use a diff tool 659# to compare the two directory structures. They include only the code points 660# common to both releases, and you can see the changes caused just by the 661# underlying release semantic changes. For versions earlier than 3.2, you 662# must copy a version of DAge.txt into the directory. 663my $string_compare_versions = DEBUG && ""; # e.g., "2.1"; 664my $compare_versions = DEBUG 665 && $string_compare_versions 666 && pack "C*", split /\./, $string_compare_versions; 667 668sub uniques { 669 # Returns non-duplicated input values. From "Perl Best Practices: 670 # Encapsulated Cleverness". p. 455 in first edition. 671 672 my %seen; 673 # Arguably this breaks encapsulation, if the goal is to permit multiple 674 # distinct objects to stringify to the same value, and be interchangeable. 675 # However, for this program, no two objects stringify identically, and all 676 # lists passed to this function are either objects or strings. So this 677 # doesn't affect correctness, but it does give a couple of percent speedup. 678 no overloading; 679 return grep { ! $seen{$_}++ } @_; 680} 681 682$0 = File::Spec->canonpath($0); 683 684my $make_test_script = 0; # ? Should we output a test script 685my $make_norm_test_script = 0; # ? Should we output a normalization test script 686my $write_unchanged_files = 0; # ? Should we update the output files even if 687 # we don't think they have changed 688my $use_directory = ""; # ? Should we chdir somewhere. 689my $pod_directory; # input directory to store the pod file. 690my $pod_file = 'perluniprops'; 691my $t_path; # Path to the .t test file 692my $file_list = 'mktables.lst'; # File to store input and output file names. 693 # This is used to speed up the build, by not 694 # executing the main body of the program if 695 # nothing on the list has changed since the 696 # previous build 697my $make_list = 1; # ? Should we write $file_list. Set to always 698 # make a list so that when the pumpking is 699 # preparing a release, s/he won't have to do 700 # special things 701my $glob_list = 0; # ? Should we try to include unknown .txt files 702 # in the input. 703my $output_range_counts = $debugging_build; # ? Should we include the number 704 # of code points in ranges in 705 # the output 706my $annotate = 0; # ? Should character names be in the output 707 708# Verbosity levels; 0 is quiet 709my $NORMAL_VERBOSITY = 1; 710my $PROGRESS = 2; 711my $VERBOSE = 3; 712 713my $verbosity = $NORMAL_VERBOSITY; 714 715# Stored in mktables.lst so that if this program is called with different 716# options, will regenerate even if the files otherwise look like they're 717# up-to-date. 718my $command_line_arguments = join " ", @ARGV; 719 720# Process arguments 721while (@ARGV) { 722 my $arg = shift @ARGV; 723 if ($arg eq '-v') { 724 $verbosity = $VERBOSE; 725 } 726 elsif ($arg eq '-p') { 727 $verbosity = $PROGRESS; 728 $| = 1; # Flush buffers as we go. 729 } 730 elsif ($arg eq '-q') { 731 $verbosity = 0; 732 } 733 elsif ($arg eq '-w') { 734 # update the files even if they haven't changed 735 $write_unchanged_files = 1; 736 } 737 elsif ($arg eq '-check') { 738 my $this = shift @ARGV; 739 my $ok = shift @ARGV; 740 if ($this ne $ok) { 741 print "Skipping as check params are not the same.\n"; 742 exit(0); 743 } 744 } 745 elsif ($arg eq '-P' && defined ($pod_directory = shift)) { 746 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; 747 } 748 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) 749 { 750 $make_test_script = 1; 751 } 752 elsif ($arg eq '-makenormtest') 753 { 754 $make_norm_test_script = 1; 755 } 756 elsif ($arg eq '-makelist') { 757 $make_list = 1; 758 } 759 elsif ($arg eq '-C' && defined ($use_directory = shift)) { 760 -d $use_directory or croak "Unknown directory '$use_directory'"; 761 } 762 elsif ($arg eq '-L') { 763 764 # Existence not tested until have chdir'd 765 $file_list = shift; 766 } 767 elsif ($arg eq '-globlist') { 768 $glob_list = 1; 769 } 770 elsif ($arg eq '-c') { 771 $output_range_counts = ! $output_range_counts 772 } 773 elsif ($arg eq '-annotate') { 774 $annotate = 1; 775 $debugging_build = 1; 776 $output_range_counts = 1; 777 } 778 else { 779 my $with_c = 'with'; 780 $with_c .= 'out' if $output_range_counts; # Complements the state 781 croak <<END; 782usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] 783 [ -T test_file_path ] [-globlist] [-makelist] [-maketest] 784 [-check A B ] 785 -c : Output comments $with_c number of code points in ranges 786 -q : Quiet Mode: Only output serious warnings. 787 -p : Set verbosity level to normal plus show progress. 788 -v : Set Verbosity level high: Show progress and non-serious 789 warnings 790 -w : Write files regardless 791 -C dir : Change to this directory before proceeding. All relative paths 792 except those specified by the -P and -T options will be done 793 with respect to this directory. 794 -P dir : Output $pod_file file to directory 'dir'. 795 -T path : Create a test script as 'path'; overrides -maketest 796 -L filelist : Use alternate 'filelist' instead of standard one 797 -globlist : Take as input all non-Test *.txt files in current and sub 798 directories 799 -maketest : Make test script 'TestProp.pl' in current (or -C directory), 800 overrides -T 801 -makelist : Rewrite the file list $file_list based on current setup 802 -annotate : Output an annotation for each character in the table files; 803 useful for debugging mktables, looking at diffs; but is slow 804 and memory intensive 805 -check A B : Executes $0 only if A and B are the same 806END 807 } 808} 809 810# Stores the most-recently changed file. If none have changed, can skip the 811# build 812my $most_recent = (stat $0)[9]; # Do this before the chdir! 813 814# Change directories now, because need to read 'version' early. 815if ($use_directory) { 816 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { 817 $pod_directory = File::Spec->rel2abs($pod_directory); 818 } 819 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { 820 $t_path = File::Spec->rel2abs($t_path); 821 } 822 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; 823 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { 824 $pod_directory = File::Spec->abs2rel($pod_directory); 825 } 826 if ($t_path && File::Spec->file_name_is_absolute($t_path)) { 827 $t_path = File::Spec->abs2rel($t_path); 828 } 829} 830 831# Get Unicode version into regular and v-string. This is done now because 832# various tables below get populated based on it. These tables are populated 833# here to be near the top of the file, and so easily seeable by those needing 834# to modify things. 835open my $VERSION, "<", "version" 836 or croak "$0: can't open required file 'version': $!\n"; 837my $string_version = <$VERSION>; 838close $VERSION; 839chomp $string_version; 840my $v_version = pack "C*", split /\./, $string_version; # v string 841 842my $unicode_version = ($compare_versions) 843 ? ( "$string_compare_versions (using " 844 . "$string_version rules)") 845 : $string_version; 846 847# The following are the complete names of properties with property values that 848# are known to not match any code points in some versions of Unicode, but that 849# may change in the future so they should be matchable, hence an empty file is 850# generated for them. 851my @tables_that_may_be_empty; 852push @tables_that_may_be_empty, 'Joining_Type=Left_Joining' 853 if $v_version lt v6.3.0; 854push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; 855push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; 856push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' 857 if $v_version ge v4.1.0; 858push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' 859 if $v_version ge v6.0.0; 860push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' 861 if $v_version ge v6.1.0; 862push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' 863 if $v_version ge v6.2.0; 864 865# The lists below are hashes, so the key is the item in the list, and the 866# value is the reason why it is in the list. This makes generation of 867# documentation easier. 868 869my %why_suppressed; # No file generated for these. 870 871# Files aren't generated for empty extraneous properties. This is arguable. 872# Extraneous properties generally come about because a property is no longer 873# used in a newer version of Unicode. If we generated a file without code 874# points, programs that used to work on that property will still execute 875# without errors. It just won't ever match (or will always match, with \P{}). 876# This means that the logic is now likely wrong. I (khw) think its better to 877# find this out by getting an error message. Just move them to the table 878# above to change this behavior 879my %why_suppress_if_empty_warn_if_not = ( 880 881 # It is the only property that has ever officially been removed from the 882 # Standard. The database never contained any code points for it. 883 'Special_Case_Condition' => 'Obsolete', 884 885 # Apparently never official, but there were code points in some versions of 886 # old-style PropList.txt 887 'Non_Break' => 'Obsolete', 888); 889 890# These would normally go in the warn table just above, but they were changed 891# a long time before this program was written, so warnings about them are 892# moot. 893if ($v_version gt v3.2.0) { 894 push @tables_that_may_be_empty, 895 'Canonical_Combining_Class=Attached_Below_Left' 896} 897 898# Enum values for to_output_map() method in the Map_Table package. (0 is don't 899# output) 900my $EXTERNAL_MAP = 1; 901my $INTERNAL_MAP = 2; 902my $OUTPUT_ADJUSTED = 3; 903 904# To override computed values for writing the map tables for these properties. 905# The default for enum map tables is to write them out, so that the Unicode 906# .txt files can be removed, but all the data to compute any property value 907# for any code point is available in a more compact form. 908my %global_to_output_map = ( 909 # Needed by UCD.pm, but don't want to publicize that it exists, so won't 910 # get stuck supporting it if things change. Since it is a STRING 911 # property, it normally would be listed in the pod, but INTERNAL_MAP 912 # suppresses that. 913 Unicode_1_Name => $INTERNAL_MAP, 914 915 Present_In => 0, # Suppress, as easily computed from Age 916 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is 917 # retained, but needed for 918 # non-ASCII 919 920 # Suppress, as mapping can be found instead from the 921 # Perl_Decomposition_Mapping file 922 Decomposition_Type => 0, 923); 924 925# There are several types of obsolete properties defined by Unicode. These 926# must be hand-edited for every new Unicode release. 927my %why_deprecated; # Generates a deprecated warning message if used. 928my %why_stabilized; # Documentation only 929my %why_obsolete; # Documentation only 930 931{ # Closure 932 my $simple = 'Perl uses the more complete version'; 933 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan'; 934 935 my $other_properties = 'other properties'; 936 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; 937 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; 938 939 %why_deprecated = ( 940 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 941 'Jamo_Short_Name' => $contributory, 942 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 943 'Other_Alphabetic' => $contributory, 944 'Other_Default_Ignorable_Code_Point' => $contributory, 945 'Other_Grapheme_Extend' => $contributory, 946 'Other_ID_Continue' => $contributory, 947 'Other_ID_Start' => $contributory, 948 'Other_Lowercase' => $contributory, 949 'Other_Math' => $contributory, 950 'Other_Uppercase' => $contributory, 951 'Expands_On_NFC' => $why_no_expand, 952 'Expands_On_NFD' => $why_no_expand, 953 'Expands_On_NFKC' => $why_no_expand, 954 'Expands_On_NFKD' => $why_no_expand, 955 ); 956 957 %why_suppressed = ( 958 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which 959 # contains the same information, but without the algorithmically 960 # determinable Hangul syllables'. This file is not published, so it's 961 # existence is not noted in the comment. 962 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', 963 964 # Don't suppress ISO_Comment, as otherwise special handling is needed 965 # to differentiate between it and gc=c, which can be written as 'isc', 966 # which is the same characters as ISO_Comment's short name. 967 968 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::", 969 970 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD", 971 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 972 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 973 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 974 975 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful', 976 ); 977 978 foreach my $property ( 979 980 # The following are suppressed because they were made contributory 981 # or deprecated by Unicode before Perl ever thought about 982 # supporting them. 983 'Jamo_Short_Name', 984 'Grapheme_Link', 985 'Expands_On_NFC', 986 'Expands_On_NFD', 987 'Expands_On_NFKC', 988 'Expands_On_NFKD', 989 990 # The following are suppressed because they have been marked 991 # as deprecated for a sufficient amount of time 992 'Other_Alphabetic', 993 'Other_Default_Ignorable_Code_Point', 994 'Other_Grapheme_Extend', 995 'Other_ID_Continue', 996 'Other_ID_Start', 997 'Other_Lowercase', 998 'Other_Math', 999 'Other_Uppercase', 1000 ) { 1001 $why_suppressed{$property} = $why_deprecated{$property}; 1002 } 1003 1004 # Customize the message for all the 'Other_' properties 1005 foreach my $property (keys %why_deprecated) { 1006 next if (my $main_property = $property) !~ s/^Other_//; 1007 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; 1008 } 1009} 1010 1011if ($write_Unicode_deprecated_tables) { 1012 foreach my $property (keys %why_suppressed) { 1013 delete $why_suppressed{$property} if $property =~ 1014 / ^ Other | Grapheme /x; 1015 } 1016} 1017 1018if ($v_version ge 4.0.0) { 1019 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; 1020 if ($v_version ge 6.0.0) { 1021 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; 1022 } 1023} 1024if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { 1025 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; 1026 if ($v_version ge 6.0.0) { 1027 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; 1028 } 1029} 1030 1031# Probably obsolete forever 1032if ($v_version ge v4.1.0) { 1033 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; 1034} 1035if ($v_version ge v6.0.0) { 1036 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; 1037 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; 1038} 1039 1040# This program can create files for enumerated-like properties, such as 1041# 'Numeric_Type'. This file would be the same format as for a string 1042# property, with a mapping from code point to its value, so you could look up, 1043# for example, the script a code point is in. But no one so far wants this 1044# mapping, or they have found another way to get it since this is a new 1045# feature. So no file is generated except if it is in this list. 1046my @output_mapped_properties = split "\n", <<END; 1047END 1048 1049# If you want more Unihan properties than the default, you need to add them to 1050# these arrays. Depending on the property type, @missing lines might have to 1051# be added to the second array. A sample entry would be (including the '#'): 1052# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 1053my @cjk_properties = split "\n", <<'END'; 1054END 1055my @cjk_property_values = split "\n", <<'END'; 1056END 1057 1058# The input files don't list every code point. Those not listed are to be 1059# defaulted to some value. Below are hard-coded what those values are for 1060# non-binary properties as of 5.1. Starting in 5.0, there are 1061# machine-parsable comment lines in the files that give the defaults; so this 1062# list shouldn't have to be extended. The claim is that all missing entries 1063# for binary properties will default to 'N'. Unicode tried to change that in 1064# 5.2, but the beta period produced enough protest that they backed off. 1065# 1066# The defaults for the fields that appear in UnicodeData.txt in this hash must 1067# be in the form that it expects. The others may be synonyms. 1068my $CODE_POINT = '<code point>'; 1069my %default_mapping = ( 1070 Age => "Unassigned", 1071 # Bidi_Class => Complicated; set in code 1072 Bidi_Mirroring_Glyph => "", 1073 Block => 'No_Block', 1074 Canonical_Combining_Class => 0, 1075 Case_Folding => $CODE_POINT, 1076 Decomposition_Mapping => $CODE_POINT, 1077 Decomposition_Type => 'None', 1078 East_Asian_Width => "Neutral", 1079 FC_NFKC_Closure => $CODE_POINT, 1080 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned', 1081 Grapheme_Cluster_Break => 'Other', 1082 Hangul_Syllable_Type => 'NA', 1083 ISO_Comment => "", 1084 Jamo_Short_Name => "", 1085 Joining_Group => "No_Joining_Group", 1086 # Joining_Type => Complicated; set in code 1087 kIICore => 'N', # Is converted to binary 1088 #Line_Break => Complicated; set in code 1089 Lowercase_Mapping => $CODE_POINT, 1090 Name => "", 1091 Name_Alias => "", 1092 NFC_QC => 'Yes', 1093 NFD_QC => 'Yes', 1094 NFKC_QC => 'Yes', 1095 NFKD_QC => 'Yes', 1096 Numeric_Type => 'None', 1097 Numeric_Value => 'NaN', 1098 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', 1099 Sentence_Break => 'Other', 1100 Simple_Case_Folding => $CODE_POINT, 1101 Simple_Lowercase_Mapping => $CODE_POINT, 1102 Simple_Titlecase_Mapping => $CODE_POINT, 1103 Simple_Uppercase_Mapping => $CODE_POINT, 1104 Titlecase_Mapping => $CODE_POINT, 1105 Unicode_1_Name => "", 1106 Unicode_Radical_Stroke => "", 1107 Uppercase_Mapping => $CODE_POINT, 1108 Word_Break => 'Other', 1109); 1110 1111### End of externally interesting definitions, except for @input_file_objects 1112 1113my $HEADER=<<"EOF"; 1114# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1115# This file is machine-generated by $0 from the Unicode 1116# database, Version $unicode_version. Any changes made here will be lost! 1117EOF 1118 1119my $INTERNAL_ONLY_HEADER = <<"EOF"; 1120 1121# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 1122# This file is for internal use by core Perl only. The format and even the 1123# name or existence of this file are subject to change without notice. Don't 1124# use it directly. Use Unicode::UCD to access the Unicode character data 1125# base. 1126EOF 1127 1128my $DEVELOPMENT_ONLY=<<"EOF"; 1129# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! 1130# This file contains information artificially constrained to code points 1131# present in Unicode release $string_compare_versions. 1132# IT CANNOT BE RELIED ON. It is for use during development only and should 1133# not be used for production. 1134 1135EOF 1136 1137my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) 1138 ? "10FFFF" 1139 : "FFFF"; 1140my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; 1141my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; 1142 1143# We work with above-Unicode code points, up to IV_MAX, but we may want to use 1144# sentinels above that number. Therefore for internal use, we use a much 1145# smaller number, translating it to IV_MAX only for output. The exact number 1146# is immaterial (all above-Unicode code points are treated exactly the same), 1147# but the algorithm requires it to be at least 1148# 2 * $MAX_UNICODE_CODEPOINTS + 1 1149my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; 1150my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; 1151my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); 1152 1153my $MAX_PLATFORM_CODEPOINT = ~0 >> 1; 1154 1155# Matches legal code point. 4-6 hex numbers, If there are 6, the first 1156# two must be 10; if there are 5, the first must not be a 0. Written this way 1157# to decrease backtracking. The first regex allows the code point to be at 1158# the end of a word, but to work properly, the word shouldn't end with a valid 1159# hex character. The second one won't match a code point at the end of a 1160# word, and doesn't have the run-on issue 1161my $run_on_code_point_re = 1162 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; 1163my $code_point_re = qr/\b$run_on_code_point_re/; 1164 1165# This matches the beginning of the line in the Unicode DB files that give the 1166# defaults for code points not listed (i.e., missing) in the file. The code 1167# depends on this ending with a semi-colon, so it can assume it is a valid 1168# field when the line is split() by semi-colons 1169my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/; 1170 1171# Property types. Unicode has more types, but these are sufficient for our 1172# purposes. 1173my $UNKNOWN = -1; # initialized to illegal value 1174my $NON_STRING = 1; # Either binary or enum 1175my $BINARY = 2; 1176my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal 1177 # tables, additional true and false tables are 1178 # generated so that false is anything matching the 1179 # default value, and true is everything else. 1180my $ENUM = 4; # Include catalog 1181my $STRING = 5; # Anything else: string or misc 1182 1183# Some input files have lines that give default values for code points not 1184# contained in the file. Sometimes these should be ignored. 1185my $NO_DEFAULTS = 0; # Must evaluate to false 1186my $NOT_IGNORED = 1; 1187my $IGNORED = 2; 1188 1189# Range types. Each range has a type. Most ranges are type 0, for normal, 1190# and will appear in the main body of the tables in the output files, but 1191# there are other types of ranges as well, listed below, that are specially 1192# handled. There are pseudo-types as well that will never be stored as a 1193# type, but will affect the calculation of the type. 1194 1195# 0 is for normal, non-specials 1196my $MULTI_CP = 1; # Sequence of more than code point 1197my $HANGUL_SYLLABLE = 2; 1198my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. 1199my $NULL = 4; # The map is to the null string; utf8.c can't 1200 # handle these, nor is there an accepted syntax 1201 # for them in \p{} constructs 1202my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would 1203 # otherwise be $MULTI_CP type are instead type 0 1204 1205# process_generic_property_file() can accept certain overrides in its input. 1206# Each of these must begin AND end with $CMD_DELIM. 1207my $CMD_DELIM = "\a"; 1208my $REPLACE_CMD = 'replace'; # Override the Replace 1209my $MAP_TYPE_CMD = 'map_type'; # Override the Type 1210 1211my $NO = 0; 1212my $YES = 1; 1213 1214# Values for the Replace argument to add_range. 1215# $NO # Don't replace; add only the code points not 1216 # already present. 1217my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in 1218 # the comments at the subroutine definition. 1219my $UNCONDITIONALLY = 2; # Replace without conditions. 1220my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if 1221 # already there 1222my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if 1223 # already there 1224my $CROAK = 6; # Die with an error if is already there 1225 1226# Flags to give property statuses. The phrases are to remind maintainers that 1227# if the flag is changed, the indefinite article referring to it in the 1228# documentation may need to be as well. 1229my $NORMAL = ""; 1230my $DEPRECATED = 'D'; 1231my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; 1232my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; 1233my $DISCOURAGED = 'X'; 1234my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; 1235my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; 1236my $STRICTER = 'T'; 1237my $a_bold_stricter = "a 'B<$STRICTER>'"; 1238my $A_bold_stricter = "A 'B<$STRICTER>'"; 1239my $STABILIZED = 'S'; 1240my $a_bold_stabilized = "an 'B<$STABILIZED>'"; 1241my $A_bold_stabilized = "An 'B<$STABILIZED>'"; 1242my $OBSOLETE = 'O'; 1243my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; 1244my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; 1245 1246# Aliases can also have an extra status: 1247my $INTERNAL_ALIAS = 'P'; 1248 1249my %status_past_participles = ( 1250 $DISCOURAGED => 'discouraged', 1251 $STABILIZED => 'stabilized', 1252 $OBSOLETE => 'obsolete', 1253 $DEPRECATED => 'deprecated', 1254 $INTERNAL_ALIAS => 'reserved for Perl core internal use only', 1255); 1256 1257# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be 1258# externally documented. 1259my $ORDINARY = 0; # The normal fate. 1260my $MAP_PROXIED = 1; # The map table for the property isn't written out, 1261 # but there is a file written that can be used to 1262 # reconstruct this table 1263my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is 1264 # for Perl's internal use only 1265my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl. 1266 # Is for backwards compatibility for applications that 1267 # read the file directly, so it's format is 1268 # unchangeable. 1269my $SUPPRESSED = 4; # The file for this table is not written out, and as a 1270 # result, we don't bother to do many computations on 1271 # it. 1272my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the 1273 # computations anyway, as the values are needed for 1274 # things to work. This happens when we have Perl 1275 # extensions that depend on Unicode tables that 1276 # wouldn't normally be in a given Unicode version. 1277 1278# The format of the values of the tables: 1279my $EMPTY_FORMAT = ""; 1280my $BINARY_FORMAT = 'b'; 1281my $DECIMAL_FORMAT = 'd'; 1282my $FLOAT_FORMAT = 'f'; 1283my $INTEGER_FORMAT = 'i'; 1284my $HEX_FORMAT = 'x'; 1285my $RATIONAL_FORMAT = 'r'; 1286my $STRING_FORMAT = 's'; 1287my $ADJUST_FORMAT = 'a'; 1288my $HEX_ADJUST_FORMAT = 'ax'; 1289my $DECOMP_STRING_FORMAT = 'c'; 1290my $STRING_WHITE_SPACE_LIST = 'sw'; 1291 1292my %map_table_formats = ( 1293 $BINARY_FORMAT => 'binary', 1294 $DECIMAL_FORMAT => 'single decimal digit', 1295 $FLOAT_FORMAT => 'floating point number', 1296 $INTEGER_FORMAT => 'integer', 1297 $HEX_FORMAT => 'non-negative hex whole number; a code point', 1298 $RATIONAL_FORMAT => 'rational: an integer or a fraction', 1299 $STRING_FORMAT => 'string', 1300 $ADJUST_FORMAT => 'some entries need adjustment', 1301 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment', 1302 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', 1303 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' 1304); 1305 1306# Unicode didn't put such derived files in a separate directory at first. 1307my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; 1308my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; 1309my $AUXILIARY = 'auxiliary'; 1310 1311# Hashes and arrays that will eventually go into Heavy.pl for the use of 1312# utf8_heavy.pl and into UCD.pl for the use of UCD.pm 1313my %loose_to_file_of; # loosely maps table names to their respective 1314 # files 1315my %stricter_to_file_of; # same; but for stricter mapping. 1316my %loose_property_to_file_of; # Maps a loose property name to its map file 1317my %strict_property_to_file_of; # Same, but strict 1318my @inline_definitions = "V0"; # Each element gives a definition of a unique 1319 # inversion list. When a definition is inlined, 1320 # its value in the hash it's in (one of the two 1321 # defined just above) will include an index into 1322 # this array. The 0th element is initialized to 1323 # the definition for a zero length inversion list 1324my %file_to_swash_name; # Maps the file name to its corresponding key name 1325 # in the hash %utf8::SwashInfo 1326my %nv_floating_to_rational; # maps numeric values floating point numbers to 1327 # their rational equivalent 1328my %loose_property_name_of; # Loosely maps (non_string) property names to 1329 # standard form 1330my %strict_property_name_of; # Strictly maps (non_string) property names to 1331 # standard form 1332my %string_property_loose_to_name; # Same, for string properties. 1333my %loose_defaults; # keys are of form "prop=value", where 'prop' is 1334 # the property name in standard loose form, and 1335 # 'value' is the default value for that property, 1336 # also in standard loose form. 1337my %loose_to_standard_value; # loosely maps table names to the canonical 1338 # alias for them 1339my %ambiguous_names; # keys are alias names (in standard form) that 1340 # have more than one possible meaning. 1341my %combination_property; # keys are alias names (in standard form) that 1342 # have both a map table, and a binary one that 1343 # yields true for all non-null maps. 1344my %prop_aliases; # Keys are standard property name; values are each 1345 # one's aliases 1346my %prop_value_aliases; # Keys of top level are standard property name; 1347 # values are keys to another hash, Each one is 1348 # one of the property's values, in standard form. 1349 # The values are that prop-val's aliases. 1350my %skipped_files; # List of files that we skip 1351my %ucd_pod; # Holds entries that will go into the UCD section of the pod 1352 1353# Most properties are immune to caseless matching, otherwise you would get 1354# nonsensical results, as properties are a function of a code point, not 1355# everything that is caselessly equivalent to that code point. For example, 1356# Changes_When_Case_Folded('s') should be false, whereas caselessly it would 1357# be true because 's' and 'S' are equivalent caselessly. However, 1358# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we 1359# extend that concept to those very few properties that are like this. Each 1360# such property will match the full range caselessly. They are hard-coded in 1361# the program; it's not worth trying to make it general as it's extremely 1362# unlikely that they will ever change. 1363my %caseless_equivalent_to; 1364 1365# This is the range of characters that were in Release 1 of Unicode, and 1366# removed in Release 2 (replaced with the current Hangul syllables starting at 1367# U+AC00). The range was reused starting in Release 3 for other purposes. 1368my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400; 1369my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF; 1370 1371# These constants names and values were taken from the Unicode standard, 1372# version 5.1, section 3.12. They are used in conjunction with Hangul 1373# syllables. The '_string' versions are so generated tables can retain the 1374# hex format, which is the more familiar value 1375my $SBase_string = "0xAC00"; 1376my $SBase = CORE::hex $SBase_string; 1377my $LBase_string = "0x1100"; 1378my $LBase = CORE::hex $LBase_string; 1379my $VBase_string = "0x1161"; 1380my $VBase = CORE::hex $VBase_string; 1381my $TBase_string = "0x11A7"; 1382my $TBase = CORE::hex $TBase_string; 1383my $SCount = 11172; 1384my $LCount = 19; 1385my $VCount = 21; 1386my $TCount = 28; 1387my $NCount = $VCount * $TCount; 1388 1389# For Hangul syllables; These store the numbers from Jamo.txt in conjunction 1390# with the above published constants. 1391my %Jamo; 1392my %Jamo_L; # Leading consonants 1393my %Jamo_V; # Vowels 1394my %Jamo_T; # Trailing consonants 1395 1396# For code points whose name contains its ordinal as a '-ABCD' suffix. 1397# The key is the base name of the code point, and the value is an 1398# array giving all the ranges that use this base name. Each range 1399# is actually a hash giving the 'low' and 'high' values of it. 1400my %names_ending_in_code_point; 1401my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes 1402 # removed from the names 1403# Inverse mapping. The list of ranges that have these kinds of 1404# names. Each element contains the low, high, and base names in an 1405# anonymous hash. 1406my @code_points_ending_in_code_point; 1407 1408# To hold Unicode's normalization test suite 1409my @normalization_tests; 1410 1411# Boolean: does this Unicode version have the hangul syllables, and are we 1412# writing out a table for them? 1413my $has_hangul_syllables = 0; 1414 1415# Does this Unicode version have code points whose names end in their 1416# respective code points, and are we writing out a table for them? 0 for no; 1417# otherwise points to first property that a table is needed for them, so that 1418# if multiple tables are needed, we don't create duplicates 1419my $needing_code_points_ending_in_code_point = 0; 1420 1421my @backslash_X_tests; # List of tests read in for testing \X 1422my @LB_tests; # List of tests read in for testing \b{lb} 1423my @SB_tests; # List of tests read in for testing \b{sb} 1424my @WB_tests; # List of tests read in for testing \b{wb} 1425my @unhandled_properties; # Will contain a list of properties found in 1426 # the input that we didn't process. 1427my @match_properties; # Properties that have match tables, to be 1428 # listed in the pod 1429my @map_properties; # Properties that get map files written 1430my @named_sequences; # NamedSequences.txt contents. 1431my %potential_files; # Generated list of all .txt files in the directory 1432 # structure so we can warn if something is being 1433 # ignored. 1434my @missing_early_files; # Generated list of absent files that we need to 1435 # proceed in compiling this early Unicode version 1436my @files_actually_output; # List of files we generated. 1437my @more_Names; # Some code point names are compound; this is used 1438 # to store the extra components of them. 1439my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at 1440 # the minimum before we consider it equivalent to a 1441 # candidate rational 1442my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms 1443 1444# These store references to certain commonly used property objects 1445my $age; 1446my $ccc; 1447my $gc; 1448my $perl; 1449my $block; 1450my $perl_charname; 1451my $print; 1452my $All; 1453my $Assigned; # All assigned characters in this Unicode release 1454my $DI; # Default_Ignorable_Code_Point property 1455my $NChar; # Noncharacter_Code_Point property 1456my $script; 1457my $scx; # Script_Extensions property 1458 1459# Are there conflicting names because of beginning with 'In_', or 'Is_' 1460my $has_In_conflicts = 0; 1461my $has_Is_conflicts = 0; 1462 1463sub internal_file_to_platform ($) { 1464 # Convert our file paths which have '/' separators to those of the 1465 # platform. 1466 1467 my $file = shift; 1468 return undef unless defined $file; 1469 1470 return File::Spec->join(split '/', $file); 1471} 1472 1473sub file_exists ($) { # platform independent '-e'. This program internally 1474 # uses slash as a path separator. 1475 my $file = shift; 1476 return 0 if ! defined $file; 1477 return -e internal_file_to_platform($file); 1478} 1479 1480sub objaddr($) { 1481 # Returns the address of the blessed input object. 1482 # It doesn't check for blessedness because that would do a string eval 1483 # every call, and the program is structured so that this is never called 1484 # for a non-blessed object. 1485 1486 no overloading; # If overloaded, numifying below won't work. 1487 1488 # Numifying a ref gives its address. 1489 return pack 'J', $_[0]; 1490} 1491 1492# These are used only if $annotate is true. 1493# The entire range of Unicode characters is examined to populate these 1494# after all the input has been processed. But most can be skipped, as they 1495# have the same descriptive phrases, such as being unassigned 1496my @viacode; # Contains the 1 million character names 1497my @age; # And their ages ("" if none) 1498my @printable; # boolean: And are those characters printable? 1499my @annotate_char_type; # Contains a type of those characters, specifically 1500 # for the purposes of annotation. 1501my $annotate_ranges; # A map of ranges of code points that have the same 1502 # name for the purposes of annotation. They map to the 1503 # upper edge of the range, so that the end point can 1504 # be immediately found. This is used to skip ahead to 1505 # the end of a range, and avoid processing each 1506 # individual code point in it. 1507my $unassigned_sans_noncharacters; # A Range_List of the unassigned 1508 # characters, but excluding those which are 1509 # also noncharacter code points 1510 1511# The annotation types are an extension of the regular range types, though 1512# some of the latter are folded into one. Make the new types negative to 1513# avoid conflicting with the regular types 1514my $SURROGATE_TYPE = -1; 1515my $UNASSIGNED_TYPE = -2; 1516my $PRIVATE_USE_TYPE = -3; 1517my $NONCHARACTER_TYPE = -4; 1518my $CONTROL_TYPE = -5; 1519my $ABOVE_UNICODE_TYPE = -6; 1520my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program 1521 1522sub populate_char_info ($) { 1523 # Used only with the $annotate option. Populates the arrays with the 1524 # input code point's info that are needed for outputting more detailed 1525 # comments. If calling context wants a return, it is the end point of 1526 # any contiguous range of characters that share essentially the same info 1527 1528 my $i = shift; 1529 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 1530 1531 $viacode[$i] = $perl_charname->value_of($i) || ""; 1532 $age[$i] = (defined $age) 1533 ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x) 1534 ? $age->value_of($i) 1535 : "") 1536 : ""; 1537 1538 # A character is generally printable if Unicode says it is, 1539 # but below we make sure that most Unicode general category 'C' types 1540 # aren't. 1541 $printable[$i] = $print->contains($i); 1542 1543 # But the characters in this range were removed in v2.0 and replaced by 1544 # different ones later. Modern fonts will be for the replacement 1545 # characters, so suppress printing them. 1546 if (($v_version lt v2.0 1547 || ($compare_versions && $compare_versions lt v2.0)) 1548 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE 1549 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE)) 1550 { 1551 $printable[$i] = 0; 1552 } 1553 1554 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; 1555 1556 # Only these two regular types are treated specially for annotations 1557 # purposes 1558 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME 1559 && $annotate_char_type[$i] != $HANGUL_SYLLABLE; 1560 1561 # Give a generic name to all code points that don't have a real name. 1562 # We output ranges, if applicable, for these. Also calculate the end 1563 # point of the range. 1564 my $end; 1565 if (! $viacode[$i]) { 1566 if ($i > $MAX_UNICODE_CODEPOINT) { 1567 $viacode[$i] = 'Above-Unicode'; 1568 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; 1569 $printable[$i] = 0; 1570 $end = $MAX_WORKING_CODEPOINT; 1571 } 1572 elsif ($gc-> table('Private_use')->contains($i)) { 1573 $viacode[$i] = 'Private Use'; 1574 $annotate_char_type[$i] = $PRIVATE_USE_TYPE; 1575 $printable[$i] = 0; 1576 $end = $gc->table('Private_Use')->containing_range($i)->end; 1577 } 1578 elsif ($NChar->contains($i)) { 1579 $viacode[$i] = 'Noncharacter'; 1580 $annotate_char_type[$i] = $NONCHARACTER_TYPE; 1581 $printable[$i] = 0; 1582 $end = $NChar->containing_range($i)->end; 1583 } 1584 elsif ($gc-> table('Control')->contains($i)) { 1585 my $name_ref = property_ref('Name_Alias'); 1586 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref; 1587 $viacode[$i] = (defined $name_ref) 1588 ? $name_ref->value_of($i) 1589 : 'Control'; 1590 $annotate_char_type[$i] = $CONTROL_TYPE; 1591 $printable[$i] = 0; 1592 } 1593 elsif ($gc-> table('Unassigned')->contains($i)) { 1594 $annotate_char_type[$i] = $UNASSIGNED_TYPE; 1595 $printable[$i] = 0; 1596 $viacode[$i] = 'Unassigned'; 1597 1598 if (defined $block) { # No blocks in earliest releases 1599 $viacode[$i] .= ', block=' . $block-> value_of($i); 1600 $end = $gc-> table('Unassigned')->containing_range($i)->end; 1601 1602 # Because we name the unassigned by the blocks they are in, it 1603 # can't go past the end of that block, and it also can't go 1604 # past the unassigned range it is in. The special table makes 1605 # sure that the non-characters, which are unassigned, are 1606 # separated out. 1607 $end = min($block->containing_range($i)->end, 1608 $unassigned_sans_noncharacters-> 1609 containing_range($i)->end); 1610 } 1611 else { 1612 $end = $i + 1; 1613 while ($unassigned_sans_noncharacters->contains($end)) { 1614 $end++; 1615 } 1616 $end--; 1617 } 1618 } 1619 elsif ($perl->table('_Perl_Surrogate')->contains($i)) { 1620 $viacode[$i] = 'Surrogate'; 1621 $annotate_char_type[$i] = $SURROGATE_TYPE; 1622 $printable[$i] = 0; 1623 $end = $gc->table('Surrogate')->containing_range($i)->end; 1624 } 1625 else { 1626 Carp::my_carp_bug("Can't figure out how to annotate " 1627 . sprintf("U+%04X", $i) 1628 . ". Proceeding anyway."); 1629 $viacode[$i] = 'UNKNOWN'; 1630 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1631 $printable[$i] = 0; 1632 } 1633 } 1634 1635 # Here, has a name, but if it's one in which the code point number is 1636 # appended to the name, do that. 1637 elsif ($annotate_char_type[$i] == $CP_IN_NAME) { 1638 $viacode[$i] .= sprintf("-%04X", $i); 1639 1640 my $limit = $perl_charname->containing_range($i)->end; 1641 if (defined $age) { 1642 # Do all these as groups of the same age, instead of individually, 1643 # because their names are so meaningless, and there are typically 1644 # large quantities of them. 1645 $end = $i + 1; 1646 while ($end <= $limit && $age->value_of($end) == $age[$i]) { 1647 $end++; 1648 } 1649 $end--; 1650 } 1651 else { 1652 $end = $limit; 1653 } 1654 } 1655 1656 # And here, has a name, but if it's a hangul syllable one, replace it with 1657 # the correct name from the Unicode algorithm 1658 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { 1659 use integer; 1660 my $SIndex = $i - $SBase; 1661 my $L = $LBase + $SIndex / $NCount; 1662 my $V = $VBase + ($SIndex % $NCount) / $TCount; 1663 my $T = $TBase + $SIndex % $TCount; 1664 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; 1665 $viacode[$i] .= $Jamo{$T} if $T != $TBase; 1666 $end = $perl_charname->containing_range($i)->end; 1667 } 1668 1669 return if ! defined wantarray; 1670 return $i if ! defined $end; # If not a range, return the input 1671 1672 # Save this whole range so can find the end point quickly 1673 $annotate_ranges->add_map($i, $end, $end); 1674 1675 return $end; 1676} 1677 1678# Commented code below should work on Perl 5.8. 1679## This 'require' doesn't necessarily work in miniperl, and even if it does, 1680## the native perl version of it (which is what would operate under miniperl) 1681## is extremely slow, as it does a string eval every call. 1682#my $has_fast_scalar_util = $^X !~ /miniperl/ 1683# && defined eval "require Scalar::Util"; 1684# 1685#sub objaddr($) { 1686# # Returns the address of the blessed input object. Uses the XS version if 1687# # available. It doesn't check for blessedness because that would do a 1688# # string eval every call, and the program is structured so that this is 1689# # never called for a non-blessed object. 1690# 1691# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; 1692# 1693# # Check at least that is a ref. 1694# my $pkg = ref($_[0]) or return undef; 1695# 1696# # Change to a fake package to defeat any overloaded stringify 1697# bless $_[0], 'main::Fake'; 1698# 1699# # Numifying a ref gives its address. 1700# my $addr = pack 'J', $_[0]; 1701# 1702# # Return to original class 1703# bless $_[0], $pkg; 1704# return $addr; 1705#} 1706 1707sub max ($$) { 1708 my $a = shift; 1709 my $b = shift; 1710 return $a if $a >= $b; 1711 return $b; 1712} 1713 1714sub min ($$) { 1715 my $a = shift; 1716 my $b = shift; 1717 return $a if $a <= $b; 1718 return $b; 1719} 1720 1721sub clarify_number ($) { 1722 # This returns the input number with underscores inserted every 3 digits 1723 # in large (5 digits or more) numbers. Input must be entirely digits, not 1724 # checked. 1725 1726 my $number = shift; 1727 my $pos = length($number) - 3; 1728 return $number if $pos <= 1; 1729 while ($pos > 0) { 1730 substr($number, $pos, 0) = '_'; 1731 $pos -= 3; 1732 } 1733 return $number; 1734} 1735 1736sub clarify_code_point_count ($) { 1737 # This is like clarify_number(), but the input is assumed to be a count of 1738 # code points, rather than a generic number. 1739 1740 my $append = ""; 1741 1742 my $number = shift; 1743 if ($number > $MAX_UNICODE_CODEPOINTS) { 1744 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); 1745 return "All above-Unicode code points" if $number == 0; 1746 $append = " + all above-Unicode code points"; 1747 } 1748 return clarify_number($number) . $append; 1749} 1750 1751package Carp; 1752 1753# These routines give a uniform treatment of messages in this program. They 1754# are placed in the Carp package to cause the stack trace to not include them, 1755# although an alternative would be to use another package and set @CARP_NOT 1756# for it. 1757 1758our $Verbose = 1 if main::DEBUG; # Useful info when debugging 1759 1760# This is a work-around suggested by Nicholas Clark to fix a problem with Carp 1761# and overload trying to load Scalar:Util under miniperl. See 1762# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html 1763undef $overload::VERSION; 1764 1765sub my_carp { 1766 my $message = shift || ""; 1767 my $nofold = shift || 0; 1768 1769 if ($message) { 1770 $message = main::join_lines($message); 1771 $message =~ s/^$0: *//; # Remove initial program name 1772 $message =~ s/[.;,]+$//; # Remove certain ending punctuation 1773 $message = "\n$0: $message;"; 1774 1775 # Fold the message with program name, semi-colon end punctuation 1776 # (which looks good with the message that carp appends to it), and a 1777 # hanging indent for continuation lines. 1778 $message = main::simple_fold($message, "", 4) unless $nofold; 1779 $message =~ s/\n$//; # Remove the trailing nl so what carp 1780 # appends is to the same line 1781 } 1782 1783 return $message if defined wantarray; # If a caller just wants the msg 1784 1785 carp $message; 1786 return; 1787} 1788 1789sub my_carp_bug { 1790 # This is called when it is clear that the problem is caused by a bug in 1791 # this program. 1792 1793 my $message = shift; 1794 $message =~ s/^$0: *//; 1795 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); 1796 carp $message; 1797 return; 1798} 1799 1800sub carp_too_few_args { 1801 if (@_ != 2) { 1802 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken."); 1803 return; 1804 } 1805 1806 my $args_ref = shift; 1807 my $count = shift; 1808 1809 my_carp_bug("Need at least $count arguments to " 1810 . (caller 1)[3] 1811 . ". Instead got: '" 1812 . join ', ', @$args_ref 1813 . "'. No action taken."); 1814 return; 1815} 1816 1817sub carp_extra_args { 1818 my $args_ref = shift; 1819 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_; 1820 1821 unless (ref $args_ref) { 1822 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); 1823 return; 1824 } 1825 my ($package, $file, $line) = caller; 1826 my $subroutine = (caller 1)[3]; 1827 1828 my $list; 1829 if (ref $args_ref eq 'HASH') { 1830 foreach my $key (keys %$args_ref) { 1831 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; 1832 } 1833 $list = join ', ', each %{$args_ref}; 1834 } 1835 elsif (ref $args_ref eq 'ARRAY') { 1836 foreach my $arg (@$args_ref) { 1837 $arg = $UNDEF unless defined $arg; 1838 } 1839 $list = join ', ', @$args_ref; 1840 } 1841 else { 1842 my_carp_bug("Can't cope with ref " 1843 . ref($args_ref) 1844 . " . argument to 'carp_extra_args'. Not checking arguments."); 1845 return; 1846 } 1847 1848 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); 1849 return; 1850} 1851 1852package main; 1853 1854{ # Closure 1855 1856 # This program uses the inside-out method for objects, as recommended in 1857 # "Perl Best Practices". (This is the best solution still, since this has 1858 # to run under miniperl.) This closure aids in generating those. There 1859 # are two routines. setup_package() is called once per package to set 1860 # things up, and then set_access() is called for each hash representing a 1861 # field in the object. These routines arrange for the object to be 1862 # properly destroyed when no longer used, and for standard accessor 1863 # functions to be generated. If you need more complex accessors, just 1864 # write your own and leave those accesses out of the call to set_access(). 1865 # More details below. 1866 1867 my %constructor_fields; # fields that are to be used in constructors; see 1868 # below 1869 1870 # The values of this hash will be the package names as keys to other 1871 # hashes containing the name of each field in the package as keys, and 1872 # references to their respective hashes as values. 1873 my %package_fields; 1874 1875 sub setup_package { 1876 # Sets up the package, creating standard DESTROY and dump methods 1877 # (unless already defined). The dump method is used in debugging by 1878 # simple_dumper(). 1879 # The optional parameters are: 1880 # a) a reference to a hash, that gets populated by later 1881 # set_access() calls with one of the accesses being 1882 # 'constructor'. The caller can then refer to this, but it is 1883 # not otherwise used by these two routines. 1884 # b) a reference to a callback routine to call during destruction 1885 # of the object, before any fields are actually destroyed 1886 1887 my %args = @_; 1888 my $constructor_ref = delete $args{'Constructor_Fields'}; 1889 my $destroy_callback = delete $args{'Destroy_Callback'}; 1890 Carp::carp_extra_args(\@_) if main::DEBUG && %args; 1891 1892 my %fields; 1893 my $package = (caller)[0]; 1894 1895 $package_fields{$package} = \%fields; 1896 $constructor_fields{$package} = $constructor_ref; 1897 1898 unless ($package->can('DESTROY')) { 1899 my $destroy_name = "${package}::DESTROY"; 1900 no strict "refs"; 1901 1902 # Use typeglob to give the anonymous subroutine the name we want 1903 *$destroy_name = sub { 1904 my $self = shift; 1905 my $addr = do { no overloading; pack 'J', $self; }; 1906 1907 $self->$destroy_callback if $destroy_callback; 1908 foreach my $field (keys %{$package_fields{$package}}) { 1909 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; 1910 delete $package_fields{$package}{$field}{$addr}; 1911 } 1912 return; 1913 } 1914 } 1915 1916 unless ($package->can('dump')) { 1917 my $dump_name = "${package}::dump"; 1918 no strict "refs"; 1919 *$dump_name = sub { 1920 my $self = shift; 1921 return dump_inside_out($self, $package_fields{$package}, @_); 1922 } 1923 } 1924 return; 1925 } 1926 1927 sub set_access { 1928 # Arrange for the input field to be garbage collected when no longer 1929 # needed. Also, creates standard accessor functions for the field 1930 # based on the optional parameters-- none if none of these parameters: 1931 # 'addable' creates an 'add_NAME()' accessor function. 1932 # 'readable' or 'readable_array' creates a 'NAME()' accessor 1933 # function. 1934 # 'settable' creates a 'set_NAME()' accessor function. 1935 # 'constructor' doesn't create an accessor function, but adds the 1936 # field to the hash that was previously passed to 1937 # setup_package(); 1938 # Any of the accesses can be abbreviated down, so that 'a', 'ad', 1939 # 'add' etc. all mean 'addable'. 1940 # The read accessor function will work on both array and scalar 1941 # values. If another accessor in the parameter list is 'a', the read 1942 # access assumes an array. You can also force it to be array access 1943 # by specifying 'readable_array' instead of 'readable' 1944 # 1945 # A sort-of 'protected' access can be set-up by preceding the addable, 1946 # readable or settable with some initial portion of 'protected_' (but, 1947 # the underscore is required), like 'p_a', 'pro_set', etc. The 1948 # "protection" is only by convention. All that happens is that the 1949 # accessor functions' names begin with an underscore. So instead of 1950 # calling set_foo, the call is _set_foo. (Real protection could be 1951 # accomplished by having a new subroutine, end_package, called at the 1952 # end of each package, and then storing the __LINE__ ranges and 1953 # checking them on every accessor. But that is way overkill.) 1954 1955 # We create anonymous subroutines as the accessors and then use 1956 # typeglobs to assign them to the proper package and name 1957 1958 my $name = shift; # Name of the field 1959 my $field = shift; # Reference to the inside-out hash containing the 1960 # field 1961 1962 my $package = (caller)[0]; 1963 1964 if (! exists $package_fields{$package}) { 1965 croak "$0: Must call 'setup_package' before 'set_access'"; 1966 } 1967 1968 # Stash the field so DESTROY can get it. 1969 $package_fields{$package}{$name} = $field; 1970 1971 # Remaining arguments are the accessors. For each... 1972 foreach my $access (@_) { 1973 my $access = lc $access; 1974 1975 my $protected = ""; 1976 1977 # Match the input as far as it goes. 1978 if ($access =~ /^(p[^_]*)_/) { 1979 $protected = $1; 1980 if (substr('protected_', 0, length $protected) 1981 eq $protected) 1982 { 1983 1984 # Add 1 for the underscore not included in $protected 1985 $access = substr($access, length($protected) + 1); 1986 $protected = '_'; 1987 } 1988 else { 1989 $protected = ""; 1990 } 1991 } 1992 1993 if (substr('addable', 0, length $access) eq $access) { 1994 my $subname = "${package}::${protected}add_$name"; 1995 no strict "refs"; 1996 1997 # add_ accessor. Don't add if already there, which we 1998 # determine using 'eq' for scalars and '==' otherwise. 1999 *$subname = sub { 2000 use strict "refs"; 2001 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2002 my $self = shift; 2003 my $value = shift; 2004 my $addr = do { no overloading; pack 'J', $self; }; 2005 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2006 if (ref $value) { 2007 return if grep { $value == $_ } @{$field->{$addr}}; 2008 } 2009 else { 2010 return if grep { $value eq $_ } @{$field->{$addr}}; 2011 } 2012 push @{$field->{$addr}}, $value; 2013 return; 2014 } 2015 } 2016 elsif (substr('constructor', 0, length $access) eq $access) { 2017 if ($protected) { 2018 Carp::my_carp_bug("Can't set-up 'protected' constructors") 2019 } 2020 else { 2021 $constructor_fields{$package}{$name} = $field; 2022 } 2023 } 2024 elsif (substr('readable_array', 0, length $access) eq $access) { 2025 2026 # Here has read access. If one of the other parameters for 2027 # access is array, or this one specifies array (by being more 2028 # than just 'readable_'), then create a subroutine that 2029 # assumes the data is an array. Otherwise just a scalar 2030 my $subname = "${package}::${protected}$name"; 2031 if (grep { /^a/i } @_ 2032 or length($access) > length('readable_')) 2033 { 2034 no strict "refs"; 2035 *$subname = sub { 2036 use strict "refs"; 2037 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; 2038 my $addr = do { no overloading; pack 'J', $_[0]; }; 2039 if (ref $field->{$addr} ne 'ARRAY') { 2040 my $type = ref $field->{$addr}; 2041 $type = 'scalar' unless $type; 2042 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); 2043 return; 2044 } 2045 return scalar @{$field->{$addr}} unless wantarray; 2046 2047 # Make a copy; had problems with caller modifying the 2048 # original otherwise 2049 my @return = @{$field->{$addr}}; 2050 return @return; 2051 } 2052 } 2053 else { 2054 2055 # Here not an array value, a simpler function. 2056 no strict "refs"; 2057 *$subname = sub { 2058 use strict "refs"; 2059 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; 2060 no overloading; 2061 return $field->{pack 'J', $_[0]}; 2062 } 2063 } 2064 } 2065 elsif (substr('settable', 0, length $access) eq $access) { 2066 my $subname = "${package}::${protected}set_$name"; 2067 no strict "refs"; 2068 *$subname = sub { 2069 use strict "refs"; 2070 if (main::DEBUG) { 2071 return Carp::carp_too_few_args(\@_, 2) if @_ < 2; 2072 Carp::carp_extra_args(\@_) if @_ > 2; 2073 } 2074 # $self is $_[0]; $value is $_[1] 2075 no overloading; 2076 $field->{pack 'J', $_[0]} = $_[1]; 2077 return; 2078 } 2079 } 2080 else { 2081 Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); 2082 } 2083 } 2084 return; 2085 } 2086} 2087 2088package Input_file; 2089 2090# All input files use this object, which stores various attributes about them, 2091# and provides for convenient, uniform handling. The run method wraps the 2092# processing. It handles all the bookkeeping of opening, reading, and closing 2093# the file, returning only significant input lines. 2094# 2095# Each object gets a handler which processes the body of the file, and is 2096# called by run(). All character property files must use the generic, 2097# default handler, which has code scrubbed to handle things you might not 2098# expect, including automatic EBCDIC handling. For files that don't deal with 2099# mapping code points to a property value, such as test files, 2100# PropertyAliases, PropValueAliases, and named sequences, you can override the 2101# handler to be a custom one. Such a handler should basically be a 2102# while(next_line()) {...} loop. 2103# 2104# You can also set up handlers to 2105# 0) call during object construction time, after everything else is done 2106# 1) call before the first line is read, for pre processing 2107# 2) call to adjust each line of the input before the main handler gets 2108# them. This can be automatically generated, if appropriately simple 2109# enough, by specifying a Properties parameter in the constructor. 2110# 3) call upon EOF before the main handler exits its loop 2111# 4) call at the end, for post processing 2112# 2113# $_ is used to store the input line, and is to be filtered by the 2114# each_line_handler()s. So, if the format of the line is not in the desired 2115# format for the main handler, these are used to do that adjusting. They can 2116# be stacked (by enclosing them in an [ anonymous array ] in the constructor, 2117# so the $_ output of one is used as the input to the next. The EOF handler 2118# is also stackable, but none of the others are, but could easily be changed 2119# to be so. 2120# 2121# Some properties are used by the Perl core but aren't defined until later 2122# Unicode releases. The perl interpreter would have problems working when 2123# compiled with an earlier Unicode version that doesn't have them, so we need 2124# to define them somehow for those releases. The 'Early' constructor 2125# parameter can be used to automatically handle this. It is essentially 2126# ignored if the Unicode version being compiled has a data file for this 2127# property. Either code to execute or a file to read can be specified. 2128# Details are at the %early definition. 2129# 2130# Most of the handlers can call insert_lines() or insert_adjusted_lines() 2131# which insert the parameters as lines to be processed before the next input 2132# file line is read. This allows the EOF handler(s) to flush buffers, for 2133# example. The difference between the two routines is that the lines inserted 2134# by insert_lines() are subjected to the each_line_handler()s. (So if you 2135# called it from such a handler, you would get infinite recursion without some 2136# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go 2137# directly to the main handler without any adjustments. If the 2138# post-processing handler calls any of these, there will be no effect. Some 2139# error checking for these conditions could be added, but it hasn't been done. 2140# 2141# carp_bad_line() should be called to warn of bad input lines, which clears $_ 2142# to prevent further processing of the line. This routine will output the 2143# message as a warning once, and then keep a count of the lines that have the 2144# same message, and output that count at the end of the file's processing. 2145# This keeps the number of messages down to a manageable amount. 2146# 2147# get_missings() should be called to retrieve any @missing input lines. 2148# Messages will be raised if this isn't done if the options aren't to ignore 2149# missings. 2150 2151sub trace { return main::trace(@_); } 2152 2153{ # Closure 2154 # Keep track of fields that are to be put into the constructor. 2155 my %constructor_fields; 2156 2157 main::setup_package(Constructor_Fields => \%constructor_fields); 2158 2159 my %file; # Input file name, required 2160 main::set_access('file', \%file, qw{ c r }); 2161 2162 my %first_released; # Unicode version file was first released in, required 2163 main::set_access('first_released', \%first_released, qw{ c r }); 2164 2165 my %handler; # Subroutine to process the input file, defaults to 2166 # 'process_generic_property_file' 2167 main::set_access('handler', \%handler, qw{ c }); 2168 2169 my %property; 2170 # name of property this file is for. defaults to none, meaning not 2171 # applicable, or is otherwise determinable, for example, from each line. 2172 main::set_access('property', \%property, qw{ c r }); 2173 2174 my %optional; 2175 # This is either an unsigned number, or a list of property names. In the 2176 # former case, if it is non-zero, it means the file is optional, so if the 2177 # file is absent, no warning about that is output. In the latter case, it 2178 # is a list of properties that the file (exclusively) defines. If the 2179 # file is present, tables for those properties will be produced; if 2180 # absent, none will, even if they are listed elsewhere (namely 2181 # PropertyAliases.txt and PropValueAliases.txt) as being in this release, 2182 # and no warnings will be raised about them not being available. (And no 2183 # warning about the file itself will be raised.) 2184 main::set_access('optional', \%optional, qw{ c readable_array } ); 2185 2186 my %non_skip; 2187 # This is used for debugging, to skip processing of all but a few input 2188 # files. Add 'non_skip => 1' to the constructor for those files you want 2189 # processed when you set the $debug_skip global. 2190 main::set_access('non_skip', \%non_skip, 'c'); 2191 2192 my %skip; 2193 # This is used to skip processing of this input file (semi-) permanently. 2194 # The value should be the reason the file is being skipped. It is used 2195 # for files that we aren't planning to process anytime soon, but want to 2196 # allow to be in the directory and be checked for their names not 2197 # conflicting with any other files on a DOS 8.3 name filesystem, but to 2198 # not otherwise be processed, and to not raise a warning about not being 2199 # handled. In the constructor call, any value that evaluates to a numeric 2200 # 0 or undef means don't skip. Any other value is a string giving the 2201 # reason it is being skipped, and this will appear in generated pod. 2202 # However, an empty string reason will suppress the pod entry. 2203 # Internally, calls that evaluate to numeric 0 are changed into undef to 2204 # distinguish them from an empty string call. 2205 main::set_access('skip', \%skip, 'c', 'r'); 2206 2207 my %each_line_handler; 2208 # list of subroutines to look at and filter each non-comment line in the 2209 # file. defaults to none. The subroutines are called in order, each is 2210 # to adjust $_ for the next one, and the final one adjusts it for 2211 # 'handler' 2212 main::set_access('each_line_handler', \%each_line_handler, 'c'); 2213 2214 my %retain_trailing_comments; 2215 # This is used to not discard the comments that end data lines. This 2216 # would be used only for files with non-typical syntax, and most code here 2217 # assumes that comments have been stripped, so special handlers would have 2218 # to be written. It is assumed that the code will use these in 2219 # single-quoted contexts, and so any "'" marks in the comment will be 2220 # prefixed by a backslash. 2221 main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c'); 2222 2223 my %properties; # Optional ordered list of the properties that occur in each 2224 # meaningful line of the input file. If present, an appropriate 2225 # each_line_handler() is automatically generated and pushed onto the stack 2226 # of such handlers. This is useful when a file contains multiple 2227 # properties per line, but no other special considerations are necessary. 2228 # The special value "<ignored>" means to discard the corresponding input 2229 # field. 2230 # Any @missing lines in the file should also match this syntax; no such 2231 # files exist as of 6.3. But if it happens in a future release, the code 2232 # could be expanded to properly parse them. 2233 main::set_access('properties', \%properties, qw{ c r }); 2234 2235 my %has_missings_defaults; 2236 # ? Are there lines in the file giving default values for code points 2237 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is 2238 # the norm, but IGNORED means it has such lines, but the handler doesn't 2239 # use them. Having these three states allows us to catch changes to the 2240 # UCD that this program should track. XXX This could be expanded to 2241 # specify the syntax for such lines, like %properties above. 2242 main::set_access('has_missings_defaults', 2243 \%has_missings_defaults, qw{ c r }); 2244 2245 my %construction_time_handler; 2246 # Subroutine to call at the end of the new method. If undef, no such 2247 # handler is called. 2248 main::set_access('construction_time_handler', 2249 \%construction_time_handler, qw{ c }); 2250 2251 my %pre_handler; 2252 # Subroutine to call before doing anything else in the file. If undef, no 2253 # such handler is called. 2254 main::set_access('pre_handler', \%pre_handler, qw{ c }); 2255 2256 my %eof_handler; 2257 # Subroutines to call upon getting an EOF on the input file, but before 2258 # that is returned to the main handler. This is to allow buffers to be 2259 # flushed. The handler is expected to call insert_lines() or 2260 # insert_adjusted() with the buffered material 2261 main::set_access('eof_handler', \%eof_handler, qw{ c }); 2262 2263 my %post_handler; 2264 # Subroutine to call after all the lines of the file are read in and 2265 # processed. If undef, no such handler is called. Note that this cannot 2266 # add lines to be processed; instead use eof_handler 2267 main::set_access('post_handler', \%post_handler, qw{ c }); 2268 2269 my %progress_message; 2270 # Message to print to display progress in lieu of the standard one 2271 main::set_access('progress_message', \%progress_message, qw{ c }); 2272 2273 my %handle; 2274 # cache open file handle, internal. Is undef if file hasn't been 2275 # processed at all, empty if has; 2276 main::set_access('handle', \%handle); 2277 2278 my %added_lines; 2279 # cache of lines added virtually to the file, internal 2280 main::set_access('added_lines', \%added_lines); 2281 2282 my %remapped_lines; 2283 # cache of lines added virtually to the file, internal 2284 main::set_access('remapped_lines', \%remapped_lines); 2285 2286 my %errors; 2287 # cache of errors found, internal 2288 main::set_access('errors', \%errors); 2289 2290 my %missings; 2291 # storage of '@missing' defaults lines 2292 main::set_access('missings', \%missings); 2293 2294 my %early; 2295 # Used for properties that must be defined (for Perl's purposes) on 2296 # versions of Unicode earlier than Unicode itself defines them. The 2297 # parameter is an array (it would be better to be a hash, but not worth 2298 # bothering about due to its rare use). 2299 # 2300 # The first element is either a code reference to call when in a release 2301 # earlier than the Unicode file is available in, or it is an alternate 2302 # file to use instead of the non-existent one. This file must have been 2303 # plunked down in the same directory as mktables. Should you be compiling 2304 # on a release that needs such a file, mktables will abort the 2305 # compilation, and tell you where to get the necessary file(s), and what 2306 # name(s) to use to store them as. 2307 # In the case of specifying an alternate file, the array must contain two 2308 # further elements: 2309 # 2310 # [1] is the name of the property that will be generated by this file. 2311 # The class automatically takes the input file and excludes any code 2312 # points in it that were not assigned in the Unicode version being 2313 # compiled. It then uses this result to define the property in the given 2314 # version. Since the property doesn't actually exist in the Unicode 2315 # version being compiled, this should be a name accessible only by core 2316 # perl. If it is the same name as the regular property, the constructor 2317 # will mark the output table as a $PLACEHOLDER so that it doesn't actually 2318 # get output, and so will be unusable by non-core code. Otherwise it gets 2319 # marked as $INTERNAL_ONLY. 2320 # 2321 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to 2322 # the Hangul syllables in that release (which were ripped out in version 2323 # 2) for the given property . (Hence it is ignored except when compiling 2324 # version 1. You only get one value that applies to all of them, which 2325 # may not be the actual reality, but probably nobody cares anyway for 2326 # these obsolete characters.) 2327 # 2328 # [3] if present is the default value for the property to assign for code 2329 # points not given in the input. If not present, the default from the 2330 # normal property is used 2331 # 2332 # [-1] If there is an extra final element that is the string 'ONLY_EARLY'. 2333 # it means to not add the name in [1] as an alias to the property name 2334 # used for these. Normally, when compiling Unicode versions that don't 2335 # invoke the early handling, the name is added as a synonym. 2336 # 2337 # Not all files can be handled in the above way, and so the code ref 2338 # alternative is available. It can do whatever it needs to. The other 2339 # array elements are optional in this case, and the code is free to use or 2340 # ignore them if they are present. 2341 # 2342 # Internally, the constructor unshifts a 0 or 1 onto this array to 2343 # indicate if an early alternative is actually being used or not. This 2344 # makes for easier testing later on. 2345 main::set_access('early', \%early, 'c'); 2346 2347 my %only_early; 2348 main::set_access('only_early', \%only_early, 'c'); 2349 2350 my %required_even_in_debug_skip; 2351 # debug_skip is used to speed up compilation during debugging by skipping 2352 # processing files that are not needed for the task at hand. However, 2353 # some files pretty much can never be skipped, and this is used to specify 2354 # that this is one of them. In order to skip this file, the call to the 2355 # constructor must be edited to comment out this parameter. 2356 main::set_access('required_even_in_debug_skip', 2357 \%required_even_in_debug_skip, 'c'); 2358 2359 my %withdrawn; 2360 # Some files get removed from the Unicode DB. This is a version object 2361 # giving the first release without this file. 2362 main::set_access('withdrawn', \%withdrawn, 'c'); 2363 2364 my %in_this_release; 2365 # Calculated value from %first_released and %withdrawn. Are we compiling 2366 # a Unicode release which includes this file? 2367 main::set_access('in_this_release', \%in_this_release); 2368 2369 sub _next_line; 2370 sub _next_line_with_remapped_range; 2371 2372 sub new { 2373 my $class = shift; 2374 2375 my $self = bless \do{ my $anonymous_scalar }, $class; 2376 my $addr = do { no overloading; pack 'J', $self; }; 2377 2378 # Set defaults 2379 $handler{$addr} = \&main::process_generic_property_file; 2380 $retain_trailing_comments{$addr} = 0; 2381 $non_skip{$addr} = 0; 2382 $skip{$addr} = undef; 2383 $has_missings_defaults{$addr} = $NO_DEFAULTS; 2384 $handle{$addr} = undef; 2385 $added_lines{$addr} = [ ]; 2386 $remapped_lines{$addr} = [ ]; 2387 $each_line_handler{$addr} = [ ]; 2388 $eof_handler{$addr} = [ ]; 2389 $errors{$addr} = { }; 2390 $missings{$addr} = [ ]; 2391 $early{$addr} = [ ]; 2392 $optional{$addr} = [ ]; 2393 2394 # Two positional parameters. 2395 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2396 $file{$addr} = main::internal_file_to_platform(shift); 2397 $first_released{$addr} = shift; 2398 2399 # The rest of the arguments are key => value pairs 2400 # %constructor_fields has been set up earlier to list all possible 2401 # ones. Either set or push, depending on how the default has been set 2402 # up just above. 2403 my %args = @_; 2404 foreach my $key (keys %args) { 2405 my $argument = $args{$key}; 2406 2407 # Note that the fields are the lower case of the constructor keys 2408 my $hash = $constructor_fields{lc $key}; 2409 if (! defined $hash) { 2410 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); 2411 next; 2412 } 2413 if (ref $hash->{$addr} eq 'ARRAY') { 2414 if (ref $argument eq 'ARRAY') { 2415 foreach my $argument (@{$argument}) { 2416 next if ! defined $argument; 2417 push @{$hash->{$addr}}, $argument; 2418 } 2419 } 2420 else { 2421 push @{$hash->{$addr}}, $argument if defined $argument; 2422 } 2423 } 2424 else { 2425 $hash->{$addr} = $argument; 2426 } 2427 delete $args{$key}; 2428 }; 2429 2430 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr}; 2431 2432 # Convert 0 (meaning don't skip) to undef 2433 undef $skip{$addr} unless $skip{$addr}; 2434 2435 # Handle the case where this file is optional 2436 my $pod_message_for_non_existent_optional = ""; 2437 if ($optional{$addr}->@*) { 2438 2439 # First element is the pod message 2440 $pod_message_for_non_existent_optional 2441 = shift $optional{$addr}->@*; 2442 # Convert a 0 'Optional' argument to an empty list to make later 2443 # code more concise. 2444 if ( $optional{$addr}->@* 2445 && $optional{$addr}->@* == 1 2446 && $optional{$addr}[0] ne "" 2447 && $optional{$addr}[0] !~ /\D/ 2448 && $optional{$addr}[0] == 0) 2449 { 2450 $optional{$addr} = [ ]; 2451 } 2452 else { # But if the only element doesn't evaluate to 0, make sure 2453 # that this file is indeed considered optional below. 2454 unshift $optional{$addr}->@*, 1; 2455 } 2456 } 2457 2458 my $progress; 2459 my $function_instead_of_file = 0; 2460 2461 if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') { 2462 $only_early{$addr} = 1; 2463 pop $early{$addr}->@*; 2464 } 2465 2466 # If we are compiling a Unicode release earlier than the file became 2467 # available, the constructor may have supplied a substitute 2468 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) { 2469 2470 # Yes, we have a substitute, that we will use; mark it so 2471 unshift $early{$addr}->@*, 1; 2472 2473 # See the definition of %early for what the array elements mean. 2474 # Note that we have just unshifted onto the array, so the numbers 2475 # below are +1 of those in the %early description. 2476 # If we have a property this defines, create a table and default 2477 # map for it now (at essentially compile time), so that it will be 2478 # available for the whole of run time. (We will want to add this 2479 # name as an alias when we are using the official property name; 2480 # but this must be deferred until run(), because at construction 2481 # time the official names have yet to be defined.) 2482 if ($early{$addr}[2]) { 2483 my $fate = ($property{$addr} 2484 && $property{$addr} eq $early{$addr}[2]) 2485 ? $PLACEHOLDER 2486 : $INTERNAL_ONLY; 2487 my $prop_object = Property->new($early{$addr}[2], 2488 Fate => $fate, 2489 Perl_Extension => 1, 2490 ); 2491 2492 # If not specified by the constructor, use the default mapping 2493 # for the regular property for this substitute one. 2494 if ($early{$addr}[4]) { 2495 $prop_object->set_default_map($early{$addr}[4]); 2496 } 2497 elsif ( defined $property{$addr} 2498 && defined $default_mapping{$property{$addr}}) 2499 { 2500 $prop_object 2501 ->set_default_map($default_mapping{$property{$addr}}); 2502 } 2503 } 2504 2505 if (ref $early{$addr}[1] eq 'CODE') { 2506 $function_instead_of_file = 1; 2507 2508 # If the first element of the array is a code ref, the others 2509 # are optional. 2510 $handler{$addr} = $early{$addr}[1]; 2511 $property{$addr} = $early{$addr}[2] 2512 if defined $early{$addr}[2]; 2513 $progress = "substitute $file{$addr}"; 2514 2515 undef $file{$addr}; 2516 } 2517 else { # Specifying a substitute file 2518 2519 if (! main::file_exists($early{$addr}[1])) { 2520 2521 # If we don't see the substitute file, generate an error 2522 # message giving the needed things, and add it to the list 2523 # of such to output before actual processing happens 2524 # (hence the user finds out all of them in one run). 2525 # Instead of creating a general method for NameAliases, 2526 # hard-code it here, as there is unlikely to ever be a 2527 # second one which needs special handling. 2528 my $string_version = ($file{$addr} eq "NameAliases.txt") 2529 ? 'at least 6.1 (the later, the better)' 2530 : sprintf "%vd", $first_released{$addr}; 2531 push @missing_early_files, <<END; 2532'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'. 2533END 2534 ; 2535 return; 2536 } 2537 $progress = $early{$addr}[1]; 2538 $progress .= ", substituting for $file{$addr}" if $file{$addr}; 2539 $file{$addr} = $early{$addr}[1]; 2540 $property{$addr} = $early{$addr}[2]; 2541 2542 # Ignore code points not in the version being compiled 2543 push $each_line_handler{$addr}->@*, \&_exclude_unassigned; 2544 2545 if ( $v_version lt v2.0 # Hanguls in this release ... 2546 && defined $early{$addr}[3]) # ... need special treatment 2547 { 2548 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls; 2549 } 2550 } 2551 2552 # And this substitute is valid for all releases. 2553 $first_released{$addr} = v0; 2554 } 2555 else { # Normal behavior 2556 $progress = $file{$addr}; 2557 unshift $early{$addr}->@*, 0; # No substitute 2558 } 2559 2560 my $file = $file{$addr}; 2561 $progress_message{$addr} = "Processing $progress" 2562 unless $progress_message{$addr}; 2563 2564 # A file should be there if it is within the window of versions for 2565 # which Unicode supplies it 2566 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) { 2567 $in_this_release{$addr} = 0; 2568 $skip{$addr} = ""; 2569 } 2570 else { 2571 $in_this_release{$addr} = $first_released{$addr} le $v_version; 2572 2573 # Check that the file for this object (possibly using a substitute 2574 # for early releases) exists or we have a function alternative 2575 if ( ! $function_instead_of_file 2576 && ! main::file_exists($file)) 2577 { 2578 # Here there is nothing available for this release. This is 2579 # fine if we aren't expecting anything in this release. 2580 if (! $in_this_release{$addr}) { 2581 $skip{$addr} = ""; # Don't remark since we expected 2582 # nothing and got nothing 2583 } 2584 elsif ($optional{$addr}->@*) { 2585 2586 # Here the file is optional in this release; Use the 2587 # passed in text to document this case in the pod. 2588 $skip{$addr} = $pod_message_for_non_existent_optional; 2589 } 2590 elsif ( $in_this_release{$addr} 2591 && ! defined $skip{$addr} 2592 && defined $file) 2593 { # Doesn't exist but should. 2594 $skip{$addr} = "'$file' not found. Possibly Big problems"; 2595 Carp::my_carp($skip{$addr}); 2596 } 2597 } 2598 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr}) 2599 { 2600 2601 # The file exists; if not skipped for another reason, and we are 2602 # skipping most everything during debugging builds, use that as 2603 # the skip reason. 2604 $skip{$addr} = '$debug_skip is on' 2605 } 2606 } 2607 2608 if ( ! $debug_skip 2609 && $non_skip{$addr} 2610 && ! $required_even_in_debug_skip{$addr} 2611 && $verbosity) 2612 { 2613 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n"; 2614 } 2615 2616 # Here, we have figured out if we will be skipping this file or not. 2617 # If so, we add any single property it defines to any passed in 2618 # optional property list. These will be dealt with at run time. 2619 if (defined $skip{$addr}) { 2620 if ($property{$addr}) { 2621 push $optional{$addr}->@*, $property{$addr}; 2622 } 2623 } # Otherwise, are going to process the file. 2624 elsif ($property{$addr}) { 2625 2626 # If the file has a property defined in the constructor for it, it 2627 # means that the property is not listed in the file's entries. So 2628 # add a handler (to the list of line handlers) to insert the 2629 # property name into the lines, to provide a uniform interface to 2630 # the final processing subroutine. 2631 push @{$each_line_handler{$addr}}, \&_insert_property_into_line; 2632 } 2633 elsif ($properties{$addr}) { 2634 2635 # Similarly, there may be more than one property represented on 2636 # each line, with no clue but the constructor input what those 2637 # might be. Add a handler for each line in the input so that it 2638 # creates a separate input line for each property in those input 2639 # lines, thus making them suitable to handle generically. 2640 2641 push @{$each_line_handler{$addr}}, 2642 sub { 2643 my $file = shift; 2644 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2645 2646 my @fields = split /\s*;\s*/, $_, -1; 2647 2648 if (@fields - 1 > @{$properties{$addr}}) { 2649 $file->carp_bad_line('Extra fields'); 2650 $_ = ""; 2651 return; 2652 } 2653 my $range = shift @fields; # 0th element is always the 2654 # range 2655 2656 # The next fields in the input line correspond 2657 # respectively to the stored properties. 2658 for my $i (0 .. @{$properties{$addr}} - 1) { 2659 my $property_name = $properties{$addr}[$i]; 2660 next if $property_name eq '<ignored>'; 2661 $file->insert_adjusted_lines( 2662 "$range; $property_name; $fields[$i]"); 2663 } 2664 $_ = ""; 2665 2666 return; 2667 }; 2668 } 2669 2670 { # On non-ascii platforms, we use a special pre-handler 2671 no strict; 2672 no warnings 'once'; 2673 *next_line = (main::NON_ASCII_PLATFORM) 2674 ? *_next_line_with_remapped_range 2675 : *_next_line; 2676 } 2677 2678 &{$construction_time_handler{$addr}}($self) 2679 if $construction_time_handler{$addr}; 2680 2681 return $self; 2682 } 2683 2684 2685 use overload 2686 fallback => 0, 2687 qw("") => "_operator_stringify", 2688 "." => \&main::_operator_dot, 2689 ".=" => \&main::_operator_dot_equal, 2690 ; 2691 2692 sub _operator_stringify { 2693 my $self = shift; 2694 2695 return __PACKAGE__ . " object for " . $self->file; 2696 } 2697 2698 sub run { 2699 # Process the input object $self. This opens and closes the file and 2700 # calls all the handlers for it. Currently, this can only be called 2701 # once per file, as it destroy's the EOF handlers 2702 2703 # flag to make sure extracted files are processed early 2704 state $seen_non_extracted = 0; 2705 2706 my $self = shift; 2707 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2708 2709 my $addr = do { no overloading; pack 'J', $self; }; 2710 2711 my $file = $file{$addr}; 2712 2713 if (! $file) { 2714 $handle{$addr} = 'pretend_is_open'; 2715 } 2716 else { 2717 if ($seen_non_extracted) { 2718 if ($file =~ /$EXTRACTED/i) # Some platforms may change the 2719 # case of the file's name 2720 { 2721 Carp::my_carp_bug(main::join_lines(<<END 2722$file should be processed just after the 'Prop...Alias' files, and before 2723anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may 2724have subtle problems 2725END 2726 )); 2727 } 2728 } 2729 elsif ($EXTRACTED_DIR 2730 2731 # We only do this check for generic property files 2732 && $handler{$addr} == \&main::process_generic_property_file 2733 2734 && $file !~ /$EXTRACTED/i) 2735 { 2736 # We don't set this (by the 'if' above) if we have no 2737 # extracted directory, so if running on an early version, 2738 # this test won't work. Not worth worrying about. 2739 $seen_non_extracted = 1; 2740 } 2741 2742 # Mark the file as having being processed, and warn if it 2743 # isn't a file we are expecting. As we process the files, 2744 # they are deleted from the hash, so any that remain at the 2745 # end of the program are files that we didn't process. 2746 my $fkey = File::Spec->rel2abs($file); 2747 my $exists = delete $potential_files{lc($fkey)}; 2748 2749 Carp::my_carp("Was not expecting '$file'.") 2750 if $exists && ! $in_this_release{$addr}; 2751 2752 # If there is special handling for compiling Unicode releases 2753 # earlier than the first one in which Unicode defines this 2754 # property ... 2755 if ($early{$addr}->@* > 1) { 2756 2757 # Mark as processed any substitute file that would be used in 2758 # such a release 2759 $fkey = File::Spec->rel2abs($early{$addr}[1]); 2760 delete $potential_files{lc($fkey)}; 2761 2762 # As commented in the constructor code, when using the 2763 # official property, we still have to allow the publicly 2764 # inaccessible early name so that the core code which uses it 2765 # will work regardless. 2766 if ( ! $only_early{$addr} 2767 && ! $early{$addr}[0] 2768 && $early{$addr}->@* > 2) 2769 { 2770 my $early_property_name = $early{$addr}[2]; 2771 if ($property{$addr} ne $early_property_name) { 2772 main::property_ref($property{$addr}) 2773 ->add_alias($early_property_name); 2774 } 2775 } 2776 } 2777 2778 # We may be skipping this file ... 2779 if (defined $skip{$addr}) { 2780 2781 # If the file isn't supposed to be in this release, there is 2782 # nothing to do 2783 if ($in_this_release{$addr}) { 2784 2785 # But otherwise, we may print a message 2786 if ($debug_skip) { 2787 print STDERR "Skipping input file '$file'", 2788 " because '$skip{$addr}'\n"; 2789 } 2790 2791 # And add it to the list of skipped files, which is later 2792 # used to make the pod 2793 $skipped_files{$file} = $skip{$addr}; 2794 2795 # The 'optional' list contains properties that are also to 2796 # be skipped along with the file. (There may also be 2797 # digits which are just placeholders to make sure it isn't 2798 # an empty list 2799 foreach my $property ($optional{$addr}->@*) { 2800 next unless $property =~ /\D/; 2801 my $prop_object = main::property_ref($property); 2802 next unless defined $prop_object; 2803 $prop_object->set_fate($SUPPRESSED, $skip{$addr}); 2804 } 2805 } 2806 2807 return; 2808 } 2809 2810 # Here, we are going to process the file. Open it, converting the 2811 # slashes used in this program into the proper form for the OS 2812 my $file_handle; 2813 if (not open $file_handle, "<", $file) { 2814 Carp::my_carp("Can't open $file. Skipping: $!"); 2815 return; 2816 } 2817 $handle{$addr} = $file_handle; # Cache the open file handle 2818 2819 # If possible, make sure that the file is the correct version. 2820 # (This data isn't available on early Unicode releases or in 2821 # UnicodeData.txt.) We don't do this check if we are using a 2822 # substitute file instead of the official one (though the code 2823 # could be extended to do so). 2824 if ($in_this_release{$addr} 2825 && ! $early{$addr}[0] 2826 && lc($file) ne 'unicodedata.txt') 2827 { 2828 if ($file !~ /^Unihan/i) { 2829 2830 # The non-Unihan files started getting version numbers in 2831 # 3.2, but some files in 4.0 are unchanged from 3.2, and 2832 # marked as 3.2. 4.0.1 is the first version where there 2833 # are no files marked as being from less than 4.0, though 2834 # some are marked as 4.0. In versions after that, the 2835 # numbers are correct. 2836 if ($v_version ge v4.0.1) { 2837 $_ = <$file_handle>; # The version number is in the 2838 # very first line 2839 if ($_ !~ / - $string_version \. /x) { 2840 chomp; 2841 $_ =~ s/^#\s*//; 2842 2843 # 4.0.1 had some valid files that weren't updated. 2844 if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) { 2845 die Carp::my_carp("File '$file' is version " 2846 . "'$_'. It should be " 2847 . "version $string_version"); 2848 } 2849 } 2850 } 2851 } 2852 elsif ($v_version ge v6.0.0) { # Unihan 2853 2854 # Unihan files didn't get accurate version numbers until 2855 # 6.0. The version is somewhere in the first comment 2856 # block 2857 while (<$file_handle>) { 2858 if ($_ !~ /^#/) { 2859 Carp::my_carp_bug("Could not find the expected " 2860 . "version info in file '$file'"); 2861 last; 2862 } 2863 chomp; 2864 $_ =~ s/^#\s*//; 2865 next if $_ !~ / version: /x; 2866 last if $_ =~ /$string_version/; 2867 die Carp::my_carp("File '$file' is version " 2868 . "'$_'. It should be " 2869 . "version $string_version"); 2870 } 2871 } 2872 } 2873 } 2874 2875 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS; 2876 2877 # Call any special handler for before the file. 2878 &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; 2879 2880 # Then the main handler 2881 &{$handler{$addr}}($self); 2882 2883 # Then any special post-file handler. 2884 &{$post_handler{$addr}}($self) if $post_handler{$addr}; 2885 2886 # If any errors have been accumulated, output the counts (as the first 2887 # error message in each class was output when it was encountered). 2888 if ($errors{$addr}) { 2889 my $total = 0; 2890 my $types = 0; 2891 foreach my $error (keys %{$errors{$addr}}) { 2892 $total += $errors{$addr}->{$error}; 2893 delete $errors{$addr}->{$error}; 2894 $types++; 2895 } 2896 if ($total > 1) { 2897 my $message 2898 = "A total of $total lines had errors in $file. "; 2899 2900 $message .= ($types == 1) 2901 ? '(Only the first one was displayed.)' 2902 : '(Only the first of each type was displayed.)'; 2903 Carp::my_carp($message); 2904 } 2905 } 2906 2907 if (@{$missings{$addr}}) { 2908 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); 2909 } 2910 2911 # If a real file handle, close it. 2912 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if 2913 ref $handle{$addr}; 2914 $handle{$addr} = ""; # Uses empty to indicate that has already seen 2915 # the file, as opposed to undef 2916 return; 2917 } 2918 2919 sub _next_line { 2920 # Sets $_ to be the next logical input line, if any. Returns non-zero 2921 # if such a line exists. 'logical' means that any lines that have 2922 # been added via insert_lines() will be returned in $_ before the file 2923 # is read again. 2924 2925 my $self = shift; 2926 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2927 2928 my $addr = do { no overloading; pack 'J', $self; }; 2929 2930 # Here the file is open (or if the handle is not a ref, is an open 2931 # 'virtual' file). Get the next line; any inserted lines get priority 2932 # over the file itself. 2933 my $adjusted; 2934 2935 LINE: 2936 while (1) { # Loop until find non-comment, non-empty line 2937 #local $to_trace = 1 if main::DEBUG; 2938 my $inserted_ref = shift @{$added_lines{$addr}}; 2939 if (defined $inserted_ref) { 2940 ($adjusted, $_) = @{$inserted_ref}; 2941 trace $adjusted, $_ if main::DEBUG && $to_trace; 2942 return 1 if $adjusted; 2943 } 2944 else { 2945 last if ! ref $handle{$addr}; # Don't read unless is real file 2946 last if ! defined ($_ = readline $handle{$addr}); 2947 } 2948 chomp; 2949 trace $_ if main::DEBUG && $to_trace; 2950 2951 # See if this line is the comment line that defines what property 2952 # value that code points that are not listed in the file should 2953 # have. The format or existence of these lines is not guaranteed 2954 # by Unicode since they are comments, but the documentation says 2955 # that this was added for machine-readability, so probably won't 2956 # change. This works starting in Unicode Version 5.0. They look 2957 # like: 2958 # 2959 # @missing: 0000..10FFFF; Not_Reordered 2960 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> 2961 # @missing: 0000..10FFFF; ; NaN 2962 # 2963 # Save the line for a later get_missings() call. 2964 if (/$missing_defaults_prefix/) { 2965 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { 2966 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); 2967 } 2968 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { 2969 my @defaults = split /\s* ; \s*/x, $_; 2970 2971 # The first field is the @missing, which ends in a 2972 # semi-colon, so can safely shift. 2973 shift @defaults; 2974 2975 # Some of these lines may have empty field placeholders 2976 # which get in the way. An example is: 2977 # @missing: 0000..10FFFF; ; NaN 2978 # Remove them. Process starting from the top so the 2979 # splice doesn't affect things still to be looked at. 2980 for (my $i = @defaults - 1; $i >= 0; $i--) { 2981 next if $defaults[$i] ne ""; 2982 splice @defaults, $i, 1; 2983 } 2984 2985 # What's left should be just the property (maybe) and the 2986 # default. Having only one element means it doesn't have 2987 # the property. 2988 my $default; 2989 my $property; 2990 if (@defaults >= 1) { 2991 if (@defaults == 1) { 2992 $default = $defaults[0]; 2993 } 2994 else { 2995 $property = $defaults[0]; 2996 $default = $defaults[1]; 2997 } 2998 } 2999 3000 if (@defaults < 1 3001 || @defaults > 2 3002 || ($default =~ /^</ 3003 && $default !~ /^<code *point>$/i 3004 && $default !~ /^<none>$/i 3005 && $default !~ /^<script>$/i)) 3006 { 3007 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); 3008 } 3009 else { 3010 3011 # If the property is missing from the line, it should 3012 # be the one for the whole file 3013 $property = $property{$addr} if ! defined $property; 3014 3015 # Change <none> to the null string, which is what it 3016 # really means. If the default is the code point 3017 # itself, set it to <code point>, which is what 3018 # Unicode uses (but sometimes they've forgotten the 3019 # space) 3020 if ($default =~ /^<none>$/i) { 3021 $default = ""; 3022 } 3023 elsif ($default =~ /^<code *point>$/i) { 3024 $default = $CODE_POINT; 3025 } 3026 elsif ($default =~ /^<script>$/i) { 3027 3028 # Special case this one. Currently is from 3029 # ScriptExtensions.txt, and means for all unlisted 3030 # code points, use their Script property values. 3031 # For the code points not listed in that file, the 3032 # default value is 'Unknown'. 3033 $default = "Unknown"; 3034 } 3035 3036 # Store them as a sub-arrays with both components. 3037 push @{$missings{$addr}}, [ $default, $property ]; 3038 } 3039 } 3040 3041 # There is nothing for the caller to process on this comment 3042 # line. 3043 next; 3044 } 3045 3046 # Unless to keep, remove comments. If to keep, ignore 3047 # comment-only lines 3048 if ($retain_trailing_comments{$addr}) { 3049 next if / ^ \s* \# /x; 3050 3051 # But escape any single quotes (done in both the comment and 3052 # non-comment portion; this could be a bug someday, but not 3053 # likely) 3054 s/'/\\'/g; 3055 } 3056 else { 3057 s/#.*//; 3058 } 3059 3060 # Remove trailing space, and skip this line if the result is empty 3061 s/\s+$//; 3062 next if /^$/; 3063 3064 # Call any handlers for this line, and skip further processing of 3065 # the line if the handler sets the line to null. 3066 foreach my $sub_ref (@{$each_line_handler{$addr}}) { 3067 &{$sub_ref}($self); 3068 next LINE if /^$/; 3069 } 3070 3071 # Here the line is ok. return success. 3072 return 1; 3073 } # End of looping through lines. 3074 3075 # If there are EOF handlers, call each (only once) and if it generates 3076 # more lines to process go back in the loop to handle them. 3077 while ($eof_handler{$addr}->@*) { 3078 &{$eof_handler{$addr}[0]}($self); 3079 shift $eof_handler{$addr}->@*; # Currently only get one shot at it. 3080 goto LINE if $added_lines{$addr}; 3081 } 3082 3083 # Return failure -- no more lines. 3084 return 0; 3085 3086 } 3087 3088 sub _next_line_with_remapped_range { 3089 my $self = shift; 3090 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3091 3092 # like _next_line(), but for use on non-ASCII platforms. It sets $_ 3093 # to be the next logical input line, if any. Returns non-zero if such 3094 # a line exists. 'logical' means that any lines that have been added 3095 # via insert_lines() will be returned in $_ before the file is read 3096 # again. 3097 # 3098 # The difference from _next_line() is that this remaps the Unicode 3099 # code points in the input to those of the native platform. Each 3100 # input line contains a single code point, or a single contiguous 3101 # range of them This routine splits each range into its individual 3102 # code points and caches them. It returns the cached values, 3103 # translated into their native equivalents, one at a time, for each 3104 # call, before reading the next line. Since native values can only be 3105 # a single byte wide, no translation is needed for code points above 3106 # 0xFF, and ranges that are entirely above that number are not split. 3107 # If an input line contains the range 254-1000, it would be split into 3108 # three elements: 254, 255, and 256-1000. (The downstream table 3109 # insertion code will sort and coalesce the individual code points 3110 # into appropriate ranges.) 3111 3112 my $addr = do { no overloading; pack 'J', $self; }; 3113 3114 while (1) { 3115 3116 # Look in cache before reading the next line. Return any cached 3117 # value, translated 3118 my $inserted = shift @{$remapped_lines{$addr}}; 3119 if (defined $inserted) { 3120 trace $inserted if main::DEBUG && $to_trace; 3121 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer; 3122 trace $_ if main::DEBUG && $to_trace; 3123 return 1; 3124 } 3125 3126 # Get the next line. 3127 return 0 unless _next_line($self); 3128 3129 # If there is a special handler for it, return the line, 3130 # untranslated. This should happen only for files that are 3131 # special, not being code-point related, such as property names. 3132 return 1 if $handler{$addr} 3133 != \&main::process_generic_property_file; 3134 3135 my ($range, $property_name, $map, @remainder) 3136 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3137 3138 if (@remainder 3139 || ! defined $property_name 3140 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3141 { 3142 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored"); 3143 } 3144 3145 my $low = hex $1; 3146 my $high = (defined $2) ? hex $2 : $low; 3147 3148 # If the input maps the range to another code point, remap the 3149 # target if it is between 0 and 255. 3150 my $tail; 3151 if (defined $map) { 3152 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe; 3153 $tail = "$property_name; $map"; 3154 $_ = "$range; $tail"; 3155 } 3156 else { 3157 $tail = $property_name; 3158 } 3159 3160 # If entire range is above 255, just return it, unchanged (except 3161 # any mapped-to code point, already changed above) 3162 return 1 if $low > 255; 3163 3164 # Cache an entry for every code point < 255. For those in the 3165 # range above 255, return a dummy entry for just that portion of 3166 # the range. Note that this will be out-of-order, but that is not 3167 # a problem. 3168 foreach my $code_point ($low .. $high) { 3169 if ($code_point > 255) { 3170 $_ = sprintf "%04X..%04X; $tail", $code_point, $high; 3171 return 1; 3172 } 3173 push @{$remapped_lines{$addr}}, "$code_point; $tail"; 3174 } 3175 } # End of looping through lines. 3176 3177 # NOTREACHED 3178 } 3179 3180# Not currently used, not fully tested. 3181# sub peek { 3182# # Non-destructive lookahead one non-adjusted, non-comment, non-blank 3183# # record. Not callable from an each_line_handler(), nor does it call 3184# # an each_line_handler() on the line. 3185# 3186# my $self = shift; 3187# my $addr = do { no overloading; pack 'J', $self; }; 3188# 3189# foreach my $inserted_ref (@{$added_lines{$addr}}) { 3190# my ($adjusted, $line) = @{$inserted_ref}; 3191# next if $adjusted; 3192# 3193# # Remove comments and trailing space, and return a non-empty 3194# # resulting line 3195# $line =~ s/#.*//; 3196# $line =~ s/\s+$//; 3197# return $line if $line ne ""; 3198# } 3199# 3200# return if ! ref $handle{$addr}; # Don't read unless is real file 3201# while (1) { # Loop until find non-comment, non-empty line 3202# local $to_trace = 1 if main::DEBUG; 3203# trace $_ if main::DEBUG && $to_trace; 3204# return if ! defined (my $line = readline $handle{$addr}); 3205# chomp $line; 3206# push @{$added_lines{$addr}}, [ 0, $line ]; 3207# 3208# $line =~ s/#.*//; 3209# $line =~ s/\s+$//; 3210# return $line if $line ne ""; 3211# } 3212# 3213# return; 3214# } 3215 3216 3217 sub insert_lines { 3218 # Lines can be inserted so that it looks like they were in the input 3219 # file at the place it was when this routine is called. See also 3220 # insert_adjusted_lines(). Lines inserted via this routine go through 3221 # any each_line_handler() 3222 3223 my $self = shift; 3224 3225 # Each inserted line is an array, with the first element being 0 to 3226 # indicate that this line hasn't been adjusted, and needs to be 3227 # processed. 3228 no overloading; 3229 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_; 3230 return; 3231 } 3232 3233 sub insert_adjusted_lines { 3234 # Lines can be inserted so that it looks like they were in the input 3235 # file at the place it was when this routine is called. See also 3236 # insert_lines(). Lines inserted via this routine are already fully 3237 # adjusted, ready to be processed; each_line_handler()s handlers will 3238 # not be called. This means this is not a completely general 3239 # facility, as only the last each_line_handler on the stack should 3240 # call this. It could be made more general, by passing to each of the 3241 # line_handlers their position on the stack, which they would pass on 3242 # to this routine, and that would replace the boolean first element in 3243 # the anonymous array pushed here, so that the next_line routine could 3244 # use that to call only those handlers whose index is after it on the 3245 # stack. But this is overkill for what is needed now. 3246 3247 my $self = shift; 3248 trace $_[0] if main::DEBUG && $to_trace; 3249 3250 # Each inserted line is an array, with the first element being 1 to 3251 # indicate that this line has been adjusted 3252 no overloading; 3253 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_; 3254 return; 3255 } 3256 3257 sub get_missings { 3258 # Returns the stored up @missings lines' values, and clears the list. 3259 # The values are in an array, consisting of the default in the first 3260 # element, and the property in the 2nd. However, since these lines 3261 # can be stacked up, the return is an array of all these arrays. 3262 3263 my $self = shift; 3264 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3265 3266 my $addr = do { no overloading; pack 'J', $self; }; 3267 3268 # If not accepting a list return, just return the first one. 3269 return shift @{$missings{$addr}} unless wantarray; 3270 3271 my @return = @{$missings{$addr}}; 3272 undef @{$missings{$addr}}; 3273 return @return; 3274 } 3275 3276 sub _exclude_unassigned { 3277 3278 # Takes the range in $_ and excludes code points that aren't assigned 3279 # in this release 3280 3281 state $skip_inserted_count = 0; 3282 3283 # Ignore recursive calls. 3284 if ($skip_inserted_count) { 3285 $skip_inserted_count--; 3286 return; 3287 } 3288 3289 # Find what code points are assigned in this release 3290 main::calculate_Assigned() if ! defined $Assigned; 3291 3292 my $self = shift; 3293 my $addr = do { no overloading; pack 'J', $self; }; 3294 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3295 3296 my ($range, @remainder) 3297 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3298 3299 # Examine the range. 3300 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3301 { 3302 my $low = hex $1; 3303 my $high = (defined $2) ? hex $2 : $low; 3304 3305 # Split the range into subranges of just those code points in it 3306 # that are assigned. 3307 my @ranges = (Range_List->new(Initialize 3308 => Range->new($low, $high)) & $Assigned)->ranges; 3309 3310 # Do nothing if nothing in the original range is assigned in this 3311 # release; handle normally if everything is in this release. 3312 if (! @ranges) { 3313 $_ = ""; 3314 } 3315 elsif (@ranges != 1) { 3316 3317 # Here, some code points in the original range aren't in this 3318 # release; @ranges gives the ones that are. Create fake input 3319 # lines for each of the ranges, and set things up so that when 3320 # this routine is called on that fake input, it will do 3321 # nothing. 3322 $skip_inserted_count = @ranges; 3323 my $remainder = join ";", @remainder; 3324 for my $range (@ranges) { 3325 $self->insert_lines(sprintf("%04X..%04X;%s", 3326 $range->start, $range->end, $remainder)); 3327 } 3328 $_ = ""; # The original range is now defunct. 3329 } 3330 } 3331 3332 return; 3333 } 3334 3335 sub _fixup_obsolete_hanguls { 3336 3337 # This is called only when compiling Unicode version 1. All Unicode 3338 # data for subsequent releases assumes that the code points that were 3339 # Hangul syllables in this release only are something else, so if 3340 # using such data, we have to override it 3341 3342 my $self = shift; 3343 my $addr = do { no overloading; pack 'J', $self; }; 3344 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3345 3346 my $object = main::property_ref($property{$addr}); 3347 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE, 3348 $FINAL_REMOVED_HANGUL_SYLLABLE, 3349 $early{$addr}[3], # Passed-in value for these 3350 Replace => $UNCONDITIONALLY); 3351 } 3352 3353 sub _insert_property_into_line { 3354 # Add a property field to $_, if this file requires it. 3355 3356 my $self = shift; 3357 my $addr = do { no overloading; pack 'J', $self; }; 3358 my $property = $property{$addr}; 3359 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3360 3361 $_ =~ s/(;|$)/; $property$1/; 3362 return; 3363 } 3364 3365 sub carp_bad_line { 3366 # Output consistent error messages, using either a generic one, or the 3367 # one given by the optional parameter. To avoid gazillions of the 3368 # same message in case the syntax of a file is way off, this routine 3369 # only outputs the first instance of each message, incrementing a 3370 # count so the totals can be output at the end of the file. 3371 3372 my $self = shift; 3373 my $message = shift; 3374 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3375 3376 my $addr = do { no overloading; pack 'J', $self; }; 3377 3378 $message = 'Unexpected line' unless $message; 3379 3380 # No trailing punctuation so as to fit with our addenda. 3381 $message =~ s/[.:;,]$//; 3382 3383 # If haven't seen this exact message before, output it now. Otherwise 3384 # increment the count of how many times it has occurred 3385 unless ($errors{$addr}->{$message}) { 3386 Carp::my_carp("$message in '$_' in " 3387 . $file{$addr} 3388 . " at line $.. Skipping this line;"); 3389 $errors{$addr}->{$message} = 1; 3390 } 3391 else { 3392 $errors{$addr}->{$message}++; 3393 } 3394 3395 # Clear the line to prevent any further (meaningful) processing of it. 3396 $_ = ""; 3397 3398 return; 3399 } 3400} # End closure 3401 3402package Multi_Default; 3403 3404# Certain properties in early versions of Unicode had more than one possible 3405# default for code points missing from the files. In these cases, one 3406# default applies to everything left over after all the others are applied, 3407# and for each of the others, there is a description of which class of code 3408# points applies to it. This object helps implement this by storing the 3409# defaults, and for all but that final default, an eval string that generates 3410# the class that it applies to. 3411 3412 3413{ # Closure 3414 3415 main::setup_package(); 3416 3417 my %class_defaults; 3418 # The defaults structure for the classes 3419 main::set_access('class_defaults', \%class_defaults); 3420 3421 my %other_default; 3422 # The default that applies to everything left over. 3423 main::set_access('other_default', \%other_default, 'r'); 3424 3425 3426 sub new { 3427 # The constructor is called with default => eval pairs, terminated by 3428 # the left-over default. e.g. 3429 # Multi_Default->new( 3430 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C 3431 # - 0x200D', 3432 # 'R' => 'some other expression that evaluates to code points', 3433 # . 3434 # . 3435 # . 3436 # 'U')); 3437 # It is best to leave the final value be the one that matches the 3438 # above-Unicode code points. 3439 3440 my $class = shift; 3441 3442 my $self = bless \do{my $anonymous_scalar}, $class; 3443 my $addr = do { no overloading; pack 'J', $self; }; 3444 3445 while (@_ > 1) { 3446 my $default = shift; 3447 my $eval = shift; 3448 $class_defaults{$addr}->{$default} = $eval; 3449 } 3450 3451 $other_default{$addr} = shift; 3452 3453 return $self; 3454 } 3455 3456 sub get_next_defaults { 3457 # Iterates and returns the next class of defaults. 3458 my $self = shift; 3459 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3460 3461 my $addr = do { no overloading; pack 'J', $self; }; 3462 3463 return each %{$class_defaults{$addr}}; 3464 } 3465} 3466 3467package Alias; 3468 3469# An alias is one of the names that a table goes by. This class defines them 3470# including some attributes. Everything is currently setup in the 3471# constructor. 3472 3473 3474{ # Closure 3475 3476 main::setup_package(); 3477 3478 my %name; 3479 main::set_access('name', \%name, 'r'); 3480 3481 my %loose_match; 3482 # Should this name match loosely or not. 3483 main::set_access('loose_match', \%loose_match, 'r'); 3484 3485 my %make_re_pod_entry; 3486 # Some aliases should not get their own entries in the re section of the 3487 # pod, because they are covered by a wild-card, and some we want to 3488 # discourage use of. Binary 3489 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); 3490 3491 my %ucd; 3492 # Is this documented to be accessible via Unicode::UCD 3493 main::set_access('ucd', \%ucd, 'r', 's'); 3494 3495 my %status; 3496 # Aliases have a status, like deprecated, or even suppressed (which means 3497 # they don't appear in documentation). Enum 3498 main::set_access('status', \%status, 'r'); 3499 3500 my %ok_as_filename; 3501 # Similarly, some aliases should not be considered as usable ones for 3502 # external use, such as file names, or we don't want documentation to 3503 # recommend them. Boolean 3504 main::set_access('ok_as_filename', \%ok_as_filename, 'r'); 3505 3506 sub new { 3507 my $class = shift; 3508 3509 my $self = bless \do { my $anonymous_scalar }, $class; 3510 my $addr = do { no overloading; pack 'J', $self; }; 3511 3512 $name{$addr} = shift; 3513 $loose_match{$addr} = shift; 3514 $make_re_pod_entry{$addr} = shift; 3515 $ok_as_filename{$addr} = shift; 3516 $status{$addr} = shift; 3517 $ucd{$addr} = shift; 3518 3519 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3520 3521 # Null names are never ok externally 3522 $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; 3523 3524 return $self; 3525 } 3526} 3527 3528package Range; 3529 3530# A range is the basic unit for storing code points, and is described in the 3531# comments at the beginning of the program. Each range has a starting code 3532# point; an ending code point (not less than the starting one); a value 3533# that applies to every code point in between the two end-points, inclusive; 3534# and an enum type that applies to the value. The type is for the user's 3535# convenience, and has no meaning here, except that a non-zero type is 3536# considered to not obey the normal Unicode rules for having standard forms. 3537# 3538# The same structure is used for both map and match tables, even though in the 3539# latter, the value (and hence type) is irrelevant and could be used as a 3540# comment. In map tables, the value is what all the code points in the range 3541# map to. Type 0 values have the standardized version of the value stored as 3542# well, so as to not have to recalculate it a lot. 3543 3544sub trace { return main::trace(@_); } 3545 3546{ # Closure 3547 3548 main::setup_package(); 3549 3550 my %start; 3551 main::set_access('start', \%start, 'r', 's'); 3552 3553 my %end; 3554 main::set_access('end', \%end, 'r', 's'); 3555 3556 my %value; 3557 main::set_access('value', \%value, 'r', 's'); 3558 3559 my %type; 3560 main::set_access('type', \%type, 'r'); 3561 3562 my %standard_form; 3563 # The value in internal standard form. Defined only if the type is 0. 3564 main::set_access('standard_form', \%standard_form); 3565 3566 # Note that if these fields change, the dump() method should as well 3567 3568 sub new { 3569 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 3570 my $class = shift; 3571 3572 my $self = bless \do { my $anonymous_scalar }, $class; 3573 my $addr = do { no overloading; pack 'J', $self; }; 3574 3575 $start{$addr} = shift; 3576 $end{$addr} = shift; 3577 3578 my %args = @_; 3579 3580 my $value = delete $args{'Value'}; # Can be 0 3581 $value = "" unless defined $value; 3582 $value{$addr} = $value; 3583 3584 $type{$addr} = delete $args{'Type'} || 0; 3585 3586 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3587 3588 return $self; 3589 } 3590 3591 use overload 3592 fallback => 0, 3593 qw("") => "_operator_stringify", 3594 "." => \&main::_operator_dot, 3595 ".=" => \&main::_operator_dot_equal, 3596 ; 3597 3598 sub _operator_stringify { 3599 my $self = shift; 3600 my $addr = do { no overloading; pack 'J', $self; }; 3601 3602 # Output it like '0041..0065 (value)' 3603 my $return = sprintf("%04X", $start{$addr}) 3604 . '..' 3605 . sprintf("%04X", $end{$addr}); 3606 my $value = $value{$addr}; 3607 my $type = $type{$addr}; 3608 $return .= ' ('; 3609 $return .= "$value"; 3610 $return .= ", Type=$type" if $type != 0; 3611 $return .= ')'; 3612 3613 return $return; 3614 } 3615 3616 sub standard_form { 3617 # Calculate the standard form only if needed, and cache the result. 3618 # The standard form is the value itself if the type is special. 3619 # This represents a considerable CPU and memory saving - at the time 3620 # of writing there are 368676 non-special objects, but the standard 3621 # form is only requested for 22047 of them - ie about 6%. 3622 3623 my $self = shift; 3624 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3625 3626 my $addr = do { no overloading; pack 'J', $self; }; 3627 3628 return $standard_form{$addr} if defined $standard_form{$addr}; 3629 3630 my $value = $value{$addr}; 3631 return $value if $type{$addr}; 3632 return $standard_form{$addr} = main::standardize($value); 3633 } 3634 3635 sub dump { 3636 # Human, not machine readable. For machine readable, comment out this 3637 # entire routine and let the standard one take effect. 3638 my $self = shift; 3639 my $indent = shift; 3640 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3641 3642 my $addr = do { no overloading; pack 'J', $self; }; 3643 3644 my $return = $indent 3645 . sprintf("%04X", $start{$addr}) 3646 . '..' 3647 . sprintf("%04X", $end{$addr}) 3648 . " '$value{$addr}';"; 3649 if (! defined $standard_form{$addr}) { 3650 $return .= "(type=$type{$addr})"; 3651 } 3652 elsif ($standard_form{$addr} ne $value{$addr}) { 3653 $return .= "(standard '$standard_form{$addr}')"; 3654 } 3655 return $return; 3656 } 3657} # End closure 3658 3659package _Range_List_Base; 3660 3661# Base class for range lists. A range list is simply an ordered list of 3662# ranges, so that the ranges with the lowest starting numbers are first in it. 3663# 3664# When a new range is added that is adjacent to an existing range that has the 3665# same value and type, it merges with it to form a larger range. 3666# 3667# Ranges generally do not overlap, except that there can be multiple entries 3668# of single code point ranges. This is because of NameAliases.txt. 3669# 3670# In this program, there is a standard value such that if two different 3671# values, have the same standard value, they are considered equivalent. This 3672# value was chosen so that it gives correct results on Unicode data 3673 3674# There are a number of methods to manipulate range lists, and some operators 3675# are overloaded to handle them. 3676 3677sub trace { return main::trace(@_); } 3678 3679{ # Closure 3680 3681 our $addr; 3682 3683 # Max is initialized to a negative value that isn't adjacent to 0, for 3684 # simpler tests 3685 my $max_init = -2; 3686 3687 main::setup_package(); 3688 3689 my %ranges; 3690 # The list of ranges 3691 main::set_access('ranges', \%ranges, 'readable_array'); 3692 3693 my %max; 3694 # The highest code point in the list. This was originally a method, but 3695 # actual measurements said it was used a lot. 3696 main::set_access('max', \%max, 'r'); 3697 3698 my %each_range_iterator; 3699 # Iterator position for each_range() 3700 main::set_access('each_range_iterator', \%each_range_iterator); 3701 3702 my %owner_name_of; 3703 # Name of parent this is attached to, if any. Solely for better error 3704 # messages. 3705 main::set_access('owner_name_of', \%owner_name_of, 'p_r'); 3706 3707 my %_search_ranges_cache; 3708 # A cache of the previous result from _search_ranges(), for better 3709 # performance 3710 main::set_access('_search_ranges_cache', \%_search_ranges_cache); 3711 3712 sub new { 3713 my $class = shift; 3714 my %args = @_; 3715 3716 # Optional initialization data for the range list. 3717 my $initialize = delete $args{'Initialize'}; 3718 3719 my $self; 3720 3721 # Use _union() to initialize. _union() returns an object of this 3722 # class, which means that it will call this constructor recursively. 3723 # But it won't have this $initialize parameter so that it won't 3724 # infinitely loop on this. 3725 return _union($class, $initialize, %args) if defined $initialize; 3726 3727 $self = bless \do { my $anonymous_scalar }, $class; 3728 my $addr = do { no overloading; pack 'J', $self; }; 3729 3730 # Optional parent object, only for debug info. 3731 $owner_name_of{$addr} = delete $args{'Owner'}; 3732 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; 3733 3734 # Stringify, in case it is an object. 3735 $owner_name_of{$addr} = "$owner_name_of{$addr}"; 3736 3737 # This is used only for error messages, and so a colon is added 3738 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; 3739 3740 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3741 3742 $max{$addr} = $max_init; 3743 3744 $_search_ranges_cache{$addr} = 0; 3745 $ranges{$addr} = []; 3746 3747 return $self; 3748 } 3749 3750 use overload 3751 fallback => 0, 3752 qw("") => "_operator_stringify", 3753 "." => \&main::_operator_dot, 3754 ".=" => \&main::_operator_dot_equal, 3755 ; 3756 3757 sub _operator_stringify { 3758 my $self = shift; 3759 my $addr = do { no overloading; pack 'J', $self; }; 3760 3761 return "Range_List attached to '$owner_name_of{$addr}'" 3762 if $owner_name_of{$addr}; 3763 return "anonymous Range_List " . \$self; 3764 } 3765 3766 sub _union { 3767 # Returns the union of the input code points. It can be called as 3768 # either a constructor or a method. If called as a method, the result 3769 # will be a new() instance of the calling object, containing the union 3770 # of that object with the other parameter's code points; if called as 3771 # a constructor, the first parameter gives the class that the new object 3772 # should be, and the second parameter gives the code points to go into 3773 # it. 3774 # In either case, there are two parameters looked at by this routine; 3775 # any additional parameters are passed to the new() constructor. 3776 # 3777 # The code points can come in the form of some object that contains 3778 # ranges, and has a conventionally named method to access them; or 3779 # they can be an array of individual code points (as integers); or 3780 # just a single code point. 3781 # 3782 # If they are ranges, this routine doesn't make any effort to preserve 3783 # the range values and types of one input over the other. Therefore 3784 # this base class should not allow _union to be called from other than 3785 # initialization code, so as to prevent two tables from being added 3786 # together where the range values matter. The general form of this 3787 # routine therefore belongs in a derived class, but it was moved here 3788 # to avoid duplication of code. The failure to overload this in this 3789 # class keeps it safe. 3790 # 3791 # It does make the effort during initialization to accept tables with 3792 # multiple values for the same code point, and to preserve the order 3793 # of these. If there is only one input range or range set, it doesn't 3794 # sort (as it should already be sorted to the desired order), and will 3795 # accept multiple values per code point. Otherwise it will merge 3796 # multiple values into a single one. 3797 3798 my $self; 3799 my @args; # Arguments to pass to the constructor 3800 3801 my $class = shift; 3802 3803 # If a method call, will start the union with the object itself, and 3804 # the class of the new object will be the same as self. 3805 if (ref $class) { 3806 $self = $class; 3807 $class = ref $self; 3808 push @args, $self; 3809 } 3810 3811 # Add the other required parameter. 3812 push @args, shift; 3813 # Rest of parameters are passed on to the constructor 3814 3815 # Accumulate all records from both lists. 3816 my @records; 3817 my $input_count = 0; 3818 for my $arg (@args) { 3819 #local $to_trace = 0 if main::DEBUG; 3820 trace "argument = $arg" if main::DEBUG && $to_trace; 3821 if (! defined $arg) { 3822 my $message = ""; 3823 if (defined $self) { 3824 no overloading; 3825 $message .= $owner_name_of{pack 'J', $self}; 3826 } 3827 Carp::my_carp_bug($message . "Undefined argument to _union. No union done."); 3828 return; 3829 } 3830 3831 $arg = [ $arg ] if ! ref $arg; 3832 my $type = ref $arg; 3833 if ($type eq 'ARRAY') { 3834 foreach my $element (@$arg) { 3835 push @records, Range->new($element, $element); 3836 $input_count++; 3837 } 3838 } 3839 elsif ($arg->isa('Range')) { 3840 push @records, $arg; 3841 $input_count++; 3842 } 3843 elsif ($arg->can('ranges')) { 3844 push @records, $arg->ranges; 3845 $input_count++; 3846 } 3847 else { 3848 my $message = ""; 3849 if (defined $self) { 3850 no overloading; 3851 $message .= $owner_name_of{pack 'J', $self}; 3852 } 3853 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); 3854 return; 3855 } 3856 } 3857 3858 # Sort with the range containing the lowest ordinal first, but if 3859 # two ranges start at the same code point, sort with the bigger range 3860 # of the two first, because it takes fewer cycles. 3861 if ($input_count > 1) { 3862 @records = sort { ($a->start <=> $b->start) 3863 or 3864 # if b is shorter than a, b->end will be 3865 # less than a->end, and we want to select 3866 # a, so want to return -1 3867 ($b->end <=> $a->end) 3868 } @records; 3869 } 3870 3871 my $new = $class->new(@_); 3872 3873 # Fold in records so long as they add new information. 3874 for my $set (@records) { 3875 my $start = $set->start; 3876 my $end = $set->end; 3877 my $value = $set->value; 3878 my $type = $set->type; 3879 if ($start > $new->max) { 3880 $new->_add_delete('+', $start, $end, $value, Type => $type); 3881 } 3882 elsif ($end > $new->max) { 3883 $new->_add_delete('+', $new->max +1, $end, $value, 3884 Type => $type); 3885 } 3886 elsif ($input_count == 1) { 3887 # Here, overlaps existing range, but is from a single input, 3888 # so preserve the multiple values from that input. 3889 $new->_add_delete('+', $start, $end, $value, Type => $type, 3890 Replace => $MULTIPLE_AFTER); 3891 } 3892 } 3893 3894 return $new; 3895 } 3896 3897 sub range_count { # Return the number of ranges in the range list 3898 my $self = shift; 3899 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3900 3901 no overloading; 3902 return scalar @{$ranges{pack 'J', $self}}; 3903 } 3904 3905 sub min { 3906 # Returns the minimum code point currently in the range list, or if 3907 # the range list is empty, 2 beyond the max possible. This is a 3908 # method because used so rarely, that not worth saving between calls, 3909 # and having to worry about changing it as ranges are added and 3910 # deleted. 3911 3912 my $self = shift; 3913 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3914 3915 my $addr = do { no overloading; pack 'J', $self; }; 3916 3917 # If the range list is empty, return a large value that isn't adjacent 3918 # to any that could be in the range list, for simpler tests 3919 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; 3920 return $ranges{$addr}->[0]->start; 3921 } 3922 3923 sub contains { 3924 # Boolean: Is argument in the range list? If so returns $i such that: 3925 # range[$i]->end < $codepoint <= range[$i+1]->end 3926 # which is one beyond what you want; this is so that the 0th range 3927 # doesn't return false 3928 my $self = shift; 3929 my $codepoint = shift; 3930 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3931 3932 my $i = $self->_search_ranges($codepoint); 3933 return 0 unless defined $i; 3934 3935 # The search returns $i, such that 3936 # range[$i-1]->end < $codepoint <= range[$i]->end 3937 # So is in the table if and only iff it is at least the start position 3938 # of range $i. 3939 no overloading; 3940 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; 3941 return $i + 1; 3942 } 3943 3944 sub containing_range { 3945 # Returns the range object that contains the code point, undef if none 3946 3947 my $self = shift; 3948 my $codepoint = shift; 3949 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3950 3951 my $i = $self->contains($codepoint); 3952 return unless $i; 3953 3954 # contains() returns 1 beyond where we should look 3955 no overloading; 3956 return $ranges{pack 'J', $self}->[$i-1]; 3957 } 3958 3959 sub value_of { 3960 # Returns the value associated with the code point, undef if none 3961 3962 my $self = shift; 3963 my $codepoint = shift; 3964 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3965 3966 my $range = $self->containing_range($codepoint); 3967 return unless defined $range; 3968 3969 return $range->value; 3970 } 3971 3972 sub type_of { 3973 # Returns the type of the range containing the code point, undef if 3974 # the code point is not in the table 3975 3976 my $self = shift; 3977 my $codepoint = shift; 3978 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3979 3980 my $range = $self->containing_range($codepoint); 3981 return unless defined $range; 3982 3983 return $range->type; 3984 } 3985 3986 sub _search_ranges { 3987 # Find the range in the list which contains a code point, or where it 3988 # should go if were to add it. That is, it returns $i, such that: 3989 # range[$i-1]->end < $codepoint <= range[$i]->end 3990 # Returns undef if no such $i is possible (e.g. at end of table), or 3991 # if there is an error. 3992 3993 my $self = shift; 3994 my $code_point = shift; 3995 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3996 3997 my $addr = do { no overloading; pack 'J', $self; }; 3998 3999 return if $code_point > $max{$addr}; 4000 my $r = $ranges{$addr}; # The current list of ranges 4001 my $range_list_size = scalar @$r; 4002 my $i; 4003 4004 use integer; # want integer division 4005 4006 # Use the cached result as the starting guess for this one, because, 4007 # an experiment on 5.1 showed that 90% of the time the cache was the 4008 # same as the result on the next call (and 7% it was one less). 4009 $i = $_search_ranges_cache{$addr}; 4010 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. 4011 # from an intervening deletion 4012 #local $to_trace = 1 if main::DEBUG; 4013 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); 4014 return $i if $code_point <= $r->[$i]->end 4015 && ($i == 0 || $r->[$i-1]->end < $code_point); 4016 4017 # Here the cache doesn't yield the correct $i. Try adding 1. 4018 if ($i < $range_list_size - 1 4019 && $r->[$i]->end < $code_point && 4020 $code_point <= $r->[$i+1]->end) 4021 { 4022 $i++; 4023 trace "next \$i is correct: $i" if main::DEBUG && $to_trace; 4024 $_search_ranges_cache{$addr} = $i; 4025 return $i; 4026 } 4027 4028 # Here, adding 1 also didn't work. We do a binary search to 4029 # find the correct position, starting with current $i 4030 my $lower = 0; 4031 my $upper = $range_list_size - 1; 4032 while (1) { 4033 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; 4034 4035 if ($code_point <= $r->[$i]->end) { 4036 4037 # Here we have met the upper constraint. We can quit if we 4038 # also meet the lower one. 4039 last if $i == 0 || $r->[$i-1]->end < $code_point; 4040 4041 $upper = $i; # Still too high. 4042 4043 } 4044 else { 4045 4046 # Here, $r[$i]->end < $code_point, so look higher up. 4047 $lower = $i; 4048 } 4049 4050 # Split search domain in half to try again. 4051 my $temp = ($upper + $lower) / 2; 4052 4053 # No point in continuing unless $i changes for next time 4054 # in the loop. 4055 if ($temp == $i) { 4056 4057 # We can't reach the highest element because of the averaging. 4058 # So if one below the upper edge, force it there and try one 4059 # more time. 4060 if ($i == $range_list_size - 2) { 4061 4062 trace "Forcing to upper edge" if main::DEBUG && $to_trace; 4063 $i = $range_list_size - 1; 4064 4065 # Change $lower as well so if fails next time through, 4066 # taking the average will yield the same $i, and we will 4067 # quit with the error message just below. 4068 $lower = $i; 4069 next; 4070 } 4071 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); 4072 return; 4073 } 4074 $i = $temp; 4075 } # End of while loop 4076 4077 if (main::DEBUG && $to_trace) { 4078 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; 4079 trace "i= [ $i ]", $r->[$i]; 4080 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; 4081 } 4082 4083 # Here we have found the offset. Cache it as a starting point for the 4084 # next call. 4085 $_search_ranges_cache{$addr} = $i; 4086 return $i; 4087 } 4088 4089 sub _add_delete { 4090 # Add, replace or delete ranges to or from a list. The $type 4091 # parameter gives which: 4092 # '+' => insert or replace a range, returning a list of any changed 4093 # ranges. 4094 # '-' => delete a range, returning a list of any deleted ranges. 4095 # 4096 # The next three parameters give respectively the start, end, and 4097 # value associated with the range. 'value' should be null unless the 4098 # operation is '+'; 4099 # 4100 # The range list is kept sorted so that the range with the lowest 4101 # starting position is first in the list, and generally, adjacent 4102 # ranges with the same values are merged into a single larger one (see 4103 # exceptions below). 4104 # 4105 # There are more parameters; all are key => value pairs: 4106 # Type gives the type of the value. It is only valid for '+'. 4107 # All ranges have types; if this parameter is omitted, 0 is 4108 # assumed. Ranges with type 0 are assumed to obey the 4109 # Unicode rules for casing, etc; ranges with other types are 4110 # not. Otherwise, the type is arbitrary, for the caller's 4111 # convenience, and looked at only by this routine to keep 4112 # adjacent ranges of different types from being merged into 4113 # a single larger range, and when Replace => 4114 # $IF_NOT_EQUIVALENT is specified (see just below). 4115 # Replace determines what to do if the range list already contains 4116 # ranges which coincide with all or portions of the input 4117 # range. It is only valid for '+': 4118 # => $NO means that the new value is not to replace 4119 # any existing ones, but any empty gaps of the 4120 # range list coinciding with the input range 4121 # will be filled in with the new value. 4122 # => $UNCONDITIONALLY means to replace the existing values with 4123 # this one unconditionally. However, if the 4124 # new and old values are identical, the 4125 # replacement is skipped to save cycles 4126 # => $IF_NOT_EQUIVALENT means to replace the existing values 4127 # (the default) with this one if they are not equivalent. 4128 # Ranges are equivalent if their types are the 4129 # same, and they are the same string; or if 4130 # both are type 0 ranges, if their Unicode 4131 # standard forms are identical. In this last 4132 # case, the routine chooses the more "modern" 4133 # one to use. This is because some of the 4134 # older files are formatted with values that 4135 # are, for example, ALL CAPs, whereas the 4136 # derived files have a more modern style, 4137 # which looks better. By looking for this 4138 # style when the pre-existing and replacement 4139 # standard forms are the same, we can move to 4140 # the modern style 4141 # => $MULTIPLE_BEFORE means that if this range duplicates an 4142 # existing one, but has a different value, 4143 # don't replace the existing one, but insert 4144 # this one so that the same range can occur 4145 # multiple times. They are stored LIFO, so 4146 # that the final one inserted is the first one 4147 # returned in an ordered search of the table. 4148 # If this is an exact duplicate, including the 4149 # value, the original will be moved to be 4150 # first, before any other duplicate ranges 4151 # with different values. 4152 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored 4153 # FIFO, so that this one is inserted after all 4154 # others that currently exist. If this is an 4155 # exact duplicate, including value, of an 4156 # existing range, this one is discarded 4157 # (leaving the existing one in its original, 4158 # higher priority position 4159 # => $CROAK Die with an error if is already there 4160 # => anything else is the same as => $IF_NOT_EQUIVALENT 4161 # 4162 # "same value" means identical for non-type-0 ranges, and it means 4163 # having the same standard forms for type-0 ranges. 4164 4165 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; 4166 4167 my $self = shift; 4168 my $operation = shift; # '+' for add/replace; '-' for delete; 4169 my $start = shift; 4170 my $end = shift; 4171 my $value = shift; 4172 4173 my %args = @_; 4174 4175 $value = "" if not defined $value; # warning: $value can be "0" 4176 4177 my $replace = delete $args{'Replace'}; 4178 $replace = $IF_NOT_EQUIVALENT unless defined $replace; 4179 4180 my $type = delete $args{'Type'}; 4181 $type = 0 unless defined $type; 4182 4183 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4184 4185 my $addr = do { no overloading; pack 'J', $self; }; 4186 4187 if ($operation ne '+' && $operation ne '-') { 4188 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); 4189 return; 4190 } 4191 unless (defined $start && defined $end) { 4192 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); 4193 return; 4194 } 4195 unless ($end >= $start) { 4196 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); 4197 return; 4198 } 4199 #local $to_trace = 1 if main::DEBUG; 4200 4201 if ($operation eq '-') { 4202 if ($replace != $IF_NOT_EQUIVALENT) { 4203 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); 4204 $replace = $IF_NOT_EQUIVALENT; 4205 } 4206 if ($type) { 4207 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); 4208 $type = 0; 4209 } 4210 if ($value ne "") { 4211 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); 4212 $value = ""; 4213 } 4214 } 4215 4216 my $r = $ranges{$addr}; # The current list of ranges 4217 my $range_list_size = scalar @$r; # And its size 4218 my $max = $max{$addr}; # The current high code point in 4219 # the list of ranges 4220 4221 # Do a special case requiring fewer machine cycles when the new range 4222 # starts after the current highest point. The Unicode input data is 4223 # structured so this is common. 4224 if ($start > $max) { 4225 4226 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace; 4227 return if $operation eq '-'; # Deleting a non-existing range is a 4228 # no-op 4229 4230 # If the new range doesn't logically extend the current final one 4231 # in the range list, create a new range at the end of the range 4232 # list. (max cleverly is initialized to a negative number not 4233 # adjacent to 0 if the range list is empty, so even adding a range 4234 # to an empty range list starting at 0 will have this 'if' 4235 # succeed.) 4236 if ($start > $max + 1 # non-adjacent means can't extend. 4237 || @{$r}[-1]->value ne $value # values differ, can't extend. 4238 || @{$r}[-1]->type != $type # types differ, can't extend. 4239 ) { 4240 push @$r, Range->new($start, $end, 4241 Value => $value, 4242 Type => $type); 4243 } 4244 else { 4245 4246 # Here, the new range starts just after the current highest in 4247 # the range list, and they have the same type and value. 4248 # Extend the existing range to incorporate the new one. 4249 @{$r}[-1]->set_end($end); 4250 } 4251 4252 # This becomes the new maximum. 4253 $max{$addr} = $end; 4254 4255 return; 4256 } 4257 #local $to_trace = 0 if main::DEBUG; 4258 4259 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; 4260 4261 # Here, the input range isn't after the whole rest of the range list. 4262 # Most likely 'splice' will be needed. The rest of the routine finds 4263 # the needed splice parameters, and if necessary, does the splice. 4264 # First, find the offset parameter needed by the splice function for 4265 # the input range. Note that the input range may span multiple 4266 # existing ones, but we'll worry about that later. For now, just find 4267 # the beginning. If the input range is to be inserted starting in a 4268 # position not currently in the range list, it must (obviously) come 4269 # just after the range below it, and just before the range above it. 4270 # Slightly less obviously, it will occupy the position currently 4271 # occupied by the range that is to come after it. More formally, we 4272 # are looking for the position, $i, in the array of ranges, such that: 4273 # 4274 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end 4275 # 4276 # (The ordered relationships within existing ranges are also shown in 4277 # the equation above). However, if the start of the input range is 4278 # within an existing range, the splice offset should point to that 4279 # existing range's position in the list; that is $i satisfies a 4280 # somewhat different equation, namely: 4281 # 4282 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end 4283 # 4284 # More briefly, $start can come before or after r[$i]->start, and at 4285 # this point, we don't know which it will be. However, these 4286 # two equations share these constraints: 4287 # 4288 # r[$i-1]->end < $start <= r[$i]->end 4289 # 4290 # And that is good enough to find $i. 4291 4292 my $i = $self->_search_ranges($start); 4293 if (! defined $i) { 4294 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); 4295 return; 4296 } 4297 4298 # The search function returns $i such that: 4299 # 4300 # r[$i-1]->end < $start <= r[$i]->end 4301 # 4302 # That means that $i points to the first range in the range list 4303 # that could possibly be affected by this operation. We still don't 4304 # know if the start of the input range is within r[$i], or if it 4305 # points to empty space between r[$i-1] and r[$i]. 4306 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; 4307 4308 # Special case the insertion of data that is not to replace any 4309 # existing data. 4310 if ($replace == $NO) { # If $NO, has to be operation '+' 4311 #local $to_trace = 1 if main::DEBUG; 4312 trace "Doesn't replace" if main::DEBUG && $to_trace; 4313 4314 # Here, the new range is to take effect only on those code points 4315 # that aren't already in an existing range. This can be done by 4316 # looking through the existing range list and finding the gaps in 4317 # the ranges that this new range affects, and then calling this 4318 # function recursively on each of those gaps, leaving untouched 4319 # anything already in the list. Gather up a list of the changed 4320 # gaps first so that changes to the internal state as new ranges 4321 # are added won't be a problem. 4322 my @gap_list; 4323 4324 # First, if the starting point of the input range is outside an 4325 # existing one, there is a gap from there to the beginning of the 4326 # existing range -- add a span to fill the part that this new 4327 # range occupies 4328 if ($start < $r->[$i]->start) { 4329 push @gap_list, Range->new($start, 4330 main::min($end, 4331 $r->[$i]->start - 1), 4332 Type => $type); 4333 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; 4334 } 4335 4336 # Then look through the range list for other gaps until we reach 4337 # the highest range affected by the input one. 4338 my $j; 4339 for ($j = $i+1; $j < $range_list_size; $j++) { 4340 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; 4341 last if $end < $r->[$j]->start; 4342 4343 # If there is a gap between when this range starts and the 4344 # previous one ends, add a span to fill it. Note that just 4345 # because there are two ranges doesn't mean there is a 4346 # non-zero gap between them. It could be that they have 4347 # different values or types 4348 if ($r->[$j-1]->end + 1 != $r->[$j]->start) { 4349 push @gap_list, 4350 Range->new($r->[$j-1]->end + 1, 4351 $r->[$j]->start - 1, 4352 Type => $type); 4353 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; 4354 } 4355 } 4356 4357 # Here, we have either found an existing range in the range list, 4358 # beyond the area affected by the input one, or we fell off the 4359 # end of the loop because the input range affects the whole rest 4360 # of the range list. In either case, $j is 1 higher than the 4361 # highest affected range. If $j == $i, it means that there are no 4362 # affected ranges, that the entire insertion is in the gap between 4363 # r[$i-1], and r[$i], which we already have taken care of before 4364 # the loop. 4365 # On the other hand, if there are affected ranges, it might be 4366 # that there is a gap that needs filling after the final such 4367 # range to the end of the input range 4368 if ($r->[$j-1]->end < $end) { 4369 push @gap_list, Range->new(main::max($start, 4370 $r->[$j-1]->end + 1), 4371 $end, 4372 Type => $type); 4373 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; 4374 } 4375 4376 # Call recursively to fill in all the gaps. 4377 foreach my $gap (@gap_list) { 4378 $self->_add_delete($operation, 4379 $gap->start, 4380 $gap->end, 4381 $value, 4382 Type => $type); 4383 } 4384 4385 return; 4386 } 4387 4388 # Here, we have taken care of the case where $replace is $NO. 4389 # Remember that here, r[$i-1]->end < $start <= r[$i]->end 4390 # If inserting a multiple record, this is where it goes, before the 4391 # first (if any) existing one if inserting LIFO. (If this is to go 4392 # afterwards, FIFO, we below move the pointer to there.) These imply 4393 # an insertion, and no change to any existing ranges. Note that $i 4394 # can be -1 if this new range doesn't actually duplicate any existing, 4395 # and comes at the beginning of the list. 4396 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) { 4397 4398 if ($start != $end) { 4399 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken."); 4400 return; 4401 } 4402 4403 # If the new code point is within a current range ... 4404 if ($end >= $r->[$i]->start) { 4405 4406 # Don't add an exact duplicate, as it isn't really a multiple 4407 my $existing_value = $r->[$i]->value; 4408 my $existing_type = $r->[$i]->type; 4409 return if $value eq $existing_value && $type eq $existing_type; 4410 4411 # If the multiple value is part of an existing range, we want 4412 # to split up that range, so that only the single code point 4413 # is affected. To do this, we first call ourselves 4414 # recursively to delete that code point from the table, having 4415 # preserved its current data above. Then we call ourselves 4416 # recursively again to add the new multiple, which we know by 4417 # the test just above is different than the current code 4418 # point's value, so it will become a range containing a single 4419 # code point: just itself. Finally, we add back in the 4420 # pre-existing code point, which will again be a single code 4421 # point range. Because 'i' likely will have changed as a 4422 # result of these operations, we can't just continue on, but 4423 # do this operation recursively as well. If we are inserting 4424 # LIFO, the pre-existing code point needs to go after the new 4425 # one, so use MULTIPLE_AFTER; and vice versa. 4426 if ($r->[$i]->start != $r->[$i]->end) { 4427 $self->_add_delete('-', $start, $end, ""); 4428 $self->_add_delete('+', $start, $end, $value, Type => $type); 4429 return $self->_add_delete('+', 4430 $start, $end, 4431 $existing_value, 4432 Type => $existing_type, 4433 Replace => ($replace == $MULTIPLE_BEFORE) 4434 ? $MULTIPLE_AFTER 4435 : $MULTIPLE_BEFORE); 4436 } 4437 } 4438 4439 # If to place this new record after, move to beyond all existing 4440 # ones; but don't add this one if identical to any of them, as it 4441 # isn't really a multiple. This leaves the original order, so 4442 # that the current request is ignored. The reasoning is that the 4443 # previous request that wanted this record to have high priority 4444 # should have precedence. 4445 if ($replace == $MULTIPLE_AFTER) { 4446 while ($i < @$r && $r->[$i]->start == $start) { 4447 return if $value eq $r->[$i]->value 4448 && $type eq $r->[$i]->type; 4449 $i++; 4450 } 4451 } 4452 else { 4453 # If instead we are to place this new record before any 4454 # existing ones, remove any identical ones that come after it. 4455 # This changes the existing order so that the new one is 4456 # first, as is being requested. 4457 for (my $j = $i + 1; 4458 $j < @$r && $r->[$j]->start == $start; 4459 $j++) 4460 { 4461 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { 4462 splice @$r, $j, 1; 4463 last; # There should only be one instance, so no 4464 # need to keep looking 4465 } 4466 } 4467 } 4468 4469 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; 4470 my @return = splice @$r, 4471 $i, 4472 0, 4473 Range->new($start, 4474 $end, 4475 Value => $value, 4476 Type => $type); 4477 if (main::DEBUG && $to_trace) { 4478 trace "After splice:"; 4479 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4480 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4481 trace "i =[", $i, "]", $r->[$i] if $i >= 0; 4482 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4483 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4484 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3; 4485 } 4486 return @return; 4487 } 4488 4489 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This 4490 # leaves delete, insert, and replace either unconditionally or if not 4491 # equivalent. $i still points to the first potential affected range. 4492 # Now find the highest range affected, which will determine the length 4493 # parameter to splice. (The input range can span multiple existing 4494 # ones.) If this isn't a deletion, while we are looking through the 4495 # range list, see also if this is a replacement rather than a clean 4496 # insertion; that is if it will change the values of at least one 4497 # existing range. Start off assuming it is an insert, until find it 4498 # isn't. 4499 my $clean_insert = $operation eq '+'; 4500 my $j; # This will point to the highest affected range 4501 4502 # For non-zero types, the standard form is the value itself; 4503 my $standard_form = ($type) ? $value : main::standardize($value); 4504 4505 for ($j = $i; $j < $range_list_size; $j++) { 4506 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; 4507 4508 # If find a range that it doesn't overlap into, we can stop 4509 # searching 4510 last if $end < $r->[$j]->start; 4511 4512 # Here, overlaps the range at $j. If the values don't match, 4513 # and so far we think this is a clean insertion, it becomes a 4514 # non-clean insertion, i.e., a 'change' or 'replace' instead. 4515 if ($clean_insert) { 4516 if ($r->[$j]->standard_form ne $standard_form) { 4517 $clean_insert = 0; 4518 if ($replace == $CROAK) { 4519 main::croak("The range to add " 4520 . sprintf("%04X", $start) 4521 . '-' 4522 . sprintf("%04X", $end) 4523 . " with value '$value' overlaps an existing range $r->[$j]"); 4524 } 4525 } 4526 else { 4527 4528 # Here, the two values are essentially the same. If the 4529 # two are actually identical, replacing wouldn't change 4530 # anything so skip it. 4531 my $pre_existing = $r->[$j]->value; 4532 if ($pre_existing ne $value) { 4533 4534 # Here the new and old standardized values are the 4535 # same, but the non-standardized values aren't. If 4536 # replacing unconditionally, then replace 4537 if( $replace == $UNCONDITIONALLY) { 4538 $clean_insert = 0; 4539 } 4540 else { 4541 4542 # Here, are replacing conditionally. Decide to 4543 # replace or not based on which appears to look 4544 # the "nicest". If one is mixed case and the 4545 # other isn't, choose the mixed case one. 4546 my $new_mixed = $value =~ /[A-Z]/ 4547 && $value =~ /[a-z]/; 4548 my $old_mixed = $pre_existing =~ /[A-Z]/ 4549 && $pre_existing =~ /[a-z]/; 4550 4551 if ($old_mixed != $new_mixed) { 4552 $clean_insert = 0 if $new_mixed; 4553 if (main::DEBUG && $to_trace) { 4554 if ($clean_insert) { 4555 trace "Retaining $pre_existing over $value"; 4556 } 4557 else { 4558 trace "Replacing $pre_existing with $value"; 4559 } 4560 } 4561 } 4562 else { 4563 4564 # Here casing wasn't different between the two. 4565 # If one has hyphens or underscores and the 4566 # other doesn't, choose the one with the 4567 # punctuation. 4568 my $new_punct = $value =~ /[-_]/; 4569 my $old_punct = $pre_existing =~ /[-_]/; 4570 4571 if ($old_punct != $new_punct) { 4572 $clean_insert = 0 if $new_punct; 4573 if (main::DEBUG && $to_trace) { 4574 if ($clean_insert) { 4575 trace "Retaining $pre_existing over $value"; 4576 } 4577 else { 4578 trace "Replacing $pre_existing with $value"; 4579 } 4580 } 4581 } # else existing one is just as "good"; 4582 # retain it to save cycles. 4583 } 4584 } 4585 } 4586 } 4587 } 4588 } # End of loop looking for highest affected range. 4589 4590 # Here, $j points to one beyond the highest range that this insertion 4591 # affects (hence to beyond the range list if that range is the final 4592 # one in the range list). 4593 4594 # The splice length is all the affected ranges. Get it before 4595 # subtracting, for efficiency, so we don't have to later add 1. 4596 my $length = $j - $i; 4597 4598 $j--; # $j now points to the highest affected range. 4599 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; 4600 4601 # Here, have taken care of $NO and $MULTIPLE_foo replaces. 4602 # $j points to the highest affected range. But it can be < $i or even 4603 # -1. These happen only if the insertion is entirely in the gap 4604 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop 4605 # above exited first time through with $end < $r->[$i]->start. (And 4606 # then we subtracted one from j) This implies also that $start < 4607 # $r->[$i]->start, but we know from above that $r->[$i-1]->end < 4608 # $start, so the entire input range is in the gap. 4609 if ($j < $i) { 4610 4611 # Here the entire input range is in the gap before $i. 4612 4613 if (main::DEBUG && $to_trace) { 4614 if ($i) { 4615 trace "Entire range is between $r->[$i-1] and $r->[$i]"; 4616 } 4617 else { 4618 trace "Entire range is before $r->[$i]"; 4619 } 4620 } 4621 return if $operation ne '+'; # Deletion of a non-existent range is 4622 # a no-op 4623 } 4624 else { 4625 4626 # Here part of the input range is not in the gap before $i. Thus, 4627 # there is at least one affected one, and $j points to the highest 4628 # such one. 4629 4630 # At this point, here is the situation: 4631 # This is not an insertion of a multiple, nor of tentative ($NO) 4632 # data. 4633 # $i points to the first element in the current range list that 4634 # may be affected by this operation. In fact, we know 4635 # that the range at $i is affected because we are in 4636 # the else branch of this 'if' 4637 # $j points to the highest affected range. 4638 # In other words, 4639 # r[$i-1]->end < $start <= r[$i]->end 4640 # And: 4641 # r[$i-1]->end < $start <= $end < r[$j+1]->start 4642 # 4643 # Also: 4644 # $clean_insert is a boolean which is set true if and only if 4645 # this is a "clean insertion", i.e., not a change nor a 4646 # deletion (multiple was handled above). 4647 4648 # We now have enough information to decide if this call is a no-op 4649 # or not. It is a no-op if this is an insertion of already 4650 # existing data. To be so, it must be contained entirely in one 4651 # range. 4652 4653 if (main::DEBUG && $to_trace && $clean_insert 4654 && $start >= $r->[$i]->start 4655 && $end <= $r->[$i]->end) 4656 { 4657 trace "no-op"; 4658 } 4659 return if $clean_insert 4660 && $start >= $r->[$i]->start 4661 && $end <= $r->[$i]->end; 4662 } 4663 4664 # Here, we know that some action will have to be taken. We have 4665 # calculated the offset and length (though adjustments may be needed) 4666 # for the splice. Now start constructing the replacement list. 4667 my @replacement; 4668 my $splice_start = $i; 4669 4670 my $extends_below; 4671 my $extends_above; 4672 4673 # See if should extend any adjacent ranges. 4674 if ($operation eq '-') { # Don't extend deletions 4675 $extends_below = $extends_above = 0; 4676 } 4677 else { # Here, should extend any adjacent ranges. See if there are 4678 # any. 4679 $extends_below = ($i > 0 4680 # can't extend unless adjacent 4681 && $r->[$i-1]->end == $start -1 4682 # can't extend unless are same standard value 4683 && $r->[$i-1]->standard_form eq $standard_form 4684 # can't extend unless share type 4685 && $r->[$i-1]->type == $type); 4686 $extends_above = ($j+1 < $range_list_size 4687 && $r->[$j+1]->start == $end +1 4688 && $r->[$j+1]->standard_form eq $standard_form 4689 && $r->[$j+1]->type == $type); 4690 } 4691 if ($extends_below && $extends_above) { # Adds to both 4692 $splice_start--; # start replace at element below 4693 $length += 2; # will replace on both sides 4694 trace "Extends both below and above ranges" if main::DEBUG && $to_trace; 4695 4696 # The result will fill in any gap, replacing both sides, and 4697 # create one large range. 4698 @replacement = Range->new($r->[$i-1]->start, 4699 $r->[$j+1]->end, 4700 Value => $value, 4701 Type => $type); 4702 } 4703 else { 4704 4705 # Here we know that the result won't just be the conglomeration of 4706 # a new range with both its adjacent neighbors. But it could 4707 # extend one of them. 4708 4709 if ($extends_below) { 4710 4711 # Here the new element adds to the one below, but not to the 4712 # one above. If inserting, and only to that one range, can 4713 # just change its ending to include the new one. 4714 if ($length == 0 && $clean_insert) { 4715 $r->[$i-1]->set_end($end); 4716 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; 4717 return; 4718 } 4719 else { 4720 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; 4721 $splice_start--; # start replace at element below 4722 $length++; # will replace the element below 4723 $start = $r->[$i-1]->start; 4724 } 4725 } 4726 elsif ($extends_above) { 4727 4728 # Here the new element adds to the one above, but not below. 4729 # Mirror the code above 4730 if ($length == 0 && $clean_insert) { 4731 $r->[$j+1]->set_start($start); 4732 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; 4733 return; 4734 } 4735 else { 4736 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; 4737 $length++; # will replace the element above 4738 $end = $r->[$j+1]->end; 4739 } 4740 } 4741 4742 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; 4743 4744 # Finally, here we know there will have to be a splice. 4745 # If the change or delete affects only the highest portion of the 4746 # first affected range, the range will have to be split. The 4747 # splice will remove the whole range, but will replace it by a new 4748 # range containing just the unaffected part. So, in this case, 4749 # add to the replacement list just this unaffected portion. 4750 if (! $extends_below 4751 && $start > $r->[$i]->start && $start <= $r->[$i]->end) 4752 { 4753 push @replacement, 4754 Range->new($r->[$i]->start, 4755 $start - 1, 4756 Value => $r->[$i]->value, 4757 Type => $r->[$i]->type); 4758 } 4759 4760 # In the case of an insert or change, but not a delete, we have to 4761 # put in the new stuff; this comes next. 4762 if ($operation eq '+') { 4763 push @replacement, Range->new($start, 4764 $end, 4765 Value => $value, 4766 Type => $type); 4767 } 4768 4769 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; 4770 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; 4771 4772 # And finally, if we're changing or deleting only a portion of the 4773 # highest affected range, it must be split, as the lowest one was. 4774 if (! $extends_above 4775 && $j >= 0 # Remember that j can be -1 if before first 4776 # current element 4777 && $end >= $r->[$j]->start 4778 && $end < $r->[$j]->end) 4779 { 4780 push @replacement, 4781 Range->new($end + 1, 4782 $r->[$j]->end, 4783 Value => $r->[$j]->value, 4784 Type => $r->[$j]->type); 4785 } 4786 } 4787 4788 # And do the splice, as calculated above 4789 if (main::DEBUG && $to_trace) { 4790 trace "replacing $length element(s) at $i with "; 4791 foreach my $replacement (@replacement) { 4792 trace " $replacement"; 4793 } 4794 trace "Before splice:"; 4795 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4796 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4797 trace "i =[", $i, "]", $r->[$i]; 4798 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4799 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4800 } 4801 4802 my @return = splice @$r, $splice_start, $length, @replacement; 4803 4804 if (main::DEBUG && $to_trace) { 4805 trace "After splice:"; 4806 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4807 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4808 trace "i =[", $i, "]", $r->[$i]; 4809 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4810 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4811 trace "removed ", @return if @return; 4812 } 4813 4814 # An actual deletion could have changed the maximum in the list. 4815 # There was no deletion if the splice didn't return something, but 4816 # otherwise recalculate it. This is done too rarely to worry about 4817 # performance. 4818 if ($operation eq '-' && @return) { 4819 if (@$r) { 4820 $max{$addr} = $r->[-1]->end; 4821 } 4822 else { # Now empty 4823 $max{$addr} = $max_init; 4824 } 4825 } 4826 return @return; 4827 } 4828 4829 sub reset_each_range { # reset the iterator for each_range(); 4830 my $self = shift; 4831 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4832 4833 no overloading; 4834 undef $each_range_iterator{pack 'J', $self}; 4835 return; 4836 } 4837 4838 sub each_range { 4839 # Iterate over each range in a range list. Results are undefined if 4840 # the range list is changed during the iteration. 4841 4842 my $self = shift; 4843 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4844 4845 my $addr = do { no overloading; pack 'J', $self; }; 4846 4847 return if $self->is_empty; 4848 4849 $each_range_iterator{$addr} = -1 4850 if ! defined $each_range_iterator{$addr}; 4851 $each_range_iterator{$addr}++; 4852 return $ranges{$addr}->[$each_range_iterator{$addr}] 4853 if $each_range_iterator{$addr} < @{$ranges{$addr}}; 4854 undef $each_range_iterator{$addr}; 4855 return; 4856 } 4857 4858 sub count { # Returns count of code points in range list 4859 my $self = shift; 4860 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4861 4862 my $addr = do { no overloading; pack 'J', $self; }; 4863 4864 my $count = 0; 4865 foreach my $range (@{$ranges{$addr}}) { 4866 $count += $range->end - $range->start + 1; 4867 } 4868 return $count; 4869 } 4870 4871 sub delete_range { # Delete a range 4872 my $self = shift; 4873 my $start = shift; 4874 my $end = shift; 4875 4876 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4877 4878 return $self->_add_delete('-', $start, $end, ""); 4879 } 4880 4881 sub is_empty { # Returns boolean as to if a range list is empty 4882 my $self = shift; 4883 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4884 4885 no overloading; 4886 return scalar @{$ranges{pack 'J', $self}} == 0; 4887 } 4888 4889 sub hash { 4890 # Quickly returns a scalar suitable for separating tables into 4891 # buckets, i.e. it is a hash function of the contents of a table, so 4892 # there are relatively few conflicts. 4893 4894 my $self = shift; 4895 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 4896 4897 my $addr = do { no overloading; pack 'J', $self; }; 4898 4899 # These are quickly computable. Return looks like 'min..max;count' 4900 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; 4901 } 4902} # End closure for _Range_List_Base 4903 4904package Range_List; 4905use parent '-norequire', '_Range_List_Base'; 4906 4907# A Range_List is a range list for match tables; i.e. the range values are 4908# not significant. Thus a number of operations can be safely added to it, 4909# such as inversion, intersection. Note that union is also an unsafe 4910# operation when range values are cared about, and that method is in the base 4911# class, not here. But things are set up so that that method is callable only 4912# during initialization. Only in this derived class, is there an operation 4913# that combines two tables. A Range_Map can thus be used to initialize a 4914# Range_List, and its mappings will be in the list, but are not significant to 4915# this class. 4916 4917sub trace { return main::trace(@_); } 4918 4919{ # Closure 4920 4921 use overload 4922 fallback => 0, 4923 '+' => sub { my $self = shift; 4924 my $other = shift; 4925 4926 return $self->_union($other) 4927 }, 4928 '+=' => sub { my $self = shift; 4929 my $other = shift; 4930 my $reversed = shift; 4931 4932 if ($reversed) { 4933 Carp::my_carp_bug("Bad news. Can't cope with '" 4934 . ref($other) 4935 . ' += ' 4936 . ref($self) 4937 . "'. undef returned."); 4938 return; 4939 } 4940 4941 return $self->_union($other) 4942 }, 4943 '&' => sub { my $self = shift; 4944 my $other = shift; 4945 4946 return $self->_intersect($other, 0); 4947 }, 4948 '&=' => sub { my $self = shift; 4949 my $other = shift; 4950 my $reversed = shift; 4951 4952 if ($reversed) { 4953 Carp::my_carp_bug("Bad news. Can't cope with '" 4954 . ref($other) 4955 . ' &= ' 4956 . ref($self) 4957 . "'. undef returned."); 4958 return; 4959 } 4960 4961 return $self->_intersect($other, 0); 4962 }, 4963 '~' => "_invert", 4964 '-' => "_subtract", 4965 ; 4966 4967 sub _invert { 4968 # Returns a new Range_List that gives all code points not in $self. 4969 4970 my $self = shift; 4971 4972 my $new = Range_List->new; 4973 4974 # Go through each range in the table, finding the gaps between them 4975 my $max = -1; # Set so no gap before range beginning at 0 4976 for my $range ($self->ranges) { 4977 my $start = $range->start; 4978 my $end = $range->end; 4979 4980 # If there is a gap before this range, the inverse will contain 4981 # that gap. 4982 if ($start > $max + 1) { 4983 $new->add_range($max + 1, $start - 1); 4984 } 4985 $max = $end; 4986 } 4987 4988 # And finally, add the gap from the end of the table to the max 4989 # possible code point 4990 if ($max < $MAX_WORKING_CODEPOINT) { 4991 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT); 4992 } 4993 return $new; 4994 } 4995 4996 sub _subtract { 4997 # Returns a new Range_List with the argument deleted from it. The 4998 # argument can be a single code point, a range, or something that has 4999 # a range, with the _range_list() method on it returning them 5000 5001 my $self = shift; 5002 my $other = shift; 5003 my $reversed = shift; 5004 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5005 5006 if ($reversed) { 5007 Carp::my_carp_bug("Bad news. Can't cope with '" 5008 . ref($other) 5009 . ' - ' 5010 . ref($self) 5011 . "'. undef returned."); 5012 return; 5013 } 5014 5015 my $new = Range_List->new(Initialize => $self); 5016 5017 if (! ref $other) { # Single code point 5018 $new->delete_range($other, $other); 5019 } 5020 elsif ($other->isa('Range')) { 5021 $new->delete_range($other->start, $other->end); 5022 } 5023 elsif ($other->can('_range_list')) { 5024 foreach my $range ($other->_range_list->ranges) { 5025 $new->delete_range($range->start, $range->end); 5026 } 5027 } 5028 else { 5029 Carp::my_carp_bug("Can't cope with a " 5030 . ref($other) 5031 . " argument to '-'. Subtraction ignored." 5032 ); 5033 return $self; 5034 } 5035 5036 return $new; 5037 } 5038 5039 sub _intersect { 5040 # Returns either a boolean giving whether the two inputs' range lists 5041 # intersect (overlap), or a new Range_List containing the intersection 5042 # of the two lists. The optional final parameter being true indicates 5043 # to do the check instead of the intersection. 5044 5045 my $a_object = shift; 5046 my $b_object = shift; 5047 my $check_if_overlapping = shift; 5048 $check_if_overlapping = 0 unless defined $check_if_overlapping; 5049 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5050 5051 if (! defined $b_object) { 5052 my $message = ""; 5053 $message .= $a_object->_owner_name_of if defined $a_object; 5054 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); 5055 return; 5056 } 5057 5058 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) 5059 # Thus the intersection could be much more simply be written: 5060 # return ~(~$a_object + ~$b_object); 5061 # But, this is slower, and when taking the inverse of a large 5062 # range_size_1 table, back when such tables were always stored that 5063 # way, it became prohibitively slow, hence the code was changed to the 5064 # below 5065 5066 if ($b_object->isa('Range')) { 5067 $b_object = Range_List->new(Initialize => $b_object, 5068 Owner => $a_object->_owner_name_of); 5069 } 5070 $b_object = $b_object->_range_list if $b_object->can('_range_list'); 5071 5072 my @a_ranges = $a_object->ranges; 5073 my @b_ranges = $b_object->ranges; 5074 5075 #local $to_trace = 1 if main::DEBUG; 5076 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; 5077 5078 # Start with the first range in each list 5079 my $a_i = 0; 5080 my $range_a = $a_ranges[$a_i]; 5081 my $b_i = 0; 5082 my $range_b = $b_ranges[$b_i]; 5083 5084 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) 5085 if ! $check_if_overlapping; 5086 5087 # If either list is empty, there is no intersection and no overlap 5088 if (! defined $range_a || ! defined $range_b) { 5089 return $check_if_overlapping ? 0 : $new; 5090 } 5091 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5092 5093 # Otherwise, must calculate the intersection/overlap. Start with the 5094 # very first code point in each list 5095 my $a = $range_a->start; 5096 my $b = $range_b->start; 5097 5098 # Loop through all the ranges of each list; in each iteration, $a and 5099 # $b are the current code points in their respective lists 5100 while (1) { 5101 5102 # If $a and $b are the same code point, ... 5103 if ($a == $b) { 5104 5105 # it means the lists overlap. If just checking for overlap 5106 # know the answer now, 5107 return 1 if $check_if_overlapping; 5108 5109 # The intersection includes this code point plus anything else 5110 # common to both current ranges. 5111 my $start = $a; 5112 my $end = main::min($range_a->end, $range_b->end); 5113 if (! $check_if_overlapping) { 5114 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; 5115 $new->add_range($start, $end); 5116 } 5117 5118 # Skip ahead to the end of the current intersect 5119 $a = $b = $end; 5120 5121 # If the current intersect ends at the end of either range (as 5122 # it must for at least one of them), the next possible one 5123 # will be the beginning code point in it's list's next range. 5124 if ($a == $range_a->end) { 5125 $range_a = $a_ranges[++$a_i]; 5126 last unless defined $range_a; 5127 $a = $range_a->start; 5128 } 5129 if ($b == $range_b->end) { 5130 $range_b = $b_ranges[++$b_i]; 5131 last unless defined $range_b; 5132 $b = $range_b->start; 5133 } 5134 5135 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5136 } 5137 elsif ($a < $b) { 5138 5139 # Not equal, but if the range containing $a encompasses $b, 5140 # change $a to be the middle of the range where it does equal 5141 # $b, so the next iteration will get the intersection 5142 if ($range_a->end >= $b) { 5143 $a = $b; 5144 } 5145 else { 5146 5147 # Here, the current range containing $a is entirely below 5148 # $b. Go try to find a range that could contain $b. 5149 $a_i = $a_object->_search_ranges($b); 5150 5151 # If no range found, quit. 5152 last unless defined $a_i; 5153 5154 # The search returns $a_i, such that 5155 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end 5156 # Set $a to the beginning of this new range, and repeat. 5157 $range_a = $a_ranges[$a_i]; 5158 $a = $range_a->start; 5159 } 5160 } 5161 else { # Here, $b < $a. 5162 5163 # Mirror image code to the leg just above 5164 if ($range_b->end >= $a) { 5165 $b = $a; 5166 } 5167 else { 5168 $b_i = $b_object->_search_ranges($a); 5169 last unless defined $b_i; 5170 $range_b = $b_ranges[$b_i]; 5171 $b = $range_b->start; 5172 } 5173 } 5174 } # End of looping through ranges. 5175 5176 # Intersection fully computed, or now know that there is no overlap 5177 return $check_if_overlapping ? 0 : $new; 5178 } 5179 5180 sub overlaps { 5181 # Returns boolean giving whether the two arguments overlap somewhere 5182 5183 my $self = shift; 5184 my $other = shift; 5185 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5186 5187 return $self->_intersect($other, 1); 5188 } 5189 5190 sub add_range { 5191 # Add a range to the list. 5192 5193 my $self = shift; 5194 my $start = shift; 5195 my $end = shift; 5196 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5197 5198 return $self->_add_delete('+', $start, $end, ""); 5199 } 5200 5201 sub matches_identically_to { 5202 # Return a boolean as to whether or not two Range_Lists match identical 5203 # sets of code points. 5204 5205 my $self = shift; 5206 my $other = shift; 5207 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5208 5209 # These are ordered in increasing real time to figure out (at least 5210 # until a patch changes that and doesn't change this) 5211 return 0 if $self->max != $other->max; 5212 return 0 if $self->min != $other->min; 5213 return 0 if $self->range_count != $other->range_count; 5214 return 0 if $self->count != $other->count; 5215 5216 # Here they could be identical because all the tests above passed. 5217 # The loop below is somewhat simpler since we know they have the same 5218 # number of elements. Compare range by range, until reach the end or 5219 # find something that differs. 5220 my @a_ranges = $self->ranges; 5221 my @b_ranges = $other->ranges; 5222 for my $i (0 .. @a_ranges - 1) { 5223 my $a = $a_ranges[$i]; 5224 my $b = $b_ranges[$i]; 5225 trace "self $a; other $b" if main::DEBUG && $to_trace; 5226 return 0 if ! defined $b 5227 || $a->start != $b->start 5228 || $a->end != $b->end; 5229 } 5230 return 1; 5231 } 5232 5233 sub is_code_point_usable { 5234 # This used only for making the test script. See if the input 5235 # proposed trial code point is one that Perl will handle. If second 5236 # parameter is 0, it won't select some code points for various 5237 # reasons, noted below. 5238 5239 my $code = shift; 5240 my $try_hard = shift; 5241 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5242 5243 return 0 if $code < 0; # Never use a negative 5244 5245 # shun null. I'm (khw) not sure why this was done, but NULL would be 5246 # the character very frequently used. 5247 return $try_hard if $code == 0x0000; 5248 5249 # shun non-character code points. 5250 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; 5251 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF 5252 5253 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range 5254 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate 5255 5256 return 1; 5257 } 5258 5259 sub get_valid_code_point { 5260 # Return a code point that's part of the range list. Returns nothing 5261 # if the table is empty or we can't find a suitable code point. This 5262 # used only for making the test script. 5263 5264 my $self = shift; 5265 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5266 5267 my $addr = do { no overloading; pack 'J', $self; }; 5268 5269 # On first pass, don't choose less desirable code points; if no good 5270 # one is found, repeat, allowing a less desirable one to be selected. 5271 for my $try_hard (0, 1) { 5272 5273 # Look through all the ranges for a usable code point. 5274 for my $set (reverse $self->ranges) { 5275 5276 # Try the edge cases first, starting with the end point of the 5277 # range. 5278 my $end = $set->end; 5279 return $end if is_code_point_usable($end, $try_hard); 5280 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT; 5281 5282 # End point didn't, work. Start at the beginning and try 5283 # every one until find one that does work. 5284 for my $trial ($set->start .. $end - 1) { 5285 return $trial if is_code_point_usable($trial, $try_hard); 5286 } 5287 } 5288 } 5289 return (); # If none found, give up. 5290 } 5291 5292 sub get_invalid_code_point { 5293 # Return a code point that's not part of the table. Returns nothing 5294 # if the table covers all code points or a suitable code point can't 5295 # be found. This used only for making the test script. 5296 5297 my $self = shift; 5298 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5299 5300 # Just find a valid code point of the inverse, if any. 5301 return Range_List->new(Initialize => ~ $self)->get_valid_code_point; 5302 } 5303} # end closure for Range_List 5304 5305package Range_Map; 5306use parent '-norequire', '_Range_List_Base'; 5307 5308# A Range_Map is a range list in which the range values (called maps) are 5309# significant, and hence shouldn't be manipulated by our other code, which 5310# could be ambiguous or lose things. For example, in taking the union of two 5311# lists, which share code points, but which have differing values, which one 5312# has precedence in the union? 5313# It turns out that these operations aren't really necessary for map tables, 5314# and so this class was created to make sure they aren't accidentally 5315# applied to them. 5316 5317{ # Closure 5318 5319 sub add_map { 5320 # Add a range containing a mapping value to the list 5321 5322 my $self = shift; 5323 # Rest of parameters passed on 5324 5325 return $self->_add_delete('+', @_); 5326 } 5327 5328 sub replace_map { 5329 # Replace a range 5330 5331 my $self = shift; 5332 5333 return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY); 5334 } 5335 5336 sub add_duplicate { 5337 # Adds entry to a range list which can duplicate an existing entry 5338 5339 my $self = shift; 5340 my $code_point = shift; 5341 my $value = shift; 5342 my %args = @_; 5343 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE; 5344 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5345 5346 return $self->add_map($code_point, $code_point, 5347 $value, Replace => $replace); 5348 } 5349} # End of closure for package Range_Map 5350 5351package _Base_Table; 5352 5353# A table is the basic data structure that gets written out into a file for 5354# use by the Perl core. This is the abstract base class implementing the 5355# common elements from the derived ones. A list of the methods to be 5356# furnished by an implementing class is just after the constructor. 5357 5358sub standardize { return main::standardize($_[0]); } 5359sub trace { return main::trace(@_); } 5360 5361{ # Closure 5362 5363 main::setup_package(); 5364 5365 my %range_list; 5366 # Object containing the ranges of the table. 5367 main::set_access('range_list', \%range_list, 'p_r', 'p_s'); 5368 5369 my %full_name; 5370 # The full table name. 5371 main::set_access('full_name', \%full_name, 'r'); 5372 5373 my %name; 5374 # The table name, almost always shorter 5375 main::set_access('name', \%name, 'r'); 5376 5377 my %short_name; 5378 # The shortest of all the aliases for this table, with underscores removed 5379 main::set_access('short_name', \%short_name); 5380 5381 my %nominal_short_name_length; 5382 # The length of short_name before removing underscores 5383 main::set_access('nominal_short_name_length', 5384 \%nominal_short_name_length); 5385 5386 my %complete_name; 5387 # The complete name, including property. 5388 main::set_access('complete_name', \%complete_name, 'r'); 5389 5390 my %property; 5391 # Parent property this table is attached to. 5392 main::set_access('property', \%property, 'r'); 5393 5394 my %aliases; 5395 # Ordered list of alias objects of the table's name. The first ones in 5396 # the list are output first in comments 5397 main::set_access('aliases', \%aliases, 'readable_array'); 5398 5399 my %comment; 5400 # A comment associated with the table for human readers of the files 5401 main::set_access('comment', \%comment, 's'); 5402 5403 my %description; 5404 # A comment giving a short description of the table's meaning for human 5405 # readers of the files. 5406 main::set_access('description', \%description, 'readable_array'); 5407 5408 my %note; 5409 # A comment giving a short note about the table for human readers of the 5410 # files. 5411 main::set_access('note', \%note, 'readable_array'); 5412 5413 my %fate; 5414 # Enum; there are a number of possibilities for what happens to this 5415 # table: it could be normal, or suppressed, or not for external use. See 5416 # values at definition for $SUPPRESSED. 5417 main::set_access('fate', \%fate, 'r'); 5418 5419 my %find_table_from_alias; 5420 # The parent property passes this pointer to a hash which this class adds 5421 # all its aliases to, so that the parent can quickly take an alias and 5422 # find this table. 5423 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); 5424 5425 my %locked; 5426 # After this table is made equivalent to another one; we shouldn't go 5427 # changing the contents because that could mean it's no longer equivalent 5428 main::set_access('locked', \%locked, 'r'); 5429 5430 my %file_path; 5431 # This gives the final path to the file containing the table. Each 5432 # directory in the path is an element in the array 5433 main::set_access('file_path', \%file_path, 'readable_array'); 5434 5435 my %status; 5436 # What is the table's status, normal, $OBSOLETE, etc. Enum 5437 main::set_access('status', \%status, 'r'); 5438 5439 my %status_info; 5440 # A comment about its being obsolete, or whatever non normal status it has 5441 main::set_access('status_info', \%status_info, 'r'); 5442 5443 my %caseless_equivalent; 5444 # The table this is equivalent to under /i matching, if any. 5445 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's'); 5446 5447 my %range_size_1; 5448 # Is the table to be output with each range only a single code point? 5449 # This is done to avoid breaking existing code that may have come to rely 5450 # on this behavior in previous versions of this program.) 5451 main::set_access('range_size_1', \%range_size_1, 'r', 's'); 5452 5453 my %perl_extension; 5454 # A boolean set iff this table is a Perl extension to the Unicode 5455 # standard. 5456 main::set_access('perl_extension', \%perl_extension, 'r'); 5457 5458 my %output_range_counts; 5459 # A boolean set iff this table is to have comments written in the 5460 # output file that contain the number of code points in the range. 5461 # The constructor can override the global flag of the same name. 5462 main::set_access('output_range_counts', \%output_range_counts, 'r'); 5463 5464 my %write_as_invlist; 5465 # A boolean set iff the output file for this table is to be in the form of 5466 # an inversion list/map. 5467 main::set_access('write_as_invlist', \%write_as_invlist, 'r'); 5468 5469 my %format; 5470 # The format of the entries of the table. This is calculated from the 5471 # data in the table (or passed in the constructor). This is an enum e.g., 5472 # $STRING_FORMAT. It is marked protected as it should not be generally 5473 # used to override calculations. 5474 main::set_access('format', \%format, 'r', 'p_s'); 5475 5476 my %has_dependency; 5477 # A boolean that gives whether some other table in this property is 5478 # defined as the complement of this table. This is a crude, but currently 5479 # sufficient, mechanism to make this table not get destroyed before what 5480 # is dependent on it is. Other dependencies could be added, so the name 5481 # was chosen to reflect a more general situation than actually is 5482 # currently the case. 5483 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 5484 5485 sub new { 5486 # All arguments are key => value pairs, which you can see below, most 5487 # of which match fields documented above. Otherwise: Re_Pod_Entry, 5488 # OK_as_Filename, and Fuzzy apply to the names of the table, and are 5489 # documented in the Alias package 5490 5491 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 5492 5493 my $class = shift; 5494 5495 my $self = bless \do { my $anonymous_scalar }, $class; 5496 my $addr = do { no overloading; pack 'J', $self; }; 5497 5498 my %args = @_; 5499 5500 $name{$addr} = delete $args{'Name'}; 5501 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; 5502 $full_name{$addr} = delete $args{'Full_Name'}; 5503 my $complete_name = $complete_name{$addr} 5504 = delete $args{'Complete_Name'}; 5505 $format{$addr} = delete $args{'Format'}; 5506 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; 5507 $property{$addr} = delete $args{'_Property'}; 5508 $range_list{$addr} = delete $args{'_Range_List'}; 5509 $status{$addr} = delete $args{'Status'} || $NORMAL; 5510 $status_info{$addr} = delete $args{'_Status_Info'} || ""; 5511 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; 5512 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; 5513 $fate{$addr} = delete $args{'Fate'} || $ORDINARY; 5514 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default 5515 my $ucd = delete $args{'UCD'}; 5516 5517 my $description = delete $args{'Description'}; 5518 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5519 my $loose_match = delete $args{'Fuzzy'}; 5520 my $note = delete $args{'Note'}; 5521 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 5522 my $perl_extension = delete $args{'Perl_Extension'}; 5523 my $suppression_reason = delete $args{'Suppression_Reason'}; 5524 5525 # Shouldn't have any left over 5526 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5527 5528 # Can't use || above because conceivably the name could be 0, and 5529 # can't use // operator in case this program gets used in Perl 5.8 5530 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; 5531 $output_range_counts{$addr} = $output_range_counts if 5532 ! defined $output_range_counts{$addr}; 5533 5534 $aliases{$addr} = [ ]; 5535 $comment{$addr} = [ ]; 5536 $description{$addr} = [ ]; 5537 $note{$addr} = [ ]; 5538 $file_path{$addr} = [ ]; 5539 $locked{$addr} = ""; 5540 $has_dependency{$addr} = 0; 5541 5542 push @{$description{$addr}}, $description if $description; 5543 push @{$note{$addr}}, $note if $note; 5544 5545 if ($fate{$addr} == $PLACEHOLDER) { 5546 5547 # A placeholder table doesn't get documented, is a perl extension, 5548 # and quite likely will be empty 5549 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5550 $perl_extension = 1 if ! defined $perl_extension; 5551 $ucd = 0 if ! defined $ucd; 5552 push @tables_that_may_be_empty, $complete_name{$addr}; 5553 $self->add_comment(<<END); 5554This is a placeholder because it is not in Version $string_version of Unicode, 5555but is needed by the Perl core to work gracefully. Because it is not in this 5556version of Unicode, it will not be listed in $pod_file.pod 5557END 5558 } 5559 elsif (exists $why_suppressed{$complete_name} 5560 # Don't suppress if overridden 5561 && ! grep { $_ eq $complete_name{$addr} } 5562 @output_mapped_properties) 5563 { 5564 $fate{$addr} = $SUPPRESSED; 5565 } 5566 elsif ($fate{$addr} == $SUPPRESSED) { 5567 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason; 5568 # Though currently unused 5569 } 5570 elsif ($suppression_reason) { 5571 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed"); 5572 } 5573 5574 # If hasn't set its status already, see if it is on one of the 5575 # lists of properties or tables that have particular statuses; if 5576 # not, is normal. The lists are prioritized so the most serious 5577 # ones are checked first 5578 if (! $status{$addr}) { 5579 if (exists $why_deprecated{$complete_name}) { 5580 $status{$addr} = $DEPRECATED; 5581 } 5582 elsif (exists $why_stabilized{$complete_name}) { 5583 $status{$addr} = $STABILIZED; 5584 } 5585 elsif (exists $why_obsolete{$complete_name}) { 5586 $status{$addr} = $OBSOLETE; 5587 } 5588 5589 # Existence above doesn't necessarily mean there is a message 5590 # associated with it. Use the most serious message. 5591 if ($status{$addr}) { 5592 if ($why_deprecated{$complete_name}) { 5593 $status_info{$addr} 5594 = $why_deprecated{$complete_name}; 5595 } 5596 elsif ($why_stabilized{$complete_name}) { 5597 $status_info{$addr} 5598 = $why_stabilized{$complete_name}; 5599 } 5600 elsif ($why_obsolete{$complete_name}) { 5601 $status_info{$addr} 5602 = $why_obsolete{$complete_name}; 5603 } 5604 } 5605 } 5606 5607 $perl_extension{$addr} = $perl_extension || 0; 5608 5609 # Don't list a property by default that is internal only 5610 if ($fate{$addr} > $MAP_PROXIED) { 5611 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5612 $ucd = 0 if ! defined $ucd; 5613 } 5614 else { 5615 $ucd = 1 if ! defined $ucd; 5616 } 5617 5618 # By convention what typically gets printed only or first is what's 5619 # first in the list, so put the full name there for good output 5620 # clarity. Other routines rely on the full name being first on the 5621 # list 5622 $self->add_alias($full_name{$addr}, 5623 OK_as_Filename => $ok_as_filename, 5624 Fuzzy => $loose_match, 5625 Re_Pod_Entry => $make_re_pod_entry, 5626 Status => $status{$addr}, 5627 UCD => $ucd, 5628 ); 5629 5630 # Then comes the other name, if meaningfully different. 5631 if (standardize($full_name{$addr}) ne standardize($name{$addr})) { 5632 $self->add_alias($name{$addr}, 5633 OK_as_Filename => $ok_as_filename, 5634 Fuzzy => $loose_match, 5635 Re_Pod_Entry => $make_re_pod_entry, 5636 Status => $status{$addr}, 5637 UCD => $ucd, 5638 ); 5639 } 5640 5641 return $self; 5642 } 5643 5644 # Here are the methods that are required to be defined by any derived 5645 # class 5646 for my $sub (qw( 5647 handle_special_range 5648 append_to_body 5649 pre_body 5650 )) 5651 # write() knows how to write out normal ranges, but it calls 5652 # handle_special_range() when it encounters a non-normal one. 5653 # append_to_body() is called by it after it has handled all 5654 # ranges to add anything after the main portion of the table. 5655 # And finally, pre_body() is called after all this to build up 5656 # anything that should appear before the main portion of the 5657 # table. Doing it this way allows things in the middle to 5658 # affect what should appear before the main portion of the 5659 # table. 5660 { 5661 no strict "refs"; 5662 *$sub = sub { 5663 Carp::my_carp_bug( __LINE__ 5664 . ": Must create method '$sub()' for " 5665 . ref shift); 5666 return; 5667 } 5668 } 5669 5670 use overload 5671 fallback => 0, 5672 "." => \&main::_operator_dot, 5673 ".=" => \&main::_operator_dot_equal, 5674 '!=' => \&main::_operator_not_equal, 5675 '==' => \&main::_operator_equal, 5676 ; 5677 5678 sub ranges { 5679 # Returns the array of ranges associated with this table. 5680 5681 no overloading; 5682 return $range_list{pack 'J', shift}->ranges; 5683 } 5684 5685 sub add_alias { 5686 # Add a synonym for this table. 5687 5688 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 5689 5690 my $self = shift; 5691 my $name = shift; # The name to add. 5692 my $pointer = shift; # What the alias hash should point to. For 5693 # map tables, this is the parent property; 5694 # for match tables, it is the table itself. 5695 5696 my %args = @_; 5697 my $loose_match = delete $args{'Fuzzy'}; 5698 5699 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5700 $ok_as_filename = 1 unless defined $ok_as_filename; 5701 5702 # An internal name does not get documented, unless overridden by the 5703 # input; same for making tests for it. 5704 my $status = delete $args{'Status'} || (($name =~ /^_/) 5705 ? $INTERNAL_ALIAS 5706 : $NORMAL); 5707 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'} 5708 // (($status ne $INTERNAL_ALIAS) 5709 ? (($name =~ /^_/) ? $NO : $YES) 5710 : $NO); 5711 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); 5712 5713 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5714 5715 # Capitalize the first letter of the alias unless it is one of the CJK 5716 # ones which specifically begins with a lower 'k'. Do this because 5717 # Unicode has varied whether they capitalize first letters or not, and 5718 # have later changed their minds and capitalized them, but not the 5719 # other way around. So do it always and avoid changes from release to 5720 # release 5721 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 5722 5723 my $addr = do { no overloading; pack 'J', $self; }; 5724 5725 # Figure out if should be loosely matched if not already specified. 5726 if (! defined $loose_match) { 5727 5728 # Is a loose_match if isn't null, and doesn't begin with an 5729 # underscore and isn't just a number 5730 if ($name ne "" 5731 && substr($name, 0, 1) ne '_' 5732 && $name !~ qr{^[0-9_.+-/]+$}) 5733 { 5734 $loose_match = 1; 5735 } 5736 else { 5737 $loose_match = 0; 5738 } 5739 } 5740 5741 # If this alias has already been defined, do nothing. 5742 return if defined $find_table_from_alias{$addr}->{$name}; 5743 5744 # That includes if it is standardly equivalent to an existing alias, 5745 # in which case, add this name to the list, so won't have to search 5746 # for it again. 5747 my $standard_name = main::standardize($name); 5748 if (defined $find_table_from_alias{$addr}->{$standard_name}) { 5749 $find_table_from_alias{$addr}->{$name} 5750 = $find_table_from_alias{$addr}->{$standard_name}; 5751 return; 5752 } 5753 5754 # Set the index hash for this alias for future quick reference. 5755 $find_table_from_alias{$addr}->{$name} = $pointer; 5756 $find_table_from_alias{$addr}->{$standard_name} = $pointer; 5757 local $to_trace = 0 if main::DEBUG; 5758 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; 5759 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; 5760 5761 5762 # Put the new alias at the end of the list of aliases unless the final 5763 # element begins with an underscore (meaning it is for internal perl 5764 # use) or is all numeric, in which case, put the new one before that 5765 # one. This floats any all-numeric or underscore-beginning aliases to 5766 # the end. This is done so that they are listed last in output lists, 5767 # to encourage the user to use a better name (either more descriptive 5768 # or not an internal-only one) instead. This ordering is relied on 5769 # implicitly elsewhere in this program, like in short_name() 5770 my $list = $aliases{$addr}; 5771 my $insert_position = (@$list == 0 5772 || (substr($list->[-1]->name, 0, 1) ne '_' 5773 && $list->[-1]->name =~ /\D/)) 5774 ? @$list 5775 : @$list - 1; 5776 splice @$list, 5777 $insert_position, 5778 0, 5779 Alias->new($name, $loose_match, $make_re_pod_entry, 5780 $ok_as_filename, $status, $ucd); 5781 5782 # This name may be shorter than any existing ones, so clear the cache 5783 # of the shortest, so will have to be recalculated. 5784 no overloading; 5785 undef $short_name{pack 'J', $self}; 5786 return; 5787 } 5788 5789 sub short_name { 5790 # Returns a name suitable for use as the base part of a file name. 5791 # That is, shorter wins. It can return undef if there is no suitable 5792 # name. The name has all non-essential underscores removed. 5793 5794 # The optional second parameter is a reference to a scalar in which 5795 # this routine will store the length the returned name had before the 5796 # underscores were removed, or undef if the return is undef. 5797 5798 # The shortest name can change if new aliases are added. So using 5799 # this should be deferred until after all these are added. The code 5800 # that does that should clear this one's cache. 5801 # Any name with alphabetics is preferred over an all numeric one, even 5802 # if longer. 5803 5804 my $self = shift; 5805 my $nominal_length_ptr = shift; 5806 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5807 5808 my $addr = do { no overloading; pack 'J', $self; }; 5809 5810 # For efficiency, don't recalculate, but this means that adding new 5811 # aliases could change what the shortest is, so the code that does 5812 # that needs to undef this. 5813 if (defined $short_name{$addr}) { 5814 if ($nominal_length_ptr) { 5815 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5816 } 5817 return $short_name{$addr}; 5818 } 5819 5820 # Look at each alias 5821 my $is_last_resort = 0; 5822 my $deprecated_or_discouraged 5823 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x; 5824 foreach my $alias ($self->aliases()) { 5825 5826 # Don't use an alias that isn't ok to use for an external name. 5827 next if ! $alias->ok_as_filename; 5828 5829 my $name = main::Standardize($alias->name); 5830 trace $self, $name if main::DEBUG && $to_trace; 5831 5832 # Take the first one, or any non-deprecated non-discouraged one 5833 # over one that is, or a shorter one that isn't numeric. This 5834 # relies on numeric aliases always being last in the array 5835 # returned by aliases(). Any alpha one will have precedence. 5836 if ( ! defined $short_name{$addr} 5837 || ( $is_last_resort 5838 && $alias->status !~ $deprecated_or_discouraged) 5839 || ($name =~ /\D/ 5840 && length($name) < length($short_name{$addr}))) 5841 { 5842 # Remove interior underscores. 5843 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; 5844 5845 $nominal_short_name_length{$addr} = length $name; 5846 $is_last_resort = $alias->status =~ $deprecated_or_discouraged; 5847 } 5848 } 5849 5850 # If the short name isn't a nice one, perhaps an equivalent table has 5851 # a better one. 5852 if ( $self->can('children') 5853 && ( ! defined $short_name{$addr} 5854 || $short_name{$addr} eq "" 5855 || $short_name{$addr} eq "_")) 5856 { 5857 my $return; 5858 foreach my $follower ($self->children) { # All equivalents 5859 my $follower_name = $follower->short_name; 5860 next unless defined $follower_name; 5861 5862 # Anything (except undefined) is better than underscore or 5863 # empty 5864 if (! defined $return || $return eq "_") { 5865 $return = $follower_name; 5866 next; 5867 } 5868 5869 # If the new follower name isn't "_" and is shorter than the 5870 # current best one, prefer the new one. 5871 next if $follower_name eq "_"; 5872 next if length $follower_name > length $return; 5873 $return = $follower_name; 5874 } 5875 $short_name{$addr} = $return if defined $return; 5876 } 5877 5878 # If no suitable external name return undef 5879 if (! defined $short_name{$addr}) { 5880 $$nominal_length_ptr = undef if $nominal_length_ptr; 5881 return; 5882 } 5883 5884 # Don't allow a null short name. 5885 if ($short_name{$addr} eq "") { 5886 $short_name{$addr} = '_'; 5887 $nominal_short_name_length{$addr} = 1; 5888 } 5889 5890 trace $self, $short_name{$addr} if main::DEBUG && $to_trace; 5891 5892 if ($nominal_length_ptr) { 5893 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5894 } 5895 return $short_name{$addr}; 5896 } 5897 5898 sub external_name { 5899 # Returns the external name that this table should be known by. This 5900 # is usually the short_name, but not if the short_name is undefined, 5901 # in which case the external_name is arbitrarily set to the 5902 # underscore. 5903 5904 my $self = shift; 5905 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5906 5907 my $short = $self->short_name; 5908 return $short if defined $short; 5909 5910 return '_'; 5911 } 5912 5913 sub add_description { # Adds the parameter as a short description. 5914 5915 my $self = shift; 5916 my $description = shift; 5917 chomp $description; 5918 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5919 5920 no overloading; 5921 push @{$description{pack 'J', $self}}, $description; 5922 5923 return; 5924 } 5925 5926 sub add_note { # Adds the parameter as a short note. 5927 5928 my $self = shift; 5929 my $note = shift; 5930 chomp $note; 5931 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5932 5933 no overloading; 5934 push @{$note{pack 'J', $self}}, $note; 5935 5936 return; 5937 } 5938 5939 sub add_comment { # Adds the parameter as a comment. 5940 5941 return unless $debugging_build; 5942 5943 my $self = shift; 5944 my $comment = shift; 5945 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5946 5947 chomp $comment; 5948 5949 no overloading; 5950 push @{$comment{pack 'J', $self}}, $comment; 5951 5952 return; 5953 } 5954 5955 sub comment { 5956 # Return the current comment for this table. If called in list 5957 # context, returns the array of comments. In scalar, returns a string 5958 # of each element joined together with a period ending each. 5959 5960 my $self = shift; 5961 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5962 5963 my $addr = do { no overloading; pack 'J', $self; }; 5964 my @list = @{$comment{$addr}}; 5965 return @list if wantarray; 5966 my $return = ""; 5967 foreach my $sentence (@list) { 5968 $return .= '. ' if $return; 5969 $return .= $sentence; 5970 $return =~ s/\.$//; 5971 } 5972 $return .= '.' if $return; 5973 return $return; 5974 } 5975 5976 sub initialize { 5977 # Initialize the table with the argument which is any valid 5978 # initialization for range lists. 5979 5980 my $self = shift; 5981 my $addr = do { no overloading; pack 'J', $self; }; 5982 my $initialization = shift; 5983 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 5984 5985 # Replace the current range list with a new one of the same exact 5986 # type. 5987 my $class = ref $range_list{$addr}; 5988 $range_list{$addr} = $class->new(Owner => $self, 5989 Initialize => $initialization); 5990 return; 5991 5992 } 5993 5994 sub header { 5995 # The header that is output for the table in the file it is written 5996 # in. 5997 5998 my $self = shift; 5999 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6000 6001 my $return = ""; 6002 $return .= $DEVELOPMENT_ONLY if $compare_versions; 6003 $return .= $HEADER; 6004 return $return; 6005 } 6006 6007 sub merge_single_annotation_line ($$$) { 6008 my ($output, $annotation, $annotation_column) = @_; 6009 6010 # This appends an annotation comment, $annotation, to $output, 6011 # starting in or after column $annotation_column, removing any 6012 # pre-existing comment from $output. 6013 6014 $annotation =~ s/^ \s* \# \ //x; 6015 $output =~ s/ \s* ( \# \N* )? \n //x; 6016 $output = Text::Tabs::expand($output); 6017 6018 my $spaces = $annotation_column - length $output; 6019 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment 6020 6021 $output = sprintf "%s%*s# %s", 6022 $output, 6023 $spaces, 6024 " ", 6025 $annotation; 6026 return Text::Tabs::unexpand $output; 6027 } 6028 6029 sub write { 6030 # Write a representation of the table to its file. It calls several 6031 # functions furnished by sub-classes of this abstract base class to 6032 # handle non-normal ranges, to add stuff before the table, and at its 6033 # end. If the table is to be written so that adjustments are 6034 # required, this does that conversion. 6035 6036 my $self = shift; 6037 my $use_adjustments = shift; # ? output in adjusted format or not 6038 my $suppress_value = shift; # Optional, if the value associated with 6039 # a range equals this one, don't write 6040 # the range 6041 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6042 6043 my $addr = do { no overloading; pack 'J', $self; }; 6044 my $write_as_invlist = $write_as_invlist{$addr}; 6045 6046 # Start with the header 6047 my @HEADER = $self->header; 6048 6049 # Then the comments 6050 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n" 6051 if $comment{$addr}; 6052 6053 # Things discovered processing the main body of the document may 6054 # affect what gets output before it, therefore pre_body() isn't called 6055 # until after all other processing of the table is done. 6056 6057 # The main body looks like a 'here' document. If there are comments, 6058 # get rid of them when processing it. 6059 my @OUT; 6060 if ($annotate || $output_range_counts) { 6061 # Use the line below in Perls that don't have /r 6062 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n"; 6063 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n"; 6064 } else { 6065 push @OUT, "return <<'END';\n"; 6066 } 6067 6068 if ($range_list{$addr}->is_empty) { 6069 6070 # This is a kludge for empty tables to silence a warning in 6071 # utf8.c, which can't really deal with empty tables, but it can 6072 # deal with a table that matches nothing, as the inverse of 'All' 6073 # does. 6074 push @OUT, "!utf8::All\n"; 6075 } 6076 elsif ($self->name eq 'N' 6077 6078 # To save disk space and table cache space, avoid putting out 6079 # binary N tables, but instead create a file which just inverts 6080 # the Y table. Since the file will still exist and occupy a 6081 # certain number of blocks, might as well output the whole 6082 # thing if it all will fit in one block. The number of 6083 # ranges below is an approximate number for that. 6084 && ($self->property->type == $BINARY 6085 || $self->property->type == $FORCED_BINARY) 6086 # && $self->property->tables == 2 Can't do this because the 6087 # non-binary properties, like NFDQC aren't specifiable 6088 # by the notation 6089 && $range_list{$addr}->ranges > 15 6090 && ! $annotate) # Under --annotate, want to see everything 6091 { 6092 push @OUT, "!utf8::" . $self->property->name . "\n"; 6093 } 6094 else { 6095 my $range_size_1 = $range_size_1{$addr}; 6096 6097 # To make it more readable, use a minimum indentation 6098 my $comment_indent; 6099 6100 # These are used only in $annotate option 6101 my $format; # e.g. $HEX_ADJUST_FORMAT 6102 my $include_name; # ? Include the character's name in the 6103 # annotation? 6104 my $include_cp; # ? Include its code point 6105 6106 if (! $annotate) { 6107 $comment_indent = ($self->isa('Map_Table')) 6108 ? 24 6109 : ($write_as_invlist) 6110 ? 8 6111 : 16; 6112 } 6113 else { 6114 $format = $self->format; 6115 6116 # The name of the character is output only for tables that 6117 # don't already include the name in the output. 6118 my $property = $self->property; 6119 $include_name = 6120 ! ($property == $perl_charname 6121 || $property == main::property_ref('Unicode_1_Name') 6122 || $property == main::property_ref('Name') 6123 || $property == main::property_ref('Name_Alias') 6124 ); 6125 6126 # Don't include the code point in the annotation where all 6127 # lines are a single code point, so it can be easily found in 6128 # the first column 6129 $include_cp = ! $range_size_1; 6130 6131 if (! $self->isa('Map_Table')) { 6132 $comment_indent = ($write_as_invlist) ? 8 : 16; 6133 } 6134 else { 6135 $comment_indent = 16; 6136 6137 # There are just a few short ranges in this table, so no 6138 # need to include the code point in the annotation. 6139 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT; 6140 6141 # We're trying to get this to look good, as the whole 6142 # point is to make human-readable tables. It is easier to 6143 # read if almost all the annotation comments begin in the 6144 # same column. Map tables have varying width maps, so can 6145 # create a jagged comment appearance. This code does a 6146 # preliminary pass through these tables looking for the 6147 # maximum width map in each, and causing the comments to 6148 # begin just to the right of that. However, if the 6149 # comments begin too far to the right of most lines, it's 6150 # hard to line them up horizontally with their real data. 6151 # Therefore we ignore the longest outliers 6152 my $ignore_longest_X_percent = 2; # Discard longest X% 6153 6154 # Each key in this hash is a width of at least one of the 6155 # maps in the table. Its value is how many lines have 6156 # that width. 6157 my %widths; 6158 6159 # We won't space things further left than one tab stop 6160 # after the rest of the line; initializing it to that 6161 # number saves some work. 6162 my $max_map_width = 8; 6163 6164 # Fill in the %widths hash 6165 my $total = 0; 6166 for my $set ($range_list{$addr}->ranges) { 6167 my $value = $set->value; 6168 6169 # These range types don't appear in the main table 6170 next if $set->type == 0 6171 && defined $suppress_value 6172 && $value eq $suppress_value; 6173 next if $set->type == $MULTI_CP 6174 || $set->type == $NULL; 6175 6176 # Include 2 spaces before the beginning of the 6177 # comment 6178 my $this_width = length($value) + 2; 6179 6180 # Ranges of the remaining non-zero types usually 6181 # occupy just one line (maybe occasionally two, but 6182 # this doesn't have to be dead accurate). This is 6183 # because these ranges are like "unassigned code 6184 # points" 6185 my $count = ($set->type != 0) 6186 ? 1 6187 : $set->end - $set->start + 1; 6188 $widths{$this_width} += $count; 6189 $total += $count; 6190 $max_map_width = $this_width 6191 if $max_map_width < $this_width; 6192 } 6193 6194 # If the widest map gives us less than two tab stops 6195 # worth, just take it as-is. 6196 if ($max_map_width > 16) { 6197 6198 # Otherwise go through %widths until we have included 6199 # the desired percentage of lines in the whole table. 6200 my $running_total = 0; 6201 foreach my $width (sort { $a <=> $b } keys %widths) 6202 { 6203 $running_total += $widths{$width}; 6204 use integer; 6205 if ($running_total * 100 / $total 6206 >= 100 - $ignore_longest_X_percent) 6207 { 6208 $max_map_width = $width; 6209 last; 6210 } 6211 } 6212 } 6213 $comment_indent += $max_map_width; 6214 } 6215 } 6216 6217 # Values for previous time through the loop. Initialize to 6218 # something that won't be adjacent to the first iteration; 6219 # only $previous_end matters for that. 6220 my $previous_start; 6221 my $previous_end = -2; 6222 my $previous_value; 6223 6224 # Values for next time through the portion of the loop that splits 6225 # the range. 0 in $next_start means there is no remaining portion 6226 # to deal with. 6227 my $next_start = 0; 6228 my $next_end; 6229 my $next_value; 6230 my $offset = 0; 6231 my $invlist_count = 0; 6232 6233 my $output_value_in_hex = $self->isa('Map_Table') 6234 && ($self->format eq $HEX_ADJUST_FORMAT 6235 || $self->to_output_map == $EXTERNAL_MAP); 6236 # Use leading zeroes just for files whose format should not be 6237 # changed from what it has been. Otherwise, they just take up 6238 # space and time to process. 6239 my $hex_format = ($self->isa('Map_Table') 6240 && $self->to_output_map == $EXTERNAL_MAP) 6241 ? "%04X" 6242 : "%X"; 6243 6244 # The values for some of these tables are stored in mktables as 6245 # hex strings. Normally, these are just output as strings without 6246 # change, but when we are doing adjustments, we have to operate on 6247 # these numerically, so we convert those to decimal to do that, 6248 # and back to hex for output 6249 my $convert_map_to_from_hex = 0; 6250 my $output_map_in_hex = 0; 6251 if ($self->isa('Map_Table')) { 6252 $convert_map_to_from_hex 6253 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT) 6254 || ($annotate && $self->format eq $HEX_FORMAT); 6255 $output_map_in_hex = $convert_map_to_from_hex 6256 || $self->format eq $HEX_FORMAT; 6257 } 6258 6259 # To store any annotations about the characters. 6260 my @annotation; 6261 6262 # Output each range as part of the here document. 6263 RANGE: 6264 for my $set ($range_list{$addr}->ranges) { 6265 if ($set->type != 0) { 6266 $self->handle_special_range($set); 6267 next RANGE; 6268 } 6269 my $start = $set->start; 6270 my $end = $set->end; 6271 my $value = $set->value; 6272 6273 # Don't output ranges whose value is the one to suppress 6274 next RANGE if defined $suppress_value 6275 && $value eq $suppress_value; 6276 6277 $value = CORE::hex $value if $convert_map_to_from_hex; 6278 6279 6280 { # This bare block encloses the scope where we may need to 6281 # 'redo' to. Consider a table that is to be written out 6282 # using single item ranges. This is given in the 6283 # $range_size_1 boolean. To accomplish this, we split the 6284 # range each time through the loop into two portions, the 6285 # first item, and the rest. We handle that first item 6286 # this time in the loop, and 'redo' to repeat the process 6287 # for the rest of the range. 6288 # 6289 # We may also have to do it, with other special handling, 6290 # if the table has adjustments. Consider the table that 6291 # contains the lowercasing maps. mktables stores the 6292 # ASCII range ones as 26 ranges: 6293 # ord('A') => ord('a'), .. ord('Z') => ord('z') 6294 # For compactness, the table that gets written has this as 6295 # just one range 6296 # ( ord('A') .. ord('Z') ) => ord('a') 6297 # and the software that reads the tables is smart enough 6298 # to "connect the dots". This change is accomplished in 6299 # this loop by looking to see if the current iteration 6300 # fits the paradigm of the previous iteration, and if so, 6301 # we merge them by replacing the final output item with 6302 # the merged data. Repeated 25 times, this gets A-Z. But 6303 # we also have to make sure we don't screw up cases where 6304 # we have internally stored 6305 # ( 0x1C4 .. 0x1C6 ) => 0x1C5 6306 # This single internal range has to be output as 3 ranges, 6307 # which is done by splitting, like we do for $range_size_1 6308 # tables. (There are very few of such ranges that need to 6309 # be split, so the gain of doing the combining of other 6310 # ranges far outweighs the splitting of these.) The 6311 # values to use for the redo at the end of this block are 6312 # set up just below in the scalars whose names begin with 6313 # '$next_'. 6314 6315 if (($use_adjustments || $range_size_1) && $end != $start) 6316 { 6317 $next_start = $start + 1; 6318 $next_end = $end; 6319 $next_value = $value; 6320 $end = $start; 6321 } 6322 6323 if ($use_adjustments && ! $range_size_1) { 6324 6325 # If this range is adjacent to the previous one, and 6326 # the values in each are integers that are also 6327 # adjacent (differ by 1), then this range really 6328 # extends the previous one that is already in element 6329 # $OUT[-1]. So we pop that element, and pretend that 6330 # the range starts with whatever it started with. 6331 # $offset is incremented by 1 each time so that it 6332 # gives the current offset from the first element in 6333 # the accumulating range, and we keep in $value the 6334 # value of that first element. 6335 if ($start == $previous_end + 1 6336 && $value =~ /^ -? \d+ $/xa 6337 && $previous_value =~ /^ -? \d+ $/xa 6338 && ($value == ($previous_value + ++$offset))) 6339 { 6340 pop @OUT; 6341 $start = $previous_start; 6342 $value = $previous_value; 6343 } 6344 else { 6345 $offset = 0; 6346 if (@annotation == 1) { 6347 $OUT[-1] = merge_single_annotation_line( 6348 $OUT[-1], $annotation[0], $comment_indent); 6349 } 6350 else { 6351 push @OUT, @annotation; 6352 } 6353 } 6354 undef @annotation; 6355 6356 # Save the current values for the next time through 6357 # the loop. 6358 $previous_start = $start; 6359 $previous_end = $end; 6360 $previous_value = $value; 6361 } 6362 6363 if ($write_as_invlist) { 6364 if ( $previous_end > 0 6365 && $output_range_counts{$addr}) 6366 { 6367 my $complement_count = $start - $previous_end - 1; 6368 if ($complement_count > 1) { 6369 $OUT[-1] = merge_single_annotation_line( 6370 $OUT[-1], 6371 "#" 6372 . (" " x 17) 6373 . "[" 6374 . main::clarify_code_point_count( 6375 $complement_count) 6376 . "] in complement\n", 6377 $comment_indent); 6378 } 6379 } 6380 6381 # Inversion list format has a single number per line, 6382 # the starting code point of a range that matches the 6383 # property 6384 push @OUT, $start, "\n"; 6385 $invlist_count++; 6386 6387 # Add a comment with the size of the range, if 6388 # requested. 6389 if ($output_range_counts{$addr}) { 6390 $OUT[-1] = merge_single_annotation_line( 6391 $OUT[-1], 6392 "# [" 6393 . main::clarify_code_point_count($end - $start + 1) 6394 . "]\n", 6395 $comment_indent); 6396 } 6397 } 6398 elsif ($start != $end) { # If there is a range 6399 if ($end == $MAX_WORKING_CODEPOINT) { 6400 push @OUT, sprintf "$hex_format\t$hex_format", 6401 $start, 6402 $MAX_PLATFORM_CODEPOINT; 6403 } 6404 else { 6405 push @OUT, sprintf "$hex_format\t$hex_format", 6406 $start, $end; 6407 } 6408 if (length $value) { 6409 if ($convert_map_to_from_hex) { 6410 $OUT[-1] .= sprintf "\t$hex_format\n", $value; 6411 } 6412 else { 6413 $OUT[-1] .= "\t$value\n"; 6414 } 6415 } 6416 6417 # Add a comment with the size of the range, if 6418 # requested. 6419 if ($output_range_counts{$addr}) { 6420 $OUT[-1] = merge_single_annotation_line( 6421 $OUT[-1], 6422 "# [" 6423 . main::clarify_code_point_count($end - $start + 1) 6424 . "]\n", 6425 $comment_indent); 6426 } 6427 } 6428 else { # Here to output a single code point per line. 6429 6430 # Use any passed in subroutine to output. 6431 if (ref $range_size_1 eq 'CODE') { 6432 for my $i ($start .. $end) { 6433 push @OUT, &{$range_size_1}($i, $value); 6434 } 6435 } 6436 else { 6437 6438 # Here, caller is ok with default output. 6439 for (my $i = $start; $i <= $end; $i++) { 6440 if ($convert_map_to_from_hex) { 6441 push @OUT, 6442 sprintf "$hex_format\t\t$hex_format\n", 6443 $i, $value; 6444 } 6445 else { 6446 push @OUT, sprintf $hex_format, $i; 6447 $OUT[-1] .= "\t\t$value" if $value ne ""; 6448 $OUT[-1] .= "\n"; 6449 } 6450 } 6451 } 6452 } 6453 6454 if ($annotate) { 6455 for (my $i = $start; $i <= $end; $i++) { 6456 my $annotation = ""; 6457 6458 # Get character information if don't have it already 6459 main::populate_char_info($i) 6460 if ! defined $viacode[$i]; 6461 my $type = $annotate_char_type[$i]; 6462 6463 # Figure out if should output the next code points 6464 # as part of a range or not. If this is not in an 6465 # annotation range, then won't output as a range, 6466 # so returns $i. Otherwise use the end of the 6467 # annotation range, but no further than the 6468 # maximum possible end point of the loop. 6469 my $range_end = 6470 $range_size_1 6471 ? $start 6472 : main::min( 6473 $annotate_ranges->value_of($i) || $i, 6474 $end); 6475 6476 # Use a range if it is a range, and either is one 6477 # of the special annotation ranges, or the range 6478 # is at most 3 long. This last case causes the 6479 # algorithmically named code points to be output 6480 # individually in spans of at most 3, as they are 6481 # the ones whose $type is > 0. 6482 if ($range_end != $i 6483 && ( $type < 0 || $range_end - $i > 2)) 6484 { 6485 # Here is to output a range. We don't allow a 6486 # caller-specified output format--just use the 6487 # standard one. 6488 my $range_name = $viacode[$i]; 6489 6490 # For the code points which end in their hex 6491 # value, we eliminate that from the output 6492 # annotation, and capitalize only the first 6493 # letter of each word. 6494 if ($type == $CP_IN_NAME) { 6495 my $hex = sprintf $hex_format, $i; 6496 $range_name =~ s/-$hex$//; 6497 my @words = split " ", $range_name; 6498 for my $word (@words) { 6499 $word = 6500 ucfirst(lc($word)) if $word ne 'CJK'; 6501 } 6502 $range_name = join " ", @words; 6503 } 6504 elsif ($type == $HANGUL_SYLLABLE) { 6505 $range_name = "Hangul Syllable"; 6506 } 6507 6508 # If the annotation would just repeat what's 6509 # already being output as the range, skip it. 6510 # (When an inversion list is being written, it 6511 # isn't a repeat, as that always is in 6512 # decimal) 6513 if ( $write_as_invlist 6514 || $i != $start 6515 || $range_end < $end) 6516 { 6517 if ($range_end < $MAX_WORKING_CODEPOINT) 6518 { 6519 $annotation = sprintf "%04X..%04X", 6520 $i, $range_end; 6521 } 6522 else { 6523 $annotation = sprintf "%04X..INFINITY", 6524 $i; 6525 } 6526 } 6527 else { # Indent if not displaying code points 6528 $annotation = " " x 4; 6529 } 6530 6531 if ($range_name) { 6532 $annotation .= " $age[$i]" if $age[$i]; 6533 $annotation .= " $range_name"; 6534 } 6535 6536 # Include the number of code points in the 6537 # range 6538 my $count = 6539 main::clarify_code_point_count($range_end - $i + 1); 6540 $annotation .= " [$count]\n"; 6541 6542 # Skip to the end of the range 6543 $i = $range_end; 6544 } 6545 else { # Not in a range. 6546 my $comment = ""; 6547 6548 # When outputting the names of each character, 6549 # use the character itself if printable 6550 $comment .= "'" . main::display_chr($i) . "' " 6551 if $printable[$i]; 6552 6553 my $output_value = $value; 6554 6555 # Determine the annotation 6556 if ($format eq $DECOMP_STRING_FORMAT) { 6557 6558 # This is very specialized, with the type 6559 # of decomposition beginning the line 6560 # enclosed in <...>, and the code points 6561 # that the code point decomposes to 6562 # separated by blanks. Create two 6563 # strings, one of the printable 6564 # characters, and one of their official 6565 # names. 6566 (my $map = $output_value) 6567 =~ s/ \ * < .*? > \ +//x; 6568 my $tostr = ""; 6569 my $to_name = ""; 6570 my $to_chr = ""; 6571 foreach my $to (split " ", $map) { 6572 $to = CORE::hex $to; 6573 $to_name .= " + " if $to_name; 6574 $to_chr .= main::display_chr($to); 6575 main::populate_char_info($to) 6576 if ! defined $viacode[$to]; 6577 $to_name .= $viacode[$to]; 6578 } 6579 6580 $comment .= 6581 "=> '$to_chr'; $viacode[$i] => $to_name"; 6582 } 6583 else { 6584 $output_value += $i - $start 6585 if $use_adjustments 6586 # Don't try to adjust a 6587 # non-integer 6588 && $output_value !~ /[-\D]/; 6589 6590 if ($output_map_in_hex) { 6591 main::populate_char_info($output_value) 6592 if ! defined $viacode[$output_value]; 6593 $comment .= " => '" 6594 . main::display_chr($output_value) 6595 . "'; " if $printable[$output_value]; 6596 } 6597 if ($include_name && $viacode[$i]) { 6598 $comment .= " " if $comment; 6599 $comment .= $viacode[$i]; 6600 } 6601 if ($output_map_in_hex) { 6602 $comment .= 6603 " => $viacode[$output_value]" 6604 if $viacode[$output_value]; 6605 $output_value = sprintf($hex_format, 6606 $output_value); 6607 } 6608 } 6609 6610 if ($include_cp) { 6611 $annotation = sprintf "%04X %s", $i, $age[$i]; 6612 if ($use_adjustments) { 6613 $annotation .= " => $output_value"; 6614 } 6615 } 6616 6617 if ($comment ne "") { 6618 $annotation .= " " if $annotation ne ""; 6619 $annotation .= $comment; 6620 } 6621 $annotation .= "\n" if $annotation ne ""; 6622 } 6623 6624 if ($annotation ne "") { 6625 push @annotation, (" " x $comment_indent) 6626 . "# $annotation"; 6627 } 6628 } 6629 6630 # If not adjusting, we don't have to go through the 6631 # loop again to know that the annotation comes next 6632 # in the output. 6633 if (! $use_adjustments) { 6634 if (@annotation == 1) { 6635 $OUT[-1] = merge_single_annotation_line( 6636 $OUT[-1], $annotation[0], $comment_indent); 6637 } 6638 else { 6639 push @OUT, map { Text::Tabs::unexpand $_ } 6640 @annotation; 6641 } 6642 undef @annotation; 6643 } 6644 } 6645 6646 # Add the beginning of the range that doesn't match the 6647 # property, except if the just added match range extends 6648 # to infinity. We do this after any annotations for the 6649 # match range. 6650 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) { 6651 push @OUT, $end + 1, "\n"; 6652 $invlist_count++; 6653 } 6654 6655 # If we split the range, set up so the next time through 6656 # we get the remainder, and redo. 6657 if ($next_start) { 6658 $start = $next_start; 6659 $end = $next_end; 6660 $value = $next_value; 6661 $next_start = 0; 6662 redo; 6663 } 6664 } 6665 } # End of loop through all the table's ranges 6666 6667 push @OUT, @annotation; # Add orphaned annotation, if any 6668 6669 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count; 6670 } 6671 6672 # Add anything that goes after the main body, but within the here 6673 # document, 6674 my $append_to_body = $self->append_to_body; 6675 push @OUT, $append_to_body if $append_to_body; 6676 6677 # And finish the here document. 6678 push @OUT, "END\n"; 6679 6680 # Done with the main portion of the body. Can now figure out what 6681 # should appear before it in the file. 6682 my $pre_body = $self->pre_body; 6683 push @HEADER, $pre_body, "\n" if $pre_body; 6684 6685 # All these files should have a .pl suffix added to them. 6686 my @file_with_pl = @{$file_path{$addr}}; 6687 $file_with_pl[-1] .= '.pl'; 6688 6689 main::write(\@file_with_pl, 6690 $annotate, # utf8 iff annotating 6691 \@HEADER, 6692 \@OUT); 6693 return; 6694 } 6695 6696 sub set_status { # Set the table's status 6697 my $self = shift; 6698 my $status = shift; # The status enum value 6699 my $info = shift; # Any message associated with it. 6700 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6701 6702 my $addr = do { no overloading; pack 'J', $self; }; 6703 6704 $status{$addr} = $status; 6705 $status_info{$addr} = $info; 6706 return; 6707 } 6708 6709 sub set_fate { # Set the fate of a table 6710 my $self = shift; 6711 my $fate = shift; 6712 my $reason = shift; 6713 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6714 6715 my $addr = do { no overloading; pack 'J', $self; }; 6716 6717 return if $fate{$addr} == $fate; # If no-op 6718 6719 # Can only change the ordinary fate, except if going to $MAP_PROXIED 6720 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; 6721 6722 $fate{$addr} = $fate; 6723 6724 # Don't document anything to do with a non-normal fated table 6725 if ($fate != $ORDINARY) { 6726 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; 6727 foreach my $alias ($self->aliases) { 6728 $alias->set_ucd($put_in_pod); 6729 6730 # MAP_PROXIED doesn't affect the match tables 6731 next if $fate == $MAP_PROXIED; 6732 $alias->set_make_re_pod_entry($put_in_pod); 6733 } 6734 } 6735 6736 # Save the reason for suppression for output 6737 if ($fate >= $SUPPRESSED) { 6738 $reason = "" unless defined $reason; 6739 $why_suppressed{$complete_name{$addr}} = $reason; 6740 } 6741 6742 return; 6743 } 6744 6745 sub lock { 6746 # Don't allow changes to the table from now on. This stores a stack 6747 # trace of where it was called, so that later attempts to modify it 6748 # can immediately show where it got locked. 6749 6750 my $self = shift; 6751 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6752 6753 my $addr = do { no overloading; pack 'J', $self; }; 6754 6755 $locked{$addr} = ""; 6756 6757 my $line = (caller(0))[2]; 6758 my $i = 1; 6759 6760 # Accumulate the stack trace 6761 while (1) { 6762 my ($pkg, $file, $caller_line, $caller) = caller $i++; 6763 6764 last unless defined $caller; 6765 6766 $locked{$addr} .= " called from $caller() at line $line\n"; 6767 $line = $caller_line; 6768 } 6769 $locked{$addr} .= " called from main at line $line\n"; 6770 6771 return; 6772 } 6773 6774 sub carp_if_locked { 6775 # Return whether a table is locked or not, and, by the way, complain 6776 # if is locked 6777 6778 my $self = shift; 6779 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 6780 6781 my $addr = do { no overloading; pack 'J', $self; }; 6782 6783 return 0 if ! $locked{$addr}; 6784 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); 6785 return 1; 6786 } 6787 6788 sub set_file_path { # Set the final directory path for this table 6789 my $self = shift; 6790 # Rest of parameters passed on 6791 6792 no overloading; 6793 @{$file_path{pack 'J', $self}} = @_; 6794 return 6795 } 6796 6797 # Accessors for the range list stored in this table. First for 6798 # unconditional 6799 for my $sub (qw( 6800 containing_range 6801 contains 6802 count 6803 each_range 6804 hash 6805 is_empty 6806 matches_identically_to 6807 max 6808 min 6809 range_count 6810 reset_each_range 6811 type_of 6812 value_of 6813 )) 6814 { 6815 no strict "refs"; 6816 *$sub = sub { 6817 use strict "refs"; 6818 my $self = shift; 6819 return $self->_range_list->$sub(@_); 6820 } 6821 } 6822 6823 # Then for ones that should fail if locked 6824 for my $sub (qw( 6825 delete_range 6826 )) 6827 { 6828 no strict "refs"; 6829 *$sub = sub { 6830 use strict "refs"; 6831 my $self = shift; 6832 6833 return if $self->carp_if_locked; 6834 no overloading; 6835 return $self->_range_list->$sub(@_); 6836 } 6837 } 6838 6839} # End closure 6840 6841package Map_Table; 6842use parent '-norequire', '_Base_Table'; 6843 6844# A Map Table is a table that contains the mappings from code points to 6845# values. There are two weird cases: 6846# 1) Anomalous entries are ones that aren't maps of ranges of code points, but 6847# are written in the table's file at the end of the table nonetheless. It 6848# requires specially constructed code to handle these; utf8.c can not read 6849# these in, so they should not go in $map_directory. As of this writing, 6850# the only case that these happen is for named sequences used in 6851# charnames.pm. But this code doesn't enforce any syntax on these, so 6852# something else could come along that uses it. 6853# 2) Specials are anything that doesn't fit syntactically into the body of the 6854# table. The ranges for these have a map type of non-zero. The code below 6855# knows about and handles each possible type. In most cases, these are 6856# written as part of the header. 6857# 6858# A map table deliberately can't be manipulated at will unlike match tables. 6859# This is because of the ambiguities having to do with what to do with 6860# overlapping code points. And there just isn't a need for those things; 6861# what one wants to do is just query, add, replace, or delete mappings, plus 6862# write the final result. 6863# However, there is a method to get the list of possible ranges that aren't in 6864# this table to use for defaulting missing code point mappings. And, 6865# map_add_or_replace_non_nulls() does allow one to add another table to this 6866# one, but it is clearly very specialized, and defined that the other's 6867# non-null values replace this one's if there is any overlap. 6868 6869sub trace { return main::trace(@_); } 6870 6871{ # Closure 6872 6873 main::setup_package(); 6874 6875 my %default_map; 6876 # Many input files omit some entries; this gives what the mapping for the 6877 # missing entries should be 6878 main::set_access('default_map', \%default_map, 'r'); 6879 6880 my %anomalous_entries; 6881 # Things that go in the body of the table which don't fit the normal 6882 # scheme of things, like having a range. Not much can be done with these 6883 # once there except to output them. This was created to handle named 6884 # sequences. 6885 main::set_access('anomalous_entry', \%anomalous_entries, 'a'); 6886 main::set_access('anomalous_entries', # Append singular, read plural 6887 \%anomalous_entries, 6888 'readable_array'); 6889 6890 my %replacement_property; 6891 # Certain files are unused by Perl itself, and are kept only for backwards 6892 # compatibility for programs that used them before Unicode::UCD existed. 6893 # These are termed legacy properties. At some point they may be removed, 6894 # but for now mark them as legacy. If non empty, this is the name of the 6895 # property to use instead (i.e., the modern equivalent). 6896 main::set_access('replacement_property', \%replacement_property, 'r'); 6897 6898 my %to_output_map; 6899 # Enum as to whether or not to write out this map table, and how: 6900 # 0 don't output 6901 # $EXTERNAL_MAP means its existence is noted in the documentation, and 6902 # it should not be removed nor its format changed. This 6903 # is done for those files that have traditionally been 6904 # output. Maps of legacy-only properties default to 6905 # this. 6906 # $INTERNAL_MAP means Perl reserves the right to do anything it wants 6907 # with this file 6908 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of 6909 # outputting the actual mappings as-is, we adjust things 6910 # to create a much more compact table. Only those few 6911 # tables where the mapping is convertible at least to an 6912 # integer and compacting makes a big difference should 6913 # have this. Hence, the default is to not do this 6914 # unless the table's default mapping is to $CODE_POINT, 6915 # and the range size is not 1. 6916 main::set_access('to_output_map', \%to_output_map, 's'); 6917 6918 sub new { 6919 my $class = shift; 6920 my $name = shift; 6921 6922 my %args = @_; 6923 6924 # Optional initialization data for the table. 6925 my $initialize = delete $args{'Initialize'}; 6926 6927 my $default_map = delete $args{'Default_Map'}; 6928 my $property = delete $args{'_Property'}; 6929 my $full_name = delete $args{'Full_Name'}; 6930 my $replacement_property = delete $args{'Replacement_Property'} // ""; 6931 my $to_output_map = delete $args{'To_Output_Map'}; 6932 6933 # Rest of parameters passed on; legacy properties have several common 6934 # other attributes 6935 if ($replacement_property) { 6936 $args{"Fate"} = $LEGACY_ONLY; 6937 $args{"Range_Size_1"} = 1; 6938 $args{"Perl_Extension"} = 1; 6939 $args{"UCD"} = 0; 6940 } 6941 6942 my $range_list = Range_Map->new(Owner => $property); 6943 6944 my $self = $class->SUPER::new( 6945 Name => $name, 6946 Complete_Name => $full_name, 6947 Full_Name => $full_name, 6948 _Property => $property, 6949 _Range_List => $range_list, 6950 Write_As_Invlist => 0, 6951 %args); 6952 6953 my $addr = do { no overloading; pack 'J', $self; }; 6954 6955 $anomalous_entries{$addr} = []; 6956 $default_map{$addr} = $default_map; 6957 $replacement_property{$addr} = $replacement_property; 6958 $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map 6959 && $replacement_property; 6960 $to_output_map{$addr} = $to_output_map; 6961 6962 $self->initialize($initialize) if defined $initialize; 6963 6964 return $self; 6965 } 6966 6967 use overload 6968 fallback => 0, 6969 qw("") => "_operator_stringify", 6970 ; 6971 6972 sub _operator_stringify { 6973 my $self = shift; 6974 6975 my $name = $self->property->full_name; 6976 $name = '""' if $name eq ""; 6977 return "Map table for Property '$name'"; 6978 } 6979 6980 sub add_alias { 6981 # Add a synonym for this table (which means the property itself) 6982 my $self = shift; 6983 my $name = shift; 6984 # Rest of parameters passed on. 6985 6986 $self->SUPER::add_alias($name, $self->property, @_); 6987 return; 6988 } 6989 6990 sub add_map { 6991 # Add a range of code points to the list of specially-handled code 6992 # points. $MULTI_CP is assumed if the type of special is not passed 6993 # in. 6994 6995 my $self = shift; 6996 my $lower = shift; 6997 my $upper = shift; 6998 my $string = shift; 6999 my %args = @_; 7000 7001 my $type = delete $args{'Type'} || 0; 7002 # Rest of parameters passed on 7003 7004 # Can't change the table if locked. 7005 return if $self->carp_if_locked; 7006 7007 my $addr = do { no overloading; pack 'J', $self; }; 7008 7009 $self->_range_list->add_map($lower, $upper, 7010 $string, 7011 @_, 7012 Type => $type); 7013 return; 7014 } 7015 7016 sub append_to_body { 7017 # Adds to the written HERE document of the table's body any anomalous 7018 # entries in the table.. 7019 7020 my $self = shift; 7021 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7022 7023 my $addr = do { no overloading; pack 'J', $self; }; 7024 7025 return "" unless @{$anomalous_entries{$addr}}; 7026 return join("\n", @{$anomalous_entries{$addr}}) . "\n"; 7027 } 7028 7029 sub map_add_or_replace_non_nulls { 7030 # This adds the mappings in the table $other to $self. Non-null 7031 # mappings from $other override those in $self. It essentially merges 7032 # the two tables, with the second having priority except for null 7033 # mappings. 7034 7035 my $self = shift; 7036 my $other = shift; 7037 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7038 7039 return if $self->carp_if_locked; 7040 7041 if (! $other->isa(__PACKAGE__)) { 7042 Carp::my_carp_bug("$other should be a " 7043 . __PACKAGE__ 7044 . ". Not a '" 7045 . ref($other) 7046 . "'. Not added;"); 7047 return; 7048 } 7049 7050 my $addr = do { no overloading; pack 'J', $self; }; 7051 my $other_addr = do { no overloading; pack 'J', $other; }; 7052 7053 local $to_trace = 0 if main::DEBUG; 7054 7055 my $self_range_list = $self->_range_list; 7056 my $other_range_list = $other->_range_list; 7057 foreach my $range ($other_range_list->ranges) { 7058 my $value = $range->value; 7059 next if $value eq ""; 7060 $self_range_list->_add_delete('+', 7061 $range->start, 7062 $range->end, 7063 $value, 7064 Type => $range->type, 7065 Replace => $UNCONDITIONALLY); 7066 } 7067 7068 return; 7069 } 7070 7071 sub set_default_map { 7072 # Define what code points that are missing from the input files should 7073 # map to. The optional second parameter 'full_name' indicates to 7074 # force using the full name of the map instead of its standard name. 7075 7076 my $self = shift; 7077 my $map = shift; 7078 my $use_full_name = shift // 0; 7079 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7080 7081 if ($use_full_name && $use_full_name ne 'full_name') { 7082 Carp::my_carp_bug("Second parameter to set_default_map() if" 7083 . " present, must be 'full_name'"); 7084 } 7085 7086 my $addr = do { no overloading; pack 'J', $self; }; 7087 7088 # Convert the input to the standard equivalent, if any (won't have any 7089 # for $STRING properties) 7090 my $standard = $self->property->table($map); 7091 if (defined $standard) { 7092 $map = ($use_full_name) 7093 ? $standard->full_name 7094 : $standard->name; 7095 } 7096 7097 # Warn if there already is a non-equivalent default map for this 7098 # property. Note that a default map can be a ref, which means that 7099 # what it actually means is delayed until later in the program, and it 7100 # IS permissible to override it here without a message. 7101 my $default_map = $default_map{$addr}; 7102 if (defined $default_map 7103 && ! ref($default_map) 7104 && $default_map ne $map 7105 && main::Standardize($map) ne $default_map) 7106 { 7107 my $property = $self->property; 7108 my $map_table = $property->table($map); 7109 my $default_table = $property->table($default_map); 7110 if (defined $map_table 7111 && defined $default_table 7112 && $map_table != $default_table) 7113 { 7114 Carp::my_carp("Changing the default mapping for " 7115 . $property 7116 . " from $default_map to $map'"); 7117 } 7118 } 7119 7120 $default_map{$addr} = $map; 7121 7122 # Don't also create any missing table for this map at this point, 7123 # because if we did, it could get done before the main table add is 7124 # done for PropValueAliases.txt; instead the caller will have to make 7125 # sure it exists, if desired. 7126 return; 7127 } 7128 7129 sub to_output_map { 7130 # Returns boolean: should we write this map table? 7131 7132 my $self = shift; 7133 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7134 7135 my $addr = do { no overloading; pack 'J', $self; }; 7136 7137 # If overridden, use that 7138 return $to_output_map{$addr} if defined $to_output_map{$addr}; 7139 7140 my $full_name = $self->full_name; 7141 return $global_to_output_map{$full_name} 7142 if defined $global_to_output_map{$full_name}; 7143 7144 # If table says to output, do so; if says to suppress it, do so. 7145 my $fate = $self->fate; 7146 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; 7147 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; 7148 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; 7149 7150 my $type = $self->property->type; 7151 7152 # Don't want to output binary map tables even for debugging. 7153 return 0 if $type == $BINARY; 7154 7155 # But do want to output string ones. All the ones that remain to 7156 # be dealt with (i.e. which haven't explicitly been set to external) 7157 # are for internal Perl use only. The default for those that map to 7158 # $CODE_POINT and haven't been restricted to a single element range 7159 # is to use the adjusted form. 7160 if ($type == $STRING) { 7161 return $INTERNAL_MAP if $self->range_size_1 7162 || $default_map{$addr} ne $CODE_POINT; 7163 return $OUTPUT_ADJUSTED; 7164 } 7165 7166 # Otherwise is an $ENUM, do output it, for Perl's purposes 7167 return $INTERNAL_MAP; 7168 } 7169 7170 sub inverse_list { 7171 # Returns a Range_List that is gaps of the current table. That is, 7172 # the inversion 7173 7174 my $self = shift; 7175 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7176 7177 my $current = Range_List->new(Initialize => $self->_range_list, 7178 Owner => $self->property); 7179 return ~ $current; 7180 } 7181 7182 sub header { 7183 my $self = shift; 7184 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7185 7186 my $return = $self->SUPER::header(); 7187 7188 if ($self->to_output_map >= $INTERNAL_MAP) { 7189 $return .= $INTERNAL_ONLY_HEADER; 7190 } 7191 else { 7192 my $property_name = $self->property->replacement_property; 7193 7194 # The legacy-only properties were gotten above; but there are some 7195 # other properties whose files are in current use that have fixed 7196 # formats. 7197 $property_name = $self->property->full_name unless $property_name; 7198 7199 $return .= <<END; 7200 7201# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!! 7202 7203# This file is for internal use by core Perl only. It is retained for 7204# backwards compatibility with applications that may have come to rely on it, 7205# but its format and even its name or existence are subject to change without 7206# notice in a future Perl version. Don't use it directly. Instead, its 7207# contents are now retrievable through a stable API in the Unicode::UCD 7208# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual 7209# code points can be retrieved via Unicode::UCD::charprop()); 7210END 7211 } 7212 return $return; 7213 } 7214 7215 sub set_final_comment { 7216 # Just before output, create the comment that heads the file 7217 # containing this table. 7218 7219 return unless $debugging_build; 7220 7221 my $self = shift; 7222 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7223 7224 # No sense generating a comment if aren't going to write it out. 7225 return if ! $self->to_output_map; 7226 7227 my $addr = do { no overloading; pack 'J', $self; }; 7228 7229 my $property = $self->property; 7230 7231 # Get all the possible names for this property. Don't use any that 7232 # aren't ok for use in a file name, etc. This is perhaps causing that 7233 # flag to do double duty, and may have to be changed in the future to 7234 # have our own flag for just this purpose; but it works now to exclude 7235 # Perl generated synonyms from the lists for properties, where the 7236 # name is always the proper Unicode one. 7237 my @property_aliases = grep { $_->ok_as_filename } $self->aliases; 7238 7239 my $count = $self->count; 7240 my $default_map = $default_map{$addr}; 7241 7242 # The ranges that map to the default aren't output, so subtract that 7243 # to get those actually output. A property with matching tables 7244 # already has the information calculated. 7245 if ($property->type != $STRING && $property->type != $FORCED_BINARY) { 7246 $count -= $property->table($default_map)->count; 7247 } 7248 elsif (defined $default_map) { 7249 7250 # But for $STRING properties, must calculate now. Subtract the 7251 # count from each range that maps to the default. 7252 foreach my $range ($self->_range_list->ranges) { 7253 if ($range->value eq $default_map) { 7254 $count -= $range->end +1 - $range->start; 7255 } 7256 } 7257 7258 } 7259 7260 # Get a string version of $count with underscores in large numbers, 7261 # for clarity. 7262 my $string_count = main::clarify_code_point_count($count); 7263 7264 my $code_points = ($count == 1) 7265 ? 'single code point' 7266 : "$string_count code points"; 7267 7268 my $mapping; 7269 my $these_mappings; 7270 my $are; 7271 if (@property_aliases <= 1) { 7272 $mapping = 'mapping'; 7273 $these_mappings = 'this mapping'; 7274 $are = 'is' 7275 } 7276 else { 7277 $mapping = 'synonymous mappings'; 7278 $these_mappings = 'these mappings'; 7279 $are = 'are' 7280 } 7281 my $cp; 7282 if ($count >= $MAX_UNICODE_CODEPOINTS) { 7283 $cp = "any code point in Unicode Version $string_version"; 7284 } 7285 else { 7286 my $map_to; 7287 if ($default_map eq "") { 7288 $map_to = 'the null string'; 7289 } 7290 elsif ($default_map eq $CODE_POINT) { 7291 $map_to = "itself"; 7292 } 7293 else { 7294 $map_to = "'$default_map'"; 7295 } 7296 if ($count == 1) { 7297 $cp = "the single code point"; 7298 } 7299 else { 7300 $cp = "one of the $code_points"; 7301 } 7302 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to"; 7303 } 7304 7305 my $comment = ""; 7306 7307 my $status = $self->status; 7308 if ($status ne $NORMAL) { 7309 my $warn = uc $status_past_participles{$status}; 7310 $comment .= <<END; 7311 7312!!!!!!! $warn !!!!!!!!!!!!!!!!!!! 7313 All property or property=value combinations contained in this file are $warn. 7314 See $unicode_reference_url for what this means. 7315 7316END 7317 } 7318 $comment .= "This file returns the $mapping:\n"; 7319 7320 my $ucd_accessible_name = ""; 7321 my $has_underscore_name = 0; 7322 my $full_name = $self->property->full_name; 7323 for my $i (0 .. @property_aliases - 1) { 7324 my $name = $property_aliases[$i]->name; 7325 $has_underscore_name = 1 if $name =~ /^_/; 7326 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); 7327 if ($property_aliases[$i]->ucd) { 7328 if ($name eq $full_name) { 7329 $ucd_accessible_name = $full_name; 7330 } 7331 elsif (! $ucd_accessible_name) { 7332 $ucd_accessible_name = $name; 7333 } 7334 } 7335 } 7336 $comment .= "\nwhere 'cp' is $cp."; 7337 if ($ucd_accessible_name) { 7338 $comment .= " Note that $these_mappings"; 7339 if ($has_underscore_name) { 7340 $comment .= " (except for the one(s) that begin with an underscore)"; 7341 } 7342 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; 7343 7344 } 7345 7346 # And append any commentary already set from the actual property. 7347 $comment .= "\n\n" . $self->comment if $self->comment; 7348 if ($self->description) { 7349 $comment .= "\n\n" . join " ", $self->description; 7350 } 7351 if ($self->note) { 7352 $comment .= "\n\n" . join " ", $self->note; 7353 } 7354 $comment .= "\n"; 7355 7356 if (! $self->perl_extension) { 7357 $comment .= <<END; 7358 7359For information about what this property really means, see: 7360$unicode_reference_url 7361END 7362 } 7363 7364 if ($count) { # Format differs for empty table 7365 $comment.= "\nThe format of the "; 7366 if ($self->range_size_1) { 7367 $comment.= <<END; 7368main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT 7369is in hex; MAPPING is what CODE_POINT maps to. 7370END 7371 } 7372 else { 7373 7374 # There are tables which end up only having one element per 7375 # range, but it is not worth keeping track of for making just 7376 # this comment a little better. 7377 $comment .= <<END; 7378non-comment portions of the main body of lines of this file is: 7379START\\tSTOP\\tMAPPING where START is the starting code point of the 7380range, in hex; STOP is the ending point, or if omitted, the range has just one 7381code point; MAPPING is what each code point between START and STOP maps to. 7382END 7383 if ($self->output_range_counts) { 7384 $comment .= <<END; 7385Numbers in comments in [brackets] indicate how many code points are in the 7386range (omitted when the range is a single code point or if the mapping is to 7387the null string). 7388END 7389 } 7390 } 7391 } 7392 $self->set_comment(main::join_lines($comment)); 7393 return; 7394 } 7395 7396 my %swash_keys; # Makes sure don't duplicate swash names. 7397 7398 # The remaining variables are temporaries used while writing each table, 7399 # to output special ranges. 7400 my @multi_code_point_maps; # Map is to more than one code point. 7401 7402 sub handle_special_range { 7403 # Called in the middle of write when it finds a range it doesn't know 7404 # how to handle. 7405 7406 my $self = shift; 7407 my $range = shift; 7408 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7409 7410 my $addr = do { no overloading; pack 'J', $self; }; 7411 7412 my $type = $range->type; 7413 7414 my $low = $range->start; 7415 my $high = $range->end; 7416 my $map = $range->value; 7417 7418 # No need to output the range if it maps to the default. 7419 return if $map eq $default_map{$addr}; 7420 7421 my $property = $self->property; 7422 7423 # Switch based on the map type... 7424 if ($type == $HANGUL_SYLLABLE) { 7425 7426 # These are entirely algorithmically determinable based on 7427 # some constants furnished by Unicode; for now, just set a 7428 # flag to indicate that have them. After everything is figured 7429 # out, we will output the code that does the algorithm. (Don't 7430 # output them if not needed because we are suppressing this 7431 # property.) 7432 $has_hangul_syllables = 1 if $property->to_output_map; 7433 } 7434 elsif ($type == $CP_IN_NAME) { 7435 7436 # Code points whose name ends in their code point are also 7437 # algorithmically determinable, but need information about the map 7438 # to do so. Both the map and its inverse are stored in data 7439 # structures output in the file. They are stored in the mean time 7440 # in global lists The lists will be written out later into Name.pm, 7441 # which is created only if needed. In order to prevent duplicates 7442 # in the list, only add to them for one property, should multiple 7443 # ones need them. 7444 if ($needing_code_points_ending_in_code_point == 0) { 7445 $needing_code_points_ending_in_code_point = $property; 7446 } 7447 if ($property == $needing_code_points_ending_in_code_point) { 7448 push @{$names_ending_in_code_point{$map}->{'low'}}, $low; 7449 push @{$names_ending_in_code_point{$map}->{'high'}}, $high; 7450 7451 my $squeezed = $map =~ s/[-\s]+//gr; 7452 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, 7453 $low; 7454 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, 7455 $high; 7456 7457 push @code_points_ending_in_code_point, { low => $low, 7458 high => $high, 7459 name => $map 7460 }; 7461 } 7462 } 7463 elsif ($range->type == $MULTI_CP || $range->type == $NULL) { 7464 7465 # Multi-code point maps and null string maps have an entry 7466 # for each code point in the range. They use the same 7467 # output format. 7468 for my $code_point ($low .. $high) { 7469 7470 # The pack() below can't cope with surrogates. XXX This may 7471 # no longer be true 7472 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { 7473 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); 7474 next; 7475 } 7476 7477 # Generate the hash entries for these in the form that 7478 # utf8.c understands. 7479 my $tostr = ""; 7480 my $to_name = ""; 7481 my $to_chr = ""; 7482 foreach my $to (split " ", $map) { 7483 if ($to !~ /^$code_point_re$/) { 7484 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); 7485 next; 7486 } 7487 $tostr .= sprintf "\\x{%s}", $to; 7488 $to = CORE::hex $to; 7489 if ($annotate) { 7490 $to_name .= " + " if $to_name; 7491 $to_chr .= main::display_chr($to); 7492 main::populate_char_info($to) 7493 if ! defined $viacode[$to]; 7494 $to_name .= $viacode[$to]; 7495 } 7496 } 7497 7498 # The unpack yields a list of the bytes that comprise the 7499 # UTF-8 of $code_point, which are each placed in \xZZ format 7500 # and output in the %s to map to $tostr, so the result looks 7501 # like: 7502 # "\xC4\xB0" => "\x{0069}\x{0307}", 7503 my $utf8 = sprintf(qq["%s" => "$tostr",], 7504 join("", map { sprintf "\\x%02X", $_ } 7505 unpack("U0C*", chr $code_point))); 7506 7507 # Add a comment so that a human reader can more easily 7508 # see what's going on. 7509 push @multi_code_point_maps, 7510 sprintf("%-45s # U+%04X", $utf8, $code_point); 7511 if (! $annotate) { 7512 $multi_code_point_maps[-1] .= " => $map"; 7513 } 7514 else { 7515 main::populate_char_info($code_point) 7516 if ! defined $viacode[$code_point]; 7517 $multi_code_point_maps[-1] .= " '" 7518 . main::display_chr($code_point) 7519 . "' => '$to_chr'; $viacode[$code_point] => $to_name"; 7520 } 7521 } 7522 } 7523 else { 7524 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written"); 7525 } 7526 7527 return; 7528 } 7529 7530 sub pre_body { 7531 # Returns the string that should be output in the file before the main 7532 # body of this table. It isn't called until the main body is 7533 # calculated, saving a pass. The string includes some hash entries 7534 # identifying the format of the body, and what the single value should 7535 # be for all ranges missing from it. It also includes any code points 7536 # which have map_types that don't go in the main table. 7537 7538 my $self = shift; 7539 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7540 7541 my $addr = do { no overloading; pack 'J', $self; }; 7542 7543 my $name = $self->property->swash_name; 7544 7545 # Currently there is nothing in the pre_body unless a swash is being 7546 # generated. 7547 return unless defined $name; 7548 7549 if (defined $swash_keys{$name}) { 7550 Carp::my_carp(main::join_lines(<<END 7551Already created a swash name '$name' for $swash_keys{$name}. This means that 7552the same name desired for $self shouldn't be used. Bad News. This must be 7553fixed before production use, but proceeding anyway 7554END 7555 )); 7556 } 7557 $swash_keys{$name} = "$self"; 7558 7559 my $pre_body = ""; 7560 7561 # Here we assume we were called after have gone through the whole 7562 # file. If we actually generated anything for each map type, add its 7563 # respective header and trailer 7564 my $specials_name = ""; 7565 if (@multi_code_point_maps) { 7566 $specials_name = "utf8::ToSpec$name"; 7567 $pre_body .= <<END; 7568 7569# Some code points require special handling because their mappings are each to 7570# multiple code points. These do not appear in the main body, but are defined 7571# in the hash below. 7572 7573# Each key is the string of N bytes that together make up the UTF-8 encoding 7574# for the code point. (i.e. the same as looking at the code point's UTF-8 7575# under "use bytes"). Each value is the UTF-8 of the translation, for speed. 7576\%$specials_name = ( 7577END 7578 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; 7579 } 7580 7581 my $format = $self->format; 7582 7583 my $return = ""; 7584 7585 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7586 if ($output_adjusted) { 7587 if ($specials_name) { 7588 $return .= <<END; 7589# The mappings in the non-hash portion of this file must be modified to get the 7590# correct values by adding the code point ordinal number to each one that is 7591# numeric. 7592END 7593 } 7594 else { 7595 $return .= <<END; 7596# The mappings must be modified to get the correct values by adding the code 7597# point ordinal number to each one that is numeric. 7598END 7599 } 7600 } 7601 7602 $return .= <<END; 7603 7604# The name this swash is to be known by, with the format of the mappings in 7605# the main body of the table, and what all code points missing from this file 7606# map to. 7607\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} 7608END 7609 if ($specials_name) { 7610 $return .= <<END; 7611\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings 7612END 7613 } 7614 my $default_map = $default_map{$addr}; 7615 7616 # For $CODE_POINT default maps and using adjustments, instead the default 7617 # becomes zero. 7618 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '" 7619 . (($output_adjusted && $default_map eq $CODE_POINT) 7620 ? "0" 7621 : $default_map) 7622 . "';"; 7623 7624 if ($default_map eq $CODE_POINT) { 7625 $return .= ' # code point maps to itself'; 7626 } 7627 elsif ($default_map eq "") { 7628 $return .= ' # code point maps to the null string'; 7629 } 7630 $return .= "\n"; 7631 7632 $return .= $pre_body; 7633 7634 return $return; 7635 } 7636 7637 sub write { 7638 # Write the table to the file. 7639 7640 my $self = shift; 7641 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7642 7643 my $addr = do { no overloading; pack 'J', $self; }; 7644 7645 # Clear the temporaries 7646 undef @multi_code_point_maps; 7647 7648 # Calculate the format of the table if not already done. 7649 my $format = $self->format; 7650 my $type = $self->property->type; 7651 my $default_map = $self->default_map; 7652 if (! defined $format) { 7653 if ($type == $BINARY) { 7654 7655 # Don't bother checking the values, because we elsewhere 7656 # verify that a binary table has only 2 values. 7657 $format = $BINARY_FORMAT; 7658 } 7659 else { 7660 my @ranges = $self->_range_list->ranges; 7661 7662 # default an empty table based on its type and default map 7663 if (! @ranges) { 7664 7665 # But it turns out that the only one we can say is a 7666 # non-string (besides binary, handled above) is when the 7667 # table is a string and the default map is to a code point 7668 if ($type == $STRING && $default_map eq $CODE_POINT) { 7669 $format = $HEX_FORMAT; 7670 } 7671 else { 7672 $format = $STRING_FORMAT; 7673 } 7674 } 7675 else { 7676 7677 # Start with the most restrictive format, and as we find 7678 # something that doesn't fit with that, change to the next 7679 # most restrictive, and so on. 7680 $format = $DECIMAL_FORMAT; 7681 foreach my $range (@ranges) { 7682 next if $range->type != 0; # Non-normal ranges don't 7683 # affect the main body 7684 my $map = $range->value; 7685 if ($map ne $default_map) { 7686 last if $format eq $STRING_FORMAT; # already at 7687 # least 7688 # restrictive 7689 $format = $INTEGER_FORMAT 7690 if $format eq $DECIMAL_FORMAT 7691 && $map !~ / ^ [0-9] $ /x; 7692 $format = $FLOAT_FORMAT 7693 if $format eq $INTEGER_FORMAT 7694 && $map !~ / ^ -? [0-9]+ $ /x; 7695 $format = $RATIONAL_FORMAT 7696 if $format eq $FLOAT_FORMAT 7697 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; 7698 $format = $HEX_FORMAT 7699 if ($format eq $RATIONAL_FORMAT 7700 && $map !~ 7701 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x) 7702 # Assume a leading zero means hex, 7703 # even if all digits are 0-9 7704 || ($format eq $INTEGER_FORMAT 7705 && $map =~ /^0[0-9A-F]/); 7706 $format = $STRING_FORMAT if $format eq $HEX_FORMAT 7707 && $map =~ /[^0-9A-F]/; 7708 } 7709 } 7710 } 7711 } 7712 } # end of calculating format 7713 7714 if ($default_map eq $CODE_POINT 7715 && $format ne $HEX_FORMAT 7716 && ! defined $self->format) # manual settings are always 7717 # considered ok 7718 { 7719 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'") 7720 } 7721 7722 # If the output is to be adjusted, the format of the table that gets 7723 # output is actually 'a' or 'ax' instead of whatever it is stored 7724 # internally as. 7725 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7726 if ($output_adjusted) { 7727 if ($default_map eq $CODE_POINT) { 7728 $format = $HEX_ADJUST_FORMAT; 7729 } 7730 else { 7731 $format = $ADJUST_FORMAT; 7732 } 7733 } 7734 7735 $self->_set_format($format); 7736 7737 return $self->SUPER::write( 7738 $output_adjusted, 7739 $default_map); # don't write defaulteds 7740 } 7741 7742 # Accessors for the underlying list that should fail if locked. 7743 for my $sub (qw( 7744 add_duplicate 7745 replace_map 7746 )) 7747 { 7748 no strict "refs"; 7749 *$sub = sub { 7750 use strict "refs"; 7751 my $self = shift; 7752 7753 return if $self->carp_if_locked; 7754 return $self->_range_list->$sub(@_); 7755 } 7756 } 7757} # End closure for Map_Table 7758 7759package Match_Table; 7760use parent '-norequire', '_Base_Table'; 7761 7762# A Match table is one which is a list of all the code points that have 7763# the same property and property value, for use in \p{property=value} 7764# constructs in regular expressions. It adds very little data to the base 7765# structure, but many methods, as these lists can be combined in many ways to 7766# form new ones. 7767# There are only a few concepts added: 7768# 1) Equivalents and Relatedness. 7769# Two tables can match the identical code points, but have different names. 7770# This always happens when there is a perl single form extension 7771# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two 7772# tables are set to be related, with the Perl extension being a child, and 7773# the Unicode property being the parent. 7774# 7775# It may be that two tables match the identical code points and we don't 7776# know if they are related or not. This happens most frequently when the 7777# Block and Script properties have the exact range. But note that a 7778# revision to Unicode could add new code points to the script, which would 7779# now have to be in a different block (as the block was filled, or there 7780# would have been 'Unknown' script code points in it and they wouldn't have 7781# been identical). So we can't rely on any two properties from Unicode 7782# always matching the same code points from release to release, and thus 7783# these tables are considered coincidentally equivalent--not related. When 7784# two tables are unrelated but equivalent, one is arbitrarily chosen as the 7785# 'leader', and the others are 'equivalents'. This concept is useful 7786# to minimize the number of tables written out. Only one file is used for 7787# any identical set of code points, with entries in Heavy.pl mapping all 7788# the involved tables to it. 7789# 7790# Related tables will always be identical; we set them up to be so. Thus 7791# if the Unicode one is deprecated, the Perl one will be too. Not so for 7792# unrelated tables. Relatedness makes generating the documentation easier. 7793# 7794# 2) Complement. 7795# Like equivalents, two tables may be the inverses of each other, the 7796# intersection between them is null, and the union is every Unicode code 7797# point. The two tables that occupy a binary property are necessarily like 7798# this. By specifying one table as the complement of another, we can avoid 7799# storing it on disk (using the other table and performing a fast 7800# transform), and some memory and calculations. 7801# 7802# 3) Conflicting. It may be that there will eventually be name clashes, with 7803# the same name meaning different things. For a while, there actually were 7804# conflicts, but they have so far been resolved by changing Perl's or 7805# Unicode's definitions to match the other, but when this code was written, 7806# it wasn't clear that that was what was going to happen. (Unicode changed 7807# because of protests during their beta period.) Name clashes are warned 7808# about during compilation, and the documentation. The generated tables 7809# are sane, free of name clashes, because the code suppresses the Perl 7810# version. But manual intervention to decide what the actual behavior 7811# should be may be required should this happen. The introductory comments 7812# have more to say about this. 7813# 7814# 4) Definition. This is a string for human consumption that specifies the 7815# code points that this table matches. This is used only for the generated 7816# pod file. It may be specified explicitly, or automatically computed. 7817# Only the first portion of complicated definitions is computed and 7818# displayed. 7819 7820sub standardize { return main::standardize($_[0]); } 7821sub trace { return main::trace(@_); } 7822 7823 7824{ # Closure 7825 7826 main::setup_package(); 7827 7828 my %leader; 7829 # The leader table of this one; initially $self. 7830 main::set_access('leader', \%leader, 'r'); 7831 7832 my %equivalents; 7833 # An array of any tables that have this one as their leader 7834 main::set_access('equivalents', \%equivalents, 'readable_array'); 7835 7836 my %parent; 7837 # The parent table to this one, initially $self. This allows us to 7838 # distinguish between equivalent tables that are related (for which this 7839 # is set to), and those which may not be, but share the same output file 7840 # because they match the exact same set of code points in the current 7841 # Unicode release. 7842 main::set_access('parent', \%parent, 'r'); 7843 7844 my %children; 7845 # An array of any tables that have this one as their parent 7846 main::set_access('children', \%children, 'readable_array'); 7847 7848 my %conflicting; 7849 # Array of any tables that would have the same name as this one with 7850 # a different meaning. This is used for the generated documentation. 7851 main::set_access('conflicting', \%conflicting, 'readable_array'); 7852 7853 my %matches_all; 7854 # Set in the constructor for tables that are expected to match all code 7855 # points. 7856 main::set_access('matches_all', \%matches_all, 'r'); 7857 7858 my %complement; 7859 # Points to the complement that this table is expressed in terms of; 0 if 7860 # none. 7861 main::set_access('complement', \%complement, 'r'); 7862 7863 my %definition; 7864 # Human readable string of the first few ranges of code points matched by 7865 # this table 7866 main::set_access('definition', \%definition, 'r', 's'); 7867 7868 sub new { 7869 my $class = shift; 7870 7871 my %args = @_; 7872 7873 # The property for which this table is a listing of property values. 7874 my $property = delete $args{'_Property'}; 7875 7876 my $name = delete $args{'Name'}; 7877 my $full_name = delete $args{'Full_Name'}; 7878 $full_name = $name if ! defined $full_name; 7879 7880 # Optional 7881 my $initialize = delete $args{'Initialize'}; 7882 my $matches_all = delete $args{'Matches_All'} || 0; 7883 my $format = delete $args{'Format'}; 7884 my $definition = delete $args{'Definition'} // ""; 7885 # Rest of parameters passed on. 7886 7887 my $range_list = Range_List->new(Initialize => $initialize, 7888 Owner => $property); 7889 7890 my $complete = $full_name; 7891 $complete = '""' if $complete eq ""; # A null name shouldn't happen, 7892 # but this helps debug if it 7893 # does 7894 # The complete name for a match table includes it's property in a 7895 # compound form 'property=table', except if the property is the 7896 # pseudo-property, perl, in which case it is just the single form, 7897 # 'table' (If you change the '=' must also change the ':' in lots of 7898 # places in this program that assume an equal sign) 7899 $complete = $property->full_name . "=$complete" if $property != $perl; 7900 7901 my $self = $class->SUPER::new(%args, 7902 Name => $name, 7903 Complete_Name => $complete, 7904 Full_Name => $full_name, 7905 _Property => $property, 7906 _Range_List => $range_list, 7907 Format => $EMPTY_FORMAT, 7908 Write_As_Invlist => 1, 7909 ); 7910 my $addr = do { no overloading; pack 'J', $self; }; 7911 7912 $conflicting{$addr} = [ ]; 7913 $equivalents{$addr} = [ ]; 7914 $children{$addr} = [ ]; 7915 $matches_all{$addr} = $matches_all; 7916 $leader{$addr} = $self; 7917 $parent{$addr} = $self; 7918 $complement{$addr} = 0; 7919 $definition{$addr} = $definition; 7920 7921 if (defined $format && $format ne $EMPTY_FORMAT) { 7922 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); 7923 } 7924 7925 return $self; 7926 } 7927 7928 # See this program's beginning comment block about overloading these. 7929 use overload 7930 fallback => 0, 7931 qw("") => "_operator_stringify", 7932 '=' => sub { 7933 my $self = shift; 7934 7935 return if $self->carp_if_locked; 7936 return $self; 7937 }, 7938 7939 '+' => sub { 7940 my $self = shift; 7941 my $other = shift; 7942 7943 return $self->_range_list + $other; 7944 }, 7945 '&' => sub { 7946 my $self = shift; 7947 my $other = shift; 7948 7949 return $self->_range_list & $other; 7950 }, 7951 '+=' => sub { 7952 my $self = shift; 7953 my $other = shift; 7954 my $reversed = shift; 7955 7956 if ($reversed) { 7957 Carp::my_carp_bug("Bad news. Can't cope with '" 7958 . ref($other) 7959 . ' += ' 7960 . ref($self) 7961 . "'. undef returned."); 7962 return; 7963 } 7964 7965 return if $self->carp_if_locked; 7966 7967 my $addr = do { no overloading; pack 'J', $self; }; 7968 7969 if (ref $other) { 7970 7971 # Change the range list of this table to be the 7972 # union of the two. 7973 $self->_set_range_list($self->_range_list 7974 + $other); 7975 } 7976 else { # $other is just a simple value 7977 $self->add_range($other, $other); 7978 } 7979 return $self; 7980 }, 7981 '&=' => sub { 7982 my $self = shift; 7983 my $other = shift; 7984 my $reversed = shift; 7985 7986 if ($reversed) { 7987 Carp::my_carp_bug("Bad news. Can't cope with '" 7988 . ref($other) 7989 . ' &= ' 7990 . ref($self) 7991 . "'. undef returned."); 7992 return; 7993 } 7994 7995 return if $self->carp_if_locked; 7996 $self->_set_range_list($self->_range_list & $other); 7997 return $self; 7998 }, 7999 '-' => sub { my $self = shift; 8000 my $other = shift; 8001 my $reversed = shift; 8002 if ($reversed) { 8003 Carp::my_carp_bug("Bad news. Can't cope with '" 8004 . ref($other) 8005 . ' - ' 8006 . ref($self) 8007 . "'. undef returned."); 8008 return; 8009 } 8010 8011 return $self->_range_list - $other; 8012 }, 8013 '~' => sub { my $self = shift; 8014 return ~ $self->_range_list; 8015 }, 8016 ; 8017 8018 sub _operator_stringify { 8019 my $self = shift; 8020 8021 my $name = $self->complete_name; 8022 return "Table '$name'"; 8023 } 8024 8025 sub _range_list { 8026 # Returns the range list associated with this table, which will be the 8027 # complement's if it has one. 8028 8029 my $self = shift; 8030 my $complement = $self->complement; 8031 8032 # In order to avoid re-complementing on each access, only do the 8033 # complement the first time, and store the result in this table's 8034 # range list to use henceforth. However, this wouldn't work if the 8035 # controlling (complement) table changed after we do this, so lock it. 8036 # Currently, the value of the complement isn't needed until after it 8037 # is fully constructed, so this works. If this were to change, the 8038 # each_range iteration functionality would no longer work on this 8039 # complement. 8040 if ($complement != 0 && $self->SUPER::_range_list->count == 0) { 8041 $self->_set_range_list($self->SUPER::_range_list 8042 + ~ $complement->_range_list); 8043 $complement->lock; 8044 } 8045 8046 return $self->SUPER::_range_list; 8047 } 8048 8049 sub add_alias { 8050 # Add a synonym for this table. See the comments in the base class 8051 8052 my $self = shift; 8053 my $name = shift; 8054 # Rest of parameters passed on. 8055 8056 $self->SUPER::add_alias($name, $self, @_); 8057 return; 8058 } 8059 8060 sub add_conflicting { 8061 # Add the name of some other object to the list of ones that name 8062 # clash with this match table. 8063 8064 my $self = shift; 8065 my $conflicting_name = shift; # The name of the conflicting object 8066 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? 8067 my $conflicting_object = shift; # Optional, the conflicting object 8068 # itself. This is used to 8069 # disambiguate the text if the input 8070 # name is identical to any of the 8071 # aliases $self is known by. 8072 # Sometimes the conflicting object is 8073 # merely hypothetical, so this has to 8074 # be an optional parameter. 8075 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8076 8077 my $addr = do { no overloading; pack 'J', $self; }; 8078 8079 # Check if the conflicting name is exactly the same as any existing 8080 # alias in this table (as long as there is a real object there to 8081 # disambiguate with). 8082 if (defined $conflicting_object) { 8083 foreach my $alias ($self->aliases) { 8084 if (standardize($alias->name) eq standardize($conflicting_name)) { 8085 8086 # Here, there is an exact match. This results in 8087 # ambiguous comments, so disambiguate by changing the 8088 # conflicting name to its object's complete equivalent. 8089 $conflicting_name = $conflicting_object->complete_name; 8090 last; 8091 } 8092 } 8093 } 8094 8095 # Convert to the \p{...} final name 8096 $conflicting_name = "\\$p" . "{$conflicting_name}"; 8097 8098 # Only add once 8099 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; 8100 8101 push @{$conflicting{$addr}}, $conflicting_name; 8102 8103 return; 8104 } 8105 8106 sub is_set_equivalent_to { 8107 # Return boolean of whether or not the other object is a table of this 8108 # type and has been marked equivalent to this one. 8109 8110 my $self = shift; 8111 my $other = shift; 8112 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8113 8114 return 0 if ! defined $other; # Can happen for incomplete early 8115 # releases 8116 unless ($other->isa(__PACKAGE__)) { 8117 my $ref_other = ref $other; 8118 my $ref_self = ref $self; 8119 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); 8120 return 0; 8121 } 8122 8123 # Two tables are equivalent if they have the same leader. 8124 no overloading; 8125 return $leader{pack 'J', $self} == $leader{pack 'J', $other}; 8126 return; 8127 } 8128 8129 sub set_equivalent_to { 8130 # Set $self equivalent to the parameter table. 8131 # The required Related => 'x' parameter is a boolean indicating 8132 # whether these tables are related or not. If related, $other becomes 8133 # the 'parent' of $self; if unrelated it becomes the 'leader' 8134 # 8135 # Related tables share all characteristics except names; equivalents 8136 # not quite so many. 8137 # If they are related, one must be a perl extension. This is because 8138 # we can't guarantee that Unicode won't change one or the other in a 8139 # later release even if they are identical now. 8140 8141 my $self = shift; 8142 my $other = shift; 8143 8144 my %args = @_; 8145 my $related = delete $args{'Related'}; 8146 8147 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 8148 8149 return if ! defined $other; # Keep on going; happens in some early 8150 # Unicode releases. 8151 8152 if (! defined $related) { 8153 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); 8154 $related = 0; 8155 } 8156 8157 # If already are equivalent, no need to re-do it; if subroutine 8158 # returns null, it found an error, also do nothing 8159 my $are_equivalent = $self->is_set_equivalent_to($other); 8160 return if ! defined $are_equivalent || $are_equivalent; 8161 8162 my $addr = do { no overloading; pack 'J', $self; }; 8163 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; 8164 8165 if ($related) { 8166 if ($current_leader->perl_extension) { 8167 if ($other->perl_extension) { 8168 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); 8169 return; 8170 } 8171 } elsif ($self->property != $other->property # Depending on 8172 # situation, might 8173 # be better to use 8174 # add_alias() 8175 # instead for same 8176 # property 8177 && ! $other->perl_extension 8178 8179 # We allow the sc and scx properties to be marked as 8180 # related. They are in fact related, and this allows 8181 # the pod to show that better. This test isn't valid 8182 # if this is an early Unicode release without the scx 8183 # property (having that also implies the sc property 8184 # exists, so don't have to test for no 'sc') 8185 && ( ! defined $scx 8186 && ! ( ( $self->property == $script 8187 || $self->property == $scx) 8188 && ( $self->property == $script 8189 || $self->property == $scx)))) 8190 { 8191 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); 8192 $related = 0; 8193 } 8194 } 8195 8196 if (! $self->is_empty && ! $self->matches_identically_to($other)) { 8197 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); 8198 return; 8199 } 8200 8201 my $leader = do { no overloading; pack 'J', $current_leader; }; 8202 my $other_addr = do { no overloading; pack 'J', $other; }; 8203 8204 # Any tables that are equivalent to or children of this table must now 8205 # instead be equivalent to or (children) to the new leader (parent), 8206 # still equivalent. The equivalency includes their matches_all info, 8207 # and for related tables, their fate and status. 8208 # All related tables are of necessity equivalent, but the converse 8209 # isn't necessarily true 8210 my $status = $other->status; 8211 my $status_info = $other->status_info; 8212 my $fate = $other->fate; 8213 my $matches_all = $matches_all{other_addr}; 8214 my $caseless_equivalent = $other->caseless_equivalent; 8215 foreach my $table ($current_leader, @{$equivalents{$leader}}) { 8216 next if $table == $other; 8217 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; 8218 8219 my $table_addr = do { no overloading; pack 'J', $table; }; 8220 $leader{$table_addr} = $other; 8221 $matches_all{$table_addr} = $matches_all; 8222 $self->_set_range_list($other->_range_list); 8223 push @{$equivalents{$other_addr}}, $table; 8224 if ($related) { 8225 $parent{$table_addr} = $other; 8226 push @{$children{$other_addr}}, $table; 8227 $table->set_status($status, $status_info); 8228 8229 # This reason currently doesn't get exposed outside; otherwise 8230 # would have to look up the parent's reason and use it instead. 8231 $table->set_fate($fate, "Parent's fate"); 8232 8233 $self->set_caseless_equivalent($caseless_equivalent); 8234 } 8235 } 8236 8237 # Now that we've declared these to be equivalent, any changes to one 8238 # of the tables would invalidate that equivalency. 8239 $self->lock; 8240 $other->lock; 8241 return; 8242 } 8243 8244 sub set_complement { 8245 # Set $self to be the complement of the parameter table. $self is 8246 # locked, as what it contains should all come from the other table. 8247 8248 my $self = shift; 8249 my $other = shift; 8250 8251 my %args = @_; 8252 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 8253 8254 if ($other->complement != 0) { 8255 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); 8256 return; 8257 } 8258 my $addr = do { no overloading; pack 'J', $self; }; 8259 $complement{$addr} = $other; 8260 8261 # Be sure the other property knows we are depending on them; or the 8262 # other table if it is one in the current property. 8263 if ($self->property != $other->property) { 8264 $other->property->set_has_dependency(1); 8265 } 8266 else { 8267 $other->set_has_dependency(1); 8268 } 8269 $self->lock; 8270 return; 8271 } 8272 8273 sub add_range { # Add a range to the list for this table. 8274 my $self = shift; 8275 # Rest of parameters passed on 8276 8277 return if $self->carp_if_locked; 8278 return $self->_range_list->add_range(@_); 8279 } 8280 8281 sub header { 8282 my $self = shift; 8283 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8284 8285 # All match tables are to be used only by the Perl core. 8286 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; 8287 } 8288 8289 sub pre_body { # Does nothing for match tables. 8290 return 8291 } 8292 8293 sub append_to_body { # Does nothing for match tables. 8294 return 8295 } 8296 8297 sub set_fate { 8298 my $self = shift; 8299 my $fate = shift; 8300 my $reason = shift; 8301 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8302 8303 $self->SUPER::set_fate($fate, $reason); 8304 8305 # All children share this fate 8306 foreach my $child ($self->children) { 8307 $child->set_fate($fate, $reason); 8308 } 8309 return; 8310 } 8311 8312 sub calculate_table_definition 8313 { 8314 # Returns a human-readable string showing some or all of the code 8315 # points matched by this table. The string will include a 8316 # bracketed-character class for all characters matched in the 00-FF 8317 # range, and the first few ranges matched beyond that. 8318 my $max_ranges = 6; 8319 8320 my $self = shift; 8321 my $definition = $self->definition || ""; 8322 8323 # Skip this if already have a definition. 8324 return $definition if $definition; 8325 8326 my $lows_string = ""; # The string representation of the 0-FF 8327 # characters 8328 my $string_range = ""; # The string rep. of the above FF ranges 8329 my $range_count = 0; # How many ranges in $string_rage 8330 8331 my @lows_invlist; # The inversion list of the 0-FF code points 8332 my $first_non_control = ord(" "); # Everything below this is a 8333 # control, on ASCII or EBCDIC 8334 my $max_table_code_point = $self->max; 8335 8336 # On ASCII platforms, the range 80-FF contains no printables. 8337 my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126); 8338 8339 8340 # Look through the first few ranges matched by this table. 8341 $self->reset_each_range; # Defensive programming 8342 while (defined (my $range = $self->each_range())) { 8343 my $start = $range->start; 8344 my $end = $range->end; 8345 8346 # Accumulate an inversion list of the 00-FF code points 8347 if ($start < 256 && ($start > 0 || $end < 256)) { 8348 push @lows_invlist, $start; 8349 push @lows_invlist, 1 + (($end < 256) ? $end : 255); 8350 8351 # Get next range if there are more ranges below 256 8352 next if $end < 256 && $end < $max_table_code_point; 8353 8354 # If the range straddles the 255/256 boundary, we split it 8355 # there. We already added above the low portion to the 8356 # inversion list 8357 $start = 256 if $end > 256; 8358 } 8359 8360 # Here, @lows_invlist contains the code points below 256, and 8361 # there is no other range, or the current one starts at or above 8362 # 256. Generate the [char class] for the 0-255 ones. 8363 while (@lows_invlist) { 8364 8365 # If this range (necessarily the first one, by the way) starts 8366 # at 0 ... 8367 if ($lows_invlist[0] == 0) { 8368 8369 # If it ends within the block of controls, that means that 8370 # some controls are in it and some aren't. Since Unicode 8371 # properties pretty much only know about a few of the 8372 # controls, like \n, \t, this means that its one of them 8373 # that isn't in the range. Complement the inversion list 8374 # which will likely cause these to be output using their 8375 # mnemonics, hence being clearer. 8376 if ($lows_invlist[1] < $first_non_control) { 8377 $lows_string .= '^'; 8378 shift @lows_invlist; 8379 push @lows_invlist, 256; 8380 } 8381 elsif ($lows_invlist[1] <= $highest_printable) { 8382 8383 # Here, it extends into the printables block. Split 8384 # into two ranges so that the controls are separate. 8385 $lows_string .= sprintf "\\x00-\\x%02x", 8386 $first_non_control - 1; 8387 $lows_invlist[0] = $first_non_control; 8388 } 8389 } 8390 8391 # If the range completely contains the printables, don't 8392 # individually spell out the printables. 8393 if ( $lows_invlist[0] <= $first_non_control 8394 && $lows_invlist[1] > $highest_printable) 8395 { 8396 $lows_string .= sprintf "\\x%02x-\\x%02x", 8397 $lows_invlist[0], $lows_invlist[1] - 1; 8398 shift @lows_invlist; 8399 shift @lows_invlist; 8400 next; 8401 } 8402 8403 # Here, the range may include some but not all printables. 8404 # Look at each one individually 8405 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) { 8406 my $char = chr $ord; 8407 8408 # If there is already something in the list, an 8409 # alphanumeric char could be the next in sequence. If so, 8410 # we start or extend a range. That is, we could have so 8411 # far something like 'a-c', and the next char is a 'd', so 8412 # we change it to 'a-d'. We use native_to_unicode() 8413 # because a-z on EBCDIC means 26 chars, and excludes the 8414 # gap ones. 8415 if ($lows_string ne "" && $char =~ /[[:alnum:]]/) { 8416 my $prev = substr($lows_string, -1); 8417 if ( $prev !~ /[[:alnum:]]/ 8418 || utf8::native_to_unicode(ord $prev) + 1 8419 != utf8::native_to_unicode(ord $char)) 8420 { 8421 # Not extending the range 8422 $lows_string .= $char; 8423 } 8424 elsif ( length $lows_string > 1 8425 && substr($lows_string, -2, 1) eq '-') 8426 { 8427 # We had a sequence like '-c' and the current 8428 # character is 'd'. Extend the range. 8429 substr($lows_string, -1, 1) = $char; 8430 } 8431 else { 8432 # We had something like 'd' and this is 'e'. 8433 # Start a range. 8434 $lows_string .= "-$char"; 8435 } 8436 } 8437 elsif ($char =~ /[[:graph:]]/) { 8438 8439 # We output a graphic char as-is, preceded by a 8440 # backslash if it is a metacharacter 8441 $lows_string .= '\\' 8442 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/; 8443 $lows_string .= $char; 8444 } # Otherwise use mnemonic for any that have them 8445 elsif ($char =~ /[\a]/) { 8446 $lows_string .= '\a'; 8447 } 8448 elsif ($char =~ /[\b]/) { 8449 $lows_string .= '\b'; 8450 } 8451 elsif ($char eq "\e") { 8452 $lows_string .= '\e'; 8453 } 8454 elsif ($char eq "\f") { 8455 $lows_string .= '\f'; 8456 } 8457 elsif ($char eq "\cK") { 8458 $lows_string .= '\cK'; 8459 } 8460 elsif ($char eq "\n") { 8461 $lows_string .= '\n'; 8462 } 8463 elsif ($char eq "\r") { 8464 $lows_string .= '\r'; 8465 } 8466 elsif ($char eq "\t") { 8467 $lows_string .= '\t'; 8468 } 8469 else { 8470 8471 # Here is a non-graphic without a mnemonic. We use \x 8472 # notation. But if the ordinal of this is one above 8473 # the previous, create or extend the range 8474 my $hex_representation = sprintf("%02x", ord $char); 8475 if ( length $lows_string >= 4 8476 && substr($lows_string, -4, 2) eq '\\x' 8477 && hex(substr($lows_string, -2)) + 1 == ord $char) 8478 { 8479 if ( length $lows_string >= 5 8480 && substr($lows_string, -5, 1) eq '-' 8481 && ( length $lows_string == 5 8482 || substr($lows_string, -6, 1) ne '\\')) 8483 { 8484 substr($lows_string, -2) = $hex_representation; 8485 } 8486 else { 8487 $lows_string .= '-\\x' . $hex_representation; 8488 } 8489 } 8490 else { 8491 $lows_string .= '\\x' . $hex_representation; 8492 } 8493 } 8494 } 8495 } 8496 8497 # Done with assembling the string of all lows. If there are only 8498 # lows in the property, are completely done. 8499 if ($max_table_code_point < 256) { 8500 $self->reset_each_range; 8501 last; 8502 } 8503 8504 # Otherwise, quit if reached max number of non-lows ranges. If 8505 # there are lows, count them as one unit towards the maximum. 8506 $range_count++; 8507 if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) { 8508 $string_range .= " ..."; 8509 $self->reset_each_range; 8510 last; 8511 } 8512 8513 # Otherwise add this range. 8514 $string_range .= ", " if $string_range ne ""; 8515 if ($start == $end) { 8516 $string_range .= sprintf("U+%04X", $start); 8517 } 8518 elsif ($end >= $MAX_WORKING_CODEPOINT) { 8519 $string_range .= sprintf("U+%04X..infinity", $start); 8520 } 8521 else { 8522 $string_range .= sprintf("U+%04X..%04X", 8523 $start, $end); 8524 } 8525 } 8526 8527 # Done with all the ranges we're going to look at. Assemble the 8528 # definition from the lows + non-lows. 8529 8530 if ($lows_string ne "" || $string_range ne "") { 8531 if ($lows_string ne "") { 8532 $definition .= "[$lows_string]"; 8533 $definition .= ", " if $string_range; 8534 } 8535 $definition .= $string_range; 8536 } 8537 8538 return $definition; 8539 } 8540 8541 sub write { 8542 my $self = shift; 8543 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8544 8545 return $self->SUPER::write(0); # No adjustments 8546 } 8547 8548 sub set_final_comment { 8549 # This creates a comment for the file that is to hold the match table 8550 # $self. It is somewhat convoluted to make the English read nicely, 8551 # but, heh, it's just a comment. 8552 # This should be called only with the leader match table of all the 8553 # ones that share the same file. It lists all such tables, ordered so 8554 # that related ones are together. 8555 8556 return unless $debugging_build; 8557 8558 my $leader = shift; # Should only be called on the leader table of 8559 # an equivalent group 8560 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 8561 8562 my $addr = do { no overloading; pack 'J', $leader; }; 8563 8564 if ($leader{$addr} != $leader) { 8565 Carp::my_carp_bug(<<END 8566set_final_comment() must be called on a leader table, which $leader is not. 8567It is equivalent to $leader{$addr}. No comment created 8568END 8569 ); 8570 return; 8571 } 8572 8573 # Get the number of code points matched by each of the tables in this 8574 # file, and add underscores for clarity. 8575 my $count = $leader->count; 8576 my $unicode_count; 8577 my $non_unicode_string; 8578 if ($count > $MAX_UNICODE_CODEPOINTS) { 8579 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 8580 - $MAX_UNICODE_CODEPOINT); 8581 $non_unicode_string = "All above-Unicode code points match as well, and are also returned"; 8582 } 8583 else { 8584 $unicode_count = $count; 8585 $non_unicode_string = ""; 8586 } 8587 my $string_count = main::clarify_code_point_count($unicode_count); 8588 8589 my $loose_count = 0; # how many aliases loosely matched 8590 my $compound_name = ""; # ? Are any names compound?, and if so, an 8591 # example 8592 my $properties_with_compound_names = 0; # count of these 8593 8594 8595 my %flags; # The status flags used in the file 8596 my $total_entries = 0; # number of entries written in the comment 8597 my $matches_comment = ""; # The portion of the comment about the 8598 # \p{}'s 8599 my @global_comments; # List of all the tables' comments that are 8600 # there before this routine was called. 8601 my $has_ucd_alias = 0; # If there is an alias that is accessible via 8602 # Unicode::UCD. If not, then don't say it is 8603 # in the comment 8604 8605 # Get list of all the parent tables that are equivalent to this one 8606 # (including itself). 8607 my @parents = grep { $parent{main::objaddr $_} == $_ } 8608 main::uniques($leader, @{$equivalents{$addr}}); 8609 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated 8610 # tables 8611 for my $parent (@parents) { 8612 8613 my $property = $parent->property; 8614 8615 # Special case 'N' tables in properties with two match tables when 8616 # the other is a 'Y' one. These are likely to be binary tables, 8617 # but not necessarily. In either case, \P{} will match the 8618 # complement of \p{}, and so if something is a synonym of \p, the 8619 # complement of that something will be the synonym of \P. This 8620 # would be true of any property with just two match tables, not 8621 # just those whose values are Y and N; but that would require a 8622 # little extra work, and there are none such so far in Unicode. 8623 my $perl_p = 'p'; # which is it? \p{} or \P{} 8624 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table 8625 8626 if (scalar $property->tables == 2 8627 && $parent == $property->table('N') 8628 && defined (my $yes = $property->table('Y'))) 8629 { 8630 my $yes_addr = do { no overloading; pack 'J', $yes; }; 8631 @yes_perl_synonyms 8632 = grep { $_->property == $perl } 8633 main::uniques($yes, 8634 $parent{$yes_addr}, 8635 $parent{$yes_addr}->children); 8636 8637 # But these synonyms are \P{} ,not \p{} 8638 $perl_p = 'P'; 8639 } 8640 8641 my @description; # Will hold the table description 8642 my @note; # Will hold the table notes. 8643 my @conflicting; # Will hold the table conflicts. 8644 8645 # Look at the parent, any yes synonyms, and all the children 8646 my $parent_addr = do { no overloading; pack 'J', $parent; }; 8647 for my $table ($parent, 8648 @yes_perl_synonyms, 8649 @{$children{$parent_addr}}) 8650 { 8651 my $table_addr = do { no overloading; pack 'J', $table; }; 8652 my $table_property = $table->property; 8653 8654 # Tables are separated by a blank line to create a grouping. 8655 $matches_comment .= "\n" if $matches_comment; 8656 8657 # The table is named based on the property and value 8658 # combination it is for, like script=greek. But there may be 8659 # a number of synonyms for each side, like 'sc' for 'script', 8660 # and 'grek' for 'greek'. Any combination of these is a valid 8661 # name for this table. In this case, there are three more, 8662 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than 8663 # listing all possible combinations in the comment, we make 8664 # sure that each synonym occurs at least once, and add 8665 # commentary that the other combinations are possible. 8666 # Because regular expressions don't recognize things like 8667 # \p{jsn=}, only look at non-null right-hand-sides 8668 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases; 8669 my @table_aliases = grep { $_->name ne "" } $table->aliases; 8670 8671 # The alias lists above are already ordered in the order we 8672 # want to output them. To ensure that each synonym is listed, 8673 # we must use the max of the two numbers. But if there are no 8674 # legal synonyms (nothing in @table_aliases), then we don't 8675 # list anything. 8676 my $listed_combos = (@table_aliases) 8677 ? main::max(scalar @table_aliases, 8678 scalar @property_aliases) 8679 : 0; 8680 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG; 8681 8682 my $property_had_compound_name = 0; 8683 8684 for my $i (0 .. $listed_combos - 1) { 8685 $total_entries++; 8686 8687 # The current alias for the property is the next one on 8688 # the list, or if beyond the end, start over. Similarly 8689 # for the table (\p{prop=table}) 8690 my $property_alias = $property_aliases 8691 [$i % @property_aliases]->name; 8692 my $table_alias_object = $table_aliases 8693 [$i % @table_aliases]; 8694 my $table_alias = $table_alias_object->name; 8695 my $loose_match = $table_alias_object->loose_match; 8696 $has_ucd_alias |= $table_alias_object->ucd; 8697 8698 if ($table_alias !~ /\D/) { # Clarify large numbers. 8699 $table_alias = main::clarify_number($table_alias) 8700 } 8701 8702 # Add a comment for this alias combination 8703 my $current_match_comment; 8704 if ($table_property == $perl) { 8705 $current_match_comment = "\\$perl_p" 8706 . "{$table_alias}"; 8707 } 8708 else { 8709 $current_match_comment 8710 = "\\p{$property_alias=$table_alias}"; 8711 $property_had_compound_name = 1; 8712 } 8713 8714 # Flag any abnormal status for this table. 8715 my $flag = $property->status 8716 || $table->status 8717 || $table_alias_object->status; 8718 if ($flag && $flag ne $PLACEHOLDER) { 8719 $flags{$flag} = $status_past_participles{$flag}; 8720 } 8721 8722 $loose_count++; 8723 8724 # Pretty up the comment. Note the \b; it says don't make 8725 # this line a continuation. 8726 $matches_comment .= sprintf("\b%-1s%-s%s\n", 8727 $flag, 8728 " " x 7, 8729 $current_match_comment); 8730 } # End of generating the entries for this table. 8731 8732 # Save these for output after this group of related tables. 8733 push @description, $table->description; 8734 push @note, $table->note; 8735 push @conflicting, $table->conflicting; 8736 8737 # And this for output after all the tables. 8738 push @global_comments, $table->comment; 8739 8740 # Compute an alternate compound name using the final property 8741 # synonym and the first table synonym with a colon instead of 8742 # the equal sign used elsewhere. 8743 if ($property_had_compound_name) { 8744 $properties_with_compound_names ++; 8745 if (! $compound_name || @property_aliases > 1) { 8746 $compound_name = $property_aliases[-1]->name 8747 . ': ' 8748 . $table_aliases[0]->name; 8749 } 8750 } 8751 } # End of looping through all children of this table 8752 8753 # Here have assembled in $matches_comment all the related tables 8754 # to the current parent (preceded by the same info for all the 8755 # previous parents). Put out information that applies to all of 8756 # the current family. 8757 if (@conflicting) { 8758 8759 # But output the conflicting information now, as it applies to 8760 # just this table. 8761 my $conflicting = join ", ", @conflicting; 8762 if ($conflicting) { 8763 $matches_comment .= <<END; 8764 8765 Note that contrary to what you might expect, the above is NOT the same as 8766END 8767 $matches_comment .= "any of: " if @conflicting > 1; 8768 $matches_comment .= "$conflicting\n"; 8769 } 8770 } 8771 if (@description) { 8772 $matches_comment .= "\n Meaning: " 8773 . join('; ', @description) 8774 . "\n"; 8775 } 8776 if (@note) { 8777 $matches_comment .= "\n Note: " 8778 . join("\n ", @note) 8779 . "\n"; 8780 } 8781 } # End of looping through all tables 8782 8783 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string; 8784 8785 8786 my $code_points; 8787 my $match; 8788 my $any_of_these; 8789 if ($unicode_count == 1) { 8790 $match = 'matches'; 8791 $code_points = 'single code point'; 8792 } 8793 else { 8794 $match = 'match'; 8795 $code_points = "$string_count code points"; 8796 } 8797 8798 my $synonyms; 8799 my $entries; 8800 if ($total_entries == 1) { 8801 $synonyms = ""; 8802 $entries = 'entry'; 8803 $any_of_these = 'this' 8804 } 8805 else { 8806 $synonyms = " any of the following regular expression constructs"; 8807 $entries = 'entries'; 8808 $any_of_these = 'any of these' 8809 } 8810 8811 my $comment = ""; 8812 if ($has_ucd_alias) { 8813 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; 8814 } 8815 if ($has_unrelated) { 8816 $comment .= <<END; 8817This file is for tables that are not necessarily related: To conserve 8818resources, every table that matches the identical set of code points in this 8819version of Unicode uses this file. Each one is listed in a separate group 8820below. It could be that the tables will match the same set of code points in 8821other Unicode releases, or it could be purely coincidence that they happen to 8822be the same in Unicode $unicode_version, and hence may not in other versions. 8823 8824END 8825 } 8826 8827 if (%flags) { 8828 foreach my $flag (sort keys %flags) { 8829 $comment .= <<END; 8830'$flag' below means that this form is $flags{$flag}. 8831END 8832 if ($flag eq $INTERNAL_ALIAS) { 8833 $comment .= "DO NOT USE!!!"; 8834 } 8835 else { 8836 $comment .= "Consult $pod_file.pod"; 8837 } 8838 $comment .= "\n"; 8839 } 8840 $comment .= "\n"; 8841 } 8842 8843 if ($total_entries == 0) { 8844 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); 8845 $comment .= <<END; 8846This file returns the $code_points in Unicode Version 8847$unicode_version for 8848$leader, but it is inaccessible through Perl regular expressions, as 8849"\\p{prop=}" is not recognized. 8850END 8851 8852 } else { 8853 $comment .= <<END; 8854This file returns the $code_points in Unicode Version 8855$unicode_version that 8856$match$synonyms: 8857 8858$matches_comment 8859$pod_file.pod should be consulted for the syntax rules for $any_of_these, 8860including if adding or subtracting white space, underscore, and hyphen 8861characters matters or doesn't matter, and other permissible syntactic 8862variants. Upper/lower case distinctions never matter. 8863END 8864 8865 } 8866 if ($compound_name) { 8867 $comment .= <<END; 8868 8869A colon can be substituted for the equals sign, and 8870END 8871 if ($properties_with_compound_names > 1) { 8872 $comment .= <<END; 8873within each group above, 8874END 8875 } 8876 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name); 8877 8878 # Note the \b below, it says don't make that line a continuation. 8879 $comment .= <<END; 8880anything to the left of the equals (or colon) can be combined with anything to 8881the right. Thus, for example, 8882$compound_name 8883\bis also valid. 8884END 8885 } 8886 8887 # And append any comment(s) from the actual tables. They are all 8888 # gathered here, so may not read all that well. 8889 if (@global_comments) { 8890 $comment .= "\n" . join("\n\n", @global_comments) . "\n"; 8891 } 8892 8893 if ($count) { # The format differs if no code points, and needs no 8894 # explanation in that case 8895 if ($leader->write_as_invlist) { 8896 $comment.= <<END; 8897 8898The first data line of this file begins with the letter V to indicate it is in 8899inversion list format. The number following the V gives the number of lines 8900remaining. Each of those remaining lines is a single number representing the 8901starting code point of a range which goes up to but not including the number 8902on the next line; The 0th, 2nd, 4th... ranges are for code points that match 8903the property; the 1st, 3rd, 5th... are ranges of code points that don't match 8904the property. The final line's range extends to the platform's infinity. 8905END 8906 } 8907 else { 8908 $comment.= <<END; 8909The format of the lines of this file is: 8910START\\tSTOP\\twhere START is the starting code point of the range, in hex; 8911STOP is the ending point, or if omitted, the range has just one code point. 8912END 8913 } 8914 if ($leader->output_range_counts) { 8915 $comment .= <<END; 8916Numbers in comments in [brackets] indicate how many code points are in the 8917range. 8918END 8919 } 8920 } 8921 8922 $leader->set_comment(main::join_lines($comment)); 8923 return; 8924 } 8925 8926 # Accessors for the underlying list 8927 for my $sub (qw( 8928 get_valid_code_point 8929 get_invalid_code_point 8930 )) 8931 { 8932 no strict "refs"; 8933 *$sub = sub { 8934 use strict "refs"; 8935 my $self = shift; 8936 8937 return $self->_range_list->$sub(@_); 8938 } 8939 } 8940} # End closure for Match_Table 8941 8942package Property; 8943 8944# The Property class represents a Unicode property, or the $perl 8945# pseudo-property. It contains a map table initialized empty at construction 8946# time, and for properties accessible through regular expressions, various 8947# match tables, created through the add_match_table() method, and referenced 8948# by the table('NAME') or tables() methods, the latter returning a list of all 8949# of the match tables. Otherwise table operations implicitly are for the map 8950# table. 8951# 8952# Most of the data in the property is actually about its map table, so it 8953# mostly just uses that table's accessors for most methods. The two could 8954# have been combined into one object, but for clarity because of their 8955# differing semantics, they have been kept separate. It could be argued that 8956# the 'file' and 'directory' fields should be kept with the map table. 8957# 8958# Each property has a type. This can be set in the constructor, or in the 8959# set_type accessor, but mostly it is figured out by the data. Every property 8960# starts with unknown type, overridden by a parameter to the constructor, or 8961# as match tables are added, or ranges added to the map table, the data is 8962# inspected, and the type changed. After the table is mostly or entirely 8963# filled, compute_type() should be called to finalize they analysis. 8964# 8965# There are very few operations defined. One can safely remove a range from 8966# the map table, and property_add_or_replace_non_nulls() adds the maps from another 8967# table to this one, replacing any in the intersection of the two. 8968 8969sub standardize { return main::standardize($_[0]); } 8970sub trace { return main::trace(@_) if main::DEBUG && $to_trace } 8971 8972{ # Closure 8973 8974 # This hash will contain as keys, all the aliases of all properties, and 8975 # as values, pointers to their respective property objects. This allows 8976 # quick look-up of a property from any of its names. 8977 my %alias_to_property_of; 8978 8979 sub dump_alias_to_property_of { 8980 # For debugging 8981 8982 print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; 8983 return; 8984 } 8985 8986 sub property_ref { 8987 # This is a package subroutine, not called as a method. 8988 # If the single parameter is a literal '*' it returns a list of all 8989 # defined properties. 8990 # Otherwise, the single parameter is a name, and it returns a pointer 8991 # to the corresponding property object, or undef if none. 8992 # 8993 # Properties can have several different names. The 'standard' form of 8994 # each of them is stored in %alias_to_property_of as they are defined. 8995 # But it's possible that this subroutine will be called with some 8996 # variant, so if the initial lookup fails, it is repeated with the 8997 # standardized form of the input name. If found, besides returning the 8998 # result, the input name is added to the list so future calls won't 8999 # have to do the conversion again. 9000 9001 my $name = shift; 9002 9003 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9004 9005 if (! defined $name) { 9006 Carp::my_carp_bug("Undefined input property. No action taken."); 9007 return; 9008 } 9009 9010 return main::uniques(values %alias_to_property_of) if $name eq '*'; 9011 9012 # Return cached result if have it. 9013 my $result = $alias_to_property_of{$name}; 9014 return $result if defined $result; 9015 9016 # Convert the input to standard form. 9017 my $standard_name = standardize($name); 9018 9019 $result = $alias_to_property_of{$standard_name}; 9020 return unless defined $result; # Don't cache undefs 9021 9022 # Cache the result before returning it. 9023 $alias_to_property_of{$name} = $result; 9024 return $result; 9025 } 9026 9027 9028 main::setup_package(); 9029 9030 my %map; 9031 # A pointer to the map table object for this property 9032 main::set_access('map', \%map); 9033 9034 my %full_name; 9035 # The property's full name. This is a duplicate of the copy kept in the 9036 # map table, but is needed because stringify needs it during 9037 # construction of the map table, and then would have a chicken before egg 9038 # problem. 9039 main::set_access('full_name', \%full_name, 'r'); 9040 9041 my %table_ref; 9042 # This hash will contain as keys, all the aliases of any match tables 9043 # attached to this property, and as values, the pointers to their 9044 # respective tables. This allows quick look-up of a table from any of its 9045 # names. 9046 main::set_access('table_ref', \%table_ref); 9047 9048 my %type; 9049 # The type of the property, $ENUM, $BINARY, etc 9050 main::set_access('type', \%type, 'r'); 9051 9052 my %file; 9053 # The filename where the map table will go (if actually written). 9054 # Normally defaulted, but can be overridden. 9055 main::set_access('file', \%file, 'r', 's'); 9056 9057 my %directory; 9058 # The directory where the map table will go (if actually written). 9059 # Normally defaulted, but can be overridden. 9060 main::set_access('directory', \%directory, 's'); 9061 9062 my %pseudo_map_type; 9063 # This is used to affect the calculation of the map types for all the 9064 # ranges in the table. It should be set to one of the values that signify 9065 # to alter the calculation. 9066 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); 9067 9068 my %has_only_code_point_maps; 9069 # A boolean used to help in computing the type of data in the map table. 9070 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); 9071 9072 my %unique_maps; 9073 # A list of the first few distinct mappings this property has. This is 9074 # used to disambiguate between binary and enum property types, so don't 9075 # have to keep more than three. 9076 main::set_access('unique_maps', \%unique_maps); 9077 9078 my %pre_declared_maps; 9079 # A boolean that gives whether the input data should declare all the 9080 # tables used, or not. If the former, unknown ones raise a warning. 9081 main::set_access('pre_declared_maps', 9082 \%pre_declared_maps, 'r', 's'); 9083 9084 my %has_dependency; 9085 # A boolean that gives whether some table somewhere is defined as the 9086 # complement of a table in this property. This is a crude, but currently 9087 # sufficient, mechanism to make this property not get destroyed before 9088 # what is dependent on it is. Other dependencies could be added, so the 9089 # name was chosen to reflect a more general situation than actually is 9090 # currently the case. 9091 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 9092 9093 sub new { 9094 # The only required parameter is the positionally first, name. All 9095 # other parameters are key => value pairs. See the documentation just 9096 # above for the meanings of the ones not passed directly on to the map 9097 # table constructor. 9098 9099 my $class = shift; 9100 my $name = shift || ""; 9101 9102 my $self = property_ref($name); 9103 if (defined $self) { 9104 my $options_string = join ", ", @_; 9105 $options_string = ". Ignoring options $options_string" if $options_string; 9106 Carp::my_carp("$self is already in use. Using existing one$options_string;"); 9107 return $self; 9108 } 9109 9110 my %args = @_; 9111 9112 $self = bless \do { my $anonymous_scalar }, $class; 9113 my $addr = do { no overloading; pack 'J', $self; }; 9114 9115 $directory{$addr} = delete $args{'Directory'}; 9116 $file{$addr} = delete $args{'File'}; 9117 $full_name{$addr} = delete $args{'Full_Name'} || $name; 9118 $type{$addr} = delete $args{'Type'} || $UNKNOWN; 9119 $pseudo_map_type{$addr} = delete $args{'Map_Type'}; 9120 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'} 9121 # Starting in this release, property 9122 # values should be defined for all 9123 # properties, except those overriding this 9124 // $v_version ge v5.1.0; 9125 9126 # Rest of parameters passed on. 9127 9128 $has_only_code_point_maps{$addr} = 1; 9129 $table_ref{$addr} = { }; 9130 $unique_maps{$addr} = { }; 9131 $has_dependency{$addr} = 0; 9132 9133 $map{$addr} = Map_Table->new($name, 9134 Full_Name => $full_name{$addr}, 9135 _Alias_Hash => \%alias_to_property_of, 9136 _Property => $self, 9137 %args); 9138 return $self; 9139 } 9140 9141 # See this program's beginning comment block about overloading the copy 9142 # constructor. Few operations are defined on properties, but a couple are 9143 # useful. It is safe to take the inverse of a property, and to remove a 9144 # single code point from it. 9145 use overload 9146 fallback => 0, 9147 qw("") => "_operator_stringify", 9148 "." => \&main::_operator_dot, 9149 ".=" => \&main::_operator_dot_equal, 9150 '==' => \&main::_operator_equal, 9151 '!=' => \&main::_operator_not_equal, 9152 '=' => sub { return shift }, 9153 '-=' => "_minus_and_equal", 9154 ; 9155 9156 sub _operator_stringify { 9157 return "Property '" . shift->full_name . "'"; 9158 } 9159 9160 sub _minus_and_equal { 9161 # Remove a single code point from the map table of a property. 9162 9163 my $self = shift; 9164 my $other = shift; 9165 my $reversed = shift; 9166 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9167 9168 if (ref $other) { 9169 Carp::my_carp_bug("Bad news. Can't cope with a " 9170 . ref($other) 9171 . " argument to '-='. Subtraction ignored."); 9172 return $self; 9173 } 9174 elsif ($reversed) { # Shouldn't happen in a -=, but just in case 9175 Carp::my_carp_bug("Bad news. Can't cope with subtracting a " 9176 . ref $self 9177 . " from a non-object. undef returned."); 9178 return; 9179 } 9180 else { 9181 no overloading; 9182 $map{pack 'J', $self}->delete_range($other, $other); 9183 } 9184 return $self; 9185 } 9186 9187 sub add_match_table { 9188 # Add a new match table for this property, with name given by the 9189 # parameter. It returns a pointer to the table. 9190 9191 my $self = shift; 9192 my $name = shift; 9193 my %args = @_; 9194 9195 my $addr = do { no overloading; pack 'J', $self; }; 9196 9197 my $table = $table_ref{$addr}{$name}; 9198 my $standard_name = main::standardize($name); 9199 if (defined $table 9200 || (defined ($table = $table_ref{$addr}{$standard_name}))) 9201 { 9202 Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); 9203 $table_ref{$addr}{$name} = $table; 9204 return $table; 9205 } 9206 else { 9207 9208 # See if this is a perl extension, if not passed in. 9209 my $perl_extension = delete $args{'Perl_Extension'}; 9210 $perl_extension 9211 = $self->perl_extension if ! defined $perl_extension; 9212 9213 my $fate; 9214 my $suppression_reason = ""; 9215 if ($self->name =~ /^_/) { 9216 $fate = $SUPPRESSED; 9217 $suppression_reason = "Parent property is internal only"; 9218 } 9219 elsif ($self->fate >= $SUPPRESSED) { 9220 $fate = $self->fate; 9221 $suppression_reason = $why_suppressed{$self->complete_name}; 9222 9223 } 9224 elsif ($name =~ /^_/) { 9225 $fate = $INTERNAL_ONLY; 9226 } 9227 $table = Match_Table->new( 9228 Name => $name, 9229 Perl_Extension => $perl_extension, 9230 _Alias_Hash => $table_ref{$addr}, 9231 _Property => $self, 9232 Fate => $fate, 9233 Suppression_Reason => $suppression_reason, 9234 Status => $self->status, 9235 _Status_Info => $self->status_info, 9236 %args); 9237 return unless defined $table; 9238 } 9239 9240 # Save the names for quick look up 9241 $table_ref{$addr}{$standard_name} = $table; 9242 $table_ref{$addr}{$name} = $table; 9243 9244 # Perhaps we can figure out the type of this property based on the 9245 # fact of adding this match table. First, string properties don't 9246 # have match tables; second, a binary property can't have 3 match 9247 # tables 9248 if ($type{$addr} == $UNKNOWN) { 9249 $type{$addr} = $NON_STRING; 9250 } 9251 elsif ($type{$addr} == $STRING) { 9252 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); 9253 $type{$addr} = $NON_STRING; 9254 } 9255 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { 9256 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) { 9257 if ($type{$addr} == $BINARY) { 9258 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); 9259 } 9260 $type{$addr} = $ENUM; 9261 } 9262 } 9263 9264 return $table; 9265 } 9266 9267 sub delete_match_table { 9268 # Delete the table referred to by $2 from the property $1. 9269 9270 my $self = shift; 9271 my $table_to_remove = shift; 9272 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9273 9274 my $addr = do { no overloading; pack 'J', $self; }; 9275 9276 # Remove all names that refer to it. 9277 foreach my $key (keys %{$table_ref{$addr}}) { 9278 delete $table_ref{$addr}{$key} 9279 if $table_ref{$addr}{$key} == $table_to_remove; 9280 } 9281 9282 $table_to_remove->DESTROY; 9283 return; 9284 } 9285 9286 sub table { 9287 # Return a pointer to the match table (with name given by the 9288 # parameter) associated with this property; undef if none. 9289 9290 my $self = shift; 9291 my $name = shift; 9292 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9293 9294 my $addr = do { no overloading; pack 'J', $self; }; 9295 9296 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; 9297 9298 # If quick look-up failed, try again using the standard form of the 9299 # input name. If that succeeds, cache the result before returning so 9300 # won't have to standardize this input name again. 9301 my $standard_name = main::standardize($name); 9302 return unless defined $table_ref{$addr}{$standard_name}; 9303 9304 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; 9305 return $table_ref{$addr}{$name}; 9306 } 9307 9308 sub tables { 9309 # Return a list of pointers to all the match tables attached to this 9310 # property 9311 9312 no overloading; 9313 return main::uniques(values %{$table_ref{pack 'J', shift}}); 9314 } 9315 9316 sub directory { 9317 # Returns the directory the map table for this property should be 9318 # output in. If a specific directory has been specified, that has 9319 # priority; 'undef' is returned if the type isn't defined; 9320 # or $map_directory for everything else. 9321 9322 my $addr = do { no overloading; pack 'J', shift; }; 9323 9324 return $directory{$addr} if defined $directory{$addr}; 9325 return undef if $type{$addr} == $UNKNOWN; 9326 return $map_directory; 9327 } 9328 9329 sub swash_name { 9330 # Return the name that is used to both: 9331 # 1) Name the file that the map table is written to. 9332 # 2) The name of swash related stuff inside that file. 9333 # The reason for this is that the Perl core historically has used 9334 # certain names that aren't the same as the Unicode property names. 9335 # To continue using these, $file is hard-coded in this file for those, 9336 # but otherwise the standard name is used. This is different from the 9337 # external_name, so that the rest of the files, like in lib can use 9338 # the standard name always, without regard to historical precedent. 9339 9340 my $self = shift; 9341 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9342 9343 my $addr = do { no overloading; pack 'J', $self; }; 9344 9345 # Swash names are used only on either 9346 # 1) legacy-only properties, because the formats for these are 9347 # unchangeable, and they have had these lines in them; or 9348 # 2) regular or internal-only map tables 9349 # 3) otherwise there should be no access to the 9350 # property map table from other parts of Perl. 9351 return if $map{$addr}->fate != $ORDINARY 9352 && $map{$addr}->fate != $LEGACY_ONLY 9353 && ! ($map{$addr}->name =~ /^_/ 9354 && $map{$addr}->fate == $INTERNAL_ONLY); 9355 9356 return $file{$addr} if defined $file{$addr}; 9357 return $map{$addr}->external_name; 9358 } 9359 9360 sub to_create_match_tables { 9361 # Returns a boolean as to whether or not match tables should be 9362 # created for this property. 9363 9364 my $self = shift; 9365 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9366 9367 # The whole point of this pseudo property is match tables. 9368 return 1 if $self == $perl; 9369 9370 my $addr = do { no overloading; pack 'J', $self; }; 9371 9372 # Don't generate tables of code points that match the property values 9373 # of a string property. Such a list would most likely have many 9374 # property values, each with just one or very few code points mapping 9375 # to it. 9376 return 0 if $type{$addr} == $STRING; 9377 9378 # Otherwise, do. 9379 return 1; 9380 } 9381 9382 sub property_add_or_replace_non_nulls { 9383 # This adds the mappings in the property $other to $self. Non-null 9384 # mappings from $other override those in $self. It essentially merges 9385 # the two properties, with the second having priority except for null 9386 # mappings. 9387 9388 my $self = shift; 9389 my $other = shift; 9390 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9391 9392 if (! $other->isa(__PACKAGE__)) { 9393 Carp::my_carp_bug("$other should be a " 9394 . __PACKAGE__ 9395 . ". Not a '" 9396 . ref($other) 9397 . "'. Not added;"); 9398 return; 9399 } 9400 9401 no overloading; 9402 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); 9403 } 9404 9405 sub set_proxy_for { 9406 # Certain tables are not generally written out to files, but 9407 # Unicode::UCD has the intelligence to know that the file for $self 9408 # can be used to reconstruct those tables. This routine just changes 9409 # things so that UCD pod entries for those suppressed tables are 9410 # generated, so the fact that a proxy is used is invisible to the 9411 # user. 9412 9413 my $self = shift; 9414 9415 foreach my $property_name (@_) { 9416 my $ref = property_ref($property_name); 9417 next if $ref->to_output_map; 9418 $ref->set_fate($MAP_PROXIED); 9419 } 9420 } 9421 9422 sub set_type { 9423 # Set the type of the property. Mostly this is figured out by the 9424 # data in the table. But this is used to set it explicitly. The 9425 # reason it is not a standard accessor is that when setting a binary 9426 # property, we need to make sure that all the true/false aliases are 9427 # present, as they were omitted in early Unicode releases. 9428 9429 my $self = shift; 9430 my $type = shift; 9431 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9432 9433 if ($type != $ENUM 9434 && $type != $BINARY 9435 && $type != $FORCED_BINARY 9436 && $type != $STRING) 9437 { 9438 Carp::my_carp("Unrecognized type '$type'. Type not set"); 9439 return; 9440 } 9441 9442 { no overloading; $type{pack 'J', $self} = $type; } 9443 return if $type != $BINARY && $type != $FORCED_BINARY; 9444 9445 my $yes = $self->table('Y'); 9446 $yes = $self->table('Yes') if ! defined $yes; 9447 $yes = $self->add_match_table('Y', Full_Name => 'Yes') 9448 if ! defined $yes; 9449 9450 # Add aliases in order wanted, duplicates will be ignored. We use a 9451 # binary property present in all releases for its ordered lists of 9452 # true/false aliases. Note, that could run into problems in 9453 # outputting things in that we don't distinguish between the name and 9454 # full name of these. Hopefully, if the table was already created 9455 # before this code is executed, it was done with these set properly. 9456 my $bm = property_ref("Bidi_Mirrored"); 9457 foreach my $alias ($bm->table("Y")->aliases) { 9458 $yes->add_alias($alias->name); 9459 } 9460 my $no = $self->table('N'); 9461 $no = $self->table('No') if ! defined $no; 9462 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; 9463 foreach my $alias ($bm->table("N")->aliases) { 9464 $no->add_alias($alias->name); 9465 } 9466 9467 return; 9468 } 9469 9470 sub add_map { 9471 # Add a map to the property's map table. This also keeps 9472 # track of the maps so that the property type can be determined from 9473 # its data. 9474 9475 my $self = shift; 9476 my $start = shift; # First code point in range 9477 my $end = shift; # Final code point in range 9478 my $map = shift; # What the range maps to. 9479 # Rest of parameters passed on. 9480 9481 my $addr = do { no overloading; pack 'J', $self; }; 9482 9483 # If haven't the type of the property, gather information to figure it 9484 # out. 9485 if ($type{$addr} == $UNKNOWN) { 9486 9487 # If the map contains an interior blank or dash, or most other 9488 # nonword characters, it will be a string property. This 9489 # heuristic may actually miss some string properties. If so, they 9490 # may need to have explicit set_types called for them. This 9491 # happens in the Unihan properties. 9492 if ($map =~ / (?<= . ) [ -] (?= . ) /x 9493 || $map =~ / [^\w.\/\ -] /x) 9494 { 9495 $self->set_type($STRING); 9496 9497 # $unique_maps is used for disambiguating between ENUM and 9498 # BINARY later; since we know the property is not going to be 9499 # one of those, no point in keeping the data around 9500 undef $unique_maps{$addr}; 9501 } 9502 else { 9503 9504 # Not necessarily a string. The final decision has to be 9505 # deferred until all the data are in. We keep track of if all 9506 # the values are code points for that eventual decision. 9507 $has_only_code_point_maps{$addr} &= 9508 $map =~ / ^ $code_point_re $/x; 9509 9510 # For the purposes of disambiguating between binary and other 9511 # enumerations at the end, we keep track of the first three 9512 # distinct property values. Once we get to three, we know 9513 # it's not going to be binary, so no need to track more. 9514 if (scalar keys %{$unique_maps{$addr}} < 3) { 9515 $unique_maps{$addr}{main::standardize($map)} = 1; 9516 } 9517 } 9518 } 9519 9520 # Add the mapping by calling our map table's method 9521 return $map{$addr}->add_map($start, $end, $map, @_); 9522 } 9523 9524 sub compute_type { 9525 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This 9526 # should be called after the property is mostly filled with its maps. 9527 # We have been keeping track of what the property values have been, 9528 # and now have the necessary information to figure out the type. 9529 9530 my $self = shift; 9531 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9532 9533 my $addr = do { no overloading; pack 'J', $self; }; 9534 9535 my $type = $type{$addr}; 9536 9537 # If already have figured these out, no need to do so again, but we do 9538 # a double check on ENUMS to make sure that a string property hasn't 9539 # improperly been classified as an ENUM, so continue on with those. 9540 return if $type == $STRING 9541 || $type == $BINARY 9542 || $type == $FORCED_BINARY; 9543 9544 # If every map is to a code point, is a string property. 9545 if ($type == $UNKNOWN 9546 && ($has_only_code_point_maps{$addr} 9547 || (defined $map{$addr}->default_map 9548 && $map{$addr}->default_map eq ""))) 9549 { 9550 $self->set_type($STRING); 9551 } 9552 else { 9553 9554 # Otherwise, it is to some sort of enumeration. (The case where 9555 # it is a Unicode miscellaneous property, and treated like a 9556 # string in this program is handled in add_map()). Distinguish 9557 # between binary and some other enumeration type. Of course, if 9558 # there are more than two values, it's not binary. But more 9559 # subtle is the test that the default mapping is defined means it 9560 # isn't binary. This in fact may change in the future if Unicode 9561 # changes the way its data is structured. But so far, no binary 9562 # properties ever have @missing lines for them, so the default map 9563 # isn't defined for them. The few properties that are two-valued 9564 # and aren't considered binary have the default map defined 9565 # starting in Unicode 5.0, when the @missing lines appeared; and 9566 # this program has special code to put in a default map for them 9567 # for earlier than 5.0 releases. 9568 if ($type == $ENUM 9569 || scalar keys %{$unique_maps{$addr}} > 2 9570 || defined $self->default_map) 9571 { 9572 my $tables = $self->tables; 9573 my $count = $self->count; 9574 if ($verbosity && $tables > 500 && $tables/$count > .1) { 9575 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n"); 9576 } 9577 $self->set_type($ENUM); 9578 } 9579 else { 9580 $self->set_type($BINARY); 9581 } 9582 } 9583 undef $unique_maps{$addr}; # Garbage collect 9584 return; 9585 } 9586 9587 sub set_fate { 9588 my $self = shift; 9589 my $fate = shift; 9590 my $reason = shift; # Ignored unless suppressing 9591 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 9592 9593 my $addr = do { no overloading; pack 'J', $self; }; 9594 if ($fate >= $SUPPRESSED) { 9595 $why_suppressed{$self->complete_name} = $reason; 9596 } 9597 9598 # Each table shares the property's fate, except that MAP_PROXIED 9599 # doesn't affect match tables 9600 $map{$addr}->set_fate($fate, $reason); 9601 if ($fate != $MAP_PROXIED) { 9602 foreach my $table ($map{$addr}, $self->tables) { 9603 $table->set_fate($fate, $reason); 9604 } 9605 } 9606 return; 9607 } 9608 9609 9610 # Most of the accessors for a property actually apply to its map table. 9611 # Setup up accessor functions for those, referring to %map 9612 for my $sub (qw( 9613 add_alias 9614 add_anomalous_entry 9615 add_comment 9616 add_conflicting 9617 add_description 9618 add_duplicate 9619 add_note 9620 aliases 9621 comment 9622 complete_name 9623 containing_range 9624 count 9625 default_map 9626 definition 9627 delete_range 9628 description 9629 each_range 9630 external_name 9631 fate 9632 file_path 9633 format 9634 initialize 9635 inverse_list 9636 is_empty 9637 replacement_property 9638 name 9639 note 9640 perl_extension 9641 property 9642 range_count 9643 ranges 9644 range_size_1 9645 replace_map 9646 reset_each_range 9647 set_comment 9648 set_default_map 9649 set_file_path 9650 set_final_comment 9651 _set_format 9652 set_range_size_1 9653 set_status 9654 set_to_output_map 9655 short_name 9656 status 9657 status_info 9658 to_output_map 9659 type_of 9660 value_of 9661 write 9662 )) 9663 # 'property' above is for symmetry, so that one can take 9664 # the property of a property and get itself, and so don't 9665 # have to distinguish between properties and tables in 9666 # calling code 9667 { 9668 no strict "refs"; 9669 *$sub = sub { 9670 use strict "refs"; 9671 my $self = shift; 9672 no overloading; 9673 return $map{pack 'J', $self}->$sub(@_); 9674 } 9675 } 9676 9677 9678} # End closure 9679 9680package main; 9681 9682sub display_chr { 9683 # Converts an ordinal printable character value to a displayable string, 9684 # using a dotted circle to hold combining characters. 9685 9686 my $ord = shift; 9687 my $chr = chr $ord; 9688 return $chr if $ccc->table(0)->contains($ord); 9689 return "\x{25CC}$chr"; 9690} 9691 9692sub join_lines($) { 9693 # Returns lines of the input joined together, so that they can be folded 9694 # properly. 9695 # This causes continuation lines to be joined together into one long line 9696 # for folding. A continuation line is any line that doesn't begin with a 9697 # space or "\b" (the latter is stripped from the output). This is so 9698 # lines can be be in a HERE document so as to fit nicely in the terminal 9699 # width, but be joined together in one long line, and then folded with 9700 # indents, '#' prefixes, etc, properly handled. 9701 # A blank separates the joined lines except if there is a break; an extra 9702 # blank is inserted after a period ending a line. 9703 9704 # Initialize the return with the first line. 9705 my ($return, @lines) = split "\n", shift; 9706 9707 # If the first line is null, it was an empty line, add the \n back in 9708 $return = "\n" if $return eq ""; 9709 9710 # Now join the remainder of the physical lines. 9711 for my $line (@lines) { 9712 9713 # An empty line means wanted a blank line, so add two \n's to get that 9714 # effect, and go to the next line. 9715 if (length $line == 0) { 9716 $return .= "\n\n"; 9717 next; 9718 } 9719 9720 # Look at the last character of what we have so far. 9721 my $previous_char = substr($return, -1, 1); 9722 9723 # And at the next char to be output. 9724 my $next_char = substr($line, 0, 1); 9725 9726 if ($previous_char ne "\n") { 9727 9728 # Here didn't end wth a nl. If the next char a blank or \b, it 9729 # means that here there is a break anyway. So add a nl to the 9730 # output. 9731 if ($next_char eq " " || $next_char eq "\b") { 9732 $previous_char = "\n"; 9733 $return .= $previous_char; 9734 } 9735 9736 # Add an extra space after periods. 9737 $return .= " " if $previous_char eq '.'; 9738 } 9739 9740 # Here $previous_char is still the latest character to be output. If 9741 # it isn't a nl, it means that the next line is to be a continuation 9742 # line, with a blank inserted between them. 9743 $return .= " " if $previous_char ne "\n"; 9744 9745 # Get rid of any \b 9746 substr($line, 0, 1) = "" if $next_char eq "\b"; 9747 9748 # And append this next line. 9749 $return .= $line; 9750 } 9751 9752 return $return; 9753} 9754 9755sub simple_fold($;$$$) { 9756 # Returns a string of the input (string or an array of strings) folded 9757 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus 9758 # a \n 9759 # This is tailored for the kind of text written by this program, 9760 # especially the pod file, which can have very long names with 9761 # underscores in the middle, or words like AbcDefgHij.... We allow 9762 # breaking in the middle of such constructs if the line won't fit 9763 # otherwise. The break in such cases will come either just after an 9764 # underscore, or just before one of the Capital letters. 9765 9766 local $to_trace = 0 if main::DEBUG; 9767 9768 my $line = shift; 9769 my $prefix = shift; # Optional string to prepend to each output 9770 # line 9771 $prefix = "" unless defined $prefix; 9772 9773 my $hanging_indent = shift; # Optional number of spaces to indent 9774 # continuation lines 9775 $hanging_indent = 0 unless $hanging_indent; 9776 9777 my $right_margin = shift; # Optional number of spaces to narrow the 9778 # total width by. 9779 $right_margin = 0 unless defined $right_margin; 9780 9781 # Call carp with the 'nofold' option to avoid it from trying to call us 9782 # recursively 9783 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_; 9784 9785 # The space available doesn't include what's automatically prepended 9786 # to each line, or what's reserved on the right. 9787 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; 9788 # XXX Instead of using the 'nofold' perhaps better to look up the stack 9789 9790 if (DEBUG && $hanging_indent >= $max) { 9791 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); 9792 $hanging_indent = 0; 9793 } 9794 9795 # First, split into the current physical lines. 9796 my @line; 9797 if (ref $line) { # Better be an array, because not bothering to 9798 # test 9799 foreach my $line (@{$line}) { 9800 push @line, split /\n/, $line; 9801 } 9802 } 9803 else { 9804 @line = split /\n/, $line; 9805 } 9806 9807 #local $to_trace = 1 if main::DEBUG; 9808 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; 9809 9810 # Look at each current physical line. 9811 for (my $i = 0; $i < @line; $i++) { 9812 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; 9813 #local $to_trace = 1 if main::DEBUG; 9814 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; 9815 9816 # Remove prefix, because will be added back anyway, don't want 9817 # doubled prefix 9818 $line[$i] =~ s/^$prefix//; 9819 9820 # Remove trailing space 9821 $line[$i] =~ s/\s+\Z//; 9822 9823 # If the line is too long, fold it. 9824 if (length $line[$i] > $max) { 9825 my $remainder; 9826 9827 # Here needs to fold. Save the leading space in the line for 9828 # later. 9829 $line[$i] =~ /^ ( \s* )/x; 9830 my $leading_space = $1; 9831 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; 9832 9833 # If character at final permissible position is white space, 9834 # fold there, which will delete that white space 9835 if (substr($line[$i], $max - 1, 1) =~ /\s/) { 9836 $remainder = substr($line[$i], $max); 9837 $line[$i] = substr($line[$i], 0, $max - 1); 9838 } 9839 else { 9840 9841 # Otherwise fold at an acceptable break char closest to 9842 # the max length. Look at just the maximal initial 9843 # segment of the line 9844 my $segment = substr($line[$i], 0, $max - 1); 9845 if ($segment =~ 9846 /^ ( .{$hanging_indent} # Don't look before the 9847 # indent. 9848 \ * # Don't look in leading 9849 # blanks past the indent 9850 [^ ] .* # Find the right-most 9851 (?: # acceptable break: 9852 [ \s = ] # space or equal 9853 | - (?! [.0-9] ) # or non-unary minus. 9854 ) # $1 includes the character 9855 )/x) 9856 { 9857 # Split into the initial part that fits, and remaining 9858 # part of the input 9859 $remainder = substr($line[$i], length $1); 9860 $line[$i] = $1; 9861 trace $line[$i] if DEBUG && $to_trace; 9862 trace $remainder if DEBUG && $to_trace; 9863 } 9864 9865 # If didn't find a good breaking spot, see if there is a 9866 # not-so-good breaking spot. These are just after 9867 # underscores or where the case changes from lower to 9868 # upper. Use \a as a soft hyphen, but give up 9869 # and don't break the line if there is actually a \a 9870 # already in the input. We use an ascii character for the 9871 # soft-hyphen to avoid any attempt by miniperl to try to 9872 # access the files that this program is creating. 9873 elsif ($segment !~ /\a/ 9874 && ($segment =~ s/_/_\a/g 9875 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg)) 9876 { 9877 # Here were able to find at least one place to insert 9878 # our substitute soft hyphen. Find the right-most one 9879 # and replace it by a real hyphen. 9880 trace $segment if DEBUG && $to_trace; 9881 substr($segment, 9882 rindex($segment, "\a"), 9883 1) = '-'; 9884 9885 # Then remove the soft hyphen substitutes. 9886 $segment =~ s/\a//g; 9887 trace $segment if DEBUG && $to_trace; 9888 9889 # And split into the initial part that fits, and 9890 # remainder of the line 9891 my $pos = rindex($segment, '-'); 9892 $remainder = substr($line[$i], $pos); 9893 trace $remainder if DEBUG && $to_trace; 9894 $line[$i] = substr($segment, 0, $pos + 1); 9895 } 9896 } 9897 9898 # Here we know if we can fold or not. If we can, $remainder 9899 # is what remains to be processed in the next iteration. 9900 if (defined $remainder) { 9901 trace "folded='$line[$i]'" if main::DEBUG && $to_trace; 9902 9903 # Insert the folded remainder of the line as a new element 9904 # of the array. (It may still be too long, but we will 9905 # deal with that next time through the loop.) Omit any 9906 # leading space in the remainder. 9907 $remainder =~ s/^\s+//; 9908 trace "remainder='$remainder'" if main::DEBUG && $to_trace; 9909 9910 # But then indent by whichever is larger of: 9911 # 1) the leading space on the input line; 9912 # 2) the hanging indent. 9913 # This preserves indentation in the original line. 9914 my $lead = ($leading_space) 9915 ? length $leading_space 9916 : $hanging_indent; 9917 $lead = max($lead, $hanging_indent); 9918 splice @line, $i+1, 0, (" " x $lead) . $remainder; 9919 } 9920 } 9921 9922 # Ready to output the line. Get rid of any trailing space 9923 # And prefix by the required $prefix passed in. 9924 $line[$i] =~ s/\s+$//; 9925 $line[$i] = "$prefix$line[$i]\n"; 9926 } # End of looping through all the lines. 9927 9928 return join "", @line; 9929} 9930 9931sub property_ref { # Returns a reference to a property object. 9932 return Property::property_ref(@_); 9933} 9934 9935sub force_unlink ($) { 9936 my $filename = shift; 9937 return unless file_exists($filename); 9938 return if CORE::unlink($filename); 9939 9940 # We might need write permission 9941 chmod 0777, $filename; 9942 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); 9943 return; 9944} 9945 9946sub write ($$@) { 9947 # Given a filename and references to arrays of lines, write the lines of 9948 # each array to the file 9949 # Filename can be given as an arrayref of directory names 9950 9951 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 9952 9953 my $file = shift; 9954 my $use_utf8 = shift; 9955 9956 # Get into a single string if an array, and get rid of, in Unix terms, any 9957 # leading '.' 9958 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; 9959 $file = File::Spec->canonpath($file); 9960 9961 # If has directories, make sure that they all exist 9962 (undef, my $directories, undef) = File::Spec->splitpath($file); 9963 File::Path::mkpath($directories) if $directories && ! -d $directories; 9964 9965 push @files_actually_output, $file; 9966 9967 force_unlink ($file); 9968 9969 my $OUT; 9970 if (not open $OUT, ">", $file) { 9971 Carp::my_carp("can't open $file for output. Skipping this file: $!"); 9972 return; 9973 } 9974 9975 binmode $OUT, ":utf8" if $use_utf8; 9976 9977 while (defined (my $lines_ref = shift)) { 9978 unless (@$lines_ref) { 9979 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;"); 9980 } 9981 9982 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); 9983 } 9984 close $OUT or die Carp::my_carp("close '$file' failed: $!"); 9985 9986 print "$file written.\n" if $verbosity >= $VERBOSE; 9987 9988 return; 9989} 9990 9991 9992sub Standardize($) { 9993 # This converts the input name string into a standardized equivalent to 9994 # use internally. 9995 9996 my $name = shift; 9997 unless (defined $name) { 9998 Carp::my_carp_bug("Standardize() called with undef. Returning undef."); 9999 return; 10000 } 10001 10002 # Remove any leading or trailing white space 10003 $name =~ s/^\s+//g; 10004 $name =~ s/\s+$//g; 10005 10006 # Convert interior white space and hyphens into underscores. 10007 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; 10008 10009 # Capitalize the letter following an underscore, and convert a sequence of 10010 # multiple underscores to a single one 10011 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; 10012 10013 # And capitalize the first letter, but not for the special cjk ones. 10014 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 10015 return $name; 10016} 10017 10018sub standardize ($) { 10019 # Returns a lower-cased standardized name, without underscores. This form 10020 # is chosen so that it can distinguish between any real versus superficial 10021 # Unicode name differences. It relies on the fact that Unicode doesn't 10022 # have interior underscores, white space, nor dashes in any 10023 # stricter-matched name. It should not be used on Unicode code point 10024 # names (the Name property), as they mostly, but not always follow these 10025 # rules. 10026 10027 my $name = Standardize(shift); 10028 return if !defined $name; 10029 10030 $name =~ s/ (?<= .) _ (?= . ) //xg; 10031 return lc $name; 10032} 10033 10034sub utf8_heavy_name ($$) { 10035 # Returns the name that utf8_heavy.pl will use to find a table. XXX 10036 # perhaps this function should be placed somewhere, like Heavy.pl so that 10037 # utf8_heavy can use it directly without duplicating code that can get 10038 # out-of sync. 10039 10040 my $table = shift; 10041 my $alias = shift; 10042 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10043 10044 my $property = $table->property; 10045 $property = ($property == $perl) 10046 ? "" # 'perl' is never explicitly stated 10047 : standardize($property->name) . '='; 10048 if ($alias->loose_match) { 10049 return $property . standardize($alias->name); 10050 } 10051 else { 10052 return lc ($property . $alias->name); 10053 } 10054 10055 return; 10056} 10057 10058{ # Closure 10059 10060 my $indent_increment = " " x (($debugging_build) ? 2 : 0); 10061 %main::already_output = (); 10062 10063 $main::simple_dumper_nesting = 0; 10064 10065 sub simple_dumper { 10066 # Like Simple Data::Dumper. Good enough for our needs. We can't use 10067 # the real thing as we have to run under miniperl. 10068 10069 # It is designed so that on input it is at the beginning of a line, 10070 # and the final thing output in any call is a trailing ",\n". 10071 10072 my $item = shift; 10073 my $indent = shift; 10074 $indent = "" if ! $debugging_build || ! defined $indent; 10075 10076 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10077 10078 # nesting level is localized, so that as the call stack pops, it goes 10079 # back to the prior value. 10080 local $main::simple_dumper_nesting = $main::simple_dumper_nesting; 10081 local %main::already_output = %main::already_output; 10082 $main::simple_dumper_nesting++; 10083 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; 10084 10085 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10086 10087 # Determine the indent for recursive calls. 10088 my $next_indent = $indent . $indent_increment; 10089 10090 my $output; 10091 if (! ref $item) { 10092 10093 # Dump of scalar: just output it in quotes if not a number. To do 10094 # so we must escape certain characters, and therefore need to 10095 # operate on a copy to avoid changing the original 10096 my $copy = $item; 10097 $copy = $UNDEF unless defined $copy; 10098 10099 # Quote non-integers (integers also have optional leading '-') 10100 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { 10101 10102 # Escape apostrophe and backslash 10103 $copy =~ s/ ( ['\\] ) /\\$1/xg; 10104 $copy = "'$copy'"; 10105 } 10106 $output = "$indent$copy,\n"; 10107 } 10108 else { 10109 10110 # Keep track of cycles in the input, and refuse to infinitely loop 10111 my $addr = do { no overloading; pack 'J', $item; }; 10112 if (defined $main::already_output{$addr}) { 10113 return "${indent}ALREADY OUTPUT: $item\n"; 10114 } 10115 $main::already_output{$addr} = $item; 10116 10117 if (ref $item eq 'ARRAY') { 10118 my $using_brackets; 10119 $output = $indent; 10120 if ($main::simple_dumper_nesting > 1) { 10121 $output .= '['; 10122 $using_brackets = 1; 10123 } 10124 else { 10125 $using_brackets = 0; 10126 } 10127 10128 # If the array is empty, put the closing bracket on the same 10129 # line. Otherwise, recursively add each array element 10130 if (@$item == 0) { 10131 $output .= " "; 10132 } 10133 else { 10134 $output .= "\n"; 10135 for (my $i = 0; $i < @$item; $i++) { 10136 10137 # Indent array elements one level 10138 $output .= &simple_dumper($item->[$i], $next_indent); 10139 next if ! $debugging_build; 10140 $output =~ s/\n$//; # Remove any trailing nl so 10141 $output .= " # [$i]\n"; # as to add a comment giving 10142 # the array index 10143 } 10144 $output .= $indent; # Indent closing ']' to orig level 10145 } 10146 $output .= ']' if $using_brackets; 10147 $output .= ",\n"; 10148 } 10149 elsif (ref $item eq 'HASH') { 10150 my $is_first_line; 10151 my $using_braces; 10152 my $body_indent; 10153 10154 # No surrounding braces at top level 10155 $output .= $indent; 10156 if ($main::simple_dumper_nesting > 1) { 10157 $output .= "{\n"; 10158 $is_first_line = 0; 10159 $body_indent = $next_indent; 10160 $next_indent .= $indent_increment; 10161 $using_braces = 1; 10162 } 10163 else { 10164 $is_first_line = 1; 10165 $body_indent = $indent; 10166 $using_braces = 0; 10167 } 10168 10169 # Output hashes sorted alphabetically instead of apparently 10170 # random. Use caseless alphabetic sort 10171 foreach my $key (sort { lc $a cmp lc $b } keys %$item) 10172 { 10173 if ($is_first_line) { 10174 $is_first_line = 0; 10175 } 10176 else { 10177 $output .= "$body_indent"; 10178 } 10179 10180 # The key must be a scalar, but this recursive call quotes 10181 # it 10182 $output .= &simple_dumper($key); 10183 10184 # And change the trailing comma and nl to the hash fat 10185 # comma for clarity, and so the value can be on the same 10186 # line 10187 $output =~ s/,\n$/ => /; 10188 10189 # Recursively call to get the value's dump. 10190 my $next = &simple_dumper($item->{$key}, $next_indent); 10191 10192 # If the value is all on one line, remove its indent, so 10193 # will follow the => immediately. If it takes more than 10194 # one line, start it on a new line. 10195 if ($next !~ /\n.*\n/) { 10196 $next =~ s/^ *//; 10197 } 10198 else { 10199 $output .= "\n"; 10200 } 10201 $output .= $next; 10202 } 10203 10204 $output .= "$indent},\n" if $using_braces; 10205 } 10206 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { 10207 $output = $indent . ref($item) . "\n"; 10208 # XXX see if blessed 10209 } 10210 elsif ($item->can('dump')) { 10211 10212 # By convention in this program, objects furnish a 'dump' 10213 # method. Since not doing any output at this level, just pass 10214 # on the input indent 10215 $output = $item->dump($indent); 10216 } 10217 else { 10218 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); 10219 } 10220 } 10221 return $output; 10222 } 10223} 10224 10225sub dump_inside_out { 10226 # Dump inside-out hashes in an object's state by converting them to a 10227 # regular hash and then calling simple_dumper on that. 10228 10229 my $object = shift; 10230 my $fields_ref = shift; 10231 10232 my $addr = do { no overloading; pack 'J', $object; }; 10233 10234 my %hash; 10235 foreach my $key (keys %$fields_ref) { 10236 $hash{$key} = $fields_ref->{$key}{$addr}; 10237 } 10238 10239 return simple_dumper(\%hash, @_); 10240} 10241 10242sub _operator_dot { 10243 # Overloaded '.' method that is common to all packages. It uses the 10244 # package's stringify method. 10245 10246 my $self = shift; 10247 my $other = shift; 10248 my $reversed = shift; 10249 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10250 10251 $other = "" unless defined $other; 10252 10253 foreach my $which (\$self, \$other) { 10254 next unless ref $$which; 10255 if ($$which->can('_operator_stringify')) { 10256 $$which = $$which->_operator_stringify; 10257 } 10258 else { 10259 my $ref = ref $$which; 10260 my $addr = do { no overloading; pack 'J', $$which; }; 10261 $$which = "$ref ($addr)"; 10262 } 10263 } 10264 return ($reversed) 10265 ? "$other$self" 10266 : "$self$other"; 10267} 10268 10269sub _operator_dot_equal { 10270 # Overloaded '.=' method that is common to all packages. 10271 10272 my $self = shift; 10273 my $other = shift; 10274 my $reversed = shift; 10275 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10276 10277 $other = "" unless defined $other; 10278 10279 if ($reversed) { 10280 return $other .= "$self"; 10281 } 10282 else { 10283 return "$self" . "$other"; 10284 } 10285} 10286 10287sub _operator_equal { 10288 # Generic overloaded '==' routine. To be equal, they must be the exact 10289 # same object 10290 10291 my $self = shift; 10292 my $other = shift; 10293 10294 return 0 unless defined $other; 10295 return 0 unless ref $other; 10296 no overloading; 10297 return $self == $other; 10298} 10299 10300sub _operator_not_equal { 10301 my $self = shift; 10302 my $other = shift; 10303 10304 return ! _operator_equal($self, $other); 10305} 10306 10307sub substitute_PropertyAliases($) { 10308 # Deal with early releases that don't have the crucial PropertyAliases.txt 10309 # file. 10310 10311 my $file_object = shift; 10312 $file_object->insert_lines(get_old_property_aliases()); 10313 10314 process_PropertyAliases($file_object); 10315} 10316 10317 10318sub process_PropertyAliases($) { 10319 # This reads in the PropertyAliases.txt file, which contains almost all 10320 # the character properties in Unicode and their equivalent aliases: 10321 # scf ; Simple_Case_Folding ; sfc 10322 # 10323 # Field 0 is the preferred short name for the property. 10324 # Field 1 is the full name. 10325 # Any succeeding ones are other accepted names. 10326 10327 my $file= shift; 10328 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10329 10330 # Add any cjk properties that may have been defined. 10331 $file->insert_lines(@cjk_properties); 10332 10333 while ($file->next_line) { 10334 10335 my @data = split /\s*;\s*/; 10336 10337 my $full = $data[1]; 10338 10339 # This line is defective in early Perls. The property in Unihan.txt 10340 # is kRSUnicode. 10341 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) { 10342 push @data, qw(cjkRSUnicode kRSUnicode); 10343 } 10344 10345 my $this = Property->new($data[0], Full_Name => $full); 10346 10347 $this->set_fate($SUPPRESSED, $why_suppressed{$full}) 10348 if $why_suppressed{$full}; 10349 10350 # Start looking for more aliases after these two. 10351 for my $i (2 .. @data - 1) { 10352 $this->add_alias($data[$i]); 10353 } 10354 10355 } 10356 10357 my $scf = property_ref("Simple_Case_Folding"); 10358 $scf->add_alias("scf"); 10359 $scf->add_alias("sfc"); 10360 10361 return; 10362} 10363 10364sub finish_property_setup { 10365 # Finishes setting up after PropertyAliases. 10366 10367 my $file = shift; 10368 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10369 10370 # This entry was missing from this file in earlier Unicode versions 10371 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) { 10372 Property->new('JSN', Full_Name => 'Jamo_Short_Name'); 10373 } 10374 10375 # These are used so much, that we set globals for them. 10376 $gc = property_ref('General_Category'); 10377 $block = property_ref('Block'); 10378 $script = property_ref('Script'); 10379 $age = property_ref('Age'); 10380 10381 # Perl adds this alias. 10382 $gc->add_alias('Category'); 10383 10384 # Unicode::Normalize expects this file with this name and directory. 10385 $ccc = property_ref('Canonical_Combining_Class'); 10386 if (defined $ccc) { 10387 $ccc->set_file('CombiningClass'); 10388 $ccc->set_directory(File::Spec->curdir()); 10389 } 10390 10391 # These two properties aren't actually used in the core, but unfortunately 10392 # the names just above that are in the core interfere with these, so 10393 # choose different names. These aren't a problem unless the map tables 10394 # for these files get written out. 10395 my $lowercase = property_ref('Lowercase'); 10396 $lowercase->set_file('IsLower') if defined $lowercase; 10397 my $uppercase = property_ref('Uppercase'); 10398 $uppercase->set_file('IsUpper') if defined $uppercase; 10399 10400 # Set up the hard-coded default mappings, but only on properties defined 10401 # for this release 10402 foreach my $property (keys %default_mapping) { 10403 my $property_object = property_ref($property); 10404 next if ! defined $property_object; 10405 my $default_map = $default_mapping{$property}; 10406 $property_object->set_default_map($default_map); 10407 10408 # A map of <code point> implies the property is string. 10409 if ($property_object->type == $UNKNOWN 10410 && $default_map eq $CODE_POINT) 10411 { 10412 $property_object->set_type($STRING); 10413 } 10414 } 10415 10416 # The following use the Multi_Default class to create objects for 10417 # defaults. 10418 10419 # Bidi class has a complicated default, but the derived file takes care of 10420 # the complications, leaving just 'L'. 10421 if (file_exists("${EXTRACTED}DBidiClass.txt")) { 10422 property_ref('Bidi_Class')->set_default_map('L'); 10423 } 10424 else { 10425 my $default; 10426 10427 # The derived file was introduced in 3.1.1. The values below are 10428 # taken from table 3-8, TUS 3.0 10429 my $default_R = 10430 'my $default = Range_List->new; 10431 $default->add_range(0x0590, 0x05FF); 10432 $default->add_range(0xFB1D, 0xFB4F);' 10433 ; 10434 10435 # The defaults apply only to unassigned characters 10436 $default_R .= '$gc->table("Unassigned") & $default;'; 10437 10438 if ($v_version lt v3.0.0) { 10439 $default = Multi_Default->new(R => $default_R, 'L'); 10440 } 10441 else { 10442 10443 # AL apparently not introduced until 3.0: TUS 2.x references are 10444 # not on-line to check it out 10445 my $default_AL = 10446 'my $default = Range_List->new; 10447 $default->add_range(0x0600, 0x07BF); 10448 $default->add_range(0xFB50, 0xFDFF); 10449 $default->add_range(0xFE70, 0xFEFF);' 10450 ; 10451 10452 # Non-character code points introduced in this release; aren't AL 10453 if ($v_version ge 3.1.0) { 10454 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; 10455 } 10456 $default_AL .= '$gc->table("Unassigned") & $default'; 10457 $default = Multi_Default->new(AL => $default_AL, 10458 R => $default_R, 10459 'L'); 10460 } 10461 property_ref('Bidi_Class')->set_default_map($default); 10462 } 10463 10464 # Joining type has a complicated default, but the derived file takes care 10465 # of the complications, leaving just 'U' (or Non_Joining), except the file 10466 # is bad in 3.1.0 10467 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { 10468 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { 10469 property_ref('Joining_Type')->set_default_map('Non_Joining'); 10470 } 10471 else { 10472 10473 # Otherwise, there are not one, but two possibilities for the 10474 # missing defaults: T and U. 10475 # The missing defaults that evaluate to T are given by: 10476 # T = Mn + Cf - ZWNJ - ZWJ 10477 # where Mn and Cf are the general category values. In other words, 10478 # any non-spacing mark or any format control character, except 10479 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO 10480 # WIDTH JOINER (joining type C). 10481 my $default = Multi_Default->new( 10482 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', 10483 'Non_Joining'); 10484 property_ref('Joining_Type')->set_default_map($default); 10485 } 10486 } 10487 10488 # Line break has a complicated default in early releases. It is 'Unknown' 10489 # for non-assigned code points; 'AL' for assigned. 10490 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { 10491 my $lb = property_ref('Line_Break'); 10492 if (file_exists("${EXTRACTED}DLineBreak.txt")) { 10493 $lb->set_default_map('Unknown'); 10494 } 10495 else { 10496 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")', 10497 'Unknown', 10498 ); 10499 $lb->set_default_map($default); 10500 } 10501 } 10502 10503 # For backwards compatibility with applications that may read the mapping 10504 # file directly (it was documented in 5.12 and 5.14 as being thusly 10505 # usable), keep it from being adjusted. (range_size_1 is 10506 # used to force the traditional format.) 10507 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { 10508 $nfkc_cf->set_to_output_map($EXTERNAL_MAP); 10509 $nfkc_cf->set_range_size_1(1); 10510 } 10511 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { 10512 $bmg->set_to_output_map($EXTERNAL_MAP); 10513 $bmg->set_range_size_1(1); 10514 } 10515 10516 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); 10517 10518 return; 10519} 10520 10521sub get_old_property_aliases() { 10522 # Returns what would be in PropertyAliases.txt if it existed in very old 10523 # versions of Unicode. It was derived from the one in 3.2, and pared 10524 # down based on the data that was actually in the older releases. 10525 # An attempt was made to use the existence of files to mean inclusion or 10526 # not of various aliases, but if this was not sufficient, using version 10527 # numbers was resorted to. 10528 10529 my @return; 10530 10531 # These are to be used in all versions (though some are constructed by 10532 # this program if missing) 10533 push @return, split /\n/, <<'END'; 10534bc ; Bidi_Class 10535Bidi_M ; Bidi_Mirrored 10536cf ; Case_Folding 10537ccc ; Canonical_Combining_Class 10538dm ; Decomposition_Mapping 10539dt ; Decomposition_Type 10540gc ; General_Category 10541isc ; ISO_Comment 10542lc ; Lowercase_Mapping 10543na ; Name 10544na1 ; Unicode_1_Name 10545nt ; Numeric_Type 10546nv ; Numeric_Value 10547scf ; Simple_Case_Folding 10548slc ; Simple_Lowercase_Mapping 10549stc ; Simple_Titlecase_Mapping 10550suc ; Simple_Uppercase_Mapping 10551tc ; Titlecase_Mapping 10552uc ; Uppercase_Mapping 10553END 10554 10555 if (-e 'Blocks.txt') { 10556 push @return, "blk ; Block\n"; 10557 } 10558 if (-e 'ArabicShaping.txt') { 10559 push @return, split /\n/, <<'END'; 10560jg ; Joining_Group 10561jt ; Joining_Type 10562END 10563 } 10564 if (-e 'PropList.txt') { 10565 10566 # This first set is in the original old-style proplist. 10567 push @return, split /\n/, <<'END'; 10568Bidi_C ; Bidi_Control 10569Dash ; Dash 10570Dia ; Diacritic 10571Ext ; Extender 10572Hex ; Hex_Digit 10573Hyphen ; Hyphen 10574IDC ; ID_Continue 10575Ideo ; Ideographic 10576Join_C ; Join_Control 10577Math ; Math 10578QMark ; Quotation_Mark 10579Term ; Terminal_Punctuation 10580WSpace ; White_Space 10581END 10582 # The next sets were added later 10583 if ($v_version ge v3.0.0) { 10584 push @return, split /\n/, <<'END'; 10585Upper ; Uppercase 10586Lower ; Lowercase 10587END 10588 } 10589 if ($v_version ge v3.0.1) { 10590 push @return, split /\n/, <<'END'; 10591NChar ; Noncharacter_Code_Point 10592END 10593 } 10594 # The next sets were added in the new-style 10595 if ($v_version ge v3.1.0) { 10596 push @return, split /\n/, <<'END'; 10597OAlpha ; Other_Alphabetic 10598OLower ; Other_Lowercase 10599OMath ; Other_Math 10600OUpper ; Other_Uppercase 10601END 10602 } 10603 if ($v_version ge v3.1.1) { 10604 push @return, "AHex ; ASCII_Hex_Digit\n"; 10605 } 10606 } 10607 if (-e 'EastAsianWidth.txt') { 10608 push @return, "ea ; East_Asian_Width\n"; 10609 } 10610 if (-e 'CompositionExclusions.txt') { 10611 push @return, "CE ; Composition_Exclusion\n"; 10612 } 10613 if (-e 'LineBreak.txt') { 10614 push @return, "lb ; Line_Break\n"; 10615 } 10616 if (-e 'BidiMirroring.txt') { 10617 push @return, "bmg ; Bidi_Mirroring_Glyph\n"; 10618 } 10619 if (-e 'Scripts.txt') { 10620 push @return, "sc ; Script\n"; 10621 } 10622 if (-e 'DNormalizationProps.txt') { 10623 push @return, split /\n/, <<'END'; 10624Comp_Ex ; Full_Composition_Exclusion 10625FC_NFKC ; FC_NFKC_Closure 10626NFC_QC ; NFC_Quick_Check 10627NFD_QC ; NFD_Quick_Check 10628NFKC_QC ; NFKC_Quick_Check 10629NFKD_QC ; NFKD_Quick_Check 10630XO_NFC ; Expands_On_NFC 10631XO_NFD ; Expands_On_NFD 10632XO_NFKC ; Expands_On_NFKC 10633XO_NFKD ; Expands_On_NFKD 10634END 10635 } 10636 if (-e 'DCoreProperties.txt') { 10637 push @return, split /\n/, <<'END'; 10638Alpha ; Alphabetic 10639IDS ; ID_Start 10640XIDC ; XID_Continue 10641XIDS ; XID_Start 10642END 10643 # These can also appear in some versions of PropList.txt 10644 push @return, "Lower ; Lowercase\n" 10645 unless grep { $_ =~ /^Lower\b/} @return; 10646 push @return, "Upper ; Uppercase\n" 10647 unless grep { $_ =~ /^Upper\b/} @return; 10648 } 10649 10650 # This flag requires the DAge.txt file to be copied into the directory. 10651 if (DEBUG && $compare_versions) { 10652 push @return, 'age ; Age'; 10653 } 10654 10655 return @return; 10656} 10657 10658sub substitute_PropValueAliases($) { 10659 # Deal with early releases that don't have the crucial 10660 # PropValueAliases.txt file. 10661 10662 my $file_object = shift; 10663 $file_object->insert_lines(get_old_property_value_aliases()); 10664 10665 process_PropValueAliases($file_object); 10666} 10667 10668sub process_PropValueAliases { 10669 # This file contains values that properties look like: 10670 # bc ; AL ; Arabic_Letter 10671 # blk; n/a ; Greek_And_Coptic ; Greek 10672 # 10673 # Field 0 is the property. 10674 # Field 1 is the short name of a property value or 'n/a' if no 10675 # short name exists; 10676 # Field 2 is the full property value name; 10677 # Any other fields are more synonyms for the property value. 10678 # Purely numeric property values are omitted from the file; as are some 10679 # others, fewer and fewer in later releases 10680 10681 # Entries for the ccc property have an extra field before the 10682 # abbreviation: 10683 # ccc; 0; NR ; Not_Reordered 10684 # It is the numeric value that the names are synonyms for. 10685 10686 # There are comment entries for values missing from this file: 10687 # # @missing: 0000..10FFFF; ISO_Comment; <none> 10688 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point> 10689 10690 my $file= shift; 10691 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 10692 10693 if ($v_version lt 4.0.0) { 10694 $file->insert_lines(split /\n/, <<'END' 10695Hangul_Syllable_Type; L ; Leading_Jamo 10696Hangul_Syllable_Type; LV ; LV_Syllable 10697Hangul_Syllable_Type; LVT ; LVT_Syllable 10698Hangul_Syllable_Type; NA ; Not_Applicable 10699Hangul_Syllable_Type; T ; Trailing_Jamo 10700Hangul_Syllable_Type; V ; Vowel_Jamo 10701END 10702 ); 10703 } 10704 if ($v_version lt 4.1.0) { 10705 $file->insert_lines(split /\n/, <<'END' 10706_Perl_GCB; CN ; Control 10707_Perl_GCB; CR ; CR 10708_Perl_GCB; EX ; Extend 10709_Perl_GCB; L ; L 10710_Perl_GCB; LF ; LF 10711_Perl_GCB; LV ; LV 10712_Perl_GCB; LVT ; LVT 10713_Perl_GCB; T ; T 10714_Perl_GCB; V ; V 10715_Perl_GCB; XX ; Other 10716END 10717 ); 10718 } 10719 10720 10721 # Add any explicit cjk values 10722 $file->insert_lines(@cjk_property_values); 10723 10724 # This line is used only for testing the code that checks for name 10725 # conflicts. There is a script Inherited, and when this line is executed 10726 # it causes there to be a name conflict with the 'Inherited' that this 10727 # program generates for this block property value 10728 #$file->insert_lines('blk; n/a; Herited'); 10729 10730 # Process each line of the file ... 10731 while ($file->next_line) { 10732 10733 # Fix typo in input file 10734 s/CCC133/CCC132/g if $v_version eq v6.1.0; 10735 10736 my ($property, @data) = split /\s*;\s*/; 10737 10738 # The ccc property has an extra field at the beginning, which is the 10739 # numeric value. Move it to be after the other two, mnemonic, fields, 10740 # so that those will be used as the property value's names, and the 10741 # number will be an extra alias. (Rightmost splice removes field 1-2, 10742 # returning them in a slice; left splice inserts that before anything, 10743 # thus shifting the former field 0 to after them.) 10744 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; 10745 10746 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) { 10747 my $new_style = $data[1] =~ s/-/_/gr; 10748 splice @data, 1, 0, $new_style; 10749 } 10750 10751 # Field 0 is a short name unless "n/a"; field 1 is the full name. If 10752 # there is no short name, use the full one in element 1 10753 if ($data[0] eq "n/a") { 10754 $data[0] = $data[1]; 10755 } 10756 elsif ($data[0] ne $data[1] 10757 && standardize($data[0]) eq standardize($data[1]) 10758 && $data[1] !~ /[[:upper:]]/) 10759 { 10760 # Also, there is a bug in the file in which "n/a" is omitted, and 10761 # the two fields are identical except for case, and the full name 10762 # is all lower case. Copy the "short" name unto the full one to 10763 # give it some upper case. 10764 10765 $data[1] = $data[0]; 10766 } 10767 10768 # Earlier releases had the pseudo property 'qc' that should expand to 10769 # the ones that replace it below. 10770 if ($property eq 'qc') { 10771 if (lc $data[0] eq 'y') { 10772 $file->insert_lines('NFC_QC; Y ; Yes', 10773 'NFD_QC; Y ; Yes', 10774 'NFKC_QC; Y ; Yes', 10775 'NFKD_QC; Y ; Yes', 10776 ); 10777 } 10778 elsif (lc $data[0] eq 'n') { 10779 $file->insert_lines('NFC_QC; N ; No', 10780 'NFD_QC; N ; No', 10781 'NFKC_QC; N ; No', 10782 'NFKD_QC; N ; No', 10783 ); 10784 } 10785 elsif (lc $data[0] eq 'm') { 10786 $file->insert_lines('NFC_QC; M ; Maybe', 10787 'NFKC_QC; M ; Maybe', 10788 ); 10789 } 10790 else { 10791 $file->carp_bad_line("qc followed by unexpected '$data[0]"); 10792 } 10793 next; 10794 } 10795 10796 # The first field is the short name, 2nd is the full one. 10797 my $property_object = property_ref($property); 10798 my $table = $property_object->add_match_table($data[0], 10799 Full_Name => $data[1]); 10800 10801 # Start looking for more aliases after these two. 10802 for my $i (2 .. @data - 1) { 10803 $table->add_alias($data[$i]); 10804 } 10805 } # End of looping through the file 10806 10807 # As noted in the comments early in the program, it generates tables for 10808 # the default values for all releases, even those for which the concept 10809 # didn't exist at the time. Here we add those if missing. 10810 if (defined $age && ! defined $age->table('Unassigned')) { 10811 $age->add_match_table('Unassigned'); 10812 } 10813 $block->add_match_table('No_Block') if -e 'Blocks.txt' 10814 && ! defined $block->table('No_Block'); 10815 10816 10817 # Now set the default mappings of the properties from the file. This is 10818 # done after the loop because a number of properties have only @missings 10819 # entries in the file, and may not show up until the end. 10820 my @defaults = $file->get_missings; 10821 foreach my $default_ref (@defaults) { 10822 my $default = $default_ref->[0]; 10823 my $property = property_ref($default_ref->[1]); 10824 $property->set_default_map($default); 10825 } 10826 return; 10827} 10828 10829sub get_old_property_value_aliases () { 10830 # Returns what would be in PropValueAliases.txt if it existed in very old 10831 # versions of Unicode. It was derived from the one in 3.2, and pared 10832 # down. An attempt was made to use the existence of files to mean 10833 # inclusion or not of various aliases, but if this was not sufficient, 10834 # using version numbers was resorted to. 10835 10836 my @return = split /\n/, <<'END'; 10837bc ; AN ; Arabic_Number 10838bc ; B ; Paragraph_Separator 10839bc ; CS ; Common_Separator 10840bc ; EN ; European_Number 10841bc ; ES ; European_Separator 10842bc ; ET ; European_Terminator 10843bc ; L ; Left_To_Right 10844bc ; ON ; Other_Neutral 10845bc ; R ; Right_To_Left 10846bc ; WS ; White_Space 10847 10848Bidi_M; N; No; F; False 10849Bidi_M; Y; Yes; T; True 10850 10851# The standard combining classes are very much different in v1, so only use 10852# ones that look right (not checked thoroughly) 10853ccc; 0; NR ; Not_Reordered 10854ccc; 1; OV ; Overlay 10855ccc; 7; NK ; Nukta 10856ccc; 8; KV ; Kana_Voicing 10857ccc; 9; VR ; Virama 10858ccc; 202; ATBL ; Attached_Below_Left 10859ccc; 216; ATAR ; Attached_Above_Right 10860ccc; 218; BL ; Below_Left 10861ccc; 220; B ; Below 10862ccc; 222; BR ; Below_Right 10863ccc; 224; L ; Left 10864ccc; 228; AL ; Above_Left 10865ccc; 230; A ; Above 10866ccc; 232; AR ; Above_Right 10867ccc; 234; DA ; Double_Above 10868 10869dt ; can ; canonical 10870dt ; enc ; circle 10871dt ; fin ; final 10872dt ; font ; font 10873dt ; fra ; fraction 10874dt ; init ; initial 10875dt ; iso ; isolated 10876dt ; med ; medial 10877dt ; n/a ; none 10878dt ; nb ; noBreak 10879dt ; sqr ; square 10880dt ; sub ; sub 10881dt ; sup ; super 10882 10883gc ; C ; Other # Cc | Cf | Cn | Co | Cs 10884gc ; Cc ; Control 10885gc ; Cn ; Unassigned 10886gc ; Co ; Private_Use 10887gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu 10888gc ; LC ; Cased_Letter # Ll | Lt | Lu 10889gc ; Ll ; Lowercase_Letter 10890gc ; Lm ; Modifier_Letter 10891gc ; Lo ; Other_Letter 10892gc ; Lu ; Uppercase_Letter 10893gc ; M ; Mark # Mc | Me | Mn 10894gc ; Mc ; Spacing_Mark 10895gc ; Mn ; Nonspacing_Mark 10896gc ; N ; Number # Nd | Nl | No 10897gc ; Nd ; Decimal_Number 10898gc ; No ; Other_Number 10899gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps 10900gc ; Pd ; Dash_Punctuation 10901gc ; Pe ; Close_Punctuation 10902gc ; Po ; Other_Punctuation 10903gc ; Ps ; Open_Punctuation 10904gc ; S ; Symbol # Sc | Sk | Sm | So 10905gc ; Sc ; Currency_Symbol 10906gc ; Sm ; Math_Symbol 10907gc ; So ; Other_Symbol 10908gc ; Z ; Separator # Zl | Zp | Zs 10909gc ; Zl ; Line_Separator 10910gc ; Zp ; Paragraph_Separator 10911gc ; Zs ; Space_Separator 10912 10913nt ; de ; Decimal 10914nt ; di ; Digit 10915nt ; n/a ; None 10916nt ; nu ; Numeric 10917END 10918 10919 if (-e 'ArabicShaping.txt') { 10920 push @return, split /\n/, <<'END'; 10921jg ; n/a ; AIN 10922jg ; n/a ; ALEF 10923jg ; n/a ; DAL 10924jg ; n/a ; GAF 10925jg ; n/a ; LAM 10926jg ; n/a ; MEEM 10927jg ; n/a ; NO_JOINING_GROUP 10928jg ; n/a ; NOON 10929jg ; n/a ; QAF 10930jg ; n/a ; SAD 10931jg ; n/a ; SEEN 10932jg ; n/a ; TAH 10933jg ; n/a ; WAW 10934 10935jt ; C ; Join_Causing 10936jt ; D ; Dual_Joining 10937jt ; L ; Left_Joining 10938jt ; R ; Right_Joining 10939jt ; U ; Non_Joining 10940jt ; T ; Transparent 10941END 10942 if ($v_version ge v3.0.0) { 10943 push @return, split /\n/, <<'END'; 10944jg ; n/a ; ALAPH 10945jg ; n/a ; BEH 10946jg ; n/a ; BETH 10947jg ; n/a ; DALATH_RISH 10948jg ; n/a ; E 10949jg ; n/a ; FEH 10950jg ; n/a ; FINAL_SEMKATH 10951jg ; n/a ; GAMAL 10952jg ; n/a ; HAH 10953jg ; n/a ; HAMZA_ON_HEH_GOAL 10954jg ; n/a ; HE 10955jg ; n/a ; HEH 10956jg ; n/a ; HEH_GOAL 10957jg ; n/a ; HETH 10958jg ; n/a ; KAF 10959jg ; n/a ; KAPH 10960jg ; n/a ; KNOTTED_HEH 10961jg ; n/a ; LAMADH 10962jg ; n/a ; MIM 10963jg ; n/a ; NUN 10964jg ; n/a ; PE 10965jg ; n/a ; QAPH 10966jg ; n/a ; REH 10967jg ; n/a ; REVERSED_PE 10968jg ; n/a ; SADHE 10969jg ; n/a ; SEMKATH 10970jg ; n/a ; SHIN 10971jg ; n/a ; SWASH_KAF 10972jg ; n/a ; TAW 10973jg ; n/a ; TEH_MARBUTA 10974jg ; n/a ; TETH 10975jg ; n/a ; YEH 10976jg ; n/a ; YEH_BARREE 10977jg ; n/a ; YEH_WITH_TAIL 10978jg ; n/a ; YUDH 10979jg ; n/a ; YUDH_HE 10980jg ; n/a ; ZAIN 10981END 10982 } 10983 } 10984 10985 10986 if (-e 'EastAsianWidth.txt') { 10987 push @return, split /\n/, <<'END'; 10988ea ; A ; Ambiguous 10989ea ; F ; Fullwidth 10990ea ; H ; Halfwidth 10991ea ; N ; Neutral 10992ea ; Na ; Narrow 10993ea ; W ; Wide 10994END 10995 } 10996 10997 if (-e 'LineBreak.txt' || -e 'LBsubst.txt') { 10998 my @lb = split /\n/, <<'END'; 10999lb ; AI ; Ambiguous 11000lb ; AL ; Alphabetic 11001lb ; B2 ; Break_Both 11002lb ; BA ; Break_After 11003lb ; BB ; Break_Before 11004lb ; BK ; Mandatory_Break 11005lb ; CB ; Contingent_Break 11006lb ; CL ; Close_Punctuation 11007lb ; CM ; Combining_Mark 11008lb ; CR ; Carriage_Return 11009lb ; EX ; Exclamation 11010lb ; GL ; Glue 11011lb ; HY ; Hyphen 11012lb ; ID ; Ideographic 11013lb ; IN ; Inseperable 11014lb ; IS ; Infix_Numeric 11015lb ; LF ; Line_Feed 11016lb ; NS ; Nonstarter 11017lb ; NU ; Numeric 11018lb ; OP ; Open_Punctuation 11019lb ; PO ; Postfix_Numeric 11020lb ; PR ; Prefix_Numeric 11021lb ; QU ; Quotation 11022lb ; SA ; Complex_Context 11023lb ; SG ; Surrogate 11024lb ; SP ; Space 11025lb ; SY ; Break_Symbols 11026lb ; XX ; Unknown 11027lb ; ZW ; ZWSpace 11028END 11029 # If this Unicode version predates the lb property, we use our 11030 # substitute one 11031 if (-e 'LBsubst.txt') { 11032 $_ = s/^lb/_Perl_LB/r for @lb; 11033 } 11034 push @return, @lb; 11035 } 11036 11037 if (-e 'DNormalizationProps.txt') { 11038 push @return, split /\n/, <<'END'; 11039qc ; M ; Maybe 11040qc ; N ; No 11041qc ; Y ; Yes 11042END 11043 } 11044 11045 if (-e 'Scripts.txt') { 11046 push @return, split /\n/, <<'END'; 11047sc ; Arab ; Arabic 11048sc ; Armn ; Armenian 11049sc ; Beng ; Bengali 11050sc ; Bopo ; Bopomofo 11051sc ; Cans ; Canadian_Aboriginal 11052sc ; Cher ; Cherokee 11053sc ; Cyrl ; Cyrillic 11054sc ; Deva ; Devanagari 11055sc ; Dsrt ; Deseret 11056sc ; Ethi ; Ethiopic 11057sc ; Geor ; Georgian 11058sc ; Goth ; Gothic 11059sc ; Grek ; Greek 11060sc ; Gujr ; Gujarati 11061sc ; Guru ; Gurmukhi 11062sc ; Hang ; Hangul 11063sc ; Hani ; Han 11064sc ; Hebr ; Hebrew 11065sc ; Hira ; Hiragana 11066sc ; Ital ; Old_Italic 11067sc ; Kana ; Katakana 11068sc ; Khmr ; Khmer 11069sc ; Knda ; Kannada 11070sc ; Laoo ; Lao 11071sc ; Latn ; Latin 11072sc ; Mlym ; Malayalam 11073sc ; Mong ; Mongolian 11074sc ; Mymr ; Myanmar 11075sc ; Ogam ; Ogham 11076sc ; Orya ; Oriya 11077sc ; Qaai ; Inherited 11078sc ; Runr ; Runic 11079sc ; Sinh ; Sinhala 11080sc ; Syrc ; Syriac 11081sc ; Taml ; Tamil 11082sc ; Telu ; Telugu 11083sc ; Thaa ; Thaana 11084sc ; Thai ; Thai 11085sc ; Tibt ; Tibetan 11086sc ; Yiii ; Yi 11087sc ; Zyyy ; Common 11088END 11089 } 11090 11091 if ($v_version ge v2.0.0) { 11092 push @return, split /\n/, <<'END'; 11093dt ; com ; compat 11094dt ; nar ; narrow 11095dt ; sml ; small 11096dt ; vert ; vertical 11097dt ; wide ; wide 11098 11099gc ; Cf ; Format 11100gc ; Cs ; Surrogate 11101gc ; Lt ; Titlecase_Letter 11102gc ; Me ; Enclosing_Mark 11103gc ; Nl ; Letter_Number 11104gc ; Pc ; Connector_Punctuation 11105gc ; Sk ; Modifier_Symbol 11106END 11107 } 11108 if ($v_version ge v2.1.2) { 11109 push @return, "bc ; S ; Segment_Separator\n"; 11110 } 11111 if ($v_version ge v2.1.5) { 11112 push @return, split /\n/, <<'END'; 11113gc ; Pf ; Final_Punctuation 11114gc ; Pi ; Initial_Punctuation 11115END 11116 } 11117 if ($v_version ge v2.1.8) { 11118 push @return, "ccc; 240; IS ; Iota_Subscript\n"; 11119 } 11120 11121 if ($v_version ge v3.0.0) { 11122 push @return, split /\n/, <<'END'; 11123bc ; AL ; Arabic_Letter 11124bc ; BN ; Boundary_Neutral 11125bc ; LRE ; Left_To_Right_Embedding 11126bc ; LRO ; Left_To_Right_Override 11127bc ; NSM ; Nonspacing_Mark 11128bc ; PDF ; Pop_Directional_Format 11129bc ; RLE ; Right_To_Left_Embedding 11130bc ; RLO ; Right_To_Left_Override 11131 11132ccc; 233; DB ; Double_Below 11133END 11134 } 11135 11136 if ($v_version ge v3.1.0) { 11137 push @return, "ccc; 226; R ; Right\n"; 11138 } 11139 11140 return @return; 11141} 11142 11143sub process_NormalizationsTest { 11144 11145 # Each line looks like: 11146 # source code point; NFC; NFD; NFKC; NFKD 11147 # e.g. 11148 # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 11149 11150 my $file= shift; 11151 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11152 11153 # Process each line of the file ... 11154 while ($file->next_line) { 11155 11156 next if /^@/; 11157 11158 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/; 11159 11160 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) { 11161 $$var = pack "U0U*", map { hex } split " ", $$var; 11162 $$var =~ s/(\\)/$1$1/g; 11163 } 11164 11165 push @normalization_tests, 11166 "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n"; 11167 } # End of looping through the file 11168} 11169 11170sub output_perl_charnames_line ($$) { 11171 11172 # Output the entries in Perl_charnames specially, using 5 digits instead 11173 # of four. This makes the entries a constant length, and simplifies 11174 # charnames.pm which this table is for. Unicode can have 6 digit 11175 # ordinals, but they are all private use or noncharacters which do not 11176 # have names, so won't be in this table. 11177 11178 return sprintf "%05X\t%s\n", $_[0], $_[1]; 11179} 11180 11181{ # Closure 11182 11183 # These are constants to the $property_info hash in this subroutine, to 11184 # avoid using a quoted-string which might have a typo. 11185 my $TYPE = 'type'; 11186 my $DEFAULT_MAP = 'default_map'; 11187 my $DEFAULT_TABLE = 'default_table'; 11188 my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; 11189 my $MISSINGS = 'missings'; 11190 11191 sub process_generic_property_file { 11192 # This processes a file containing property mappings and puts them 11193 # into internal map tables. It should be used to handle any property 11194 # files that have mappings from a code point or range thereof to 11195 # something else. This means almost all the UCD .txt files. 11196 # each_line_handlers() should be set to adjust the lines of these 11197 # files, if necessary, to what this routine understands: 11198 # 11199 # 0374 ; NFD_QC; N 11200 # 003C..003E ; Math 11201 # 11202 # the fields are: "codepoint-range ; property; map" 11203 # 11204 # meaning the codepoints in the range all have the value 'map' under 11205 # 'property'. 11206 # Beginning and trailing white space in each field are not significant. 11207 # Note there is not a trailing semi-colon in the above. A trailing 11208 # semi-colon means the map is a null-string. An omitted map, as 11209 # opposed to a null-string, is assumed to be 'Y', based on Unicode 11210 # table syntax. (This could have been hidden from this routine by 11211 # doing it in the $file object, but that would require parsing of the 11212 # line there, so would have to parse it twice, or change the interface 11213 # to pass this an array. So not done.) 11214 # 11215 # The map field may begin with a sequence of commands that apply to 11216 # this range. Each such command begins and ends with $CMD_DELIM. 11217 # These are used to indicate, for example, that the mapping for a 11218 # range has a non-default type. 11219 # 11220 # This loops through the file, calling its next_line() method, and 11221 # then taking the map and adding it to the property's table. 11222 # Complications arise because any number of properties can be in the 11223 # file, in any order, interspersed in any way. The first time a 11224 # property is seen, it gets information about that property and 11225 # caches it for quick retrieval later. It also normalizes the maps 11226 # so that only one of many synonyms is stored. The Unicode input 11227 # files do use some multiple synonyms. 11228 11229 my $file = shift; 11230 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11231 11232 my %property_info; # To keep track of what properties 11233 # have already had entries in the 11234 # current file, and info about each, 11235 # so don't have to recompute. 11236 my $property_name; # property currently being worked on 11237 my $property_type; # and its type 11238 my $previous_property_name = ""; # name from last time through loop 11239 my $property_object; # pointer to the current property's 11240 # object 11241 my $property_addr; # the address of that object 11242 my $default_map; # the string that code points missing 11243 # from the file map to 11244 my $default_table; # For non-string properties, a 11245 # reference to the match table that 11246 # will contain the list of code 11247 # points that map to $default_map. 11248 11249 # Get the next real non-comment line 11250 LINE: 11251 while ($file->next_line) { 11252 11253 # Default replacement type; means that if parts of the range have 11254 # already been stored in our tables, the new map overrides them if 11255 # they differ more than cosmetically 11256 my $replace = $IF_NOT_EQUIVALENT; 11257 my $map_type; # Default type for the map of this range 11258 11259 #local $to_trace = 1 if main::DEBUG; 11260 trace $_ if main::DEBUG && $to_trace; 11261 11262 # Split the line into components 11263 my ($range, $property_name, $map, @remainder) 11264 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 11265 11266 # If more or less on the line than we are expecting, warn and skip 11267 # the line 11268 if (@remainder) { 11269 $file->carp_bad_line('Extra fields'); 11270 next LINE; 11271 } 11272 elsif ( ! defined $property_name) { 11273 $file->carp_bad_line('Missing property'); 11274 next LINE; 11275 } 11276 11277 # Examine the range. 11278 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 11279 { 11280 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); 11281 next LINE; 11282 } 11283 my $low = hex $1; 11284 my $high = (defined $2) ? hex $2 : $low; 11285 11286 # If changing to a new property, get the things constant per 11287 # property 11288 if ($previous_property_name ne $property_name) { 11289 11290 $property_object = property_ref($property_name); 11291 if (! defined $property_object) { 11292 $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); 11293 next LINE; 11294 } 11295 { no overloading; $property_addr = pack 'J', $property_object; } 11296 11297 # Defer changing names until have a line that is acceptable 11298 # (the 'next' statement above means is unacceptable) 11299 $previous_property_name = $property_name; 11300 11301 # If not the first time for this property, retrieve info about 11302 # it from the cache 11303 if (defined ($property_info{$property_addr}{$TYPE})) { 11304 $property_type = $property_info{$property_addr}{$TYPE}; 11305 $default_map = $property_info{$property_addr}{$DEFAULT_MAP}; 11306 $map_type 11307 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}; 11308 $default_table 11309 = $property_info{$property_addr}{$DEFAULT_TABLE}; 11310 } 11311 else { 11312 11313 # Here, is the first time for this property. Set up the 11314 # cache. 11315 $property_type = $property_info{$property_addr}{$TYPE} 11316 = $property_object->type; 11317 $map_type 11318 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE} 11319 = $property_object->pseudo_map_type; 11320 11321 # The Unicode files are set up so that if the map is not 11322 # defined, it is a binary property 11323 if (! defined $map && $property_type != $BINARY) { 11324 if ($property_type != $UNKNOWN 11325 && $property_type != $NON_STRING) 11326 { 11327 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); 11328 } 11329 else { 11330 $property_object->set_type($BINARY); 11331 $property_type 11332 = $property_info{$property_addr}{$TYPE} 11333 = $BINARY; 11334 } 11335 } 11336 11337 # Get any @missings default for this property. This 11338 # should precede the first entry for the property in the 11339 # input file, and is located in a comment that has been 11340 # stored by the Input_file class until we access it here. 11341 # It's possible that there is more than one such line 11342 # waiting for us; collect them all, and parse 11343 my @missings_list = $file->get_missings 11344 if $file->has_missings_defaults; 11345 foreach my $default_ref (@missings_list) { 11346 my $default = $default_ref->[0]; 11347 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); }; 11348 11349 # For string properties, the default is just what the 11350 # file says, but non-string properties should already 11351 # have set up a table for the default property value; 11352 # use the table for these, so can resolve synonyms 11353 # later to a single standard one. 11354 if ($property_type == $STRING 11355 || $property_type == $UNKNOWN) 11356 { 11357 $property_info{$addr}{$MISSINGS} = $default; 11358 } 11359 else { 11360 $property_info{$addr}{$MISSINGS} 11361 = $property_object->table($default); 11362 } 11363 } 11364 11365 # Finished storing all the @missings defaults in the input 11366 # file so far. Get the one for the current property. 11367 my $missings = $property_info{$property_addr}{$MISSINGS}; 11368 11369 # But we likely have separately stored what the default 11370 # should be. (This is to accommodate versions of the 11371 # standard where the @missings lines are absent or 11372 # incomplete.) Hopefully the two will match. But check 11373 # it out. 11374 $default_map = $property_object->default_map; 11375 11376 # If the map is a ref, it means that the default won't be 11377 # processed until later, so undef it, so next few lines 11378 # will redefine it to something that nothing will match 11379 undef $default_map if ref $default_map; 11380 11381 # Create a $default_map if don't have one; maybe a dummy 11382 # that won't match anything. 11383 if (! defined $default_map) { 11384 11385 # Use any @missings line in the file. 11386 if (defined $missings) { 11387 if (ref $missings) { 11388 $default_map = $missings->full_name; 11389 $default_table = $missings; 11390 } 11391 else { 11392 $default_map = $missings; 11393 } 11394 11395 # And store it with the property for outside use. 11396 $property_object->set_default_map($default_map); 11397 } 11398 else { 11399 11400 # Neither an @missings nor a default map. Create 11401 # a dummy one, so won't have to test definedness 11402 # in the main loop. 11403 $default_map = '_Perl This will never be in a file 11404 from Unicode'; 11405 } 11406 } 11407 11408 # Here, we have $default_map defined, possibly in terms of 11409 # $missings, but maybe not, and possibly is a dummy one. 11410 if (defined $missings) { 11411 11412 # Make sure there is no conflict between the two. 11413 # $missings has priority. 11414 if (ref $missings) { 11415 $default_table 11416 = $property_object->table($default_map); 11417 if (! defined $default_table 11418 || $default_table != $missings) 11419 { 11420 if (! defined $default_table) { 11421 $default_table = $UNDEF; 11422 } 11423 $file->carp_bad_line(<<END 11424The \@missings line for $property_name in $file says that missings default to 11425$missings, but we expect it to be $default_table. $missings used. 11426END 11427 ); 11428 $default_table = $missings; 11429 $default_map = $missings->full_name; 11430 } 11431 $property_info{$property_addr}{$DEFAULT_TABLE} 11432 = $default_table; 11433 } 11434 elsif ($default_map ne $missings) { 11435 $file->carp_bad_line(<<END 11436The \@missings line for $property_name in $file says that missings default to 11437$missings, but we expect it to be $default_map. $missings used. 11438END 11439 ); 11440 $default_map = $missings; 11441 } 11442 } 11443 11444 $property_info{$property_addr}{$DEFAULT_MAP} 11445 = $default_map; 11446 11447 # If haven't done so already, find the table corresponding 11448 # to this map for non-string properties. 11449 if (! defined $default_table 11450 && $property_type != $STRING 11451 && $property_type != $UNKNOWN) 11452 { 11453 $default_table = $property_info{$property_addr} 11454 {$DEFAULT_TABLE} 11455 = $property_object->table($default_map); 11456 } 11457 } # End of is first time for this property 11458 } # End of switching properties. 11459 11460 # Ready to process the line. 11461 # The Unicode files are set up so that if the map is not defined, 11462 # it is a binary property with value 'Y' 11463 if (! defined $map) { 11464 $map = 'Y'; 11465 } 11466 else { 11467 11468 # If the map begins with a special command to us (enclosed in 11469 # delimiters), extract the command(s). 11470 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { 11471 my $command = $1; 11472 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { 11473 $replace = $1; 11474 } 11475 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { 11476 $map_type = $1; 11477 } 11478 else { 11479 $file->carp_bad_line("Unknown command line: '$1'"); 11480 next LINE; 11481 } 11482 } 11483 } 11484 11485 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x) 11486 { 11487 11488 # Here, we have a map to a particular code point, and the 11489 # default map is to a code point itself. If the range 11490 # includes the particular code point, change that portion of 11491 # the range to the default. This makes sure that in the final 11492 # table only the non-defaults are listed. 11493 my $decimal_map = hex $map; 11494 if ($low <= $decimal_map && $decimal_map <= $high) { 11495 11496 # If the range includes stuff before or after the map 11497 # we're changing, split it and process the split-off parts 11498 # later. 11499 if ($low < $decimal_map) { 11500 $file->insert_adjusted_lines( 11501 sprintf("%04X..%04X; %s; %s", 11502 $low, 11503 $decimal_map - 1, 11504 $property_name, 11505 $map)); 11506 } 11507 if ($high > $decimal_map) { 11508 $file->insert_adjusted_lines( 11509 sprintf("%04X..%04X; %s; %s", 11510 $decimal_map + 1, 11511 $high, 11512 $property_name, 11513 $map)); 11514 } 11515 $low = $high = $decimal_map; 11516 $map = $CODE_POINT; 11517 } 11518 } 11519 11520 # If we can tell that this is a synonym for the default map, use 11521 # the default one instead. 11522 if ($property_type != $STRING 11523 && $property_type != $UNKNOWN) 11524 { 11525 my $table = $property_object->table($map); 11526 if (defined $table && $table == $default_table) { 11527 $map = $default_map; 11528 } 11529 } 11530 11531 # And figure out the map type if not known. 11532 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { 11533 if ($map eq "") { # Nulls are always $NULL map type 11534 $map_type = $NULL; 11535 } # Otherwise, non-strings, and those that don't allow 11536 # $MULTI_CP, and those that aren't multiple code points are 11537 # 0 11538 elsif 11539 (($property_type != $STRING && $property_type != $UNKNOWN) 11540 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) 11541 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) 11542 { 11543 $map_type = 0; 11544 } 11545 else { 11546 $map_type = $MULTI_CP; 11547 } 11548 } 11549 11550 $property_object->add_map($low, $high, 11551 $map, 11552 Type => $map_type, 11553 Replace => $replace); 11554 } # End of loop through file's lines 11555 11556 return; 11557 } 11558} 11559 11560{ # Closure for UnicodeData.txt handling 11561 11562 # This file was the first one in the UCD; its design leads to some 11563 # awkwardness in processing. Here is a sample line: 11564 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 11565 # The fields in order are: 11566 my $i = 0; # The code point is in field 0, and is shifted off. 11567 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") 11568 my $CATEGORY = $i++; # category (e.g. "Lu") 11569 my $CCC = $i++; # Canonical combining class (e.g. "230") 11570 my $BIDI = $i++; # directional class (e.g. "L") 11571 my $PERL_DECOMPOSITION = $i++; # decomposition mapping 11572 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value 11573 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript 11574 # Dual-use in this program; see below 11575 my $NUMERIC = $i++; # numeric value 11576 my $MIRRORED = $i++; # ? mirrored 11577 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 11578 my $COMMENT = $i++; # iso comment 11579 my $UPPER = $i++; # simple uppercase mapping 11580 my $LOWER = $i++; # simple lowercase mapping 11581 my $TITLE = $i++; # simple titlecase mapping 11582 my $input_field_count = $i; 11583 11584 # This routine in addition outputs these extra fields: 11585 11586 my $DECOMP_TYPE = $i++; # Decomposition type 11587 11588 # These fields are modifications of ones above, and are usually 11589 # suppressed; they must come last, as for speed, the loop upper bound is 11590 # normally set to ignore them 11591 my $NAME = $i++; # This is the strict name field, not the one that 11592 # charnames uses. 11593 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used 11594 # by Unicode::Normalize 11595 my $last_field = $i - 1; 11596 11597 # All these are read into an array for each line, with the indices defined 11598 # above. The empty fields in the example line above indicate that the 11599 # value is defaulted. The handler called for each line of the input 11600 # changes these to their defaults. 11601 11602 # Here are the official names of the properties, in a parallel array: 11603 my @field_names; 11604 $field_names[$BIDI] = 'Bidi_Class'; 11605 $field_names[$CATEGORY] = 'General_Category'; 11606 $field_names[$CCC] = 'Canonical_Combining_Class'; 11607 $field_names[$CHARNAME] = 'Perl_Charnames'; 11608 $field_names[$COMMENT] = 'ISO_Comment'; 11609 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; 11610 $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; 11611 $field_names[$LOWER] = 'Lowercase_Mapping'; 11612 $field_names[$MIRRORED] = 'Bidi_Mirrored'; 11613 $field_names[$NAME] = 'Name'; 11614 $field_names[$NUMERIC] = 'Numeric_Value'; 11615 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; 11616 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; 11617 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; 11618 $field_names[$TITLE] = 'Titlecase_Mapping'; 11619 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; 11620 $field_names[$UPPER] = 'Uppercase_Mapping'; 11621 11622 # Some of these need a little more explanation: 11623 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode 11624 # property, but is used in calculating the Numeric_Type. Perl however, 11625 # creates a file from this field, so a Perl property is created from it. 11626 # Similarly, the Other_Digit field is used only for calculating the 11627 # Numeric_Type, and so it can be safely re-used as the place to store 11628 # the value for Numeric_Type; hence it is referred to as 11629 # $NUMERIC_TYPE_OTHER_DIGIT. 11630 # The input field named $PERL_DECOMPOSITION is a combination of both the 11631 # decomposition mapping and its type. Perl creates a file containing 11632 # exactly this field, so it is used for that. The two properties are 11633 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. 11634 # $DECOMP_MAP is usually suppressed (unless the lists are changed to 11635 # output it), as Perl doesn't use it directly. 11636 # The input field named here $CHARNAME is used to construct the 11637 # Perl_Charnames property, which is a combination of the Name property 11638 # (which the input field contains), and the Unicode_1_Name property, and 11639 # others from other files. Since, the strict Name property is not used 11640 # by Perl, this field is used for the table that Perl does use. The 11641 # strict Name property table is usually suppressed (unless the lists are 11642 # changed to output it), so it is accumulated in a separate field, 11643 # $NAME, which to save time is discarded unless the table is actually to 11644 # be output 11645 11646 # This file is processed like most in this program. Control is passed to 11647 # process_generic_property_file() which calls filter_UnicodeData_line() 11648 # for each input line. This filter converts the input into line(s) that 11649 # process_generic_property_file() understands. There is also a setup 11650 # routine called before any of the file is processed, and a handler for 11651 # EOF processing, all in this closure. 11652 11653 # A huge speed-up occurred at the cost of some added complexity when these 11654 # routines were altered to buffer the outputs into ranges. Almost all the 11655 # lines of the input file apply to just one code point, and for most 11656 # properties, the map for the next code point up is the same as the 11657 # current one. So instead of creating a line for each property for each 11658 # input line, filter_UnicodeData_line() remembers what the previous map 11659 # of a property was, and doesn't generate a line to pass on until it has 11660 # to, as when the map changes; and that passed-on line encompasses the 11661 # whole contiguous range of code points that have the same map for that 11662 # property. This means a slight amount of extra setup, and having to 11663 # flush these buffers on EOF, testing if the maps have changed, plus 11664 # remembering state information in the closure. But it means a lot less 11665 # real time in not having to change the data base for each property on 11666 # each line. 11667 11668 # Another complication is that there are already a few ranges designated 11669 # in the input. There are two lines for each, with the same maps except 11670 # the code point and name on each line. This was actually the hardest 11671 # thing to design around. The code points in those ranges may actually 11672 # have real maps not given by these two lines. These maps will either 11673 # be algorithmically determinable, or be in the extracted files furnished 11674 # with the UCD. In the event of conflicts between these extracted files, 11675 # and this one, Unicode says that this one prevails. But it shouldn't 11676 # prevail for conflicts that occur in these ranges. The data from the 11677 # extracted files prevails in those cases. So, this program is structured 11678 # so that those files are processed first, storing maps. Then the other 11679 # files are processed, generally overwriting what the extracted files 11680 # stored. But just the range lines in this input file are processed 11681 # without overwriting. This is accomplished by adding a special string to 11682 # the lines output to tell process_generic_property_file() to turn off the 11683 # overwriting for just this one line. 11684 # A similar mechanism is used to tell it that the map is of a non-default 11685 # type. 11686 11687 sub setup_UnicodeData { # Called before any lines of the input are read 11688 my $file = shift; 11689 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11690 11691 # Create a new property specially located that is a combination of 11692 # various Name properties: Name, Unicode_1_Name, Named Sequences, and 11693 # _Perl_Name_Alias properties. (The final one duplicates elements of the 11694 # first, and starting in v6.1, is the same as the 'Name_Alias 11695 # property.) A comment for the new property will later be constructed 11696 # based on the actual properties present and used 11697 $perl_charname = Property->new('Perl_Charnames', 11698 Default_Map => "", 11699 Directory => File::Spec->curdir(), 11700 File => 'Name', 11701 Fate => $INTERNAL_ONLY, 11702 Perl_Extension => 1, 11703 Range_Size_1 => \&output_perl_charnames_line, 11704 Type => $STRING, 11705 ); 11706 $perl_charname->set_proxy_for('Name'); 11707 11708 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', 11709 Directory => File::Spec->curdir(), 11710 File => 'Decomposition', 11711 Format => $DECOMP_STRING_FORMAT, 11712 Fate => $INTERNAL_ONLY, 11713 Perl_Extension => 1, 11714 Default_Map => $CODE_POINT, 11715 11716 # normalize.pm can't cope with these 11717 Output_Range_Counts => 0, 11718 11719 # This is a specially formatted table 11720 # explicitly for normalize.pm, which 11721 # is expecting a particular format, 11722 # which means that mappings containing 11723 # multiple code points are in the main 11724 # body of the table 11725 Map_Type => $COMPUTE_NO_MULTI_CP, 11726 Type => $STRING, 11727 To_Output_Map => $INTERNAL_MAP, 11728 ); 11729 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); 11730 $Perl_decomp->add_comment(join_lines(<<END 11731This mapping is a combination of the Unicode 'Decomposition_Type' and 11732'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is 11733identical to the official Unicode 'Decomposition_Mapping' property except for 11734two things: 11735 1) It omits the algorithmically determinable Hangul syllable decompositions, 11736which normalize.pm handles algorithmically. 11737 2) It contains the decomposition type as well. Non-canonical decompositions 11738begin with a word in angle brackets, like <super>, which denotes the 11739compatible decomposition type. If the map does not begin with the <angle 11740brackets>, the decomposition is canonical. 11741END 11742 )); 11743 11744 my $Decimal_Digit = Property->new("Perl_Decimal_Digit", 11745 Default_Map => "", 11746 Perl_Extension => 1, 11747 Directory => $map_directory, 11748 Type => $STRING, 11749 To_Output_Map => $OUTPUT_ADJUSTED, 11750 ); 11751 $Decimal_Digit->add_comment(join_lines(<<END 11752This file gives the mapping of all code points which represent a single 11753decimal digit [0-9] to their respective digits, but it has ranges of 10 code 11754points, and the mapping of each non-initial element of each range is actually 11755not to "0", but to the offset that element has from its corresponding DIGIT 0. 11756These code points are those that have Numeric_Type=Decimal; not special 11757things, like subscripts nor Roman numerals. 11758END 11759 )); 11760 11761 # These properties are not used for generating anything else, and are 11762 # usually not output. By making them last in the list, we can just 11763 # change the high end of the loop downwards to avoid the work of 11764 # generating a table(s) that is/are just going to get thrown away. 11765 if (! property_ref('Decomposition_Mapping')->to_output_map 11766 && ! property_ref('Name')->to_output_map) 11767 { 11768 $last_field = min($NAME, $DECOMP_MAP) - 1; 11769 } elsif (property_ref('Decomposition_Mapping')->to_output_map) { 11770 $last_field = $DECOMP_MAP; 11771 } elsif (property_ref('Name')->to_output_map) { 11772 $last_field = $NAME; 11773 } 11774 return; 11775 } 11776 11777 my $first_time = 1; # ? Is this the first line of the file 11778 my $in_range = 0; # ? Are we in one of the file's ranges 11779 my $previous_cp; # hex code point of previous line 11780 my $decimal_previous_cp = -1; # And its decimal equivalent 11781 my @start; # For each field, the current starting 11782 # code point in hex for the range 11783 # being accumulated. 11784 my @fields; # The input fields; 11785 my @previous_fields; # And those from the previous call 11786 11787 sub filter_UnicodeData_line { 11788 # Handle a single input line from UnicodeData.txt; see comments above 11789 # Conceptually this takes a single line from the file containing N 11790 # properties, and converts it into N lines with one property per line, 11791 # which is what the final handler expects. But there are 11792 # complications due to the quirkiness of the input file, and to save 11793 # time, it accumulates ranges where the property values don't change 11794 # and only emits lines when necessary. This is about an order of 11795 # magnitude fewer lines emitted. 11796 11797 my $file = shift; 11798 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 11799 11800 # $_ contains the input line. 11801 # -1 in split means retain trailing null fields 11802 (my $cp, @fields) = split /\s*;\s*/, $_, -1; 11803 11804 #local $to_trace = 1 if main::DEBUG; 11805 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; 11806 if (@fields > $input_field_count) { 11807 $file->carp_bad_line('Extra fields'); 11808 $_ = ""; 11809 return; 11810 } 11811 11812 my $decimal_cp = hex $cp; 11813 11814 # We have to output all the buffered ranges when the next code point 11815 # is not exactly one after the previous one, which means there is a 11816 # gap in the ranges. 11817 my $force_output = ($decimal_cp != $decimal_previous_cp + 1); 11818 11819 # The decomposition mapping field requires special handling. It looks 11820 # like either: 11821 # 11822 # <compat> 0032 0020 11823 # 0041 0300 11824 # 11825 # The decomposition type is enclosed in <brackets>; if missing, it 11826 # means the type is canonical. There are two decomposition mapping 11827 # tables: the one for use by Perl's normalize.pm has a special format 11828 # which is this field intact; the other, for general use is of 11829 # standard format. In either case we have to find the decomposition 11830 # type. Empty fields have None as their type, and map to the code 11831 # point itself 11832 if ($fields[$PERL_DECOMPOSITION] eq "") { 11833 $fields[$DECOMP_TYPE] = 'None'; 11834 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; 11835 } 11836 else { 11837 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] 11838 =~ / < ( .+? ) > \s* ( .+ ) /x; 11839 if (! defined $fields[$DECOMP_TYPE]) { 11840 $fields[$DECOMP_TYPE] = 'Canonical'; 11841 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; 11842 } 11843 else { 11844 $fields[$DECOMP_MAP] = $map; 11845 } 11846 } 11847 11848 # The 3 numeric fields also require special handling. The 2 digit 11849 # fields must be either empty or match the number field. This means 11850 # that if it is empty, they must be as well, and the numeric type is 11851 # None, and the numeric value is 'Nan'. 11852 # The decimal digit field must be empty or match the other digit 11853 # field. If the decimal digit field is non-empty, the code point is 11854 # a decimal digit, and the other two fields will have the same value. 11855 # If it is empty, but the other digit field is non-empty, the code 11856 # point is an 'other digit', and the number field will have the same 11857 # value as the other digit field. If the other digit field is empty, 11858 # but the number field is non-empty, the code point is a generic 11859 # numeric type. 11860 if ($fields[$NUMERIC] eq "") { 11861 if ($fields[$PERL_DECIMAL_DIGIT] ne "" 11862 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" 11863 ) { 11864 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); 11865 } 11866 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; 11867 $fields[$NUMERIC] = 'NaN'; 11868 } 11869 else { 11870 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; 11871 if ($fields[$PERL_DECIMAL_DIGIT] ne "") { 11872 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; 11873 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd"; 11874 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; 11875 } 11876 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { 11877 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; 11878 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; 11879 } 11880 else { 11881 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; 11882 11883 # Rationals require extra effort. 11884 if ($fields[$NUMERIC] =~ qr{/}) { 11885 reduce_fraction(\$fields[$NUMERIC]); 11886 register_fraction($fields[$NUMERIC]) 11887 } 11888 } 11889 } 11890 11891 # For the properties that have empty fields in the file, and which 11892 # mean something different from empty, change them to that default. 11893 # Certain fields just haven't been empty so far in any Unicode 11894 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, 11895 # $CATEGORY. This leaves just the two fields, and so we hard-code in 11896 # the defaults; which are very unlikely to ever change. 11897 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; 11898 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; 11899 11900 # UAX44 says that if title is empty, it is the same as whatever upper 11901 # is, 11902 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; 11903 11904 # There are a few pairs of lines like: 11905 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 11906 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 11907 # that define ranges. These should be processed after the fields are 11908 # adjusted above, as they may override some of them; but mostly what 11909 # is left is to possibly adjust the $CHARNAME field. The names of all the 11910 # paired lines start with a '<', but this is also true of '<control>, 11911 # which isn't one of these special ones. 11912 if ($fields[$CHARNAME] eq '<control>') { 11913 11914 # Some code points in this file have the pseudo-name 11915 # '<control>', but the official name for such ones is the null 11916 # string. 11917 $fields[$NAME] = $fields[$CHARNAME] = ""; 11918 11919 # We had better not be in between range lines. 11920 if ($in_range) { 11921 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11922 $in_range = 0; 11923 } 11924 } 11925 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { 11926 11927 # Here is a non-range line. We had better not be in between range 11928 # lines. 11929 if ($in_range) { 11930 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11931 $in_range = 0; 11932 } 11933 if ($fields[$CHARNAME] =~ s/- $cp $//x) { 11934 11935 # These are code points whose names end in their code points, 11936 # which means the names are algorithmically derivable from the 11937 # code points. To shorten the output Name file, the algorithm 11938 # for deriving these is placed in the file instead of each 11939 # code point, so they have map type $CP_IN_NAME 11940 $fields[$CHARNAME] = $CMD_DELIM 11941 . $MAP_TYPE_CMD 11942 . '=' 11943 . $CP_IN_NAME 11944 . $CMD_DELIM 11945 . $fields[$CHARNAME]; 11946 } 11947 $fields[$NAME] = $fields[$CHARNAME]; 11948 } 11949 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { 11950 $fields[$CHARNAME] = $fields[$NAME] = $1; 11951 11952 # Here we are at the beginning of a range pair. 11953 if ($in_range) { 11954 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); 11955 } 11956 $in_range = 1; 11957 11958 # Because the properties in the range do not overwrite any already 11959 # in the db, we must flush the buffers of what's already there, so 11960 # they get handled in the normal scheme. 11961 $force_output = 1; 11962 11963 } 11964 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { 11965 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); 11966 $_ = ""; 11967 return; 11968 } 11969 else { # Here, we are at the last line of a range pair. 11970 11971 if (! $in_range) { 11972 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); 11973 $_ = ""; 11974 return; 11975 } 11976 $in_range = 0; 11977 11978 $fields[$NAME] = $fields[$CHARNAME]; 11979 11980 # Check that the input is valid: that the closing of the range is 11981 # the same as the beginning. 11982 foreach my $i (0 .. $last_field) { 11983 next if $fields[$i] eq $previous_fields[$i]; 11984 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); 11985 } 11986 11987 # The processing differs depending on the type of range, 11988 # determined by its $CHARNAME 11989 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { 11990 11991 # Check that the data looks right. 11992 if ($decimal_previous_cp != $SBase) { 11993 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); 11994 } 11995 if ($decimal_cp != $SBase + $SCount - 1) { 11996 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); 11997 } 11998 11999 # The Hangul syllable range has a somewhat complicated name 12000 # generation algorithm. Each code point in it has a canonical 12001 # decomposition also computable by an algorithm. The 12002 # perl decomposition map table built from these is used only 12003 # by normalize.pm, which has the algorithm built in it, so the 12004 # decomposition maps are not needed, and are large, so are 12005 # omitted from it. If the full decomposition map table is to 12006 # be output, the decompositions are generated for it, in the 12007 # EOF handling code for this input file. 12008 12009 $previous_fields[$DECOMP_TYPE] = 'Canonical'; 12010 12011 # This range is stored in our internal structure with its 12012 # own map type, different from all others. 12013 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 12014 = $CMD_DELIM 12015 . $MAP_TYPE_CMD 12016 . '=' 12017 . $HANGUL_SYLLABLE 12018 . $CMD_DELIM 12019 . $fields[$CHARNAME]; 12020 } 12021 elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter 12022 12023 # All the CJK ranges like this have the name given as a 12024 # special case in the next code line. And for the others, we 12025 # hope that Unicode continues to use the correct name in 12026 # future releases, so we don't have to make further special 12027 # cases. 12028 my $name = ($fields[$CHARNAME] =~ /^CJK/) 12029 ? 'CJK UNIFIED IDEOGRAPH' 12030 : uc $fields[$CHARNAME]; 12031 12032 # The name for these contains the code point itself, and all 12033 # are defined to have the same base name, regardless of what 12034 # is in the file. They are stored in our internal structure 12035 # with a map type of $CP_IN_NAME 12036 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 12037 = $CMD_DELIM 12038 . $MAP_TYPE_CMD 12039 . '=' 12040 . $CP_IN_NAME 12041 . $CMD_DELIM 12042 . $name; 12043 12044 } 12045 elsif ($fields[$CATEGORY] eq 'Co' 12046 || $fields[$CATEGORY] eq 'Cs') 12047 { 12048 # The names of all the code points in these ranges are set to 12049 # null, as there are no names for the private use and 12050 # surrogate code points. 12051 12052 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; 12053 } 12054 else { 12055 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); 12056 } 12057 12058 # The first line of the range caused everything else to be output, 12059 # and then its values were stored as the beginning values for the 12060 # next set of ranges, which this one ends. Now, for each value, 12061 # add a command to tell the handler that these values should not 12062 # replace any existing ones in our database. 12063 foreach my $i (0 .. $last_field) { 12064 $previous_fields[$i] = $CMD_DELIM 12065 . $REPLACE_CMD 12066 . '=' 12067 . $NO 12068 . $CMD_DELIM 12069 . $previous_fields[$i]; 12070 } 12071 12072 # And change things so it looks like the entire range has been 12073 # gone through with this being the final part of it. Adding the 12074 # command above to each field will cause this range to be flushed 12075 # during the next iteration, as it guaranteed that the stored 12076 # field won't match whatever value the next one has. 12077 $previous_cp = $cp; 12078 $decimal_previous_cp = $decimal_cp; 12079 12080 # We are now set up for the next iteration; so skip the remaining 12081 # code in this subroutine that does the same thing, but doesn't 12082 # know about these ranges. 12083 $_ = ""; 12084 12085 return; 12086 } 12087 12088 # On the very first line, we fake it so the code below thinks there is 12089 # nothing to output, and initialize so that when it does get output it 12090 # uses the first line's values for the lowest part of the range. 12091 # (One could avoid this by using peek(), but then one would need to 12092 # know the adjustments done above and do the same ones in the setup 12093 # routine; not worth it) 12094 if ($first_time) { 12095 $first_time = 0; 12096 @previous_fields = @fields; 12097 @start = ($cp) x scalar @fields; 12098 $decimal_previous_cp = $decimal_cp - 1; 12099 } 12100 12101 # For each field, output the stored up ranges that this code point 12102 # doesn't fit in. Earlier we figured out if all ranges should be 12103 # terminated because of changing the replace or map type styles, or if 12104 # there is a gap between this new code point and the previous one, and 12105 # that is stored in $force_output. But even if those aren't true, we 12106 # need to output the range if this new code point's value for the 12107 # given property doesn't match the stored range's. 12108 #local $to_trace = 1 if main::DEBUG; 12109 foreach my $i (0 .. $last_field) { 12110 my $field = $fields[$i]; 12111 if ($force_output || $field ne $previous_fields[$i]) { 12112 12113 # Flush the buffer of stored values. 12114 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 12115 12116 # Start a new range with this code point and its value 12117 $start[$i] = $cp; 12118 $previous_fields[$i] = $field; 12119 } 12120 } 12121 12122 # Set the values for the next time. 12123 $previous_cp = $cp; 12124 $decimal_previous_cp = $decimal_cp; 12125 12126 # The input line has generated whatever adjusted lines are needed, and 12127 # should not be looked at further. 12128 $_ = ""; 12129 return; 12130 } 12131 12132 sub EOF_UnicodeData { 12133 # Called upon EOF to flush the buffers, and create the Hangul 12134 # decomposition mappings if needed. 12135 12136 my $file = shift; 12137 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12138 12139 # Flush the buffers. 12140 foreach my $i (0 .. $last_field) { 12141 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 12142 } 12143 12144 if (-e 'Jamo.txt') { 12145 12146 # The algorithm is published by Unicode, based on values in 12147 # Jamo.txt, (which should have been processed before this 12148 # subroutine), and the results left in %Jamo 12149 unless (%Jamo) { 12150 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); 12151 return; 12152 } 12153 12154 # If the full decomposition map table is being output, insert 12155 # into it the Hangul syllable mappings. This is to avoid having 12156 # to publish a subroutine in it to compute them. (which would 12157 # essentially be this code.) This uses the algorithm published by 12158 # Unicode. (No hangul syllables in version 1) 12159 if ($v_version ge v2.0.0 12160 && property_ref('Decomposition_Mapping')->to_output_map) { 12161 for (my $S = $SBase; $S < $SBase + $SCount; $S++) { 12162 use integer; 12163 my $SIndex = $S - $SBase; 12164 my $L = $LBase + $SIndex / $NCount; 12165 my $V = $VBase + ($SIndex % $NCount) / $TCount; 12166 my $T = $TBase + $SIndex % $TCount; 12167 12168 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; 12169 my $decomposition = sprintf("%04X %04X", $L, $V); 12170 $decomposition .= sprintf(" %04X", $T) if $T != $TBase; 12171 $file->insert_adjusted_lines( 12172 sprintf("%04X; Decomposition_Mapping; %s", 12173 $S, 12174 $decomposition)); 12175 } 12176 } 12177 } 12178 12179 return; 12180 } 12181 12182 sub filter_v1_ucd { 12183 # Fix UCD lines in version 1. This is probably overkill, but this 12184 # fixes some glaring errors in Version 1 UnicodeData.txt. That file: 12185 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later 12186 # removed. This program retains them 12187 # 2) didn't include ranges, which it should have, and which are now 12188 # added in @corrected_lines below. It was hand populated by 12189 # taking the data from Version 2, verified by analyzing 12190 # DAge.txt. 12191 # 3) There is a syntax error in the entry for U+09F8 which could 12192 # cause problems for utf8_heavy, and so is changed. It's 12193 # numeric value was simply a minus sign, without any number. 12194 # (Eventually Unicode changed the code point to non-numeric.) 12195 # 4) The decomposition types often don't match later versions 12196 # exactly, and the whole syntax of that field is different; so 12197 # the syntax is changed as well as the types to their later 12198 # terminology. Otherwise normalize.pm would be very unhappy 12199 # 5) Many ccc classes are different. These are left intact. 12200 # 6) U+FF10..U+FF19 are missing their numeric values in all three 12201 # fields. These are unchanged because it doesn't really cause 12202 # problems for Perl. 12203 # 7) A number of code points, such as controls, don't have their 12204 # Unicode Version 1 Names in this file. These are added. 12205 # 8) A number of Symbols were marked as Lm. This changes those in 12206 # the Latin1 range, so that regexes work. 12207 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are 12208 # referred to by their lc equivalents. Not fixed. 12209 12210 my @corrected_lines = split /\n/, <<'END'; 122114E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 122129FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 12213E000;<Private Use, First>;Co;0;L;;;;;N;;;;; 12214F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;; 12215F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;; 12216FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;; 12217END 12218 12219 my $file = shift; 12220 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12221 12222 #local $to_trace = 1 if main::DEBUG; 12223 trace $_ if main::DEBUG && $to_trace; 12224 12225 # -1 => retain trailing null fields 12226 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12227 12228 # At the first place that is wrong in the input, insert all the 12229 # corrections, replacing the wrong line. 12230 if ($code_point eq '4E00') { 12231 my @copy = @corrected_lines; 12232 $_ = shift @copy; 12233 ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12234 12235 $file->insert_lines(@copy); 12236 } 12237 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') { 12238 12239 # There are no Lm characters in Latin1; these should be 'Sk', but 12240 # there isn't that in V1. 12241 $fields[$CATEGORY] = 'So'; 12242 } 12243 12244 if ($fields[$NUMERIC] eq '-') { 12245 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. 12246 } 12247 12248 if ($fields[$PERL_DECOMPOSITION] ne "") { 12249 12250 # Several entries have this change to superscript 2 or 3 in the 12251 # middle. Convert these to the modern version, which is to use 12252 # the actual U+00B2 and U+00B3 (the superscript forms) instead. 12253 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes 12254 # 'HHHH HHHH 00B3 HHHH'. 12255 # It turns out that all of these that don't have another 12256 # decomposition defined at the beginning of the line have the 12257 # <square> decomposition in later releases. 12258 if ($code_point ne '00B2' && $code_point ne '00B3') { 12259 if ($fields[$PERL_DECOMPOSITION] 12260 =~ s/<\+sup> 003([23]) <-sup>/00B$1/) 12261 { 12262 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { 12263 $fields[$PERL_DECOMPOSITION] = '<square> ' 12264 . $fields[$PERL_DECOMPOSITION]; 12265 } 12266 } 12267 } 12268 12269 # If is like '<+circled> 0052 <-circled>', convert to 12270 # '<circled> 0052' 12271 $fields[$PERL_DECOMPOSITION] =~ 12272 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg; 12273 12274 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc. 12275 $fields[$PERL_DECOMPOSITION] =~ 12276 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x 12277 or $fields[$PERL_DECOMPOSITION] =~ 12278 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x 12279 or $fields[$PERL_DECOMPOSITION] =~ 12280 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x 12281 or $fields[$PERL_DECOMPOSITION] =~ 12282 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x; 12283 12284 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc. 12285 $fields[$PERL_DECOMPOSITION] =~ 12286 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; 12287 12288 # Change names to modern form. 12289 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g; 12290 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g; 12291 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g; 12292 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g; 12293 12294 # One entry has weird braces 12295 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; 12296 12297 # One entry at U+2116 has an extra <sup> 12298 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x; 12299 } 12300 12301 $_ = join ';', $code_point, @fields; 12302 trace $_ if main::DEBUG && $to_trace; 12303 return; 12304 } 12305 12306 sub filter_bad_Nd_ucd { 12307 # Early versions specified a value in the decimal digit field even 12308 # though the code point wasn't a decimal digit. Clear the field in 12309 # that situation, so that the main code doesn't think it is a decimal 12310 # digit. 12311 12312 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12313 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') { 12314 $fields[$PERL_DECIMAL_DIGIT] = ""; 12315 $_ = join ';', $code_point, @fields; 12316 } 12317 return; 12318 } 12319 12320 my @U1_control_names = split /\n/, <<'END'; 12321NULL 12322START OF HEADING 12323START OF TEXT 12324END OF TEXT 12325END OF TRANSMISSION 12326ENQUIRY 12327ACKNOWLEDGE 12328BELL 12329BACKSPACE 12330HORIZONTAL TABULATION 12331LINE FEED 12332VERTICAL TABULATION 12333FORM FEED 12334CARRIAGE RETURN 12335SHIFT OUT 12336SHIFT IN 12337DATA LINK ESCAPE 12338DEVICE CONTROL ONE 12339DEVICE CONTROL TWO 12340DEVICE CONTROL THREE 12341DEVICE CONTROL FOUR 12342NEGATIVE ACKNOWLEDGE 12343SYNCHRONOUS IDLE 12344END OF TRANSMISSION BLOCK 12345CANCEL 12346END OF MEDIUM 12347SUBSTITUTE 12348ESCAPE 12349FILE SEPARATOR 12350GROUP SEPARATOR 12351RECORD SEPARATOR 12352UNIT SEPARATOR 12353DELETE 12354BREAK PERMITTED HERE 12355NO BREAK HERE 12356INDEX 12357NEXT LINE 12358START OF SELECTED AREA 12359END OF SELECTED AREA 12360CHARACTER TABULATION SET 12361CHARACTER TABULATION WITH JUSTIFICATION 12362LINE TABULATION SET 12363PARTIAL LINE DOWN 12364PARTIAL LINE UP 12365REVERSE LINE FEED 12366SINGLE SHIFT TWO 12367SINGLE SHIFT THREE 12368DEVICE CONTROL STRING 12369PRIVATE USE ONE 12370PRIVATE USE TWO 12371SET TRANSMIT STATE 12372CANCEL CHARACTER 12373MESSAGE WAITING 12374START OF GUARDED AREA 12375END OF GUARDED AREA 12376START OF STRING 12377SINGLE CHARACTER INTRODUCER 12378CONTROL SEQUENCE INTRODUCER 12379STRING TERMINATOR 12380OPERATING SYSTEM COMMAND 12381PRIVACY MESSAGE 12382APPLICATION PROGRAM COMMAND 12383END 12384 12385 sub filter_early_U1_names { 12386 # Very early versions did not have the Unicode_1_name field specified. 12387 # They differed in which ones were present; make sure a U1 name 12388 # exists, so that Unicode::UCD::charinfo will work 12389 12390 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12391 12392 12393 # @U1_control names above are entirely positional, so we pull them out 12394 # in the exact order required, with gaps for the ones that don't have 12395 # names. 12396 if ($code_point =~ /^00[01]/ 12397 || $code_point eq '007F' 12398 || $code_point =~ /^008[2-9A-F]/ 12399 || $code_point =~ /^009[0-8A-F]/) 12400 { 12401 my $u1_name = shift @U1_control_names; 12402 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME]; 12403 $_ = join ';', $code_point, @fields; 12404 } 12405 return; 12406 } 12407 12408 sub filter_v2_1_5_ucd { 12409 # A dozen entries in this 2.1.5 file had the mirrored and numeric 12410 # columns swapped; These all had mirrored be 'N'. So if the numeric 12411 # column appears to be N, swap it back. 12412 12413 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12414 if ($fields[$NUMERIC] eq 'N') { 12415 $fields[$NUMERIC] = $fields[$MIRRORED]; 12416 $fields[$MIRRORED] = 'N'; 12417 $_ = join ';', $code_point, @fields; 12418 } 12419 return; 12420 } 12421 12422 sub filter_v6_ucd { 12423 12424 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17, 12425 # it wasn't accepted, to allow for some deprecation cycles. This 12426 # function is not called after 5.16 12427 12428 return if $_ !~ /^(?:0007|1F514|070F);/; 12429 12430 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12431 if ($code_point eq '0007') { 12432 $fields[$CHARNAME] = ""; 12433 } 12434 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see 12435 # http://www.unicode.org/versions/corrigendum8.html 12436 $fields[$BIDI] = "AL"; 12437 } 12438 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name 12439 $fields[$CHARNAME] = ""; 12440 } 12441 12442 $_ = join ';', $code_point, @fields; 12443 12444 return; 12445 } 12446} # End closure for UnicodeData 12447 12448sub process_GCB_test { 12449 12450 my $file = shift; 12451 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12452 12453 while ($file->next_line) { 12454 push @backslash_X_tests, $_; 12455 } 12456 12457 return; 12458} 12459 12460sub process_LB_test { 12461 12462 my $file = shift; 12463 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12464 12465 while ($file->next_line) { 12466 push @LB_tests, $_; 12467 } 12468 12469 return; 12470} 12471 12472sub process_SB_test { 12473 12474 my $file = shift; 12475 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12476 12477 while ($file->next_line) { 12478 push @SB_tests, $_; 12479 } 12480 12481 return; 12482} 12483 12484sub process_WB_test { 12485 12486 my $file = shift; 12487 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12488 12489 while ($file->next_line) { 12490 push @WB_tests, $_; 12491 } 12492 12493 return; 12494} 12495 12496sub process_NamedSequences { 12497 # NamedSequences.txt entries are just added to an array. Because these 12498 # don't look like the other tables, they have their own handler. 12499 # An example: 12500 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 12501 # 12502 # This just adds the sequence to an array for later handling 12503 12504 my $file = shift; 12505 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12506 12507 while ($file->next_line) { 12508 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; 12509 if (@remainder) { 12510 $file->carp_bad_line( 12511 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); 12512 next; 12513 } 12514 12515 # Note single \t in keeping with special output format of 12516 # Perl_charnames. But it turns out that the code points don't have to 12517 # be 5 digits long, like the rest, based on the internal workings of 12518 # charnames.pm. This could be easily changed for consistency. 12519 push @named_sequences, "$sequence\t$name"; 12520 } 12521 return; 12522} 12523 12524{ # Closure 12525 12526 my $first_range; 12527 12528 sub filter_early_ea_lb { 12529 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a 12530 # third field be the name of the code point, which can be ignored in 12531 # most cases. But it can be meaningful if it marks a range: 12532 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 12533 # 3400;W;<CJK Ideograph Extension A, First> 12534 # 12535 # We need to see the First in the example above to know it's a range. 12536 # They did not use the later range syntaxes. This routine changes it 12537 # to use the modern syntax. 12538 # $1 is the Input_file object. 12539 12540 my @fields = split /\s*;\s*/; 12541 if ($fields[2] =~ /^<.*, First>/) { 12542 $first_range = $fields[0]; 12543 $_ = ""; 12544 } 12545 elsif ($fields[2] =~ /^<.*, Last>/) { 12546 $_ = $_ = "$first_range..$fields[0]; $fields[1]"; 12547 } 12548 else { 12549 undef $first_range; 12550 $_ = "$fields[0]; $fields[1]"; 12551 } 12552 12553 return; 12554 } 12555} 12556 12557sub filter_substitute_lb { 12558 # Used on Unicodes that predate the LB property, where there is a 12559 # substitute file. This just does the regular ea_lb handling for such 12560 # files, and then substitutes the long property value name for the short 12561 # one that comes with the file. (The other break files have the long 12562 # names in them, so this is the odd one out.) The reason for doing this 12563 # kludge is that regen/mk_invlists.pl is expecting the long name. This 12564 # also fixes the typo 'Inseperable' that leads to problems. 12565 12566 filter_early_ea_lb; 12567 return unless $_; 12568 12569 my @fields = split /\s*;\s*/; 12570 $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name; 12571 $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable'; 12572 $_ = join '; ', @fields; 12573} 12574 12575sub filter_old_style_arabic_shaping { 12576 # Early versions used a different term for the later one. 12577 12578 my @fields = split /\s*;\s*/; 12579 $fields[3] =~ s/<no shaping>/No_Joining_Group/; 12580 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores 12581 $_ = join ';', @fields; 12582 return; 12583} 12584 12585{ # Closure 12586 my $lc; # Table for lowercase mapping 12587 my $tc; 12588 my $uc; 12589 my %special_casing_code_points; 12590 12591 sub setup_special_casing { 12592 # SpecialCasing.txt contains the non-simple case change mappings. The 12593 # simple ones are in UnicodeData.txt, which should already have been 12594 # read in to the full property data structures, so as to initialize 12595 # these with the simple ones. Then the SpecialCasing.txt entries 12596 # add or overwrite the ones which have different full mappings. 12597 12598 # This routine sees if the simple mappings are to be output, and if 12599 # so, copies what has already been put into the full mapping tables, 12600 # while they still contain only the simple mappings. 12601 12602 # The reason it is done this way is that the simple mappings are 12603 # probably not going to be output, so it saves work to initialize the 12604 # full tables with the simple mappings, and then overwrite those 12605 # relatively few entries in them that have different full mappings, 12606 # and thus skip the simple mapping tables altogether. 12607 12608 my $file= shift; 12609 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12610 12611 $lc = property_ref('lc'); 12612 $tc = property_ref('tc'); 12613 $uc = property_ref('uc'); 12614 12615 # For each of the case change mappings... 12616 foreach my $full_casing_table ($lc, $tc, $uc) { 12617 my $full_casing_name = $full_casing_table->name; 12618 my $full_casing_full_name = $full_casing_table->full_name; 12619 unless (defined $full_casing_table 12620 && ! $full_casing_table->is_empty) 12621 { 12622 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); 12623 } 12624 12625 # Create a table in the old-style format and with the original 12626 # file name for backwards compatibility with applications that 12627 # read it directly. The new tables contain both the simple and 12628 # full maps, and the old are missing simple maps when there is a 12629 # conflicting full one. Probably it would have been ok to add 12630 # those to the legacy version, as was already done in 5.14 to the 12631 # case folding one, but this was not done, out of an abundance of 12632 # caution. The tables are set up here before we deal with the 12633 # full maps so that as we handle those, we can override the simple 12634 # maps for them in the legacy table, and merely add them in the 12635 # new-style one. 12636 my $legacy = Property->new("Legacy_" . $full_casing_full_name, 12637 File => $full_casing_full_name 12638 =~ s/case_Mapping//r, 12639 Format => $HEX_FORMAT, 12640 Default_Map => $CODE_POINT, 12641 Initialize => $full_casing_table, 12642 Replacement_Property => $full_casing_full_name, 12643 ); 12644 12645 $full_casing_table->add_comment(join_lines( <<END 12646This file includes both the simple and full case changing maps. The simple 12647ones are in the main body of the table below, and the full ones adding to or 12648overriding them are in the hash. 12649END 12650 )); 12651 12652 # The simple version's name in each mapping merely has an 's' in 12653 # front of the full one's 12654 my $simple_name = 's' . $full_casing_name; 12655 my $simple = property_ref($simple_name); 12656 $simple->initialize($full_casing_table) if $simple->to_output_map(); 12657 } 12658 12659 return; 12660 } 12661 12662 sub filter_2_1_8_special_casing_line { 12663 12664 # This version had duplicate entries in this file. Delete all but the 12665 # first one 12666 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12667 # fields 12668 if (exists $special_casing_code_points{$fields[0]}) { 12669 $_ = ""; 12670 return; 12671 } 12672 12673 $special_casing_code_points{$fields[0]} = 1; 12674 filter_special_casing_line(@_); 12675 } 12676 12677 sub filter_special_casing_line { 12678 # Change the format of $_ from SpecialCasing.txt into something that 12679 # the generic handler understands. Each input line contains three 12680 # case mappings. This will generate three lines to pass to the 12681 # generic handler for each of those. 12682 12683 # The input syntax (after stripping comments and trailing white space 12684 # is like one of the following (with the final two being entries that 12685 # we ignore): 12686 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 12687 # 03A3; 03C2; 03A3; 03A3; Final_Sigma; 12688 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 12689 # Note the trailing semi-colon, unlike many of the input files. That 12690 # means that there will be an extra null field generated by the split 12691 12692 my $file = shift; 12693 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12694 12695 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12696 # fields 12697 12698 # field #4 is when this mapping is conditional. If any of these get 12699 # implemented, it would be by hard-coding in the casing functions in 12700 # the Perl core, not through tables. But if there is a new condition 12701 # we don't know about, output a warning. We know about all the 12702 # conditions through 6.0 12703 if ($fields[4] ne "") { 12704 my @conditions = split ' ', $fields[4]; 12705 if ($conditions[0] ne 'tr' # We know that these languages have 12706 # conditions, and some are multiple 12707 && $conditions[0] ne 'az' 12708 && $conditions[0] ne 'lt' 12709 12710 # And, we know about a single condition Final_Sigma, but 12711 # nothing else. 12712 && ($v_version gt v5.2.0 12713 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) 12714 { 12715 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); 12716 } 12717 elsif ($conditions[0] ne 'Final_Sigma') { 12718 12719 # Don't print out a message for Final_Sigma, because we 12720 # have hard-coded handling for it. (But the standard 12721 # could change what the rule should be, but it wouldn't 12722 # show up here anyway. 12723 12724 print "# SKIPPING Special Casing: $_\n" 12725 if $verbosity >= $VERBOSE; 12726 } 12727 $_ = ""; 12728 return; 12729 } 12730 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { 12731 $file->carp_bad_line('Extra fields'); 12732 $_ = ""; 12733 return; 12734 } 12735 12736 my $decimal_code_point = hex $fields[0]; 12737 12738 # Loop to handle each of the three mappings in the input line, in 12739 # order, with $i indicating the current field number. 12740 my $i = 0; 12741 for my $object ($lc, $tc, $uc) { 12742 $i++; # First time through, $i = 0 ... 3rd time = 3 12743 12744 my $value = $object->value_of($decimal_code_point); 12745 $value = ($value eq $CODE_POINT) 12746 ? $decimal_code_point 12747 : hex $value; 12748 12749 # If this isn't a multi-character mapping, it should already have 12750 # been read in. 12751 if ($fields[$i] !~ / /) { 12752 if ($value != hex $fields[$i]) { 12753 Carp::my_carp("Bad news. UnicodeData.txt thinks " 12754 . $object->name 12755 . "(0x$fields[0]) is $value" 12756 . " and SpecialCasing.txt thinks it is " 12757 . hex($fields[$i]) 12758 . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); 12759 } 12760 } 12761 else { 12762 12763 # The mapping goes into both the legacy table, in which it 12764 # replaces the simple one... 12765 $file->insert_adjusted_lines("$fields[0]; Legacy_" 12766 . $object->full_name 12767 . "; $fields[$i]"); 12768 12769 # ... and the regular table, in which it is additional, 12770 # beyond the simple mapping. 12771 $file->insert_adjusted_lines("$fields[0]; " 12772 . $object->name 12773 . "; " 12774 . $CMD_DELIM 12775 . "$REPLACE_CMD=$MULTIPLE_BEFORE" 12776 . $CMD_DELIM 12777 . $fields[$i]); 12778 } 12779 } 12780 12781 # Everything has been handled by the insert_adjusted_lines() 12782 $_ = ""; 12783 12784 return; 12785 } 12786} 12787 12788sub filter_old_style_case_folding { 12789 # This transforms $_ containing the case folding style of 3.0.1, to 3.1 12790 # and later style. Different letters were used in the earlier. 12791 12792 my $file = shift; 12793 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12794 12795 my @fields = split /\s*;\s*/; 12796 12797 if ($fields[1] eq 'L') { 12798 $fields[1] = 'C'; # L => C always 12799 } 12800 elsif ($fields[1] eq 'E') { 12801 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise 12802 $fields[1] = 'F' 12803 } 12804 else { 12805 $fields[1] = 'C' 12806 } 12807 } 12808 else { 12809 $file->carp_bad_line("Expecting L or E in second field"); 12810 $_ = ""; 12811 return; 12812 } 12813 $_ = join("; ", @fields) . ';'; 12814 return; 12815} 12816 12817{ # Closure for case folding 12818 12819 # Create the map for simple only if are going to output it, for otherwise 12820 # it takes no part in anything we do. 12821 my $to_output_simple; 12822 12823 sub setup_case_folding($) { 12824 # Read in the case foldings in CaseFolding.txt. This handles both 12825 # simple and full case folding. 12826 12827 $to_output_simple 12828 = property_ref('Simple_Case_Folding')->to_output_map; 12829 12830 if (! $to_output_simple) { 12831 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); 12832 } 12833 12834 # If we ever wanted to show that these tables were combined, a new 12835 # property method could be created, like set_combined_props() 12836 property_ref('Case_Folding')->add_comment(join_lines( <<END 12837This file includes both the simple and full case folding maps. The simple 12838ones are in the main body of the table below, and the full ones adding to or 12839overriding them are in the hash. 12840END 12841 )); 12842 return; 12843 } 12844 12845 sub filter_case_folding_line { 12846 # Called for each line in CaseFolding.txt 12847 # Input lines look like: 12848 # 0041; C; 0061; # LATIN CAPITAL LETTER A 12849 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 12850 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 12851 # 12852 # 'C' means that folding is the same for both simple and full 12853 # 'F' that it is only for full folding 12854 # 'S' that it is only for simple folding 12855 # 'T' is locale-dependent, and ignored 12856 # 'I' is a type of 'F' used in some early releases. 12857 # Note the trailing semi-colon, unlike many of the input files. That 12858 # means that there will be an extra null field generated by the split 12859 # below, which we ignore and hence is not an error. 12860 12861 my $file = shift; 12862 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 12863 12864 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; 12865 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { 12866 $file->carp_bad_line('Extra fields'); 12867 $_ = ""; 12868 return; 12869 } 12870 12871 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent 12872 $_ = ""; 12873 return; 12874 } 12875 12876 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase 12877 # I are all full foldings; S is single-char. For S, there is always 12878 # an F entry, so we must allow multiple values for the same code 12879 # point. Fortunately this table doesn't need further manipulation 12880 # which would preclude using multiple-values. The S is now included 12881 # so that _swash_inversion_hash() is able to construct closures 12882 # without having to worry about F mappings. 12883 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { 12884 $_ = "$range; Case_Folding; " 12885 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map"; 12886 } 12887 else { 12888 $_ = ""; 12889 $file->carp_bad_line('Expecting C F I S or T in second field'); 12890 } 12891 12892 # C and S are simple foldings, but simple case folding is not needed 12893 # unless we explicitly want its map table output. 12894 if ($to_output_simple && $type eq 'C' || $type eq 'S') { 12895 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); 12896 } 12897 12898 return; 12899 } 12900 12901} # End case fold closure 12902 12903sub filter_jamo_line { 12904 # Filter Jamo.txt lines. This routine mainly is used to populate hashes 12905 # from this file that is used in generating the Name property for Jamo 12906 # code points. But, it also is used to convert early versions' syntax 12907 # into the modern form. Here are two examples: 12908 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax 12909 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax 12910 # 12911 # The input is $_, the output is $_ filtered. 12912 12913 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 12914 12915 # Let the caller handle unexpected input. In earlier versions, there was 12916 # a third field which is supposed to be a comment, but did not have a '#' 12917 # before it. 12918 return if @fields > (($v_version gt v3.0.0) ? 2 : 3); 12919 12920 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous 12921 # beginning. 12922 12923 # Some 2.1 versions had this wrong. Causes havoc with the algorithm. 12924 $fields[1] = 'R' if $fields[0] eq '1105'; 12925 12926 # Add to structure so can generate Names from it. 12927 my $cp = hex $fields[0]; 12928 my $short_name = $fields[1]; 12929 $Jamo{$cp} = $short_name; 12930 if ($cp <= $LBase + $LCount) { 12931 $Jamo_L{$short_name} = $cp - $LBase; 12932 } 12933 elsif ($cp <= $VBase + $VCount) { 12934 $Jamo_V{$short_name} = $cp - $VBase; 12935 } 12936 elsif ($cp <= $TBase + $TCount) { 12937 $Jamo_T{$short_name} = $cp - $TBase; 12938 } 12939 else { 12940 Carp::my_carp_bug("Unexpected Jamo code point in $_"); 12941 } 12942 12943 12944 # Reassemble using just the first two fields to look like a typical 12945 # property file line 12946 $_ = "$fields[0]; $fields[1]"; 12947 12948 return; 12949} 12950 12951sub register_fraction($) { 12952 # This registers the input rational number so that it can be passed on to 12953 # utf8_heavy.pl, both in rational and floating forms. 12954 12955 my $rational = shift; 12956 12957 my $float = eval $rational; 12958 $nv_floating_to_rational{$float} = $rational; 12959 return; 12960} 12961 12962sub gcd($$) { # Greatest-common-divisor; from 12963 # http://en.wikipedia.org/wiki/Euclidean_algorithm 12964 my ($a, $b) = @_; 12965 12966 use integer; 12967 12968 while ($b != 0) { 12969 my $temp = $b; 12970 $b = $a % $b; 12971 $a = $temp; 12972 } 12973 return $a; 12974} 12975 12976sub reduce_fraction($) { 12977 my $fraction_ref = shift; 12978 12979 # Reduce a fraction to lowest terms. The Unicode data may be reducible, 12980 # hence this is needed. The argument is a reference to the 12981 # string denoting the fraction, which must be of the form: 12982 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) { 12983 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged"); 12984 return; 12985 } 12986 12987 my $sign = $1; 12988 my $numerator = $2; 12989 my $denominator = $3; 12990 12991 use integer; 12992 12993 # Find greatest common divisor 12994 my $gcd = gcd($numerator, $denominator); 12995 12996 # And reduce using the gcd. 12997 if ($gcd != 1) { 12998 $numerator /= $gcd; 12999 $denominator /= $gcd; 13000 $$fraction_ref = "$sign$numerator/$denominator"; 13001 } 13002 13003 return; 13004} 13005 13006sub filter_numeric_value_line { 13007 # DNumValues contains lines of a different syntax than the typical 13008 # property file: 13009 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO 13010 # 13011 # This routine transforms $_ containing the anomalous syntax to the 13012 # typical, by filtering out the extra columns, and convert early version 13013 # decimal numbers to strings that look like rational numbers. 13014 13015 my $file = shift; 13016 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13017 13018 # Starting in 5.1, there is a rational field. Just use that, omitting the 13019 # extra columns. Otherwise convert the decimal number in the second field 13020 # to a rational, and omit extraneous columns. 13021 my @fields = split /\s*;\s*/, $_, -1; 13022 my $rational; 13023 13024 if ($v_version ge v5.1.0) { 13025 if (@fields != 4) { 13026 $file->carp_bad_line('Not 4 semi-colon separated fields'); 13027 $_ = ""; 13028 return; 13029 } 13030 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/}; 13031 $rational = $fields[3]; 13032 13033 $_ = join '; ', @fields[ 0, 3 ]; 13034 } 13035 else { 13036 13037 # Here, is an older Unicode file, which has decimal numbers instead of 13038 # rationals in it. Use the fraction to calculate the denominator and 13039 # convert to rational. 13040 13041 if (@fields != 2 && @fields != 3) { 13042 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); 13043 $_ = ""; 13044 return; 13045 } 13046 13047 my $codepoints = $fields[0]; 13048 my $decimal = $fields[1]; 13049 if ($decimal =~ s/\.0+$//) { 13050 13051 # Anything ending with a decimal followed by nothing but 0's is an 13052 # integer 13053 $_ = "$codepoints; $decimal"; 13054 $rational = $decimal; 13055 } 13056 else { 13057 13058 my $denominator; 13059 if ($decimal =~ /\.50*$/) { 13060 $denominator = 2; 13061 } 13062 13063 # Here have the hardcoded repeating decimals in the fraction, and 13064 # the denominator they imply. There were only a few denominators 13065 # in the older Unicode versions of this file which this code 13066 # handles, so it is easy to convert them. 13067 13068 # The 4 is because of a round-off error in the Unicode 3.2 files 13069 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { 13070 $denominator = 3; 13071 } 13072 elsif ($decimal =~ /\.[27]50*$/) { 13073 $denominator = 4; 13074 } 13075 elsif ($decimal =~ /\.[2468]0*$/) { 13076 $denominator = 5; 13077 } 13078 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { 13079 $denominator = 6; 13080 } 13081 elsif ($decimal =~ /\.(12|37|62|87)50*$/) { 13082 $denominator = 8; 13083 } 13084 if ($denominator) { 13085 my $sign = ($decimal < 0) ? "-" : ""; 13086 my $numerator = int((abs($decimal) * $denominator) + .5); 13087 $rational = "$sign$numerator/$denominator"; 13088 $_ = "$codepoints; $rational"; 13089 } 13090 else { 13091 $file->carp_bad_line("Can't cope with number '$decimal'."); 13092 $_ = ""; 13093 return; 13094 } 13095 } 13096 } 13097 13098 register_fraction($rational) if $rational =~ qr{/}; 13099 return; 13100} 13101 13102{ # Closure 13103 my %unihan_properties; 13104 13105 sub construct_unihan { 13106 13107 my $file_object = shift; 13108 13109 return unless file_exists($file_object->file); 13110 13111 if ($v_version lt v4.0.0) { 13112 push @cjk_properties, 'URS ; Unicode_Radical_Stroke'; 13113 push @cjk_property_values, split "\n", <<'END'; 13114# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none> 13115END 13116 } 13117 13118 if ($v_version ge v3.0.0) { 13119 push @cjk_properties, split "\n", <<'END'; 13120cjkIRG_GSource; kIRG_GSource 13121cjkIRG_JSource; kIRG_JSource 13122cjkIRG_KSource; kIRG_KSource 13123cjkIRG_TSource; kIRG_TSource 13124cjkIRG_VSource; kIRG_VSource 13125END 13126 push @cjk_property_values, split "\n", <<'END'; 13127# @missing: 0000..10FFFF; cjkIRG_GSource; <none> 13128# @missing: 0000..10FFFF; cjkIRG_JSource; <none> 13129# @missing: 0000..10FFFF; cjkIRG_KSource; <none> 13130# @missing: 0000..10FFFF; cjkIRG_TSource; <none> 13131# @missing: 0000..10FFFF; cjkIRG_VSource; <none> 13132END 13133 } 13134 if ($v_version ge v3.1.0) { 13135 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource'; 13136 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>'; 13137 } 13138 if ($v_version ge v3.1.1) { 13139 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource'; 13140 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>'; 13141 } 13142 if ($v_version ge v3.2.0) { 13143 push @cjk_properties, split "\n", <<'END'; 13144cjkAccountingNumeric; kAccountingNumeric 13145cjkCompatibilityVariant; kCompatibilityVariant 13146cjkOtherNumeric; kOtherNumeric 13147cjkPrimaryNumeric; kPrimaryNumeric 13148END 13149 push @cjk_property_values, split "\n", <<'END'; 13150# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 13151# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> 13152# @missing: 0000..10FFFF; cjkOtherNumeric; NaN 13153# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN 13154END 13155 } 13156 if ($v_version gt v4.0.0) { 13157 push @cjk_properties, 'cjkIRG_USource; kIRG_USource'; 13158 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>'; 13159 } 13160 13161 if ($v_version ge v4.1.0) { 13162 push @cjk_properties, 'cjkIICore ; kIICore'; 13163 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>'; 13164 } 13165 } 13166 13167 sub setup_unihan { 13168 # Do any special setup for Unihan properties. 13169 13170 # This property gives the wrong computed type, so override. 13171 my $usource = property_ref('kIRG_USource'); 13172 $usource->set_type($STRING) if defined $usource; 13173 13174 # This property is to be considered binary (it says so in 13175 # http://www.unicode.org/reports/tr38/) 13176 my $iicore = property_ref('kIICore'); 13177 if (defined $iicore) { 13178 $iicore->set_type($FORCED_BINARY); 13179 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38."); 13180 13181 # Unicode doesn't include the maps for this property, so don't 13182 # warn that they are missing. 13183 $iicore->set_pre_declared_maps(0); 13184 $iicore->add_comment(join_lines( <<END 13185This property contains string values, but any non-empty ones are considered to 13186be 'core', so Perl creates tables for both: 1) its string values, plus 2) 13187tables so that \\p{kIICore} matches any code point which has a non-empty 13188value for this property. 13189END 13190 )); 13191 } 13192 13193 return; 13194 } 13195 13196 sub filter_unihan_line { 13197 # Change unihan db lines to look like the others in the db. Here is 13198 # an input sample: 13199 # U+341C kCangjie IEKN 13200 13201 # Tabs are used instead of semi-colons to separate fields; therefore 13202 # they may have semi-colons embedded in them. Change these to periods 13203 # so won't screw up the rest of the code. 13204 s/;/./g; 13205 13206 # Remove lines that don't look like ones we accept. 13207 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { 13208 $_ = ""; 13209 return; 13210 } 13211 13212 # Extract the property, and save a reference to its object. 13213 my $property = $1; 13214 if (! exists $unihan_properties{$property}) { 13215 $unihan_properties{$property} = property_ref($property); 13216 } 13217 13218 # Don't do anything unless the property is one we're handling, which 13219 # we determine by seeing if there is an object defined for it or not 13220 if (! defined $unihan_properties{$property}) { 13221 $_ = ""; 13222 return; 13223 } 13224 13225 # Convert the tab separators to our standard semi-colons, and convert 13226 # the U+HHHH notation to the rest of the standard's HHHH 13227 s/\t/;/g; 13228 s/\b U \+ (?= $code_point_re )//xg; 13229 13230 #local $to_trace = 1 if main::DEBUG; 13231 trace $_ if main::DEBUG && $to_trace; 13232 13233 return; 13234 } 13235} 13236 13237sub filter_blocks_lines { 13238 # In the Blocks.txt file, the names of the blocks don't quite match the 13239 # names given in PropertyValueAliases.txt, so this changes them so they 13240 # do match: Blanks and hyphens are changed into underscores. Also makes 13241 # early release versions look like later ones 13242 # 13243 # $_ is transformed to the correct value. 13244 13245 my $file = shift; 13246 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13247 13248 if ($v_version lt v3.2.0) { 13249 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted 13250 $_ = ""; 13251 return; 13252 } 13253 13254 # Old versions used a different syntax to mark the range. 13255 $_ =~ s/;\s+/../ if $v_version lt v3.1.0; 13256 } 13257 13258 my @fields = split /\s*;\s*/, $_, -1; 13259 if (@fields != 2) { 13260 $file->carp_bad_line("Expecting exactly two fields"); 13261 $_ = ""; 13262 return; 13263 } 13264 13265 # Change hyphens and blanks in the block name field only 13266 $fields[1] =~ s/[ -]/_/g; 13267 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word 13268 13269 $_ = join("; ", @fields); 13270 return; 13271} 13272 13273{ # Closure 13274 my $current_property; 13275 13276 sub filter_old_style_proplist { 13277 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it 13278 # was in a completely different syntax. Ken Whistler of Unicode says 13279 # that it was something he used as an aid for his own purposes, but 13280 # was never an official part of the standard. Many of the properties 13281 # in it were incorporated into the later PropList.txt, but some were 13282 # not. This program uses this early file to generate property tables 13283 # that are otherwise not accessible in the early UCD's. It does this 13284 # for the ones that eventually became official, and don't appear to be 13285 # too different in their contents from the later official version, and 13286 # throws away the rest. It could be argued that the ones it generates 13287 # were probably not really official at that time, so should be 13288 # ignored. You can easily modify things to skip all of them by 13289 # changing this function to just set $_ to "", and return; and to skip 13290 # certain of them by by simply removing their declarations from 13291 # get_old_property_aliases(). 13292 # 13293 # Here is a list of all the ones that are thrown away: 13294 # Alphabetic The definitions for this are very 13295 # defective, so better to not mislead 13296 # people into thinking it works. 13297 # Instead the Perl extension of the 13298 # same name is constructed from first 13299 # principles. 13300 # Bidi=* duplicates UnicodeData.txt 13301 # Combining never made into official property; 13302 # is \P{ccc=0} 13303 # Composite never made into official property. 13304 # Currency Symbol duplicates UnicodeData.txt: gc=sc 13305 # Decimal Digit duplicates UnicodeData.txt: gc=nd 13306 # Delimiter never made into official property; 13307 # removed in 3.0.1 13308 # Format Control never made into official property; 13309 # similar to gc=cf 13310 # High Surrogate duplicates Blocks.txt 13311 # Ignorable Control never made into official property; 13312 # similar to di=y 13313 # ISO Control duplicates UnicodeData.txt: gc=cc 13314 # Left of Pair never made into official property; 13315 # Line Separator duplicates UnicodeData.txt: gc=zl 13316 # Low Surrogate duplicates Blocks.txt 13317 # Non-break was actually listed as a property 13318 # in 3.2, but without any code 13319 # points. Unicode denies that this 13320 # was ever an official property 13321 # Non-spacing duplicate UnicodeData.txt: gc=mn 13322 # Numeric duplicates UnicodeData.txt: gc=cc 13323 # Paired Punctuation never made into official property; 13324 # appears to be gc=ps + gc=pe 13325 # Paragraph Separator duplicates UnicodeData.txt: gc=cc 13326 # Private Use duplicates UnicodeData.txt: gc=co 13327 # Private Use High Surrogate duplicates Blocks.txt 13328 # Punctuation duplicates UnicodeData.txt: gc=p 13329 # Space different definition than eventual 13330 # one. 13331 # Titlecase duplicates UnicodeData.txt: gc=lt 13332 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn 13333 # Zero-width never made into official property; 13334 # subset of gc=cf 13335 # Most of the properties have the same names in this file as in later 13336 # versions, but a couple do not. 13337 # 13338 # This subroutine filters $_, converting it from the old style into 13339 # the new style. Here's a sample of the old-style 13340 # 13341 # ******************************************* 13342 # 13343 # Property dump for: 0x100000A0 (Join Control) 13344 # 13345 # 200C..200D (2 chars) 13346 # 13347 # In the example, the property is "Join Control". It is kept in this 13348 # closure between calls to the subroutine. The numbers beginning with 13349 # 0x were internal to Ken's program that generated this file. 13350 13351 # If this line contains the property name, extract it. 13352 if (/^Property dump for: [^(]*\((.*)\)/) { 13353 $_ = $1; 13354 13355 # Convert white space to underscores. 13356 s/ /_/g; 13357 13358 # Convert the few properties that don't have the same name as 13359 # their modern counterparts 13360 s/Identifier_Part/ID_Continue/ 13361 or s/Not_a_Character/NChar/; 13362 13363 # If the name matches an existing property, use it. 13364 if (defined property_ref($_)) { 13365 trace "new property=", $_ if main::DEBUG && $to_trace; 13366 $current_property = $_; 13367 } 13368 else { # Otherwise discard it 13369 trace "rejected property=", $_ if main::DEBUG && $to_trace; 13370 undef $current_property; 13371 } 13372 $_ = ""; # The property is saved for the next lines of the 13373 # file, but this defining line is of no further use, 13374 # so clear it so that the caller won't process it 13375 # further. 13376 } 13377 elsif (! defined $current_property || $_ !~ /^$code_point_re/) { 13378 13379 # Here, the input line isn't a header defining a property for the 13380 # following section, and either we aren't in such a section, or 13381 # the line doesn't look like one that defines the code points in 13382 # such a section. Ignore this line. 13383 $_ = ""; 13384 } 13385 else { 13386 13387 # Here, we have a line defining the code points for the current 13388 # stashed property. Anything starting with the first blank is 13389 # extraneous. Otherwise, it should look like a normal range to 13390 # the caller. Append the property name so that it looks just like 13391 # a modern PropList entry. 13392 13393 $_ =~ s/\s.*//; 13394 $_ .= "; $current_property"; 13395 } 13396 trace $_ if main::DEBUG && $to_trace; 13397 return; 13398 } 13399} # End closure for old style proplist 13400 13401sub filter_old_style_normalization_lines { 13402 # For early releases of Unicode, the lines were like: 13403 # 74..2A76 ; NFKD_NO 13404 # For later releases this became: 13405 # 74..2A76 ; NFKD_QC; N 13406 # Filter $_ to look like those in later releases. 13407 # Similarly for MAYBEs 13408 13409 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; 13410 13411 # Also, the property FC_NFKC was abbreviated to FNC 13412 s/FNC/FC_NFKC/; 13413 return; 13414} 13415 13416sub setup_script_extensions { 13417 # The Script_Extensions property starts out with a clone of the Script 13418 # property. 13419 13420 $scx = property_ref("Script_Extensions"); 13421 return unless defined $scx; 13422 13423 $scx->_set_format($STRING_WHITE_SPACE_LIST); 13424 $scx->initialize($script); 13425 $scx->set_default_map($script->default_map); 13426 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 13427 $scx->add_comment(join_lines( <<END 13428The values for code points that appear in one script are just the same as for 13429the 'Script' property. Likewise the values for those that appear in many 13430scripts are either 'Common' or 'Inherited', same as with 'Script'. But the 13431values of code points that appear in a few scripts are a space separated list 13432of those scripts. 13433END 13434 )); 13435 13436 # Initialize scx's tables and the aliases for them to be the same as sc's 13437 foreach my $table ($script->tables) { 13438 my $scx_table = $scx->add_match_table($table->name, 13439 Full_Name => $table->full_name); 13440 foreach my $alias ($table->aliases) { 13441 $scx_table->add_alias($alias->name); 13442 } 13443 } 13444} 13445 13446sub filter_script_extensions_line { 13447 # The Scripts file comes with the full name for the scripts; the 13448 # ScriptExtensions, with the short name. The final mapping file is a 13449 # combination of these, and without adjustment, would have inconsistent 13450 # entries. This filters the latter file to convert to full names. 13451 # Entries look like this: 13452 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 13453 13454 my @fields = split /\s*;\s*/; 13455 13456 # This script was erroneously omitted in this Unicode version. 13457 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; 13458 13459 my @full_names; 13460 foreach my $short_name (split " ", $fields[1]) { 13461 push @full_names, $script->table($short_name)->full_name; 13462 } 13463 $fields[1] = join " ", @full_names; 13464 $_ = join "; ", @fields; 13465 13466 return; 13467} 13468 13469sub generate_hst { 13470 13471 # Populates the Hangul Syllable Type property from first principles 13472 13473 my $file= shift; 13474 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13475 13476 # These few ranges are hard-coded in. 13477 $file->insert_lines(split /\n/, <<'END' 134781100..1159 ; L 13479115F ; L 134801160..11A2 ; V 1348111A8..11F9 ; T 13482END 13483); 13484 13485 # The Hangul syllables in version 1 are at different code points than 13486 # those that came along starting in version 2, and have different names; 13487 # they comprise about 60% of the code points of the later version. 13488 # From my (khw) research on them (see <558493EB.4000807@att.net>), the 13489 # initial set is a subset of the later version, with different English 13490 # transliterations. I did not see an easy mapping between them. The 13491 # later set includes essentially all possibilities, even ones that aren't 13492 # in modern use (if they ever were), and over 96% of the new ones are type 13493 # LVT. Mathematically, the early set must also contain a preponderance of 13494 # LVT values. In lieu of doing nothing, we just set them all to LVT, and 13495 # expect that this will be right most of the time, which is better than 13496 # not being right at all. 13497 if ($v_version lt v2.0.0) { 13498 my $property = property_ref($file->property); 13499 $file->insert_lines(sprintf("%04X..%04X; LVT\n", 13500 $FIRST_REMOVED_HANGUL_SYLLABLE, 13501 $FINAL_REMOVED_HANGUL_SYLLABLE)); 13502 push @tables_that_may_be_empty, $property->table('LV')->complete_name; 13503 return; 13504 } 13505 13506 # The algorithmically derived syllables are almost all LVT ones, so 13507 # initialize the whole range with that. 13508 $file->insert_lines(sprintf "%04X..%04X; LVT\n", 13509 $SBase, $SBase + $SCount -1); 13510 13511 # Those ones that aren't LVT are LV, and they occur at intervals of 13512 # $TCount code points, starting with the first code point, at $SBase. 13513 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) { 13514 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i); 13515 } 13516 13517 return; 13518} 13519 13520sub generate_GCB { 13521 13522 # Populates the Grapheme Cluster Break property from first principles 13523 13524 my $file= shift; 13525 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13526 13527 # All these definitions are from 13528 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation 13529 # from http://www.unicode.org/reports/tr29/tr29-4.html 13530 13531 foreach my $range ($gc->ranges) { 13532 13533 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc 13534 # and gc=Cf 13535 if ($range->value =~ / ^ M [en] $ /x) { 13536 $file->insert_lines(sprintf "%04X..%04X; Extend", 13537 $range->start, $range->end); 13538 } 13539 elsif ($range->value =~ / ^ C [cf] $ /x) { 13540 $file->insert_lines(sprintf "%04X..%04X; Control", 13541 $range->start, $range->end); 13542 } 13543 } 13544 $file->insert_lines("2028; Control"); # Line Separator 13545 $file->insert_lines("2029; Control"); # Paragraph Separator 13546 13547 $file->insert_lines("000D; CR"); 13548 $file->insert_lines("000A; LF"); 13549 13550 # Also from http://www.unicode.org/reports/tr29/tr29-3.html. 13551 foreach my $code_point ( qw{ 13552 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6 13553 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F 13554 } 13555 ) { 13556 my $category = $gc->value_of(hex $code_point); 13557 next if ! defined $category || $category eq 'Cn'; # But not if 13558 # unassigned in this 13559 # release 13560 $file->insert_lines("$code_point; Extend"); 13561 } 13562 13563 my $hst = property_ref('Hangul_Syllable_Type'); 13564 if ($hst->count > 0) { 13565 foreach my $range ($hst->ranges) { 13566 $file->insert_lines(sprintf "%04X..%04X; %s", 13567 $range->start, $range->end, $range->value); 13568 } 13569 } 13570 else { 13571 generate_hst($file); 13572 } 13573 13574 main::process_generic_property_file($file); 13575} 13576 13577 13578sub fixup_early_perl_name_alias { 13579 13580 # Different versions of Unicode have varying support for the name synonyms 13581 # below. Just include everything. As of 6.1, all these are correct in 13582 # the Unicode-supplied file. 13583 13584 my $file= shift; 13585 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 13586 13587 13588 # ALERT did not come along until 6.0, at which point it became preferred 13589 # over BELL. By inserting it last in early releases, BELL is preferred 13590 # over it; and vice-vers in 6.0 13591 my $type_for_bell = ($v_version lt v6.0.0) 13592 ? 'correction' 13593 : 'alternate'; 13594 $file->insert_lines(split /\n/, <<END 135950007;BELL; $type_for_bell 13596000A;LINE FEED (LF);alternate 13597000C;FORM FEED (FF);alternate 13598000D;CARRIAGE RETURN (CR);alternate 135990085;NEXT LINE (NEL);alternate 13600END 13601 13602 ); 13603 13604 # One might think that the the 'Unicode_1_Name' field, could work for most 13605 # of the above names, but sadly that field varies depending on the 13606 # release. Version 1.1.5 had no names for any of the controls; Version 13607 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names. 13608 # 3.0.1 removed the name INDEX; and 3.2 changed some names: 13609 # changed to parenthesized versions like "NEXT LINE" to 13610 # "NEXT LINE (NEL)"; 13611 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD 13612 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; 13613 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR 13614 # 13615 # All these are present in the 6.1 NameAliases.txt 13616 13617 return; 13618} 13619 13620sub filter_later_version_name_alias_line { 13621 13622 # This file has an extra entry per line for the alias type. This is 13623 # handled by creating a compound entry: "$alias: $type"; First, split 13624 # the line into components. 13625 my ($range, $alias, $type, @remainder) 13626 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13627 13628 # This file contains multiple entries for some components, so tell the 13629 # downstream code to allow this in our internal tables; the 13630 # $MULTIPLE_AFTER preserves the input ordering. 13631 $_ = join ";", $range, $CMD_DELIM 13632 . $REPLACE_CMD 13633 . '=' 13634 . $MULTIPLE_AFTER 13635 . $CMD_DELIM 13636 . "$alias: $type", 13637 @remainder; 13638 return; 13639} 13640 13641sub filter_early_version_name_alias_line { 13642 13643 # Early versions did not have the trailing alias type field; implicitly it 13644 # was 'correction'. 13645 $_ .= "; correction"; 13646 13647 filter_later_version_name_alias_line; 13648 return; 13649} 13650 13651sub filter_all_caps_script_names { 13652 13653 # Some early Unicode releases had the script names in all CAPS. This 13654 # converts them to just the first letter of each word being capital. 13655 13656 my ($range, $script, @remainder) 13657 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13658 my @words = split /[_-]/, $script; 13659 for my $word (@words) { 13660 $word = 13661 ucfirst(lc($word)) if $word ne 'CJK'; 13662 } 13663 $script = join "_", @words; 13664 $_ = join ";", $range, $script, @remainder; 13665} 13666 13667sub finish_Unicode() { 13668 # This routine should be called after all the Unicode files have been read 13669 # in. It: 13670 # 1) Creates properties that are missing from the version of Unicode being 13671 # compiled, and which, for whatever reason, are needed for the Perl 13672 # core to function properly. These are minimally populated as 13673 # necessary. 13674 # 2) Adds the mappings for code points missing from the files which have 13675 # defaults specified for them. 13676 # 3) At this this point all mappings are known, so it computes the type of 13677 # each property whose type hasn't been determined yet. 13678 # 4) Calculates all the regular expression match tables based on the 13679 # mappings. 13680 # 5) Calculates and adds the tables which are defined by Unicode, but 13681 # which aren't derived by them, and certain derived tables that Perl 13682 # uses. 13683 13684 # Folding information was introduced later into Unicode data. To get 13685 # Perl's case ignore (/i) to work at all in releases that don't have 13686 # folding, use the best available alternative, which is lower casing. 13687 my $fold = property_ref('Case_Folding'); 13688 if ($fold->is_empty) { 13689 $fold->initialize(property_ref('Lowercase_Mapping')); 13690 $fold->add_note(join_lines(<<END 13691WARNING: This table uses lower case as a substitute for missing fold 13692information 13693END 13694 )); 13695 } 13696 13697 # Multiple-character mapping was introduced later into Unicode data, so it 13698 # is by default the simple version. If to output the simple versions and 13699 # not present, just use the regular (which in these Unicode versions is 13700 # the simple as well). 13701 foreach my $map (qw { Uppercase_Mapping 13702 Lowercase_Mapping 13703 Titlecase_Mapping 13704 Case_Folding 13705 } ) 13706 { 13707 my $comment = <<END; 13708 13709Note that although the Perl core uses this file, it has the standard values 13710for code points from U+0000 to U+00FF compiled in, so changing this table will 13711not change the core's behavior with respect to these code points. Use 13712Unicode::Casing to override this table. 13713END 13714 if ($map eq 'Case_Folding') { 13715 $comment .= <<END; 13716(/i regex matching is not overridable except by using a custom regex engine) 13717END 13718 } 13719 property_ref($map)->add_comment(join_lines($comment)); 13720 my $simple = property_ref("Simple_$map"); 13721 next if ! $simple->is_empty; 13722 if ($simple->to_output_map) { 13723 $simple->initialize(property_ref($map)); 13724 } 13725 else { 13726 property_ref($map)->set_proxy_for($simple->name); 13727 } 13728 } 13729 13730 # For each property, fill in any missing mappings, and calculate the re 13731 # match tables. If a property has more than one missing mapping, the 13732 # default is a reference to a data structure, and may require data from 13733 # other properties to resolve. The sort is used to cause these to be 13734 # processed last, after all the other properties have been calculated. 13735 # (Fortunately, the missing properties so far don't depend on each other.) 13736 foreach my $property 13737 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } 13738 property_ref('*')) 13739 { 13740 # $perl has been defined, but isn't one of the Unicode properties that 13741 # need to be finished up. 13742 next if $property == $perl; 13743 13744 # Nor do we need to do anything with properties that aren't going to 13745 # be output. 13746 next if $property->fate == $SUPPRESSED; 13747 13748 # Handle the properties that have more than one possible default 13749 if (ref $property->default_map) { 13750 my $default_map = $property->default_map; 13751 13752 # These properties have stored in the default_map: 13753 # One or more of: 13754 # 1) A default map which applies to all code points in a 13755 # certain class 13756 # 2) an expression which will evaluate to the list of code 13757 # points in that class 13758 # And 13759 # 3) the default map which applies to every other missing code 13760 # point. 13761 # 13762 # Go through each list. 13763 while (my ($default, $eval) = $default_map->get_next_defaults) { 13764 13765 # Get the class list, and intersect it with all the so-far 13766 # unspecified code points yielding all the code points 13767 # in the class that haven't been specified. 13768 my $list = eval $eval; 13769 if ($@) { 13770 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); 13771 last; 13772 } 13773 13774 # Narrow down the list to just those code points we don't have 13775 # maps for yet. 13776 $list = $list & $property->inverse_list; 13777 13778 # Add mappings to the property for each code point in the list 13779 foreach my $range ($list->ranges) { 13780 $property->add_map($range->start, $range->end, $default, 13781 Replace => $CROAK); 13782 } 13783 } 13784 13785 # All remaining code points have the other mapping. Set that up 13786 # so the normal single-default mapping code will work on them 13787 $property->set_default_map($default_map->other_default); 13788 13789 # And fall through to do that 13790 } 13791 13792 # We should have enough data now to compute the type of the property. 13793 my $property_name = $property->name; 13794 $property->compute_type; 13795 my $property_type = $property->type; 13796 13797 next if ! $property->to_create_match_tables; 13798 13799 # Here want to create match tables for this property 13800 13801 # The Unicode db always (so far, and they claim into the future) have 13802 # the default for missing entries in binary properties be 'N' (unless 13803 # there is a '@missing' line that specifies otherwise) 13804 if (! defined $property->default_map) { 13805 if ($property_type == $BINARY) { 13806 $property->set_default_map('N'); 13807 } 13808 elsif ($property_type == $ENUM) { 13809 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one"); 13810 $property->set_default_map('XXX This makes sure there is a default map'); 13811 } 13812 } 13813 13814 # Add any remaining code points to the mapping, using the default for 13815 # missing code points. 13816 my $default_table; 13817 my $default_map = $property->default_map; 13818 if ($property_type == $FORCED_BINARY) { 13819 13820 # A forced binary property creates a 'Y' table that matches all 13821 # non-default values. The actual string values are also written out 13822 # as a map table. (The default value will almost certainly be the 13823 # empty string, so the pod glosses over the distinction, and just 13824 # talks about empty vs non-empty.) 13825 my $yes = $property->table("Y"); 13826 foreach my $range ($property->ranges) { 13827 next if $range->value eq $default_map; 13828 $yes->add_range($range->start, $range->end); 13829 } 13830 $property->table("N")->set_complement($yes); 13831 } 13832 else { 13833 if (defined $default_map) { 13834 13835 # Make sure there is a match table for the default 13836 if (! defined ($default_table = $property->table($default_map))) 13837 { 13838 $default_table = $property->add_match_table($default_map); 13839 } 13840 13841 # And, if the property is binary, the default table will just 13842 # be the complement of the other table. 13843 if ($property_type == $BINARY) { 13844 my $non_default_table; 13845 13846 # Find the non-default table. 13847 for my $table ($property->tables) { 13848 if ($table == $default_table) { 13849 if ($v_version le v5.0.0) { 13850 $table->add_alias($_) for qw(N No F False); 13851 } 13852 next; 13853 } elsif ($v_version le v5.0.0) { 13854 $table->add_alias($_) for qw(Y Yes T True); 13855 } 13856 $non_default_table = $table; 13857 } 13858 $default_table->set_complement($non_default_table); 13859 } 13860 else { 13861 13862 # This fills in any missing values with the default. It's 13863 # not necessary to do this with binary properties, as the 13864 # default is defined completely in terms of the Y table. 13865 $property->add_map(0, $MAX_WORKING_CODEPOINT, 13866 $default_map, Replace => $NO); 13867 } 13868 } 13869 13870 # Have all we need to populate the match tables. 13871 my $maps_should_be_defined = $property->pre_declared_maps; 13872 foreach my $range ($property->ranges) { 13873 my $map = $range->value; 13874 my $table = $property->table($map); 13875 if (! defined $table) { 13876 13877 # Integral and rational property values are not 13878 # necessarily defined in PropValueAliases, but whether all 13879 # the other ones should be depends on the property. 13880 if ($maps_should_be_defined 13881 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) 13882 { 13883 Carp::my_carp("Table '$property_name=$map' should " 13884 . "have been defined. Defining it now.") 13885 } 13886 $table = $property->add_match_table($map); 13887 } 13888 13889 next if $table->complement != 0; # Don't need to populate these 13890 $table->add_range($range->start, $range->end); 13891 } 13892 } 13893 13894 # For Perl 5.6 compatibility, all properties matchable in regexes can 13895 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl. 13896 # But warn if this creates a conflict with a (new) Unicode property 13897 # name, although it appears that Unicode has made a decision never to 13898 # begin a property name with 'Is_', so this shouldn't happen. 13899 foreach my $alias ($property->aliases) { 13900 my $Is_name = 'Is_' . $alias->name; 13901 if (defined (my $pre_existing = property_ref($Is_name))) { 13902 Carp::my_carp(<<END 13903There is already an alias named $Is_name (from " . $pre_existing . "), so 13904creating one for $property won't work. This is bad news. If it is not too 13905late, get Unicode to back off. Otherwise go back to the old scheme (findable 13906from the git blame log for this area of the code that suppressed individual 13907aliases that conflict with the new Unicode names. Proceeding anyway. 13908END 13909 ); 13910 } 13911 } # End of loop through aliases for this property 13912 } # End of loop through all Unicode properties. 13913 13914 # Fill in the mappings that Unicode doesn't completely furnish. First the 13915 # single letter major general categories. If Unicode were to start 13916 # delivering the values, this would be redundant, but better that than to 13917 # try to figure out if should skip and not get it right. Ths could happen 13918 # if a new major category were to be introduced, and the hard-coded test 13919 # wouldn't know about it. 13920 # This routine depends on the standard names for the general categories 13921 # being what it thinks they are, like 'Cn'. The major categories are the 13922 # union of all the general category tables which have the same first 13923 # letters. eg. L = Lu + Lt + Ll + Lo + Lm 13924 foreach my $minor_table ($gc->tables) { 13925 my $minor_name = $minor_table->name; 13926 next if length $minor_name == 1; 13927 if (length $minor_name != 2) { 13928 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); 13929 next; 13930 } 13931 13932 my $major_name = uc(substr($minor_name, 0, 1)); 13933 my $major_table = $gc->table($major_name); 13934 $major_table += $minor_table; 13935 } 13936 13937 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt 13938 # defines it as LC) 13939 my $LC = $gc->table('LC'); 13940 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... 13941 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. 13942 13943 13944 if ($LC->is_empty) { # Assume if not empty that Unicode has started to 13945 # deliver the correct values in it 13946 $LC->initialize($gc->table('Ll') + $gc->table('Lu')); 13947 13948 # Lt not in release 1. 13949 if (defined $gc->table('Lt')) { 13950 $LC += $gc->table('Lt'); 13951 $gc->table('Lt')->set_caseless_equivalent($LC); 13952 } 13953 } 13954 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); 13955 13956 $gc->table('Ll')->set_caseless_equivalent($LC); 13957 $gc->table('Lu')->set_caseless_equivalent($LC); 13958 13959 # Create digit and case fold tables with the original file names for 13960 # backwards compatibility with applications that read them directly. 13961 my $Digit = Property->new("Legacy_Perl_Decimal_Digit", 13962 Default_Map => "", 13963 File => 'Digit', # Trad. location 13964 Directory => $map_directory, 13965 Type => $STRING, 13966 Replacement_Property => "Perl_Decimal_Digit", 13967 Initialize => property_ref('Perl_Decimal_Digit'), 13968 ); 13969 $Digit->add_comment(join_lines(<<END 13970This file gives the mapping of all code points which represent a single 13971decimal digit [0-9] to their respective digits. For example, the code point 13972U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those 13973that have Numeric_Type=Decimal; not special things, like subscripts nor Roman 13974numerals. 13975END 13976 )); 13977 13978 # Make sure this assumption in perl core code is valid in this Unicode 13979 # release, with known exceptions 13980 foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) { 13981 next if $range->end - $range->start == 9; 13982 next if $range->start == 0x1D7CE; # This whole range was added in 3.1 13983 next if $range->end == 0x19DA && $v_version eq v5.2.0; 13984 next if $range->end - $range->start < 9 && $v_version le 4.0.0; 13985 Carp::my_carp("Range $range unexpectedly doesn't contain 10" 13986 . " decimal digits. Code in regcomp.c assumes it does," 13987 . " and will have to be fixed. Proceeding anyway."); 13988 } 13989 13990 Property->new('Legacy_Case_Folding', 13991 File => "Fold", 13992 Directory => $map_directory, 13993 Default_Map => $CODE_POINT, 13994 Type => $STRING, 13995 Replacement_Property => "Case_Folding", 13996 Format => $HEX_FORMAT, 13997 Initialize => property_ref('cf'), 13998 ); 13999 14000 # The Script_Extensions property started out as a clone of the Script 14001 # property. But processing its data file caused some elements to be 14002 # replaced with different data. (These elements were for the Common and 14003 # Inherited properties.) This data is a qw() list of all the scripts that 14004 # the code points in the given range are in. An example line is: 14005 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA 14006 # 14007 # The code above has created a new match table named "Arab Syrc Thaa" 14008 # which contains 060C. (The cloned table started out with this code point 14009 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and 14010 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa" 14011 # match table. This is repeated for all these tables and ranges. The map 14012 # data is retained in the map table for reference, but the spurious match 14013 # tables are deleted. 14014 14015 if (defined $scx) { 14016 foreach my $table ($scx->tables) { 14017 next unless $table->name =~ /\s/; # All the new and only the new 14018 # tables have a space in their 14019 # names 14020 my @scripts = split /\s+/, $table->name; 14021 foreach my $script (@scripts) { 14022 my $script_table = $scx->table($script); 14023 $script_table += $table; 14024 } 14025 $scx->delete_match_table($table); 14026 } 14027 14028 # Mark the scx table as the parent of the corresponding sc table for 14029 # those which are identical. This causes the pod for the script table 14030 # to refer to the corresponding scx one. 14031 # 14032 # This has to be in a separate loop from above, so as to wait until 14033 # the tables are stabilized before checking for equivalency. 14034 if (defined $pod_directory) { 14035 foreach my $table ($scx->tables) { 14036 my $plain_sc_equiv = $script->table($table->name); 14037 if ($table->matches_identically_to($plain_sc_equiv)) { 14038 $plain_sc_equiv->set_equivalent_to($table, Related => 1); 14039 } 14040 } 14041 } 14042 } 14043 14044 return; 14045} 14046 14047sub pre_3_dot_1_Nl () { 14048 14049 # Return a range list for gc=nl for Unicode versions prior to 3.1, which 14050 # is when Unicode's became fully usable. These code points were 14051 # determined by inspection and experimentation. gc=nl is important for 14052 # certain Perl-extension properties that should be available in all 14053 # releases. 14054 14055 my $Nl = Range_List->new(); 14056 if (defined (my $official = $gc->table('Nl'))) { 14057 $Nl += $official; 14058 } 14059 else { 14060 $Nl->add_range(0x2160, 0x2182); 14061 $Nl->add_range(0x3007, 0x3007); 14062 $Nl->add_range(0x3021, 0x3029); 14063 } 14064 $Nl->add_range(0xFE20, 0xFE23); 14065 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when 14066 # these were added 14067 return $Nl; 14068} 14069 14070sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be 14071 # called before the Cn's are completely filled. 14072 # Works on Unicodes earlier than ones that 14073 # explicitly specify Cn. 14074 return if defined $Assigned; 14075 14076 if (! defined $gc || $gc->is_empty()) { 14077 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated"); 14078 } 14079 14080 $Assigned = $perl->add_match_table('Assigned', 14081 Description => "All assigned code points", 14082 ); 14083 while (defined (my $range = $gc->each_range())) { 14084 my $standard_value = standardize($range->value); 14085 next if $standard_value eq 'cn' || $standard_value eq 'unassigned'; 14086 $Assigned->add_range($range->start, $range->end); 14087 } 14088} 14089 14090sub calculate_DI() { # Set $DI to a Range_List equivalent to the 14091 # Default_Ignorable_Code_Point property. Works on 14092 # Unicodes earlier than ones that explicitly specify 14093 # DI. 14094 return if defined $DI; 14095 14096 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 14097 $DI = $di->table('Y'); 14098 } 14099 else { 14100 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D, 14101 0x2060 .. 0x206F, 14102 0xFE00 .. 0xFE0F, 14103 0xFFF0 .. 0xFFFB, 14104 ]); 14105 if ($v_version ge v2.0) { 14106 $DI += $gc->table('Cf') 14107 + $gc->table('Cs'); 14108 14109 # These are above the Unicode version 1 max 14110 $DI->add_range(0xE0000, 0xE0FFF); 14111 } 14112 $DI += $gc->table('Cc') 14113 - ord("\t") 14114 - utf8::unicode_to_native(0x0A) # LINE FEED 14115 - utf8::unicode_to_native(0x0B) # VERTICAL TAB 14116 - ord("\f") 14117 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 14118 - utf8::unicode_to_native(0x85); # NEL 14119 } 14120} 14121 14122sub calculate_NChar() { # Create a Perl extension match table which is the 14123 # same as the Noncharacter_Code_Point property, and 14124 # set $NChar to point to it. Works on Unicodes 14125 # earlier than ones that explicitly specify NChar 14126 return if defined $NChar; 14127 14128 $NChar = $perl->add_match_table('_Perl_Nchar', 14129 Perl_Extension => 1, 14130 Fate => $INTERNAL_ONLY); 14131 if (defined (my $off_nchar = property_ref('NChar'))) { 14132 $NChar->initialize($off_nchar->table('Y')); 14133 } 14134 else { 14135 $NChar->initialize([ 0xFFFE .. 0xFFFF ]); 14136 if ($v_version ge v2.0) { # First release with these nchars 14137 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) { 14138 $NChar += [ $i .. $i+1 ]; 14139 } 14140 } 14141 } 14142} 14143 14144sub handle_compare_versions () { 14145 # This fixes things up for the $compare_versions capability, where we 14146 # compare Unicode version X with version Y (with Y > X), and we are 14147 # running it on the Unicode Data for version Y. 14148 # 14149 # It works by calculating the code points whose meaning has been specified 14150 # after release X, by using the Age property. The complement of this set 14151 # is the set of code points whose meaning is unchanged between the 14152 # releases. This is the set the program restricts itself to. It includes 14153 # everything whose meaning has been specified by the time version X came 14154 # along, plus those still unassigned by the time of version Y. (We will 14155 # continue to use the word 'assigned' to mean 'meaning has been 14156 # specified', as it's shorter and is accurate in all cases except the 14157 # Noncharacter code points.) 14158 # 14159 # This function is run after all the properties specified by Unicode have 14160 # been calculated for release Y. This makes sure we get all the nuances 14161 # of Y's rules. (It is done before the Perl extensions are calculated, as 14162 # those are based entirely on the Unicode ones.) But doing it after the 14163 # Unicode table calculations means we have to fix up the Unicode tables. 14164 # We do this by subtracting the code points that have been assigned since 14165 # X (which is actually done by ANDing each table of assigned code points 14166 # with the set of unchanged code points). Most Unicode properties are of 14167 # the form such that all unassigned code points have a default, grab-bag, 14168 # property value which is changed when the code point gets assigned. For 14169 # these, we just remove the changed code points from the table for the 14170 # latter property value, and add them back in to the grab-bag one. A few 14171 # other properties are not entirely of this form and have values for some 14172 # or all unassigned code points that are not the grab-bag one. These have 14173 # to be handled specially, and are hard-coded in to this routine based on 14174 # manual inspection of the Unicode character database. A list of the 14175 # outlier code points is made for each of these properties, and those 14176 # outliers are excluded from adding and removing from tables. 14177 # 14178 # Note that there are glitches when comparing against Unicode 1.1, as some 14179 # Hangul syllables in it were later ripped out and eventually replaced 14180 # with other things. 14181 14182 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS; 14183 14184 my $after_first_version = "All matching code points were added after " 14185 . "Unicode $string_compare_versions"; 14186 14187 # Calculate the delta as those code points that have been newly assigned 14188 # since the first compare version. 14189 my $delta = Range_List->new(); 14190 foreach my $table ($age->tables) { 14191 use version; 14192 next if $table == $age->table('Unassigned'); 14193 next if version->parse($table->name) 14194 le version->parse($string_compare_versions); 14195 $delta += $table; 14196 } 14197 if ($delta->is_empty) { 14198 die ("No changes; perhaps you need a 'DAge.txt' file?"); 14199 } 14200 14201 my $unchanged = ~ $delta; 14202 14203 calculate_Assigned() if ! defined $Assigned; 14204 $Assigned &= $unchanged; 14205 14206 # $Assigned now contains the code points that were assigned as of Unicode 14207 # version X. 14208 14209 # A block is all or nothing. If nothing is assigned in it, it all goes 14210 # back to the No_Block pool; but if even one code point is assigned, the 14211 # block is retained. 14212 my $no_block = $block->table('No_Block'); 14213 foreach my $this_block ($block->tables) { 14214 next if $this_block == $no_block 14215 || ! ($this_block & $Assigned)->is_empty; 14216 $this_block->set_fate($SUPPRESSED, $after_first_version); 14217 foreach my $range ($this_block->ranges) { 14218 $block->replace_map($range->start, $range->end, 'No_Block') 14219 } 14220 $no_block += $this_block; 14221 } 14222 14223 my @special_delta_properties; # List of properties that have to be 14224 # handled specially. 14225 my %restricted_delta; # Keys are the entries in 14226 # @special_delta_properties; values 14227 # are the range list of the code points 14228 # that behave normally when they get 14229 # assigned. 14230 14231 # In the next three properties, the Default Ignorable code points are 14232 # outliers. 14233 calculate_DI(); 14234 $DI &= $unchanged; 14235 14236 push @special_delta_properties, property_ref('_Perl_GCB'); 14237 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 14238 14239 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded'))) 14240 { 14241 push @special_delta_properties, $cwnfkcc; 14242 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 14243 } 14244 14245 calculate_NChar(); # Non-character code points 14246 $NChar &= $unchanged; 14247 14248 # This may have to be updated from time-to-time to get the most accurate 14249 # results. 14250 my $default_BC_non_LtoR = Range_List->new(Initialize => 14251 # These came from the comments in v8.0 DBidiClass.txt 14252 [ # AL 14253 0x0600 .. 0x07BF, 14254 0x08A0 .. 0x08FF, 14255 0xFB50 .. 0xFDCF, 14256 0xFDF0 .. 0xFDFF, 14257 0xFE70 .. 0xFEFF, 14258 0x1EE00 .. 0x1EEFF, 14259 # R 14260 0x0590 .. 0x05FF, 14261 0x07C0 .. 0x089F, 14262 0xFB1D .. 0xFB4F, 14263 0x10800 .. 0x10FFF, 14264 0x1E800 .. 0x1EDFF, 14265 0x1EF00 .. 0x1EFFF, 14266 # ET 14267 0x20A0 .. 0x20CF, 14268 ] 14269 ); 14270 $default_BC_non_LtoR += $DI + $NChar; 14271 push @special_delta_properties, property_ref('BidiClass'); 14272 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR; 14273 14274 if (defined (my $eaw = property_ref('East_Asian_Width'))) { 14275 14276 my $default_EA_width_W = Range_List->new(Initialize => 14277 # From comments in v8.0 EastAsianWidth.txt 14278 [ 14279 0x3400 .. 0x4DBF, 14280 0x4E00 .. 0x9FFF, 14281 0xF900 .. 0xFAFF, 14282 0x20000 .. 0x2A6DF, 14283 0x2A700 .. 0x2B73F, 14284 0x2B740 .. 0x2B81F, 14285 0x2B820 .. 0x2CEAF, 14286 0x2F800 .. 0x2FA1F, 14287 0x20000 .. 0x2FFFD, 14288 0x30000 .. 0x3FFFD, 14289 ] 14290 ); 14291 push @special_delta_properties, $eaw; 14292 $restricted_delta{$special_delta_properties[-1]} 14293 = ~ $default_EA_width_W; 14294 14295 # Line break came along in the same release as East_Asian_Width, and 14296 # the non-grab-bag default set is a superset of the EAW one. 14297 if (defined (my $lb = property_ref('Line_Break'))) { 14298 my $default_LB_non_XX = Range_List->new(Initialize => 14299 # From comments in v8.0 LineBreak.txt 14300 [ 0x20A0 .. 0x20CF ]); 14301 $default_LB_non_XX += $default_EA_width_W; 14302 push @special_delta_properties, $lb; 14303 $restricted_delta{$special_delta_properties[-1]} 14304 = ~ $default_LB_non_XX; 14305 } 14306 } 14307 14308 # Go through every property, skipping those we've already worked on, those 14309 # that are immutable, and the perl ones that will be calculated after this 14310 # routine has done its fixup. 14311 foreach my $property (property_ref('*')) { 14312 next if $property == $perl # Done later in the program 14313 || $property == $block # Done just above 14314 || $property == $DI # Done just above 14315 || $property == $NChar # Done just above 14316 14317 # The next two are invariant across Unicode versions 14318 || $property == property_ref('Pattern_Syntax') 14319 || $property == property_ref('Pattern_White_Space'); 14320 14321 # Find the grab-bag value. 14322 my $default_map = $property->default_map; 14323 14324 if (! $property->to_create_match_tables) { 14325 14326 # Here there aren't any match tables. So far, all such properties 14327 # have a default map, and don't require special handling. Just 14328 # change each newly assigned code point back to the default map, 14329 # as if they were unassigned. 14330 foreach my $range ($delta->ranges) { 14331 $property->add_map($range->start, 14332 $range->end, 14333 $default_map, 14334 Replace => $UNCONDITIONALLY); 14335 } 14336 } 14337 else { # Here there are match tables. Find the one (if any) for the 14338 # grab-bag value that unassigned code points go to. 14339 my $default_table; 14340 if (defined $default_map) { 14341 $default_table = $property->table($default_map); 14342 } 14343 14344 # If some code points don't go back to the the grab-bag when they 14345 # are considered unassigned, exclude them from the list that does 14346 # that. 14347 my $this_delta = $delta; 14348 my $this_unchanged = $unchanged; 14349 if (grep { $_ == $property } @special_delta_properties) { 14350 $this_delta = $delta & $restricted_delta{$property}; 14351 $this_unchanged = ~ $this_delta; 14352 } 14353 14354 # Fix up each match table for this property. 14355 foreach my $table ($property->tables) { 14356 if (defined $default_table && $table == $default_table) { 14357 14358 # The code points assigned after release X (the ones we 14359 # are excluding in this routine) go back on to the default 14360 # (grab-bag) table. However, some of these tables don't 14361 # actually exist, but are specified solely by the other 14362 # tables. (In a binary property, we don't need to 14363 # actually have an 'N' table, as it's just the complement 14364 # of the 'Y' table.) Such tables will be locked, so just 14365 # skip those. 14366 $table += $this_delta unless $table->locked; 14367 } 14368 else { 14369 14370 # Here the table is not for the default value. We need to 14371 # subtract the code points we are ignoring for this 14372 # comparison (the deltas) from it. But if the table 14373 # started out with nothing, no need to exclude anything, 14374 # and want to skip it here anyway, so it gets listed 14375 # properly in the pod. 14376 next if $table->is_empty; 14377 14378 # Save the deltas for later, before we do the subtraction 14379 my $deltas = $table & $this_delta; 14380 14381 $table &= $this_unchanged; 14382 14383 # Suppress the table if the subtraction left it with 14384 # nothing in it 14385 if ($table->is_empty) { 14386 if ($property->type == $BINARY) { 14387 push @tables_that_may_be_empty, $table->complete_name; 14388 } 14389 else { 14390 $table->set_fate($SUPPRESSED, $after_first_version); 14391 } 14392 } 14393 14394 # Now we add the removed code points to the property's 14395 # map, as they should now map to the grab-bag default 14396 # property (which they did in the first comparison 14397 # version). But we don't have to do this if the map is 14398 # only for internal use. 14399 if (defined $default_map && $property->to_output_map) { 14400 14401 # The gc property has pseudo property values whose names 14402 # have length 1. These are the union of all the 14403 # property values whose name is longer than 1 and 14404 # whose first letter is all the same. The replacement 14405 # is done once for the longer-named tables. 14406 next if $property == $gc && length $table->name == 1; 14407 14408 foreach my $range ($deltas->ranges) { 14409 $property->add_map($range->start, 14410 $range->end, 14411 $default_map, 14412 Replace => $UNCONDITIONALLY); 14413 } 14414 } 14415 } 14416 } 14417 } 14418 } 14419 14420 # The above code doesn't work on 'gc=C', as it is a superset of the default 14421 # ('Cn') table. It's easiest to just special case it here. 14422 my $C = $gc->table('C'); 14423 $C += $gc->table('Cn'); 14424 14425 return; 14426} 14427 14428sub compile_perl() { 14429 # Create perl-defined tables. Almost all are part of the pseudo-property 14430 # named 'perl' internally to this program. Many of these are recommended 14431 # in UTS#18 "Unicode Regular Expressions", and their derivations are based 14432 # on those found there. 14433 # Almost all of these are equivalent to some Unicode property. 14434 # A number of these properties have equivalents restricted to the ASCII 14435 # range, with their names prefaced by 'Posix', to signify that these match 14436 # what the Posix standard says they should match. A couple are 14437 # effectively this, but the name doesn't have 'Posix' in it because there 14438 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended 14439 # to the full Unicode range, by our guesses as to what is appropriate. 14440 14441 # 'All' is all code points. As an error check, instead of just setting it 14442 # to be that, construct it to be the union of all the major categories 14443 $All = $perl->add_match_table('All', 14444 Description 14445 => "All code points, including those above Unicode. Same as qr/./s", 14446 Matches_All => 1); 14447 14448 foreach my $major_table ($gc->tables) { 14449 14450 # Major categories are the ones with single letter names. 14451 next if length($major_table->name) != 1; 14452 14453 $All += $major_table; 14454 } 14455 14456 if ($All->max != $MAX_WORKING_CODEPOINT) { 14457 Carp::my_carp_bug("Generated highest code point (" 14458 . sprintf("%X", $All->max) 14459 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.") 14460 } 14461 if ($All->range_count != 1 || $All->min != 0) { 14462 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.") 14463 } 14464 14465 my $Any = $perl->add_match_table('Any', 14466 Description => "All Unicode code points"); 14467 $Any->add_range(0, $MAX_UNICODE_CODEPOINT); 14468 $Any->add_alias('Unicode'); 14469 14470 calculate_Assigned(); 14471 14472 # Our internal-only property should be treated as more than just a 14473 # synonym; grandfather it in to the pod. 14474 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1, 14475 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED) 14476 ->set_equivalent_to(property_ref('ccc')->table('Above'), 14477 Related => 1); 14478 14479 my $ASCII = $perl->add_match_table('ASCII'); 14480 if (defined $block) { # This is equivalent to the block if have it. 14481 my $Unicode_ASCII = $block->table('Basic_Latin'); 14482 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { 14483 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); 14484 } 14485 } 14486 14487 # Very early releases didn't have blocks, so initialize ASCII ourselves if 14488 # necessary 14489 if ($ASCII->is_empty) { 14490 if (! NON_ASCII_PLATFORM) { 14491 $ASCII->add_range(0, 127); 14492 } 14493 else { 14494 for my $i (0 .. 127) { 14495 $ASCII->add_range(utf8::unicode_to_native($i), 14496 utf8::unicode_to_native($i)); 14497 } 14498 } 14499 } 14500 14501 # Get the best available case definitions. Early Unicode versions didn't 14502 # have Uppercase and Lowercase defined, so use the general category 14503 # instead for them, modified by hard-coding in the code points each is 14504 # missing. 14505 my $Lower = $perl->add_match_table('XPosixLower'); 14506 my $Unicode_Lower = property_ref('Lowercase'); 14507 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { 14508 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); 14509 14510 } 14511 else { 14512 $Lower += $gc->table('Lowercase_Letter'); 14513 14514 # There are quite a few code points in Lower, that aren't in gc=lc, 14515 # and not all are in all releases. 14516 my $temp = Range_List->new(Initialize => [ 14517 utf8::unicode_to_native(0xAA), 14518 utf8::unicode_to_native(0xBA), 14519 0x02B0 .. 0x02B8, 14520 0x02C0 .. 0x02C1, 14521 0x02E0 .. 0x02E4, 14522 0x0345, 14523 0x037A, 14524 0x1D2C .. 0x1D6A, 14525 0x1D78, 14526 0x1D9B .. 0x1DBF, 14527 0x2071, 14528 0x207F, 14529 0x2090 .. 0x209C, 14530 0x2170 .. 0x217F, 14531 0x24D0 .. 0x24E9, 14532 0x2C7C .. 0x2C7D, 14533 0xA770, 14534 0xA7F8 .. 0xA7F9, 14535 ]); 14536 $Lower += $temp & $Assigned; 14537 } 14538 my $Posix_Lower = $perl->add_match_table("PosixLower", 14539 Initialize => $Lower & $ASCII, 14540 ); 14541 14542 my $Upper = $perl->add_match_table("XPosixUpper"); 14543 my $Unicode_Upper = property_ref('Uppercase'); 14544 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { 14545 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); 14546 } 14547 else { 14548 14549 # Unlike Lower, there are only two ranges in Upper that aren't in 14550 # gc=Lu, and all code points were assigned in all releases. 14551 $Upper += $gc->table('Uppercase_Letter'); 14552 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals 14553 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters 14554 } 14555 my $Posix_Upper = $perl->add_match_table("PosixUpper", 14556 Initialize => $Upper & $ASCII, 14557 ); 14558 14559 # Earliest releases didn't have title case. Initialize it to empty if not 14560 # otherwise present 14561 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase', 14562 Description => '(= \p{Gc=Lt})'); 14563 my $lt = $gc->table('Lt'); 14564 14565 # Earlier versions of mktables had this related to $lt since they have 14566 # identical code points, but their caseless equivalents are not the same, 14567 # one being 'Cased' and the other being 'LC', and so now must be kept as 14568 # separate entities. 14569 if (defined $lt) { 14570 $Title += $lt; 14571 } 14572 else { 14573 push @tables_that_may_be_empty, $Title->complete_name; 14574 } 14575 14576 my $Unicode_Cased = property_ref('Cased'); 14577 if (defined $Unicode_Cased) { 14578 my $yes = $Unicode_Cased->table('Y'); 14579 my $no = $Unicode_Cased->table('N'); 14580 $Title->set_caseless_equivalent($yes); 14581 if (defined $Unicode_Upper) { 14582 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes); 14583 $Unicode_Upper->table('N')->set_caseless_equivalent($no); 14584 } 14585 $Upper->set_caseless_equivalent($yes); 14586 if (defined $Unicode_Lower) { 14587 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes); 14588 $Unicode_Lower->table('N')->set_caseless_equivalent($no); 14589 } 14590 $Lower->set_caseless_equivalent($yes); 14591 } 14592 else { 14593 # If this Unicode version doesn't have Cased, set up the Perl 14594 # extension from first principles. From Unicode 5.1: Definition D120: 14595 # A character C is defined to be cased if and only if C has the 14596 # Lowercase or Uppercase property or has a General_Category value of 14597 # Titlecase_Letter. 14598 my $cased = $perl->add_match_table('Cased', 14599 Initialize => $Lower + $Upper + $Title, 14600 Description => 'Uppercase or Lowercase or Titlecase', 14601 ); 14602 # $notcased is purely for the caseless equivalents below 14603 my $notcased = $perl->add_match_table('_Not_Cased', 14604 Initialize => ~ $cased, 14605 Fate => $INTERNAL_ONLY, 14606 Description => 'All not-cased code points'); 14607 $Title->set_caseless_equivalent($cased); 14608 if (defined $Unicode_Upper) { 14609 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased); 14610 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased); 14611 } 14612 $Upper->set_caseless_equivalent($cased); 14613 if (defined $Unicode_Lower) { 14614 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased); 14615 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased); 14616 } 14617 $Lower->set_caseless_equivalent($cased); 14618 } 14619 14620 # Similarly, set up our own Case_Ignorable property if this Unicode 14621 # version doesn't have it. From Unicode 5.1: Definition D121: A character 14622 # C is defined to be case-ignorable if C has the value MidLetter or the 14623 # value MidNumLet for the Word_Break property or its General_Category is 14624 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), 14625 # Modifier_Letter (Lm), or Modifier_Symbol (Sk). 14626 14627 # Perl has long had an internal-only alias for this property; grandfather 14628 # it in to the pod, but discourage its use. 14629 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable', 14630 Re_Pod_Entry => 1, 14631 Fate => $INTERNAL_ONLY, 14632 Status => $DISCOURAGED); 14633 my $case_ignorable = property_ref('Case_Ignorable'); 14634 if (defined $case_ignorable && ! $case_ignorable->is_empty) { 14635 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'), 14636 Related => 1); 14637 } 14638 else { 14639 14640 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm')); 14641 14642 # The following three properties are not in early releases 14643 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me'); 14644 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf'); 14645 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk'); 14646 14647 # For versions 4.1 - 5.0, there is no MidNumLet property, and 14648 # correspondingly the case-ignorable definition lacks that one. For 14649 # 4.0, it appears that it was meant to be the same definition, but was 14650 # inadvertently omitted from the standard's text, so add it if the 14651 # property actually is there 14652 my $wb = property_ref('Word_Break'); 14653 if (defined $wb) { 14654 my $midlet = $wb->table('MidLetter'); 14655 $perl_case_ignorable += $midlet if defined $midlet; 14656 my $midnumlet = $wb->table('MidNumLet'); 14657 $perl_case_ignorable += $midnumlet if defined $midnumlet; 14658 } 14659 else { 14660 14661 # In earlier versions of the standard, instead of the above two 14662 # properties , just the following characters were used: 14663 $perl_case_ignorable += 14664 ord("'") 14665 + utf8::unicode_to_native(0xAD) # SOFT HYPHEN (SHY) 14666 + 0x2019; # RIGHT SINGLE QUOTATION MARK 14667 } 14668 } 14669 14670 # The remaining perl defined tables are mostly based on Unicode TR 18, 14671 # "Annex C: Compatibility Properties". All of these have two versions, 14672 # one whose name generally begins with Posix that is posix-compliant, and 14673 # one that matches Unicode characters beyond the Posix, ASCII range 14674 14675 my $Alpha = $perl->add_match_table('XPosixAlpha'); 14676 14677 # Alphabetic was not present in early releases 14678 my $Alphabetic = property_ref('Alphabetic'); 14679 if (defined $Alphabetic && ! $Alphabetic->is_empty) { 14680 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); 14681 } 14682 else { 14683 14684 # The Alphabetic property doesn't exist for early releases, so 14685 # generate it. The actual definition, in 5.2 terms is: 14686 # 14687 # gc=L + gc=Nl + Other_Alphabetic 14688 # 14689 # Other_Alphabetic is also not defined in these early releases, but it 14690 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add 14691 # those last two as well, then subtract the relatively few of them that 14692 # shouldn't have been added. (The gc=So range is the circled capital 14693 # Latin characters. Early releases mistakenly didn't also include the 14694 # lower-case versions of these characters, and so we don't either, to 14695 # maintain consistency with those releases that first had this 14696 # property. 14697 $Alpha->initialize($gc->table('Letter') 14698 + pre_3_dot_1_Nl() 14699 + $gc->table('Mn') 14700 + $gc->table('Mc') 14701 ); 14702 $Alpha->add_range(0x24D0, 0x24E9); # gc=So 14703 foreach my $range ( [ 0x0300, 0x0344 ], 14704 [ 0x0346, 0x034E ], 14705 [ 0x0360, 0x0362 ], 14706 [ 0x0483, 0x0486 ], 14707 [ 0x0591, 0x05AF ], 14708 [ 0x06DF, 0x06E0 ], 14709 [ 0x06EA, 0x06EC ], 14710 [ 0x0740, 0x074A ], 14711 0x093C, 14712 0x094D, 14713 [ 0x0951, 0x0954 ], 14714 0x09BC, 14715 0x09CD, 14716 0x0A3C, 14717 0x0A4D, 14718 0x0ABC, 14719 0x0ACD, 14720 0x0B3C, 14721 0x0B4D, 14722 0x0BCD, 14723 0x0C4D, 14724 0x0CCD, 14725 0x0D4D, 14726 0x0DCA, 14727 [ 0x0E47, 0x0E4C ], 14728 0x0E4E, 14729 [ 0x0EC8, 0x0ECC ], 14730 [ 0x0F18, 0x0F19 ], 14731 0x0F35, 14732 0x0F37, 14733 0x0F39, 14734 [ 0x0F3E, 0x0F3F ], 14735 [ 0x0F82, 0x0F84 ], 14736 [ 0x0F86, 0x0F87 ], 14737 0x0FC6, 14738 0x1037, 14739 0x1039, 14740 [ 0x17C9, 0x17D3 ], 14741 [ 0x20D0, 0x20DC ], 14742 0x20E1, 14743 [ 0x302A, 0x302F ], 14744 [ 0x3099, 0x309A ], 14745 [ 0xFE20, 0xFE23 ], 14746 [ 0x1D165, 0x1D169 ], 14747 [ 0x1D16D, 0x1D172 ], 14748 [ 0x1D17B, 0x1D182 ], 14749 [ 0x1D185, 0x1D18B ], 14750 [ 0x1D1AA, 0x1D1AD ], 14751 ) { 14752 if (ref $range) { 14753 $Alpha->delete_range($range->[0], $range->[1]); 14754 } 14755 else { 14756 $Alpha->delete_range($range, $range); 14757 } 14758 } 14759 $Alpha->add_description('Alphabetic'); 14760 $Alpha->add_alias('Alphabetic'); 14761 } 14762 my $Posix_Alpha = $perl->add_match_table("PosixAlpha", 14763 Initialize => $Alpha & $ASCII, 14764 ); 14765 $Posix_Upper->set_caseless_equivalent($Posix_Alpha); 14766 $Posix_Lower->set_caseless_equivalent($Posix_Alpha); 14767 14768 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum', 14769 Description => 'Alphabetic and (decimal) Numeric', 14770 Initialize => $Alpha + $gc->table('Decimal_Number'), 14771 ); 14772 $perl->add_match_table("PosixAlnum", 14773 Initialize => $Alnum & $ASCII, 14774 ); 14775 14776 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord', 14777 Description => '\w, including beyond ASCII;' 14778 . ' = \p{Alnum} + \pM + \p{Pc}' 14779 . ' + \p{Join_Control}', 14780 Initialize => $Alnum + $gc->table('Mark'), 14781 ); 14782 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 14783 if (defined $Pc) { 14784 $Word += $Pc; 14785 } 14786 else { 14787 $Word += ord('_'); # Make sure this is a $Word 14788 } 14789 my $JC = property_ref('Join_Control'); # Wasn't in release 1 14790 if (defined $JC) { 14791 $Word += $JC->table('Y'); 14792 } 14793 else { 14794 $Word += 0x200C + 0x200D; 14795 } 14796 14797 # This is a Perl extension, so the name doesn't begin with Posix. 14798 my $PerlWord = $perl->add_match_table('PosixWord', 14799 Description => '\w, restricted to ASCII', 14800 Initialize => $Word & $ASCII, 14801 ); 14802 $PerlWord->add_alias('PerlWord'); 14803 14804 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank', 14805 Description => '\h, Horizontal white space', 14806 14807 # 200B is Zero Width Space which is for line 14808 # break control, and was listed as 14809 # Space_Separator in early releases 14810 Initialize => $gc->table('Space_Separator') 14811 + ord("\t") 14812 - 0x200B, # ZWSP 14813 ); 14814 $Blank->add_alias('HorizSpace'); # Another name for it. 14815 $perl->add_match_table("PosixBlank", 14816 Initialize => $Blank & $ASCII, 14817 ); 14818 14819 my $VertSpace = $perl->add_match_table('VertSpace', 14820 Description => '\v', 14821 Initialize => 14822 $gc->table('Line_Separator') 14823 + $gc->table('Paragraph_Separator') 14824 + utf8::unicode_to_native(0x0A) # LINE FEED 14825 + utf8::unicode_to_native(0x0B) # VERTICAL TAB 14826 + ord("\f") 14827 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 14828 + utf8::unicode_to_native(0x85) # NEL 14829 ); 14830 # No Posix equivalent for vertical space 14831 14832 my $Space = $perl->add_match_table('XPosixSpace', 14833 Description => '\s including beyond ASCII and vertical tab', 14834 Initialize => $Blank + $VertSpace, 14835 ); 14836 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms 14837 $Space->add_alias('SpacePerl'); 14838 $Space->add_alias('Space') if $v_version lt v4.1.0; 14839 14840 my $Posix_space = $perl->add_match_table("PosixSpace", 14841 Initialize => $Space & $ASCII, 14842 ); 14843 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym 14844 14845 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl', 14846 Description => 'Control characters'); 14847 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); 14848 $perl->add_match_table("PosixCntrl", 14849 Description => "ASCII control characters", 14850 Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2," 14851 . " DC3, DC4, DEL, DLE, ENQ, EOM," 14852 . " EOT, ESC, ETB, ETX, FF, FS, GS," 14853 . " HT, LF, NAK, NUL, RS, SI, SO," 14854 . " SOH, STX, SUB, SYN, US, VT", 14855 Initialize => $Cntrl & $ASCII, 14856 ); 14857 14858 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate'); 14859 my $Cs = $gc->table('Cs'); 14860 if (defined $Cs && ! $Cs->is_empty) { 14861 $perl_surrogate += $Cs; 14862 } 14863 else { 14864 push @tables_that_may_be_empty, '_Perl_Surrogate'; 14865 } 14866 14867 # $controls is a temporary used to construct Graph. 14868 my $controls = Range_List->new(Initialize => $gc->table('Unassigned') 14869 + $gc->table('Control') 14870 + $perl_surrogate); 14871 14872 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) 14873 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph', 14874 Description => 'Characters that are graphical', 14875 Initialize => ~ ($Space + $controls), 14876 ); 14877 $perl->add_match_table("PosixGraph", 14878 Initialize => $Graph & $ASCII, 14879 ); 14880 14881 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint', 14882 Description => 'Characters that are graphical plus space characters (but no controls)', 14883 Initialize => $Blank + $Graph - $gc->table('Control'), 14884 ); 14885 $perl->add_match_table("PosixPrint", 14886 Initialize => $print & $ASCII, 14887 ); 14888 14889 my $Punct = $perl->add_match_table('Punct'); 14890 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); 14891 14892 # \p{punct} doesn't include the symbols, which posix does 14893 my $XPosixPunct = $perl->add_match_table('XPosixPunct', 14894 Description => '\p{Punct} + ASCII-range \p{Symbol}', 14895 Initialize => $gc->table('Punctuation') 14896 + ($ASCII & $gc->table('Symbol')), 14897 Perl_Extension => 1 14898 ); 14899 $perl->add_match_table('PosixPunct', Perl_Extension => 1, 14900 Initialize => $ASCII & $XPosixPunct, 14901 ); 14902 14903 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit', 14904 Description => '[0-9] + all other decimal digits'); 14905 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); 14906 my $PosixDigit = $perl->add_match_table("PosixDigit", 14907 Initialize => $Digit & $ASCII, 14908 ); 14909 14910 # Hex_Digit was not present in first release 14911 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit'); 14912 my $Hex = property_ref('Hex_Digit'); 14913 if (defined $Hex && ! $Hex->is_empty) { 14914 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); 14915 } 14916 else { 14917 $Xdigit->initialize([ ord('0') .. ord('9'), 14918 ord('A') .. ord('F'), 14919 ord('a') .. ord('f'), 14920 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); 14921 } 14922 14923 # AHex was not present in early releases 14924 my $PosixXDigit = $perl->add_match_table('PosixXDigit'); 14925 my $AHex = property_ref('ASCII_Hex_Digit'); 14926 if (defined $AHex && ! $AHex->is_empty) { 14927 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); 14928 } 14929 else { 14930 $PosixXDigit->initialize($Xdigit & $ASCII); 14931 $PosixXDigit->add_alias('AHex'); 14932 $PosixXDigit->add_alias('Ascii_Hex_Digit'); 14933 } 14934 14935 my $any_folds = $perl->add_match_table("_Perl_Any_Folds", 14936 Description => "Code points that particpate in some fold", 14937 ); 14938 my $loc_problem_folds = $perl->add_match_table( 14939 "_Perl_Problematic_Locale_Folds", 14940 Description => 14941 "Code points that are in some way problematic under locale", 14942 ); 14943 14944 # This allows regexec.c to skip some work when appropriate. Some of the 14945 # entries in _Perl_Problematic_Locale_Folds are multi-character folds, 14946 my $loc_problem_folds_start = $perl->add_match_table( 14947 "_Perl_Problematic_Locale_Foldeds_Start", 14948 Description => 14949 "The first character of every sequence in _Perl_Problematic_Locale_Folds", 14950 ); 14951 14952 my $cf = property_ref('Case_Folding'); 14953 14954 # Every character 0-255 is problematic because what each folds to depends 14955 # on the current locale 14956 $loc_problem_folds->add_range(0, 255); 14957 $loc_problem_folds_start += $loc_problem_folds; 14958 14959 # Also problematic are anything these fold to outside the range. Likely 14960 # forever the only thing folded to by these outside the 0-255 range is the 14961 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code 14962 # completely general, which should catch any unexpected changes or errors. 14963 # We look at each code point 0-255, and add its fold (including each part 14964 # of a multi-char fold) to the list. See commit message 14965 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description 14966 # of the MU issue. 14967 foreach my $range ($loc_problem_folds->ranges) { 14968 foreach my $code_point ($range->start .. $range->end) { 14969 my $fold_range = $cf->containing_range($code_point); 14970 next unless defined $fold_range; 14971 14972 # Skip if folds to itself 14973 next if $fold_range->value eq $CODE_POINT; 14974 14975 my @hex_folds = split " ", $fold_range->value; 14976 my $start_cp = $hex_folds[0]; 14977 next if $start_cp eq $CODE_POINT; 14978 $start_cp = hex $start_cp; 14979 foreach my $i (0 .. @hex_folds - 1) { 14980 my $cp = $hex_folds[$i]; 14981 next if $cp eq $CODE_POINT; 14982 $cp = hex $cp; 14983 next unless $cp > 255; # Already have the < 256 ones 14984 14985 $loc_problem_folds->add_range($cp, $cp); 14986 $loc_problem_folds_start->add_range($start_cp, $start_cp); 14987 } 14988 } 14989 } 14990 14991 my $folds_to_multi_char = $perl->add_match_table( 14992 "_Perl_Folds_To_Multi_Char", 14993 Description => 14994 "Code points whose fold is a string of more than one character", 14995 ); 14996 if ($v_version lt v3.0.1) { 14997 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char'; 14998 } 14999 15000 # Look through all the known folds to populate these tables. 15001 foreach my $range ($cf->ranges) { 15002 next if $range->value eq $CODE_POINT; 15003 my $start = $range->start; 15004 my $end = $range->end; 15005 $any_folds->add_range($start, $end); 15006 15007 my @hex_folds = split " ", $range->value; 15008 if (@hex_folds > 1) { # Is multi-char fold 15009 $folds_to_multi_char->add_range($start, $end); 15010 } 15011 15012 my $found_locale_problematic = 0; 15013 15014 # Look at each of the folded-to characters... 15015 foreach my $i (0 .. @hex_folds - 1) { 15016 my $cp = hex $hex_folds[$i]; 15017 $any_folds->add_range($cp, $cp); 15018 15019 # The fold is problematic if any of the folded-to characters is 15020 # already considered problematic. 15021 if ($loc_problem_folds->contains($cp)) { 15022 $loc_problem_folds->add_range($start, $end); 15023 $found_locale_problematic = 1; 15024 } 15025 } 15026 15027 # If this is a problematic fold, add to the start chars the 15028 # folding-from characters and first folded-to character. 15029 if ($found_locale_problematic) { 15030 $loc_problem_folds_start->add_range($start, $end); 15031 my $cp = hex $hex_folds[0]; 15032 $loc_problem_folds_start->add_range($cp, $cp); 15033 } 15034 } 15035 15036 my $dt = property_ref('Decomposition_Type'); 15037 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', 15038 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), 15039 Perl_Extension => 1, 15040 Note => 'Union of all non-canonical decompositions', 15041 ); 15042 15043 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier 15044 # than SD appeared, construct it ourselves, based on the first release SD 15045 # was in. A pod entry is grandfathered in for it 15046 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1, 15047 Perl_Extension => 1, 15048 Fate => $INTERNAL_ONLY, 15049 Status => $DISCOURAGED); 15050 my $soft_dotted = property_ref('Soft_Dotted'); 15051 if (defined $soft_dotted && ! $soft_dotted->is_empty) { 15052 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1); 15053 } 15054 else { 15055 15056 # This list came from 3.2 Soft_Dotted; all of these code points are in 15057 # all releases 15058 $CanonDCIJ->initialize([ ord('i'), 15059 ord('j'), 15060 0x012F, 15061 0x0268, 15062 0x0456, 15063 0x0458, 15064 0x1E2D, 15065 0x1ECB, 15066 ]); 15067 $CanonDCIJ = $CanonDCIJ & $Assigned; 15068 } 15069 15070 # For backward compatibility, Perl has its own definition for IDStart. 15071 # It is regular XID_Start plus the underscore, but all characters must be 15072 # Word characters as well 15073 my $XID_Start = property_ref('XID_Start'); 15074 my $perl_xids = $perl->add_match_table('_Perl_IDStart', 15075 Perl_Extension => 1, 15076 Fate => $INTERNAL_ONLY, 15077 Initialize => ord('_') 15078 ); 15079 if (defined $XID_Start 15080 || defined ($XID_Start = property_ref('ID_Start'))) 15081 { 15082 $perl_xids += $XID_Start->table('Y'); 15083 } 15084 else { 15085 # For Unicode versions that don't have the property, construct our own 15086 # from first principles. The actual definition is: 15087 # Letters 15088 # + letter numbers (Nl) 15089 # - Pattern_Syntax 15090 # - Pattern_White_Space 15091 # + stability extensions 15092 # - NKFC modifications 15093 # 15094 # What we do in the code below is to include the identical code points 15095 # that are in the first release that had Unicode's version of this 15096 # property, essentially extrapolating backwards. There were no 15097 # stability extensions until v4.1, so none are included; likewise in 15098 # no Unicode version so far do subtracting PatSyn and PatWS make any 15099 # difference, so those also are ignored. 15100 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl(); 15101 15102 # We do subtract the NFKC modifications that are in the first version 15103 # that had this property. We don't bother to test if they are in the 15104 # version in question, because if they aren't, the operation is a 15105 # no-op. The NKFC modifications are discussed in 15106 # http://www.unicode.org/reports/tr31/#NFKC_Modifications 15107 foreach my $range ( 0x037A, 15108 0x0E33, 15109 0x0EB3, 15110 [ 0xFC5E, 0xFC63 ], 15111 [ 0xFDFA, 0xFE70 ], 15112 [ 0xFE72, 0xFE76 ], 15113 0xFE78, 15114 0xFE7A, 15115 0xFE7C, 15116 0xFE7E, 15117 [ 0xFF9E, 0xFF9F ], 15118 ) { 15119 if (ref $range) { 15120 $perl_xids->delete_range($range->[0], $range->[1]); 15121 } 15122 else { 15123 $perl_xids->delete_range($range, $range); 15124 } 15125 } 15126 } 15127 15128 $perl_xids &= $Word; 15129 15130 my $perl_xidc = $perl->add_match_table('_Perl_IDCont', 15131 Perl_Extension => 1, 15132 Fate => $INTERNAL_ONLY); 15133 my $XIDC = property_ref('XID_Continue'); 15134 if (defined $XIDC 15135 || defined ($XIDC = property_ref('ID_Continue'))) 15136 { 15137 $perl_xidc += $XIDC->table('Y'); 15138 } 15139 else { 15140 # Similarly, we construct our own XIDC if necessary for early Unicode 15141 # versions. The definition is: 15142 # everything in XIDS 15143 # + Gc=Mn 15144 # + Gc=Mc 15145 # + Gc=Nd 15146 # + Gc=Pc 15147 # - Pattern_Syntax 15148 # - Pattern_White_Space 15149 # + stability extensions 15150 # - NFKC modifications 15151 # 15152 # The same thing applies to this as with XIDS for the PatSyn, PatWS, 15153 # and stability extensions. There is a somewhat different set of NFKC 15154 # mods to remove (and add in this case). The ones below make this 15155 # have identical code points as in the first release that defined it. 15156 $perl_xidc += $perl_xids 15157 + $gc->table('L') 15158 + $gc->table('Mn') 15159 + $gc->table('Mc') 15160 + $gc->table('Nd') 15161 + utf8::unicode_to_native(0xB7) 15162 ; 15163 if (defined (my $pc = $gc->table('Pc'))) { 15164 $perl_xidc += $pc; 15165 } 15166 else { # 1.1.5 didn't have Pc, but these should have been in it 15167 $perl_xidc += 0xFF3F; 15168 $perl_xidc->add_range(0x203F, 0x2040); 15169 $perl_xidc->add_range(0xFE33, 0xFE34); 15170 $perl_xidc->add_range(0xFE4D, 0xFE4F); 15171 } 15172 15173 # Subtract the NFKC mods 15174 foreach my $range ( 0x037A, 15175 [ 0xFC5E, 0xFC63 ], 15176 [ 0xFDFA, 0xFE1F ], 15177 0xFE70, 15178 [ 0xFE72, 0xFE76 ], 15179 0xFE78, 15180 0xFE7A, 15181 0xFE7C, 15182 0xFE7E, 15183 ) { 15184 if (ref $range) { 15185 $perl_xidc->delete_range($range->[0], $range->[1]); 15186 } 15187 else { 15188 $perl_xidc->delete_range($range, $range); 15189 } 15190 } 15191 } 15192 15193 $perl_xidc &= $Word; 15194 15195 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin', 15196 Perl_Extension => 1, 15197 Fate => $INTERNAL_ONLY, 15198 Initialize => $gc->table('Letter') & $Alpha & $perl_xids, 15199 ); 15200 15201 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue', 15202 Perl_Extension => 1, 15203 Fate => $INTERNAL_ONLY, 15204 Initialize => $perl_xidc 15205 + ord(" ") 15206 + ord("(") 15207 + ord(")") 15208 + ord("-") 15209 ); 15210 15211 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); 15212 15213 if (@named_sequences) { 15214 push @composition, 'Named_Sequence'; 15215 foreach my $sequence (@named_sequences) { 15216 $perl_charname->add_anomalous_entry($sequence); 15217 } 15218 } 15219 15220 my $alias_sentence = ""; 15221 my %abbreviations; 15222 my $alias = property_ref('_Perl_Name_Alias'); 15223 $perl_charname->set_proxy_for('_Perl_Name_Alias'); 15224 15225 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go 15226 # with respect to any existing entry depends on the entry type. 15227 # Corrections go before said entry, as they should be returned in 15228 # preference over the existing entry. (A correction to a correction 15229 # should be later in the _Perl_Name_Alias table, so it will correctly 15230 # precede the erroneous correction in Perl_Charnames.) 15231 # 15232 # Abbreviations go after everything else, so they are saved temporarily in 15233 # a hash for later. 15234 # 15235 # Everything else is added added afterwards, which preserves the input 15236 # ordering 15237 15238 foreach my $range ($alias->ranges) { 15239 next if $range->value eq ""; 15240 my $code_point = $range->start; 15241 if ($code_point != $range->end) { 15242 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 15243 } 15244 my ($value, $type) = split ': ', $range->value; 15245 my $replace_type; 15246 if ($type eq 'correction') { 15247 $replace_type = $MULTIPLE_BEFORE; 15248 } 15249 elsif ($type eq 'abbreviation') { 15250 15251 # Save for later 15252 $abbreviations{$value} = $code_point; 15253 next; 15254 } 15255 else { 15256 $replace_type = $MULTIPLE_AFTER; 15257 } 15258 15259 # Actually add; before or after current entry(ies) as determined 15260 # above. 15261 15262 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); 15263 } 15264 $alias_sentence = <<END; 15265The _Perl_Name_Alias property adds duplicate code point entries that are 15266alternatives to the original name. If an addition is a corrected 15267name, it will be physically first in the table. The original (less correct, 15268but still valid) name will be next; then any alternatives, in no particular 15269order; and finally any abbreviations, again in no particular order. 15270END 15271 15272 # Now add the Unicode_1 names for the controls. The Unicode_1 names had 15273 # precedence before 6.1, including the awful ones like "LINE FEED (LF)", 15274 # so should be first in the file; the other names have precedence starting 15275 # in 6.1, 15276 my $before_or_after = ($v_version lt v6.1.0) 15277 ? $MULTIPLE_BEFORE 15278 : $MULTIPLE_AFTER; 15279 15280 foreach my $range (property_ref('Unicode_1_Name')->ranges) { 15281 my $code_point = $range->start; 15282 my $unicode_1_value = $range->value; 15283 next if $unicode_1_value eq ""; # Skip if name doesn't exist. 15284 15285 if ($code_point != $range->end) { 15286 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 15287 } 15288 15289 # To handle EBCDIC, we don't hard code in the code points of the 15290 # controls; instead realizing that all of them are below 256. 15291 last if $code_point > 255; 15292 15293 # We only add in the controls. 15294 next if $gc->value_of($code_point) ne 'Cc'; 15295 15296 # We reject this Unicode1 name for later Perls, as it is used for 15297 # another code point 15298 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0; 15299 15300 # This won't add an exact duplicate. 15301 $perl_charname->add_duplicate($code_point, $unicode_1_value, 15302 Replace => $before_or_after); 15303 } 15304 15305 # Now that have everything added, add in abbreviations after 15306 # everything else. Sort so results don't change between runs of this 15307 # program 15308 foreach my $value (sort keys %abbreviations) { 15309 $perl_charname->add_duplicate($abbreviations{$value}, $value, 15310 Replace => $MULTIPLE_AFTER); 15311 } 15312 15313 my $comment; 15314 if (@composition <= 2) { # Always at least 2 15315 $comment = join " and ", @composition; 15316 } 15317 else { 15318 $comment = join ", ", @composition[0 .. scalar @composition - 2]; 15319 $comment .= ", and $composition[-1]"; 15320 } 15321 15322 $perl_charname->add_comment(join_lines( <<END 15323This file is for charnames.pm. It is the union of the $comment properties. 15324Unicode_1_Name entries are used only for nameless code points in the Name 15325property. 15326$alias_sentence 15327This file doesn't include the algorithmically determinable names. For those, 15328use 'unicore/Name.pm' 15329END 15330 )); 15331 property_ref('Name')->add_comment(join_lines( <<END 15332This file doesn't include the algorithmically determinable names. For those, 15333use 'unicore/Name.pm' 15334END 15335 )); 15336 15337 # Construct the Present_In property from the Age property. 15338 if (-e 'DAge.txt' && defined $age) { 15339 my $default_map = $age->default_map; 15340 my $in = Property->new('In', 15341 Default_Map => $default_map, 15342 Full_Name => "Present_In", 15343 Perl_Extension => 1, 15344 Type => $ENUM, 15345 Initialize => $age, 15346 ); 15347 $in->add_comment(join_lines(<<END 15348THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the 15349same as for $age, and not for what $in really means. This is because anything 15350defined in a given release should have multiple values: that release and all 15351higher ones. But only one value per code point can be represented in a table 15352like this. 15353END 15354 )); 15355 15356 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the 15357 # lowest numbered (earliest) come first, with the non-numeric one 15358 # last. 15359 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/) 15360 ? 1 15361 : ($b->name !~ /^[\d.]*$/) 15362 ? -1 15363 : $a->name <=> $b->name 15364 } $age->tables; 15365 15366 # The Present_In property is the cumulative age properties. The first 15367 # one hence is identical to the first age one. 15368 my $previous_in = $in->add_match_table($first_age->name); 15369 $previous_in->set_equivalent_to($first_age, Related => 1); 15370 15371 my $description_start = "Code point's usage introduced in version "; 15372 $first_age->add_description($description_start . $first_age->name); 15373 15374 # To construct the accumulated values, for each of the age tables 15375 # starting with the 2nd earliest, merge the earliest with it, to get 15376 # all those code points existing in the 2nd earliest. Repeat merging 15377 # the new 2nd earliest with the 3rd earliest to get all those existing 15378 # in the 3rd earliest, and so on. 15379 foreach my $current_age (@rest_ages) { 15380 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric 15381 15382 my $current_in = $in->add_match_table( 15383 $current_age->name, 15384 Initialize => $current_age + $previous_in, 15385 Description => $description_start 15386 . $current_age->name 15387 . ' or earlier', 15388 ); 15389 foreach my $alias ($current_age->aliases) { 15390 $current_in->add_alias($alias->name); 15391 } 15392 $previous_in = $current_in; 15393 15394 # Add clarifying material for the corresponding age file. This is 15395 # in part because of the confusing and contradictory information 15396 # given in the Standard's documentation itself, as of 5.2. 15397 $current_age->add_description( 15398 "Code point's usage was introduced in version " 15399 . $current_age->name); 15400 $current_age->add_note("See also $in"); 15401 15402 } 15403 15404 # And finally the code points whose usages have yet to be decided are 15405 # the same in both properties. Note that permanently unassigned code 15406 # points actually have their usage assigned (as being permanently 15407 # unassigned), so that these tables are not the same as gc=cn. 15408 my $unassigned = $in->add_match_table($default_map); 15409 my $age_default = $age->table($default_map); 15410 $age_default->add_description(<<END 15411Code point's usage has not been assigned in any Unicode release thus far. 15412END 15413 ); 15414 $unassigned->set_equivalent_to($age_default, Related => 1); 15415 } 15416 15417 my $patws = $perl->add_match_table('_Perl_PatWS', 15418 Perl_Extension => 1, 15419 Fate => $INTERNAL_ONLY); 15420 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) { 15421 $patws->initialize($off_patws->table('Y')); 15422 } 15423 else { 15424 $patws->initialize([ ord("\t"), 15425 ord("\n"), 15426 utf8::unicode_to_native(0x0B), # VT 15427 ord("\f"), 15428 ord("\r"), 15429 ord(" "), 15430 utf8::unicode_to_native(0x85), # NEL 15431 0x200E..0x200F, # Left, Right marks 15432 0x2028..0x2029 # Line, Paragraph seps 15433 ] ); 15434 } 15435 15436 # See L<perlfunc/quotemeta> 15437 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', 15438 Perl_Extension => 1, 15439 Fate => $INTERNAL_ONLY, 15440 15441 # Initialize to what's common in 15442 # all Unicode releases. 15443 Initialize => 15444 $gc->table('Control') 15445 + $Space 15446 + $patws 15447 + ((~ $Word) & $ASCII) 15448 ); 15449 15450 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) { 15451 $quotemeta += $patsyn->table('Y'); 15452 } 15453 else { 15454 $quotemeta += ((~ $Word) & Range->new(0, 255)) 15455 - utf8::unicode_to_native(0xA8) 15456 - utf8::unicode_to_native(0xAF) 15457 - utf8::unicode_to_native(0xB2) 15458 - utf8::unicode_to_native(0xB3) 15459 - utf8::unicode_to_native(0xB4) 15460 - utf8::unicode_to_native(0xB7) 15461 - utf8::unicode_to_native(0xB8) 15462 - utf8::unicode_to_native(0xB9) 15463 - utf8::unicode_to_native(0xBC) 15464 - utf8::unicode_to_native(0xBD) 15465 - utf8::unicode_to_native(0xBE); 15466 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the 15467 # same in all releases 15468 0x2010 .. 0x2027, 15469 0x2030 .. 0x203E, 15470 0x2041 .. 0x2053, 15471 0x2055 .. 0x205E, 15472 0x2190 .. 0x245F, 15473 0x2500 .. 0x2775, 15474 0x2794 .. 0x2BFF, 15475 0x2E00 .. 0x2E7F, 15476 0x3001 .. 0x3003, 15477 0x3008 .. 0x3020, 15478 0x3030 .. 0x3030, 15479 0xFD3E .. 0xFD3F, 15480 0xFE45 .. 0xFE46 15481 ]; 15482 } 15483 15484 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 15485 $quotemeta += $di->table('Y') 15486 } 15487 else { 15488 if ($v_version ge v2.0) { 15489 $quotemeta += $gc->table('Cf') 15490 + $gc->table('Cs'); 15491 15492 # These are above the Unicode version 1 max 15493 $quotemeta->add_range(0xE0000, 0xE0FFF); 15494 } 15495 $quotemeta += $gc->table('Cc') 15496 - $Space; 15497 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D, 15498 0x2060 .. 0x206F, 15499 0xFE00 .. 0xFE0F, 15500 0xFFF0 .. 0xFFFB, 15501 ]); 15502 $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0; 15503 $quotemeta += $temp; 15504 } 15505 calculate_DI(); 15506 $quotemeta += $DI; 15507 15508 calculate_NChar(); 15509 15510 # Finished creating all the perl properties. All non-internal non-string 15511 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with 15512 # an underscore.) These do not get a separate entry in the pod file 15513 foreach my $table ($perl->tables) { 15514 foreach my $alias ($table->aliases) { 15515 next if $alias->name =~ /^_/; 15516 $table->add_alias('Is_' . $alias->name, 15517 Re_Pod_Entry => 0, 15518 UCD => 0, 15519 Status => $alias->status, 15520 OK_as_Filename => 0); 15521 } 15522 } 15523 15524 # Perl tailors the WordBreak property so that \b{wb} doesn't split 15525 # adjacent spaces into separate words. First create a copy of the regular 15526 # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB 15527 # was defined for, this will already have been done by the substitute file 15528 # portion for 'Input_file' code for WB.) 15529 my $perl_wb = property_ref('_Perl_WB'); 15530 if (! defined $perl_wb) { 15531 $perl_wb = Property->new('_Perl_WB', 15532 Fate => $INTERNAL_ONLY, 15533 Perl_Extension => 1, 15534 Directory => $map_directory, 15535 Type => $STRING); 15536 my $wb = property_ref('Word_Break'); 15537 $perl_wb->initialize($wb); 15538 $perl_wb->set_default_map($wb->default_map); 15539 } 15540 15541 # And simply replace the mappings of horizontal space characters that 15542 # otherwise would map to the default to instead map to our tailoring. 15543 my $default = $perl_wb->default_map; 15544 for my $range ($Blank->ranges) { 15545 for my $i ($range->start .. $range->end) { 15546 next unless $perl_wb->value_of($i) eq $default; 15547 $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace', 15548 Replace => $UNCONDITIONALLY); 15549 } 15550 } 15551 15552 # Create a version of the LineBreak property with the mappings that are 15553 # omitted in the default algorithm remapped to what 15554 # http://www.unicode.org/reports/tr14 says they should be. 15555 # 15556 # Original Resolved General_Category 15557 # AI, SG, XX AL Any 15558 # SA CM Only Mn or Mc 15559 # SA AL Any except Mn and Mc 15560 # CJ NS Any 15561 # 15562 # All property values are also written out in their long form, as 15563 # regen/mk_invlist.pl expects that. This also fixes occurrences of the 15564 # typo in early Unicode versions: 'inseperable'. 15565 my $perl_lb = property_ref('_Perl_LB'); 15566 if (! defined $perl_lb) { 15567 $perl_lb = Property->new('_Perl_LB', 15568 Fate => $INTERNAL_ONLY, 15569 Perl_Extension => 1, 15570 Directory => $map_directory, 15571 Type => $STRING); 15572 my $lb = property_ref('Line_Break'); 15573 15574 # Populate from $lb, but use full name and fix typo. 15575 foreach my $range ($lb->ranges) { 15576 my $full_name = $lb->table($range->value)->full_name; 15577 $full_name = 'Inseparable' 15578 if standardize($full_name) eq 'inseperable'; 15579 $perl_lb->add_map($range->start, $range->end, $full_name); 15580 } 15581 } 15582 15583 $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL 15584 15585 for my $range ($perl_lb->ranges) { 15586 my $value = standardize($range->value); 15587 if ( $value eq standardize('Unknown') 15588 || $value eq standardize('Ambiguous') 15589 || $value eq standardize('Surrogate')) 15590 { 15591 $perl_lb->add_map($range->start, $range->end, 'Alphabetic', 15592 Replace => $UNCONDITIONALLY); 15593 } 15594 elsif ($value eq standardize('Conditional_Japanese_Starter')) { 15595 $perl_lb->add_map($range->start, $range->end, 'Nonstarter', 15596 Replace => $UNCONDITIONALLY); 15597 } 15598 elsif ($value eq standardize('Complex_Context')) { 15599 for my $i ($range->start .. $range->end) { 15600 my $gc_val = $gc->value_of($i); 15601 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') { 15602 $perl_lb->add_map($i, $i, 'Combining_Mark', 15603 Replace => $UNCONDITIONALLY); 15604 } 15605 else { 15606 $perl_lb->add_map($i, $i, 'Alphabetic', 15607 Replace => $UNCONDITIONALLY); 15608 } 15609 } 15610 } 15611 } 15612 15613 # This property is a modification of the scx property 15614 my $perl_scx = Property->new('_Perl_SCX', 15615 Fate => $INTERNAL_ONLY, 15616 Perl_Extension => 1, 15617 Directory => $map_directory, 15618 Type => $ENUM); 15619 my $source; 15620 15621 # Use scx if available; otherwise sc; if neither is there (a very old 15622 # Unicode version, just say that everything is 'Common' 15623 if (defined $scx) { 15624 $source = $scx; 15625 $perl_scx->set_default_map('Unknown'); 15626 } 15627 elsif (defined $script) { 15628 $source = $script; 15629 15630 # Early versions of 'sc', had everything be 'Common' 15631 if (defined $script->table('Unknown')) { 15632 $perl_scx->set_default_map('Unknown'); 15633 } 15634 else { 15635 $perl_scx->set_default_map('Common'); 15636 } 15637 } else { 15638 $perl_scx->add_match_table('Common'); 15639 $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common'); 15640 15641 $perl_scx->add_match_table('Unknown'); 15642 $perl_scx->set_default_map('Unknown'); 15643 } 15644 15645 $perl_scx->_set_format($STRING_WHITE_SPACE_LIST); 15646 $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 15647 15648 if (defined $source) { 15649 $perl_scx->initialize($source); 15650 15651 # UTS 39 says that the scx property should be modified for these 15652 # countries where certain mixed scripts are commonly used. 15653 for my $range ($perl_scx->ranges) { 15654 my $value = $range->value; 15655 my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi; 15656 $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi; 15657 $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi; 15658 $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) } 15659 {$1 Katakana Hiragana Jpan}xi; 15660 $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi; 15661 $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi; 15662 15663 if ($changed) { 15664 $value = join " ", uniques split " ", $value; 15665 $range->set_value($value) 15666 } 15667 } 15668 15669 foreach my $table ($source->tables) { 15670 my $scx_table = $perl_scx->add_match_table($table->name, 15671 Full_Name => $table->full_name); 15672 foreach my $alias ($table->aliases) { 15673 $scx_table->add_alias($alias->name); 15674 } 15675 } 15676 } 15677 15678 # Here done with all the basic stuff. Ready to populate the information 15679 # about each character if annotating them. 15680 if ($annotate) { 15681 15682 # See comments at its declaration 15683 $annotate_ranges = Range_Map->new; 15684 15685 # This separates out the non-characters from the other unassigneds, so 15686 # can give different annotations for each. 15687 $unassigned_sans_noncharacters = Range_List->new( 15688 Initialize => $gc->table('Unassigned')); 15689 $unassigned_sans_noncharacters &= (~ $NChar); 15690 15691 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) { 15692 $i = populate_char_info($i); # Note sets $i so may cause skips 15693 15694 } 15695 } 15696 15697 return; 15698} 15699 15700sub add_perl_synonyms() { 15701 # A number of Unicode tables have Perl synonyms that are expressed in 15702 # the single-form, \p{name}. These are: 15703 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and 15704 # \p{Is_Name} as synonyms 15705 # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms 15706 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms 15707 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no 15708 # conflict, \p{Value} and \p{Is_Value} as well 15709 # 15710 # This routine generates these synonyms, warning of any unexpected 15711 # conflicts. 15712 15713 # Construct the list of tables to get synonyms for. Start with all the 15714 # binary and the General_Category ones. 15715 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } 15716 property_ref('*'); 15717 push @tables, $gc->tables; 15718 15719 # If the version of Unicode includes the Script Extensions (preferably), 15720 # or Script property, add its tables 15721 if (defined $scx) { 15722 push @tables, $scx->tables; 15723 } 15724 else { 15725 push @tables, $script->tables if defined $script; 15726 } 15727 15728 # The Block tables are kept separate because they are treated differently. 15729 # And the earliest versions of Unicode didn't include them, so add only if 15730 # there are some. 15731 my @blocks; 15732 push @blocks, $block->tables if defined $block; 15733 15734 # Here, have the lists of tables constructed. Process blocks last so that 15735 # if there are name collisions with them, blocks have lowest priority. 15736 # Should there ever be other collisions, manual intervention would be 15737 # required. See the comments at the beginning of the program for a 15738 # possible way to handle those semi-automatically. 15739 foreach my $table (@tables, @blocks) { 15740 15741 # For non-binary properties, the synonym is just the name of the 15742 # table, like Greek, but for binary properties the synonym is the name 15743 # of the property, and means the code points in its 'Y' table. 15744 my $nominal = $table; 15745 my $nominal_property = $nominal->property; 15746 my $actual; 15747 if (! $nominal->isa('Property')) { 15748 $actual = $table; 15749 } 15750 else { 15751 15752 # Here is a binary property. Use the 'Y' table. Verify that is 15753 # there 15754 my $yes = $nominal->table('Y'); 15755 unless (defined $yes) { # Must be defined, but is permissible to 15756 # be empty. 15757 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); 15758 next; 15759 } 15760 $actual = $yes; 15761 } 15762 15763 foreach my $alias ($nominal->aliases) { 15764 15765 # Attempt to create a table in the perl directory for the 15766 # candidate table, using whatever aliases in it that don't 15767 # conflict. Also add non-conflicting aliases for all these 15768 # prefixed by 'Is_' (and/or 'In_' for Block property tables) 15769 PREFIX: 15770 foreach my $prefix ("", 'Is_', 'In_') { 15771 15772 # Only Block properties can have added 'In_' aliases. 15773 next if $prefix eq 'In_' and $nominal_property != $block; 15774 15775 my $proposed_name = $prefix . $alias->name; 15776 15777 # No Is_Is, In_In, nor combinations thereof 15778 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; 15779 next if $proposed_name =~ /^ I [ns] _I [ns] _/x; 15780 15781 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; 15782 15783 # Get a reference to any existing table in the perl 15784 # directory with the desired name. 15785 my $pre_existing = $perl->table($proposed_name); 15786 15787 if (! defined $pre_existing) { 15788 15789 # No name collision, so OK to add the perl synonym. 15790 15791 my $make_re_pod_entry; 15792 my $ok_as_filename; 15793 my $status = $alias->status; 15794 if ($nominal_property == $block) { 15795 15796 # For block properties, only the compound form is 15797 # preferred for external use; the others are 15798 # discouraged. The pod file contains wild cards for 15799 # the 'In' and 'Is' forms so no entries for those; and 15800 # we don't want people using the name without any 15801 # prefix, so discourage that. 15802 if ($prefix eq "") { 15803 $make_re_pod_entry = 1; 15804 $status = $status || $DISCOURAGED; 15805 $ok_as_filename = 0; 15806 } 15807 elsif ($prefix eq 'In_') { 15808 $make_re_pod_entry = 0; 15809 $status = $status || $DISCOURAGED; 15810 $ok_as_filename = 1; 15811 } 15812 else { 15813 $make_re_pod_entry = 0; 15814 $status = $status || $DISCOURAGED; 15815 $ok_as_filename = 0; 15816 } 15817 } 15818 elsif ($prefix ne "") { 15819 15820 # The 'Is' prefix is handled in the pod by a wild 15821 # card, and we won't use it for an external name 15822 $make_re_pod_entry = 0; 15823 $status = $status || $NORMAL; 15824 $ok_as_filename = 0; 15825 } 15826 else { 15827 15828 # Here, is an empty prefix, non block. This gets its 15829 # own pod entry and can be used for an external name. 15830 $make_re_pod_entry = 1; 15831 $status = $status || $NORMAL; 15832 $ok_as_filename = 1; 15833 } 15834 15835 # Here, there isn't a perl pre-existing table with the 15836 # name. Look through the list of equivalents of this 15837 # table to see if one is a perl table. 15838 foreach my $equivalent ($actual->leader->equivalents) { 15839 next if $equivalent->property != $perl; 15840 15841 # Here, have found a table for $perl. Add this alias 15842 # to it, and are done with this prefix. 15843 $equivalent->add_alias($proposed_name, 15844 Re_Pod_Entry => $make_re_pod_entry, 15845 15846 # Currently don't output these in the 15847 # ucd pod, as are strongly discouraged 15848 # from being used 15849 UCD => 0, 15850 15851 Status => $status, 15852 OK_as_Filename => $ok_as_filename); 15853 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; 15854 next PREFIX; 15855 } 15856 15857 # Here, $perl doesn't already have a table that is a 15858 # synonym for this property, add one. 15859 my $added_table = $perl->add_match_table($proposed_name, 15860 Re_Pod_Entry => $make_re_pod_entry, 15861 15862 # See UCD comment just above 15863 UCD => 0, 15864 15865 Status => $status, 15866 OK_as_Filename => $ok_as_filename); 15867 # And it will be related to the actual table, since it is 15868 # based on it. 15869 $added_table->set_equivalent_to($actual, Related => 1); 15870 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; 15871 next; 15872 } # End of no pre-existing. 15873 15874 # Here, there is a pre-existing table that has the proposed 15875 # name. We could be in trouble, but not if this is just a 15876 # synonym for another table that we have already made a child 15877 # of the pre-existing one. 15878 if ($pre_existing->is_set_equivalent_to($actual)) { 15879 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; 15880 $pre_existing->add_alias($proposed_name); 15881 next; 15882 } 15883 15884 # Here, there is a name collision, but it still could be OK if 15885 # the tables match the identical set of code points, in which 15886 # case, we can combine the names. Compare each table's code 15887 # point list to see if they are identical. 15888 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; 15889 if ($pre_existing->matches_identically_to($actual)) { 15890 15891 # Here, they do match identically. Not a real conflict. 15892 # Make the perl version a child of the Unicode one, except 15893 # in the non-obvious case of where the perl name is 15894 # already a synonym of another Unicode property. (This is 15895 # excluded by the test for it being its own parent.) The 15896 # reason for this exclusion is that then the two Unicode 15897 # properties become related; and we don't really know if 15898 # they are or not. We generate documentation based on 15899 # relatedness, and this would be misleading. Code 15900 # later executed in the process will cause the tables to 15901 # be represented by a single file anyway, without making 15902 # it look in the pod like they are necessarily related. 15903 if ($pre_existing->parent == $pre_existing 15904 && ($pre_existing->property == $perl 15905 || $actual->property == $perl)) 15906 { 15907 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; 15908 $pre_existing->set_equivalent_to($actual, Related => 1); 15909 } 15910 elsif (main::DEBUG && $to_trace) { 15911 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; 15912 trace $pre_existing->parent; 15913 } 15914 next PREFIX; 15915 } 15916 15917 # Here they didn't match identically, there is a real conflict 15918 # between our new name and a pre-existing property. 15919 $actual->add_conflicting($proposed_name, 'p', $pre_existing); 15920 $pre_existing->add_conflicting($nominal->full_name, 15921 'p', 15922 $actual); 15923 15924 # Don't output a warning for aliases for the block 15925 # properties (unless they start with 'In_') as it is 15926 # expected that there will be conflicts and the block 15927 # form loses. 15928 if ($verbosity >= $NORMAL_VERBOSITY 15929 && ($actual->property != $block || $prefix eq 'In_')) 15930 { 15931 print simple_fold(join_lines(<<END 15932There is already an alias named $proposed_name (from $pre_existing), 15933so not creating this alias for $actual 15934END 15935 ), "", 4); 15936 } 15937 15938 # Keep track for documentation purposes. 15939 $has_In_conflicts++ if $prefix eq 'In_'; 15940 $has_Is_conflicts++ if $prefix eq 'Is_'; 15941 } 15942 } 15943 } 15944 15945 # There are some properties which have No and Yes (and N and Y) as 15946 # property values, but aren't binary, and could possibly be confused with 15947 # binary ones. So create caveats for them. There are tables that are 15948 # named 'No', and tables that are named 'N', but confusion is not likely 15949 # unless they are the same table. For example, N meaning Number or 15950 # Neutral is not likely to cause confusion, so don't add caveats to things 15951 # like them. 15952 foreach my $property (grep { $_->type != $BINARY 15953 && $_->type != $FORCED_BINARY } 15954 property_ref('*')) 15955 { 15956 my $yes = $property->table('Yes'); 15957 if (defined $yes) { 15958 my $y = $property->table('Y'); 15959 if (defined $y && $yes == $y) { 15960 foreach my $alias ($property->aliases) { 15961 $yes->add_conflicting($alias->name); 15962 } 15963 } 15964 } 15965 my $no = $property->table('No'); 15966 if (defined $no) { 15967 my $n = $property->table('N'); 15968 if (defined $n && $no == $n) { 15969 foreach my $alias ($property->aliases) { 15970 $no->add_conflicting($alias->name, 'P'); 15971 } 15972 } 15973 } 15974 } 15975 15976 return; 15977} 15978 15979sub register_file_for_name($$$) { 15980 # Given info about a table and a datafile that it should be associated 15981 # with, register that association 15982 15983 my $table = shift; 15984 my $directory_ref = shift; # Array of the directory path for the file 15985 my $file = shift; # The file name in the final directory. 15986 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 15987 15988 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace; 15989 15990 if ($table->isa('Property')) { 15991 $table->set_file_path(@$directory_ref, $file); 15992 push @map_properties, $table; 15993 15994 # No swash means don't do the rest of this. 15995 return if $table->fate != $ORDINARY 15996 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY); 15997 15998 # Get the path to the file 15999 my @path = $table->file_path; 16000 16001 # Use just the file name if no subdirectory. 16002 shift @path if $path[0] eq File::Spec->curdir(); 16003 16004 my $file = join '/', @path; 16005 16006 # Create a hash entry for utf8_heavy to get the file that stores this 16007 # property's map table 16008 foreach my $alias ($table->aliases) { 16009 my $name = $alias->name; 16010 if ($name =~ /^_/) { 16011 $strict_property_to_file_of{lc $name} = $file; 16012 } 16013 else { 16014 $loose_property_to_file_of{standardize($name)} = $file; 16015 } 16016 } 16017 16018 # And a way for utf8_heavy to find the proper key in the SwashInfo 16019 # hash for this property. 16020 $file_to_swash_name{$file} = "To" . $table->swash_name; 16021 return; 16022 } 16023 16024 # Do all of the work for all equivalent tables when called with the leader 16025 # table, so skip if isn't the leader. 16026 return if $table->leader != $table; 16027 16028 # If this is a complement of another file, use that other file instead, 16029 # with a ! prepended to it. 16030 my $complement; 16031 if (($complement = $table->complement) != 0) { 16032 my @directories = $complement->file_path; 16033 16034 # This assumes that the 0th element is something like 'lib', 16035 # the 1th element the property name (in its own directory), like 16036 # 'AHex', and the 2th element the file like 'Y' which will have a .pl 16037 # appended to it later. 16038 $directories[1] =~ s/^/!/; 16039 $file = pop @directories; 16040 $directory_ref =\@directories; 16041 } 16042 16043 # Join all the file path components together, using slashes. 16044 my $full_filename = join('/', @$directory_ref, $file); 16045 16046 # All go in the same subdirectory of unicore, or the special 16047 # pseudo-directory '#' 16048 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) { 16049 Carp::my_carp("Unexpected directory in " 16050 . join('/', @{$directory_ref}, $file)); 16051 } 16052 16053 # For this table and all its equivalents ... 16054 foreach my $table ($table, $table->equivalents) { 16055 16056 # Associate it with its file internally. Don't include the 16057 # $matches_directory first component 16058 $table->set_file_path(@$directory_ref, $file); 16059 16060 # No swash means don't do the rest of this. 16061 next if $table->isa('Map_Table') && $table->fate != $ORDINARY; 16062 16063 my $sub_filename = join('/', $directory_ref->[1, -1], $file); 16064 16065 my $property = $table->property; 16066 my $property_name = ($property == $perl) 16067 ? "" # 'perl' is never explicitly stated 16068 : standardize($property->name) . '='; 16069 16070 my $is_default = 0; # Is this table the default one for the property? 16071 16072 # To calculate $is_default, we find if this table is the same as the 16073 # default one for the property. But this is complicated by the 16074 # possibility that there is a master table for this one, and the 16075 # information is stored there instead of here. 16076 my $parent = $table->parent; 16077 my $leader_prop = $parent->property; 16078 my $default_map = $leader_prop->default_map; 16079 if (defined $default_map) { 16080 my $default_table = $leader_prop->table($default_map); 16081 $is_default = 1 if defined $default_table && $parent == $default_table; 16082 } 16083 16084 # Calculate the loose name for this table. Mostly it's just its name, 16085 # standardized. But in the case of Perl tables that are single-form 16086 # equivalents to Unicode properties, it is the latter's name. 16087 my $loose_table_name = 16088 ($property != $perl || $leader_prop == $perl) 16089 ? standardize($table->name) 16090 : standardize($parent->name); 16091 16092 my $deprecated = ($table->status eq $DEPRECATED) 16093 ? $table->status_info 16094 : ""; 16095 my $caseless_equivalent = $table->caseless_equivalent; 16096 16097 # And for each of the table's aliases... This inner loop eventually 16098 # goes through all aliases in the UCD that we generate regex match 16099 # files for 16100 foreach my $alias ($table->aliases) { 16101 my $standard = utf8_heavy_name($table, $alias); 16102 16103 # Generate an entry in either the loose or strict hashes, which 16104 # will translate the property and alias names combination into the 16105 # file where the table for them is stored. 16106 if ($alias->loose_match) { 16107 if (exists $loose_to_file_of{$standard}) { 16108 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); 16109 } 16110 else { 16111 $loose_to_file_of{$standard} = $sub_filename; 16112 } 16113 } 16114 else { 16115 if (exists $stricter_to_file_of{$standard}) { 16116 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); 16117 } 16118 else { 16119 $stricter_to_file_of{$standard} = $sub_filename; 16120 16121 # Tightly coupled with how utf8_heavy.pl works, for a 16122 # floating point number that is a whole number, get rid of 16123 # the trailing decimal point and 0's, so that utf8_heavy 16124 # will work. Also note that this assumes that such a 16125 # number is matched strictly; so if that were to change, 16126 # this would be wrong. 16127 if ((my $integer_name = $alias->name) 16128 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) 16129 { 16130 $stricter_to_file_of{$property_name . $integer_name} 16131 = $sub_filename; 16132 } 16133 } 16134 } 16135 16136 # For Unicode::UCD, create a mapping of the prop=value to the 16137 # canonical =value for that property. 16138 if ($standard =~ /=/) { 16139 16140 # This could happen if a strict name mapped into an existing 16141 # loose name. In that event, the strict names would have to 16142 # be moved to a new hash. 16143 if (exists($loose_to_standard_value{$standard})) { 16144 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); 16145 } 16146 $loose_to_standard_value{$standard} = $loose_table_name; 16147 } 16148 16149 # Keep a list of the deprecated properties and their filenames 16150 if ($deprecated && $complement == 0) { 16151 $utf8::why_deprecated{$sub_filename} = $deprecated; 16152 } 16153 16154 # And a substitute table, if any, for case-insensitive matching 16155 if ($caseless_equivalent != 0) { 16156 $caseless_equivalent_to{$standard} = $caseless_equivalent; 16157 } 16158 16159 # Add to defaults list if the table this alias belongs to is the 16160 # default one 16161 $loose_defaults{$standard} = 1 if $is_default; 16162 } 16163 } 16164 16165 return; 16166} 16167 16168{ # Closure 16169 my %base_names; # Names already used for avoiding DOS 8.3 filesystem 16170 # conflicts 16171 my %full_dir_name_of; # Full length names of directories used. 16172 16173 sub construct_filename($$$) { 16174 # Return a file name for a table, based on the table name, but perhaps 16175 # changed to get rid of non-portable characters in it, and to make 16176 # sure that it is unique on a file system that allows the names before 16177 # any period to be at most 8 characters (DOS). While we're at it 16178 # check and complain if there are any directory conflicts. 16179 16180 my $name = shift; # The name to start with 16181 my $mutable = shift; # Boolean: can it be changed? If no, but 16182 # yet it must be to work properly, a warning 16183 # is given 16184 my $directories_ref = shift; # A reference to an array containing the 16185 # path to the file, with each element one path 16186 # component. This is used because the same 16187 # name can be used in different directories. 16188 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 16189 16190 my $warn = ! defined wantarray; # If true, then if the name is 16191 # changed, a warning is issued as well. 16192 16193 if (! defined $name) { 16194 Carp::my_carp("Undefined name in directory " 16195 . File::Spec->join(@$directories_ref) 16196 . ". '_' used"); 16197 return '_'; 16198 } 16199 16200 # Make sure that no directory names conflict with each other. Look at 16201 # each directory in the input file's path. If it is already in use, 16202 # assume it is correct, and is merely being re-used, but if we 16203 # truncate it to 8 characters, and find that there are two directories 16204 # that are the same for the first 8 characters, but differ after that, 16205 # then that is a problem. 16206 foreach my $directory (@$directories_ref) { 16207 my $short_dir = substr($directory, 0, 8); 16208 if (defined $full_dir_name_of{$short_dir}) { 16209 next if $full_dir_name_of{$short_dir} eq $directory; 16210 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); 16211 } 16212 else { 16213 $full_dir_name_of{$short_dir} = $directory; 16214 } 16215 } 16216 16217 my $path = join '/', @$directories_ref; 16218 $path .= '/' if $path; 16219 16220 # Remove interior underscores. 16221 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; 16222 16223 # Convert the dot in floating point numbers to an underscore 16224 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x; 16225 16226 my $suffix = ""; 16227 16228 # Extract any suffix, delete any non-word character, and truncate to 3 16229 # after the dot 16230 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) { 16231 $filename = $1; 16232 $suffix = $2; 16233 $suffix =~ s/\W+//g; 16234 substr($suffix, 4) = "" if length($suffix) > 4; 16235 } 16236 16237 # Change any non-word character outside the suffix into an underscore, 16238 # and truncate to 8. 16239 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" 16240 substr($filename, 8) = "" if length($filename) > 8; 16241 16242 # Make sure the basename doesn't conflict with something we 16243 # might have already written. If we have, say, 16244 # InGreekExtended1 16245 # InGreekExtended2 16246 # they become 16247 # InGreekE 16248 # InGreek2 16249 my $warned = 0; 16250 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) { 16251 $num++; # so basenames with numbers start with '2', which 16252 # just looks more natural. 16253 16254 # Want to append $num, but if it'll make the basename longer 16255 # than 8 characters, pre-truncate $filename so that the result 16256 # is acceptable. 16257 my $delta = length($filename) + length($num) - 8; 16258 if ($delta > 0) { 16259 substr($filename, -$delta) = $num; 16260 } 16261 else { 16262 $filename .= $num; 16263 } 16264 if ($warn && ! $warned) { 16265 $warned = 1; 16266 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); 16267 } 16268 } 16269 16270 return $filename if $mutable; 16271 16272 # If not changeable, must return the input name, but warn if needed to 16273 # change it beyond shortening it. 16274 if ($name ne $filename 16275 && substr($name, 0, length($filename)) ne $filename) { 16276 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); 16277 } 16278 return $name; 16279 } 16280} 16281 16282# The pod file contains a very large table. Many of the lines in that table 16283# would exceed a typical output window's size, and so need to be wrapped with 16284# a hanging indent to make them look good. The pod language is really 16285# insufficient here. There is no general construct to do that in pod, so it 16286# is done here by beginning each such line with a space to cause the result to 16287# be output without formatting, and doing all the formatting here. This leads 16288# to the result that if the eventual display window is too narrow it won't 16289# look good, and if the window is too wide, no advantage is taken of that 16290# extra width. A further complication is that the output may be indented by 16291# the formatter so that there is less space than expected. What I (khw) have 16292# done is to assume that that indent is a particular number of spaces based on 16293# what it is in my Linux system; people can always resize their windows if 16294# necessary, but this is obviously less than desirable, but the best that can 16295# be expected. 16296my $automatic_pod_indent = 8; 16297 16298# Try to format so that uses fewest lines, but few long left column entries 16299# slide into the right column. An experiment on 5.1 data yielded the 16300# following percentages that didn't cut into the other side along with the 16301# associated first-column widths 16302# 69% = 24 16303# 80% not too bad except for a few blocks 16304# 90% = 33; # , cuts 353/3053 lines from 37 = 12% 16305# 95% = 37; 16306my $indent_info_column = 27; # 75% of lines didn't have overlap 16307 16308my $FILLER = 3; # Length of initial boiler-plate columns in a pod line 16309 # The 3 is because of: 16310 # 1 for the leading space to tell the pod formatter to 16311 # output as-is 16312 # 1 for the flag 16313 # 1 for the space between the flag and the main data 16314 16315sub format_pod_line ($$$;$$) { 16316 # Take a pod line and return it, formatted properly 16317 16318 my $first_column_width = shift; 16319 my $entry = shift; # Contents of left column 16320 my $info = shift; # Contents of right column 16321 16322 my $status = shift || ""; # Any flag 16323 16324 my $loose_match = shift; # Boolean. 16325 $loose_match = 1 unless defined $loose_match; 16326 16327 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 16328 16329 my $flags = ""; 16330 $flags .= $STRICTER if ! $loose_match; 16331 16332 $flags .= $status if $status; 16333 16334 # There is a blank in the left column to cause the pod formatter to 16335 # output the line as-is. 16336 return sprintf " %-*s%-*s %s\n", 16337 # The first * in the format is replaced by this, the -1 is 16338 # to account for the leading blank. There isn't a 16339 # hard-coded blank after this to separate the flags from 16340 # the rest of the line, so that in the unlikely event that 16341 # multiple flags are shown on the same line, they both 16342 # will get displayed at the expense of that separation, 16343 # but since they are left justified, a blank will be 16344 # inserted in the normal case. 16345 $FILLER - 1, 16346 $flags, 16347 16348 # The other * in the format is replaced by this number to 16349 # cause the first main column to right fill with blanks. 16350 # The -1 is for the guaranteed blank following it. 16351 $first_column_width - $FILLER - 1, 16352 $entry, 16353 $info; 16354} 16355 16356my @zero_match_tables; # List of tables that have no matches in this release 16357 16358sub make_re_pod_entries($) { 16359 # This generates the entries for the pod file for a given table. 16360 # Also done at this time are any children tables. The output looks like: 16361 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) 16362 16363 my $input_table = shift; # Table the entry is for 16364 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 16365 16366 # Generate parent and all its children at the same time. 16367 return if $input_table->parent != $input_table; 16368 16369 my $property = $input_table->property; 16370 my $type = $property->type; 16371 my $full_name = $property->full_name; 16372 16373 my $count = $input_table->count; 16374 my $unicode_count; 16375 my $non_unicode_string; 16376 if ($count > $MAX_UNICODE_CODEPOINTS) { 16377 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 16378 - $MAX_UNICODE_CODEPOINT); 16379 $non_unicode_string = " plus all above-Unicode code points"; 16380 } 16381 else { 16382 $unicode_count = $count; 16383 $non_unicode_string = ""; 16384 } 16385 16386 my $string_count = clarify_number($unicode_count) . $non_unicode_string; 16387 16388 my $definition = $input_table->calculate_table_definition; 16389 if ($definition) { 16390 16391 # Save the definition for later use. 16392 $input_table->set_definition($definition); 16393 16394 $definition = ": $definition"; 16395 } 16396 16397 my $status = $input_table->status; 16398 my $status_info = $input_table->status_info; 16399 my $caseless_equivalent = $input_table->caseless_equivalent; 16400 16401 # Don't mention a placeholder equivalent as it isn't to be listed in the 16402 # pod 16403 $caseless_equivalent = 0 if $caseless_equivalent != 0 16404 && $caseless_equivalent->fate > $ORDINARY; 16405 16406 my $entry_for_first_table; # The entry for the first table output. 16407 # Almost certainly, it is the parent. 16408 16409 # For each related table (including itself), we will generate a pod entry 16410 # for each name each table goes by 16411 foreach my $table ($input_table, $input_table->children) { 16412 16413 # utf8_heavy.pl cannot deal with null string property values, so skip 16414 # any tables that have no non-null names. 16415 next if ! grep { $_->name ne "" } $table->aliases; 16416 16417 # First, gather all the info that applies to this table as a whole. 16418 16419 push @zero_match_tables, $table if $count == 0 16420 # Don't mention special tables 16421 # as being zero length 16422 && $table->fate == $ORDINARY; 16423 16424 my $table_property = $table->property; 16425 16426 # The short name has all the underscores removed, while the full name 16427 # retains them. Later, we decide whether to output a short synonym 16428 # for the full one, we need to compare apples to apples, so we use the 16429 # short name's length including underscores. 16430 my $table_property_short_name_length; 16431 my $table_property_short_name 16432 = $table_property->short_name(\$table_property_short_name_length); 16433 my $table_property_full_name = $table_property->full_name; 16434 16435 # Get how much savings there is in the short name over the full one 16436 # (delta will always be <= 0) 16437 my $table_property_short_delta = $table_property_short_name_length 16438 - length($table_property_full_name); 16439 my @table_description = $table->description; 16440 my @table_note = $table->note; 16441 16442 # Generate an entry for each alias in this table. 16443 my $entry_for_first_alias; # saves the first one encountered. 16444 foreach my $alias ($table->aliases) { 16445 16446 # Skip if not to go in pod. 16447 next unless $alias->make_re_pod_entry; 16448 16449 # Start gathering all the components for the entry 16450 my $name = $alias->name; 16451 16452 # Skip if name is empty, as can't be accessed by regexes. 16453 next if $name eq ""; 16454 16455 my $entry; # Holds the left column, may include extras 16456 my $entry_ref; # To refer to the left column's contents from 16457 # another entry; has no extras 16458 16459 # First the left column of the pod entry. Tables for the $perl 16460 # property always use the single form. 16461 if ($table_property == $perl) { 16462 $entry = "\\p{$name}"; 16463 $entry .= " \\p$name" if length $name == 1; # Show non-braced 16464 # form too 16465 $entry_ref = "\\p{$name}"; 16466 } 16467 else { # Compound form. 16468 16469 # Only generate one entry for all the aliases that mean true 16470 # or false in binary properties. Append a '*' to indicate 16471 # some are missing. (The heading comment notes this.) 16472 my $rhs; 16473 if ($type == $BINARY) { 16474 next if $name ne 'N' && $name ne 'Y'; 16475 $rhs = "$name*"; 16476 } 16477 elsif ($type != $FORCED_BINARY) { 16478 $rhs = $name; 16479 } 16480 else { 16481 16482 # Forced binary properties require special handling. It 16483 # has two sets of tables, one set is true/false; and the 16484 # other set is everything else. Entries are generated for 16485 # each set. Use the Bidi_Mirrored property (which appears 16486 # in all Unicode versions) to get a list of the aliases 16487 # for the true/false tables. Of these, only output the N 16488 # and Y ones, the same as, a regular binary property. And 16489 # output all the rest, same as a non-binary property. 16490 my $bm = property_ref("Bidi_Mirrored"); 16491 if ($name eq 'N' || $name eq 'Y') { 16492 $rhs = "$name*"; 16493 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, 16494 $bm->table("N")->aliases) 16495 { 16496 next; 16497 } 16498 else { 16499 $rhs = $name; 16500 } 16501 } 16502 16503 # Colon-space is used to give a little more space to be easier 16504 # to read; 16505 $entry = "\\p{" 16506 . $table_property_full_name 16507 . ": $rhs}"; 16508 16509 # But for the reference to this entry, which will go in the 16510 # right column, where space is at a premium, use equals 16511 # without a space 16512 $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; 16513 } 16514 16515 # Then the right (info) column. This is stored as components of 16516 # an array for the moment, then joined into a string later. For 16517 # non-internal only properties, begin the info with the entry for 16518 # the first table we encountered (if any), as things are ordered 16519 # so that that one is the most descriptive. This leads to the 16520 # info column of an entry being a more descriptive version of the 16521 # name column 16522 my @info; 16523 if ($name =~ /^_/) { 16524 push @info, 16525 '(For internal use by Perl, not necessarily stable)'; 16526 } 16527 elsif ($entry_for_first_alias) { 16528 push @info, $entry_for_first_alias; 16529 } 16530 16531 # If this entry is equivalent to another, add that to the info, 16532 # using the first such table we encountered 16533 if ($entry_for_first_table) { 16534 if (@info) { 16535 push @info, "(= $entry_for_first_table)"; 16536 } 16537 else { 16538 push @info, $entry_for_first_table; 16539 } 16540 } 16541 16542 # If the name is a large integer, add an equivalent with an 16543 # exponent for better readability 16544 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { 16545 push @info, sprintf "(= %.1e)", $name 16546 } 16547 16548 my $parenthesized = ""; 16549 if (! $entry_for_first_alias) { 16550 16551 # This is the first alias for the current table. The alias 16552 # array is ordered so that this is the fullest, most 16553 # descriptive alias, so it gets the fullest info. The other 16554 # aliases are mostly merely pointers to this one, using the 16555 # information already added above. 16556 16557 # Display any status message, but only on the parent table 16558 if ($status && ! $entry_for_first_table) { 16559 push @info, $status_info; 16560 } 16561 16562 # Put out any descriptive info 16563 if (@table_description || @table_note) { 16564 push @info, join "; ", @table_description, @table_note; 16565 } 16566 16567 # Look to see if there is a shorter name we can point people 16568 # at 16569 my $standard_name = standardize($name); 16570 my $short_name; 16571 my $proposed_short = $table->short_name; 16572 if (defined $proposed_short) { 16573 my $standard_short = standardize($proposed_short); 16574 16575 # If the short name is shorter than the standard one, or 16576 # even it it's not, but the combination of it and its 16577 # short property name (as in \p{prop=short} ($perl doesn't 16578 # have this form)) saves at least two characters, then, 16579 # cause it to be listed as a shorter synonym. 16580 if (length $standard_short < length $standard_name 16581 || ($table_property != $perl 16582 && (length($standard_short) 16583 - length($standard_name) 16584 + $table_property_short_delta) # (<= 0) 16585 < -2)) 16586 { 16587 $short_name = $proposed_short; 16588 if ($table_property != $perl) { 16589 $short_name = $table_property_short_name 16590 . "=$short_name"; 16591 } 16592 $short_name = "\\p{$short_name}"; 16593 } 16594 } 16595 16596 # And if this is a compound form name, see if there is a 16597 # single form equivalent 16598 my $single_form; 16599 if ($table_property != $perl && $table_property != $block) { 16600 16601 # Special case the binary N tables, so that will print 16602 # \P{single}, but use the Y table values to populate 16603 # 'single', as we haven't likewise populated the N table. 16604 # For forced binary tables, we can't just look at the N 16605 # table, but must see if this table is equivalent to the N 16606 # one, as there are two equivalent beasts in these 16607 # properties. 16608 my $test_table; 16609 my $p; 16610 if ( ($type == $BINARY 16611 && $input_table == $property->table('No')) 16612 || ($type == $FORCED_BINARY 16613 && $property->table('No')-> 16614 is_set_equivalent_to($input_table))) 16615 { 16616 $test_table = $property->table('Yes'); 16617 $p = 'P'; 16618 } 16619 else { 16620 $test_table = $input_table; 16621 $p = 'p'; 16622 } 16623 16624 # Look for a single form amongst all the children. 16625 foreach my $table ($test_table->children) { 16626 next if $table->property != $perl; 16627 my $proposed_name = $table->short_name; 16628 next if ! defined $proposed_name; 16629 16630 # Don't mention internal-only properties as a possible 16631 # single form synonym 16632 next if substr($proposed_name, 0, 1) eq '_'; 16633 16634 $proposed_name = "\\$p\{$proposed_name}"; 16635 if (! defined $single_form 16636 || length($proposed_name) < length $single_form) 16637 { 16638 $single_form = $proposed_name; 16639 16640 # The goal here is to find a single form; not the 16641 # shortest possible one. We've already found a 16642 # short name. So, stop at the first single form 16643 # found, which is likely to be closer to the 16644 # original. 16645 last; 16646 } 16647 } 16648 } 16649 16650 # Output both short and single in the same parenthesized 16651 # expression, but with only one of 'Single', 'Short' if there 16652 # are both items. 16653 if ($short_name || $single_form || $table->conflicting) { 16654 $parenthesized .= "Short: $short_name" if $short_name; 16655 if ($short_name && $single_form) { 16656 $parenthesized .= ', '; 16657 } 16658 elsif ($single_form) { 16659 $parenthesized .= 'Single: '; 16660 } 16661 $parenthesized .= $single_form if $single_form; 16662 } 16663 } 16664 16665 if ($caseless_equivalent != 0) { 16666 $parenthesized .= '; ' if $parenthesized ne ""; 16667 $parenthesized .= "/i= " . $caseless_equivalent->complete_name; 16668 } 16669 16670 16671 # Warn if this property isn't the same as one that a 16672 # semi-casual user might expect. The other components of this 16673 # parenthesized structure are calculated only for the first entry 16674 # for this table, but the conflicting is deemed important enough 16675 # to go on every entry. 16676 my $conflicting = join " NOR ", $table->conflicting; 16677 if ($conflicting) { 16678 $parenthesized .= '; ' if $parenthesized ne ""; 16679 $parenthesized .= "NOT $conflicting"; 16680 } 16681 16682 push @info, "($parenthesized)" if $parenthesized; 16683 16684 if ($name =~ /_$/ && $alias->loose_match) { 16685 push @info, "Note the trailing '_' matters in spite of loose matching rules."; 16686 } 16687 16688 if ($table_property != $perl && $table->perl_extension) { 16689 push @info, '(Perl extension)'; 16690 } 16691 my $definition = $table->definition // ""; 16692 $definition = "" if $entry_for_first_alias; 16693 $definition = ": $definition" if $definition; 16694 push @info, "($string_count$definition)"; 16695 16696 # Now, we have both the entry and info so add them to the 16697 # list of all the properties. 16698 push @match_properties, 16699 format_pod_line($indent_info_column, 16700 $entry, 16701 join( " ", @info), 16702 $alias->status, 16703 $alias->loose_match); 16704 16705 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; 16706 } # End of looping through the aliases for this table. 16707 16708 if (! $entry_for_first_table) { 16709 $entry_for_first_table = $entry_for_first_alias; 16710 } 16711 } # End of looping through all the related tables 16712 return; 16713} 16714 16715sub make_ucd_table_pod_entries { 16716 my $table = shift; 16717 16718 # Generate the entries for the UCD section of the pod for $table. This 16719 # also calculates if names are ambiguous, so has to be called even if the 16720 # pod is not being output 16721 16722 my $short_name = $table->name; 16723 my $standard_short_name = standardize($short_name); 16724 my $full_name = $table->full_name; 16725 my $standard_full_name = standardize($full_name); 16726 16727 my $full_info = ""; # Text of info column for full-name entries 16728 my $other_info = ""; # Text of info column for short-name entries 16729 my $short_info = ""; # Text of info column for other entries 16730 my $meaning = ""; # Synonym of this table 16731 16732 my $property = ($table->isa('Property')) 16733 ? $table 16734 : $table->parent->property; 16735 16736 my $perl_extension = $table->perl_extension; 16737 my $is_perl_extension_match_table_but_not_dollar_perl 16738 = $property != $perl 16739 && $perl_extension 16740 && $property != $table; 16741 16742 # Get the more official name for for perl extensions that aren't 16743 # stand-alone properties 16744 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16745 if ($property->type == $BINARY) { 16746 $meaning = $property->full_name; 16747 } 16748 else { 16749 $meaning = $table->parent->complete_name; 16750 } 16751 } 16752 16753 # There are three types of info column. One for the short name, one for 16754 # the full name, and one for everything else. They mostly are the same, 16755 # so initialize in the same loop. 16756 16757 foreach my $info_ref (\$full_info, \$short_info, \$other_info) { 16758 if ($info_ref != \$full_info) { 16759 16760 # The non-full name columns include the full name 16761 $$info_ref .= $full_name; 16762 } 16763 16764 16765 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16766 16767 # Add the synonymous name for the non-full name entries; and to 16768 # the full-name entry if it adds extra information 16769 if ( standardize($meaning) ne $standard_full_name 16770 || $info_ref == \$other_info 16771 || $info_ref == \$short_info) 16772 { 16773 my $parenthesized = $info_ref != \$full_info; 16774 $$info_ref .= " " if $$info_ref && $parenthesized; 16775 $$info_ref .= "(=" if $parenthesized; 16776 $$info_ref .= "$meaning"; 16777 $$info_ref .= ")" if $parenthesized; 16778 $$info_ref .= "."; 16779 } 16780 } 16781 16782 # And the full-name entry includes the short name, if shorter 16783 if ($info_ref == \$full_info 16784 && length $standard_short_name < length $standard_full_name) 16785 { 16786 $full_info =~ s/\.\Z//; 16787 $full_info .= " " if $full_info; 16788 $full_info .= "(Short: $short_name)"; 16789 } 16790 16791 if ($table->perl_extension) { 16792 $$info_ref =~ s/\.\Z//; 16793 $$info_ref .= ". " if $$info_ref; 16794 $$info_ref .= "(Perl extension)"; 16795 } 16796 } 16797 16798 my $definition; 16799 my $definition_table; 16800 my $type = $table->property->type; 16801 if ($type == $BINARY || $type == $FORCED_BINARY) { 16802 $definition_table = $table->property->table('Y'); 16803 } 16804 elsif ($table->isa('Match_Table')) { 16805 $definition_table = $table; 16806 } 16807 16808 $definition = $definition_table->calculate_table_definition 16809 if defined $definition_table 16810 && $definition_table != 0; 16811 16812 # Add any extra annotations to the full name entry 16813 foreach my $more_info ($table->description, 16814 $definition, 16815 $table->note, 16816 $table->status_info) 16817 { 16818 next unless $more_info; 16819 $full_info =~ s/\.\Z//; 16820 $full_info .= ". " if $full_info; 16821 $full_info .= $more_info; 16822 } 16823 if ($table->property->type == $FORCED_BINARY) { 16824 if ($full_info) { 16825 $full_info =~ s/\.\Z//; 16826 $full_info .= ". "; 16827 } 16828 $full_info .= "This is a combination property which has both:" 16829 . " 1) a map to various string values; and" 16830 . " 2) a map to boolean Y/N, where 'Y' means the" 16831 . " string value is non-empty. Add the prefix 'is'" 16832 . " to the prop_invmap() call to get the latter"; 16833 } 16834 16835 # These keep track if have created full and short name pod entries for the 16836 # property 16837 my $done_full = 0; 16838 my $done_short = 0; 16839 16840 # Every possible name is kept track of, even those that aren't going to be 16841 # output. This way we can be sure to find the ambiguities. 16842 foreach my $alias ($table->aliases) { 16843 my $name = $alias->name; 16844 my $standard = standardize($name); 16845 my $info; 16846 my $output_this = $alias->ucd; 16847 16848 # If the full and short names are the same, we want to output the full 16849 # one's entry, so it has priority. 16850 if ($standard eq $standard_full_name) { 16851 next if $done_full; 16852 $done_full = 1; 16853 $info = $full_info; 16854 } 16855 elsif ($standard eq $standard_short_name) { 16856 next if $done_short; 16857 $done_short = 1; 16858 next if $standard_short_name eq $standard_full_name; 16859 $info = $short_info; 16860 } 16861 else { 16862 $info = $other_info; 16863 } 16864 16865 $combination_property{$standard} = 1 16866 if $table->property->type == $FORCED_BINARY; 16867 16868 # Here, we have set up the two columns for this entry. But if an 16869 # entry already exists for this name, we have to decide which one 16870 # we're going to later output. 16871 if (exists $ucd_pod{$standard}) { 16872 16873 # If the two entries refer to the same property, it's not going to 16874 # be ambiguous. (Likely it's because the names when standardized 16875 # are the same.) But that means if they are different properties, 16876 # there is ambiguity. 16877 if ($ucd_pod{$standard}->{'property'} != $property) { 16878 16879 # Here, we have an ambiguity. This code assumes that one is 16880 # scheduled to be output and one not and that one is a perl 16881 # extension (which is not to be output) and the other isn't. 16882 # If those assumptions are wrong, things have to be rethought. 16883 if ($ucd_pod{$standard}{'output_this'} == $output_this 16884 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension 16885 || $output_this == $perl_extension) 16886 { 16887 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); 16888 } 16889 16890 # We modify the info column of the one being output to 16891 # indicate the ambiguity. Set $which to point to that one's 16892 # info. 16893 my $which; 16894 if ($ucd_pod{$standard}{'output_this'}) { 16895 $which = \$ucd_pod{$standard}->{'info'}; 16896 } 16897 else { 16898 $which = \$info; 16899 $meaning = $ucd_pod{$standard}{'meaning'}; 16900 } 16901 16902 chomp $$which; 16903 $$which =~ s/\.\Z//; 16904 $$which .= "; NOT '$standard' meaning '$meaning'"; 16905 16906 $ambiguous_names{$standard} = 1; 16907 } 16908 16909 # Use the non-perl-extension variant 16910 next unless $ucd_pod{$standard}{'perl_extension'}; 16911 } 16912 16913 # Store enough information about this entry that we can later look for 16914 # ambiguities, and output it properly. 16915 $ucd_pod{$standard} = { 'name' => $name, 16916 'info' => $info, 16917 'meaning' => $meaning, 16918 'output_this' => $output_this, 16919 'perl_extension' => $perl_extension, 16920 'property' => $property, 16921 'status' => $alias->status, 16922 }; 16923 } # End of looping through all this table's aliases 16924 16925 return; 16926} 16927 16928sub pod_alphanumeric_sort { 16929 # Sort pod entries alphanumerically. 16930 16931 # The first few character columns are filler, plus the '\p{'; and get rid 16932 # of all the trailing stuff, starting with the trailing '}', so as to sort 16933 # on just 'Name=Value' 16934 (my $a = lc $a) =~ s/^ .*? \{ //x; 16935 $a =~ s/}.*//; 16936 (my $b = lc $b) =~ s/^ .*? \{ //x; 16937 $b =~ s/}.*//; 16938 16939 # Determine if the two operands are both internal only or both not. 16940 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 16941 # should be the underscore that begins internal only 16942 my $a_is_internal = (substr($a, 0, 1) eq '_'); 16943 my $b_is_internal = (substr($b, 0, 1) eq '_'); 16944 16945 # Sort so the internals come last in the table instead of first (which the 16946 # leading underscore would otherwise indicate). 16947 if ($a_is_internal != $b_is_internal) { 16948 return 1 if $a_is_internal; 16949 return -1 16950 } 16951 16952 # Determine if the two operands are compound or not, and if so if are 16953 # "numeric" property values or not, like \p{Age: 3.0}. But there are also 16954 # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0}, 16955 # all of which this considers numeric, and for sorting, looks just at the 16956 # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}. 16957 my $split_re = qr/ 16958 ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the 16959 # property name 16960 [:=] \s* # The syntax for the compound form 16961 (?: # followed by ... 16962 ( # $2 gets defined if what follows is a "numeric" 16963 # expression, which is ... 16964 ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational 16965 # number, optionally signed 16966 | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either 16967 # of these go into $3 16968 | ( V \d+ _ \d+ ) # or a Unicode's Age property version 16969 # number, into $4 16970 ) 16971 | .* $ # If not "numeric", accept anything so that $1 gets 16972 # defined if it is any compound form 16973 ) /ix; 16974 my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re); 16975 my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re); 16976 16977 # Sort alphabeticlly on the whole property name if either operand isn't 16978 # compound, or they differ. 16979 return $a cmp $b if ! defined $a_initial 16980 || ! defined $b_initial 16981 || $a_initial ne $b_initial; 16982 16983 if (! defined $a_numeric) { 16984 16985 # If neither is numeric, use alpha sort 16986 return $a cmp $b if ! defined $b_numeric; 16987 return 1; # Sort numeric ahead of alpha 16988 } 16989 16990 # Here $a is numeric 16991 return -1 if ! defined $b_numeric; # Numeric sorts before alpha 16992 16993 # Here they are both numeric in the same property. 16994 # Convert version numbers into regular numbers 16995 if (defined $a_version) { 16996 ($a_number = $a_version) =~ s/^V//i; 16997 $a_number =~ s/_/./; 16998 } 16999 else { # Otherwise get rid of the, e.g., CCC in CCC9 */ 17000 $a_number =~ s/ ^ [[:alpha:]]+ //x; 17001 } 17002 if (defined $b_version) { 17003 ($b_number = $b_version) =~ s/^V//i; 17004 $b_number =~ s/_/./; 17005 } 17006 else { 17007 $b_number =~ s/ ^ [[:alpha:]]+ //x; 17008 } 17009 17010 # Convert rationals to floating for the comparison. 17011 $a_number = eval $a_number if $a_number =~ qr{/}; 17012 $b_number = eval $b_number if $b_number =~ qr{/}; 17013 17014 return $a_number <=> $b_number || $a cmp $b; 17015} 17016 17017sub make_pod () { 17018 # Create the .pod file. This generates the various subsections and then 17019 # combines them in one big HERE document. 17020 17021 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; 17022 17023 return unless defined $pod_directory; 17024 print "Making pod file\n" if $verbosity >= $PROGRESS; 17025 17026 my $exception_message = 17027 '(Any exceptions are individually noted beginning with the word NOT.)'; 17028 my @block_warning; 17029 if (-e 'Blocks.txt') { 17030 17031 # Add the line: '\p{In_*} \p{Block: *}', with the warning message 17032 # if the global $has_In_conflicts indicates we have them. 17033 push @match_properties, format_pod_line($indent_info_column, 17034 '\p{In_*}', 17035 '\p{Block: *}' 17036 . (($has_In_conflicts) 17037 ? " $exception_message" 17038 : ""), 17039 $DISCOURAGED); 17040 @block_warning = << "END"; 17041 17042In particular, matches in the Block property have single forms 17043defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at 17044all, Like all B<DISCOURAGED> forms, these are not stable. For example, 17045C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>, 17046C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may 17047come along that would force Perl to change the meaning of one or more of 17048these, and your program would no longer be correct. Currently there are no 17049such conflicts with the form that begins C<"In_">, but there are many with the 17050other two shortcuts, and Unicode continues to define new properties that begin 17051with C<"In">, so it's quite possible that a conflict will occur in the future. 17052The compound form is guaranteed to not become obsolete, and its meaning is 17053clearer anyway. See L<perlunicode/"Blocks"> for more information about this. 17054END 17055 } 17056 my $text = $Is_flags_text; 17057 $text = "$exception_message $text" if $has_Is_conflicts; 17058 17059 # And the 'Is_ line'; 17060 push @match_properties, format_pod_line($indent_info_column, 17061 '\p{Is_*}', 17062 "\\p{*} $text"); 17063 17064 # Sort the properties array for output. It is sorted alphabetically 17065 # except numerically for numeric properties, and only output unique lines. 17066 @match_properties = sort pod_alphanumeric_sort uniques @match_properties; 17067 17068 my $formatted_properties = simple_fold(\@match_properties, 17069 "", 17070 # indent succeeding lines by two extra 17071 # which looks better 17072 $indent_info_column + 2, 17073 17074 # shorten the line length by how much 17075 # the formatter indents, so the folded 17076 # line will fit in the space 17077 # presumably available 17078 $automatic_pod_indent); 17079 # Add column headings, indented to be a little more centered, but not 17080 # exactly 17081 $formatted_properties = format_pod_line($indent_info_column, 17082 ' NAME', 17083 ' INFO') 17084 . "\n" 17085 . $formatted_properties; 17086 17087 # Generate pod documentation lines for the tables that match nothing 17088 my $zero_matches = ""; 17089 if (@zero_match_tables) { 17090 @zero_match_tables = uniques(@zero_match_tables); 17091 $zero_matches = join "\n\n", 17092 map { $_ = '=item \p{' . $_->complete_name . "}" } 17093 sort { $a->complete_name cmp $b->complete_name } 17094 @zero_match_tables; 17095 17096 $zero_matches = <<END; 17097 17098=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters 17099 17100Unicode has some property-value pairs that currently don't match anything. 17101This happens generally either because they are obsolete, or they exist for 17102symmetry with other forms, but no language has yet been encoded that uses 17103them. In this version of Unicode, the following match zero code points: 17104 17105=over 4 17106 17107$zero_matches 17108 17109=back 17110 17111END 17112 } 17113 17114 # Generate list of properties that we don't accept, grouped by the reasons 17115 # why. This is so only put out the 'why' once, and then list all the 17116 # properties that have that reason under it. 17117 17118 my %why_list; # The keys are the reasons; the values are lists of 17119 # properties that have the key as their reason 17120 17121 # For each property, add it to the list that are suppressed for its reason 17122 # The sort will cause the alphabetically first properties to be added to 17123 # each list first, so each list will be sorted. 17124 foreach my $property (sort keys %why_suppressed) { 17125 next unless $why_suppressed{$property}; 17126 push @{$why_list{$why_suppressed{$property}}}, $property; 17127 } 17128 17129 # For each reason (sorted by the first property that has that reason)... 17130 my @bad_re_properties; 17131 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } 17132 keys %why_list) 17133 { 17134 # Add to the output, all the properties that have that reason. 17135 my $has_item = 0; # Flag if actually output anything. 17136 foreach my $name (@{$why_list{$why}}) { 17137 17138 # Split compound names into $property and $table components 17139 my $property = $name; 17140 my $table; 17141 if ($property =~ / (.*) = (.*) /x) { 17142 $property = $1; 17143 $table = $2; 17144 } 17145 17146 # This release of Unicode may not have a property that is 17147 # suppressed, so don't reference a non-existent one. 17148 $property = property_ref($property); 17149 next if ! defined $property; 17150 17151 # And since this list is only for match tables, don't list the 17152 # ones that don't have match tables. 17153 next if ! $property->to_create_match_tables; 17154 17155 # Find any abbreviation, and turn it into a compound name if this 17156 # is a property=value pair. 17157 my $short_name = $property->name; 17158 $short_name .= '=' . $property->table($table)->name if $table; 17159 17160 # Start with an empty line. 17161 push @bad_re_properties, "\n\n" unless $has_item; 17162 17163 # And add the property as an item for the reason. 17164 push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; 17165 $has_item = 1; 17166 } 17167 17168 # And add the reason under the list of properties, if such a list 17169 # actually got generated. Note that the header got added 17170 # unconditionally before. But pod ignores extra blank lines, so no 17171 # harm. 17172 push @bad_re_properties, "\n$why\n" if $has_item; 17173 17174 } # End of looping through each reason. 17175 17176 if (! @bad_re_properties) { 17177 push @bad_re_properties, 17178 "*** This installation accepts ALL non-Unihan properties ***"; 17179 } 17180 else { 17181 # Add =over only if non-empty to avoid an empty =over/=back section, 17182 # which is considered bad form. 17183 unshift @bad_re_properties, "\n=over 4\n"; 17184 push @bad_re_properties, "\n=back\n"; 17185 } 17186 17187 # Similarly, generate a list of files that we don't use, grouped by the 17188 # reasons why (Don't output if the reason is empty). First, create a hash 17189 # whose keys are the reasons, and whose values are anonymous arrays of all 17190 # the files that share that reason. 17191 my %grouped_by_reason; 17192 foreach my $file (keys %skipped_files) { 17193 next unless $skipped_files{$file}; 17194 push @{$grouped_by_reason{$skipped_files{$file}}}, $file; 17195 } 17196 17197 # Then, sort each group. 17198 foreach my $group (keys %grouped_by_reason) { 17199 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } 17200 @{$grouped_by_reason{$group}} ; 17201 } 17202 17203 # Finally, create the output text. For each reason (sorted by the 17204 # alphabetically first file that has that reason)... 17205 my @unused_files; 17206 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] 17207 cmp lc $grouped_by_reason{$b}->[0] 17208 } 17209 keys %grouped_by_reason) 17210 { 17211 # Add all the files that have that reason to the output. Start 17212 # with an empty line. 17213 push @unused_files, "\n\n"; 17214 push @unused_files, map { "\n=item F<$_> \n" } 17215 @{$grouped_by_reason{$reason}}; 17216 # And add the reason under the list of files 17217 push @unused_files, "\n$reason\n"; 17218 } 17219 17220 # Similarly, create the output text for the UCD section of the pod 17221 my @ucd_pod; 17222 foreach my $key (keys %ucd_pod) { 17223 next unless $ucd_pod{$key}->{'output_this'}; 17224 push @ucd_pod, format_pod_line($indent_info_column, 17225 $ucd_pod{$key}->{'name'}, 17226 $ucd_pod{$key}->{'info'}, 17227 $ucd_pod{$key}->{'status'}, 17228 ); 17229 } 17230 17231 # Sort alphabetically, and fold for output 17232 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; 17233 my $ucd_pod = simple_fold(\@ucd_pod, 17234 ' ', 17235 $indent_info_column, 17236 $automatic_pod_indent); 17237 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') 17238 . "\n" 17239 . $ucd_pod; 17240 my $space_hex = sprintf("%02x", ord " "); 17241 local $" = ""; 17242 17243 # Everything is ready to assemble. 17244 my @OUT = << "END"; 17245=begin comment 17246 17247$HEADER 17248 17249To change this file, edit $0 instead. 17250 17251=end comment 17252 17253=head1 NAME 17254 17255$pod_file - Index of Unicode Version $unicode_version character properties in Perl 17256 17257=head1 DESCRIPTION 17258 17259This document provides information about the portion of the Unicode database 17260that deals with character properties, that is the portion that is defined on 17261single code points. (L</Other information in the Unicode data base> 17262below briefly mentions other data that Unicode provides.) 17263 17264Perl can provide access to all non-provisional Unicode character properties, 17265though not all are enabled by default. The omitted ones are the Unihan 17266properties (accessible via the CPAN module L<Unicode::Unihan>) and certain 17267deprecated or Unicode-internal properties. (An installation may choose to 17268recompile Perl's tables to change this. See L<Unicode character 17269properties that are NOT accepted by Perl>.) 17270 17271For most purposes, access to Unicode properties from the Perl core is through 17272regular expression matches, as described in the next section. 17273For some special purposes, and to access the properties that are not suitable 17274for regular expression matching, all the Unicode character properties that 17275Perl handles are accessible via the standard L<Unicode::UCD> module, as 17276described in the section L</Properties accessible through Unicode::UCD>. 17277 17278Perl also provides some additional extensions and short-cut synonyms 17279for Unicode properties. 17280 17281This document merely lists all available properties and does not attempt to 17282explain what each property really means. There is a brief description of each 17283Perl extension; see L<perlunicode/Other Properties> for more information on 17284these. There is some detail about Blocks, Scripts, General_Category, 17285and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the 17286official Unicode properties, refer to the Unicode standard. A good starting 17287place is L<$unicode_reference_url>. 17288 17289Note that you can define your own properties; see 17290L<perlunicode/"User-Defined Character Properties">. 17291 17292=head1 Properties accessible through C<\\p{}> and C<\\P{}> 17293 17294The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to 17295most of the Unicode character properties. The table below shows all these 17296constructs, both single and compound forms. 17297 17298B<Compound forms> consist of two components, separated by an equals sign or a 17299colon. The first component is the property name, and the second component is 17300the particular value of the property to match against, for example, 17301C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean 17302to match characters whose Script_Extensions property value is Greek. 17303(C<Script_Extensions> is an improved version of the C<Script> property.) 17304 17305B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for 17306their equivalent compound forms. The table shows these equivalences. (In our 17307example, C<\\p{Greek}> is a just a shortcut for 17308C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single 17309forms that are not shortcuts for a compound form. One such is C<\\p{Word}>. 17310These are also listed in the table. 17311 17312In parsing these constructs, Perl always ignores Upper/lower case differences 17313everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as 17314C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before 17315the left brace completely changes the meaning of the construct, from "match" 17316(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is 17317for improved legibility. 17318 17319Also, white space, hyphens, and underscores are normally ignored 17320everywhere between the {braces}, and hence can be freely added or removed 17321even if the C</x> modifier hasn't been specified on the regular expression. 17322But in the table below $a_bold_stricter at the beginning of an entry 17323means that tighter (stricter) rules are used for that entry: 17324 17325=over 4 17326 17327=over 4 17328 17329=item Single form (C<\\p{name}>) tighter rules: 17330 17331White space, hyphens, and underscores ARE significant 17332except for: 17333 17334=over 4 17335 17336=item * white space adjacent to a non-word character 17337 17338=item * underscores separating digits in numbers 17339 17340=back 17341 17342That means, for example, that you can freely add or remove white space 17343adjacent to (but within) the braces without affecting the meaning. 17344 17345=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules: 17346 17347The tighter rules given above for the single form apply to everything to the 17348right of the colon or equals; the looser rules still apply to everything to 17349the left. 17350 17351That means, for example, that you can freely add or remove white space 17352adjacent to (but within) the braces and the colon or equal sign. 17353 17354=back 17355 17356=back 17357 17358Some properties are considered obsolete by Unicode, but still available. 17359There are several varieties of obsolescence: 17360 17361=over 4 17362 17363=over 4 17364 17365=item Stabilized 17366 17367A property may be stabilized. Such a determination does not indicate 17368that the property should or should not be used; instead it is a declaration 17369that the property will not be maintained nor extended for newly encoded 17370characters. Such properties are marked with $a_bold_stabilized in the 17371table. 17372 17373=item Deprecated 17374 17375A property may be deprecated, perhaps because its original intent 17376has been replaced by another property, or because its specification was 17377somehow defective. This means that its use is strongly 17378discouraged, so much so that a warning will be issued if used, unless the 17379regular expression is in the scope of a C<S<no warnings 'deprecated'>> 17380statement. $A_bold_deprecated flags each such entry in the table, and 17381the entry there for the longest, most descriptive version of the property will 17382give the reason it is deprecated, and perhaps advice. Perl may issue such a 17383warning, even for properties that aren't officially deprecated by Unicode, 17384when there used to be characters or code points that were matched by them, but 17385no longer. This is to warn you that your program may not work like it did on 17386earlier Unicode releases. 17387 17388A deprecated property may be made unavailable in a future Perl version, so it 17389is best to move away from them. 17390 17391A deprecated property may also be stabilized, but this fact is not shown. 17392 17393=item Obsolete 17394 17395Properties marked with $a_bold_obsolete in the table are considered (plain) 17396obsolete. Generally this designation is given to properties that Unicode once 17397used for internal purposes (but not any longer). 17398 17399=item Discouraged 17400 17401This is not actually a Unicode-specified obsolescence, but applies to certain 17402Perl extensions that are present for backwards compatibility, but are 17403discouraged from being used. These are not obsolete, but their meanings are 17404not stable. Future Unicode versions could force any of these extensions to be 17405removed without warning, replaced by another property with the same name that 17406means something different. $A_bold_discouraged flags each such entry in the 17407table. Use the equivalent shown instead. 17408 17409@block_warning 17410 17411=back 17412 17413=back 17414 17415The table below has two columns. The left column contains the C<\\p{}> 17416constructs to look up, possibly preceded by the flags mentioned above; and 17417the right column contains information about them, like a description, or 17418synonyms. The table shows both the single and compound forms for each 17419property that has them. If the left column is a short name for a property, 17420the right column will give its longer, more descriptive name; and if the left 17421column is the longest name, the right column will show any equivalent shortest 17422name, in both single and compound forms if applicable. 17423 17424If braces are not needed to specify a property (e.g., C<\\pL>), the left 17425column contains both forms, with and without braces. 17426 17427The right column will also caution you if a property means something different 17428than what might normally be expected. 17429 17430All single forms are Perl extensions; a few compound forms are as well, and 17431are noted as such. 17432 17433Numbers in (parentheses) indicate the total number of Unicode code points 17434matched by the property. For the entries that give the longest, most 17435descriptive version of the property, the count is followed by a list of some 17436of the code points matched by it. The list includes all the matched 17437characters in the 0-255 range, enclosed in the familiar [brackets] the same as 17438a regular expression bracketed character class. Following that, the next few 17439higher matching ranges are also given. To avoid visual ambiguity, the SPACE 17440character is represented as C<\\x$space_hex>. 17441 17442For emphasis, those properties that match no code points at all are listed as 17443well in a separate section following the table. 17444 17445Most properties match the same code points regardless of whether C<"/i"> 17446case-insensitive matching is specified or not. But a few properties are 17447affected. These are shown with the notation S<C<(/i= I<other_property>)>> 17448in the second column. Under case-insensitive matching they match the 17449same code pode points as the property I<other_property>. 17450 17451There is no description given for most non-Perl defined properties (See 17452L<$unicode_reference_url> for that). 17453 17454For compactness, 'B<*>' is used as a wildcard instead of showing all possible 17455combinations. For example, entries like: 17456 17457 \\p{Gc: *} \\p{General_Category: *} 17458 17459mean that 'Gc' is a synonym for 'General_Category', and anything that is valid 17460for the latter is also valid for the former. Similarly, 17461 17462 \\p{Is_*} \\p{*} 17463 17464means that if and only if, for example, C<\\p{Foo}> exists, then 17465C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing. 17466And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and 17467C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an 17468underscore. 17469 17470Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. 17471And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 17472'N*' to indicate this, and doesn't have separate entries for the other 17473possibilities. Note that not all properties which have values 'Yes' and 'No' 17474are binary, and they have all their values spelled out without using this wild 17475card, and a C<NOT> clause in their description that highlights their not being 17476binary. These also require the compound form to match them, whereas true 17477binary properties have both single and compound forms available. 17478 17479Note that all non-essential underscores are removed in the display of the 17480short names below. 17481 17482B<Legend summary:> 17483 17484=over 4 17485 17486=item Z<>B<*> is a wild-card 17487 17488=item B<(\\d+)> in the info column gives the number of Unicode code points matched 17489by this property. 17490 17491=item B<$DEPRECATED> means this is deprecated. 17492 17493=item B<$OBSOLETE> means this is obsolete. 17494 17495=item B<$STABILIZED> means this is stabilized. 17496 17497=item B<$STRICTER> means tighter (stricter) name matching applies. 17498 17499=item B<$DISCOURAGED> means use of this form is discouraged, and may not be 17500stable. 17501 17502=back 17503 17504$formatted_properties 17505 17506$zero_matches 17507 17508=head1 Properties accessible through Unicode::UCD 17509 17510The value of any Unicode (not including Perl extensions) character 17511property mentioned above for any single code point is available through 17512L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the 17513values of all the Unicode properties for a given code point. 17514 17515Besides these, all the Unicode character properties mentioned above 17516(except for those marked as for internal use by Perl) are also 17517accessible by L<Unicode::UCD/prop_invlist()>. 17518 17519Due to their nature, not all Unicode character properties are suitable for 17520regular expression matches, nor C<prop_invlist()>. The remaining 17521non-provisional, non-internal ones are accessible via 17522L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation 17523hasn't included; see L<below for which those are|/Unicode character properties 17524that are NOT accepted by Perl>). 17525 17526For compatibility with other parts of Perl, all the single forms given in the 17527table in the L<section above|/Properties accessible through \\p{} and \\P{}> 17528are recognized. BUT, there are some ambiguities between some Perl extensions 17529and the Unicode properties, all of which are silently resolved in favor of the 17530official Unicode property. To avoid surprises, you should only use 17531C<prop_invmap()> for forms listed in the table below, which omits the 17532non-recommended ones. The affected forms are the Perl single form equivalents 17533of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of 17534C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property, 17535whose short name is C<sc>. The table indicates the current ambiguities in the 17536INFO column, beginning with the word C<"NOT">. 17537 17538The standard Unicode properties listed below are documented in 17539L<$unicode_reference_url>; Perl_Decimal_Digit is documented in 17540L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in 17541L<perlunicode/Other Properties>; 17542 17543The first column in the table is a name for the property; the second column is 17544an alternative name, if any, plus possibly some annotations. The alternative 17545name is the property's full name, unless that would simply repeat the first 17546column, in which case the second column indicates the property's short name 17547(if different). The annotations are given only in the entry for the full 17548name. The annotations for binary properties include a list of the first few 17549ranges that the property matches. To avoid any ambiguity, the SPACE character 17550is represented as C<\\x$space_hex>. 17551 17552If a property is obsolete, etc, the entry will be flagged with the same 17553characters used in the table in the L<section above|/Properties accessible 17554through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>. 17555 17556$ucd_pod 17557 17558=head1 Properties accessible through other means 17559 17560Certain properties are accessible also via core function calls. These are: 17561 17562 Lowercase_Mapping lc() and lcfirst() 17563 Titlecase_Mapping ucfirst() 17564 Uppercase_Mapping uc() 17565 17566Also, Case_Folding is accessible through the C</i> modifier in regular 17567expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>> 17568operator. 17569 17570And, the Name and Name_Aliases properties are accessible through the C<\\N{}> 17571interpolation in double-quoted strings and regular expressions; and functions 17572C<charnames::viacode()>, C<charnames::vianame()>, and 17573C<charnames::string_vianame()> (which require a C<use charnames ();> to be 17574specified. 17575 17576Finally, most properties related to decomposition are accessible via 17577L<Unicode::Normalize>. 17578 17579=head1 Unicode character properties that are NOT accepted by Perl 17580 17581Perl will generate an error for a few character properties in Unicode when 17582used in a regular expression. The non-Unihan ones are listed below, with the 17583reasons they are not accepted, perhaps with work-arounds. The short names for 17584the properties are listed enclosed in (parentheses). 17585As described after the list, an installation can change the defaults and choose 17586to accept any of these. The list is machine generated based on the 17587choices made for the installation that generated this document. 17588 17589@bad_re_properties 17590 17591An installation can choose to allow any of these to be matched by downloading 17592the Unicode database from L<http://www.unicode.org/Public/> to 17593C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the 17594controlling lists contained in the program 17595C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing. 17596(C<\%Config> is available from the Config module). 17597 17598Also, perl can be recompiled to operate on an earlier version of the Unicode 17599standard. Further information is at 17600C<\$Config{privlib}>/F<unicore/README.perl>. 17601 17602=head1 Other information in the Unicode data base 17603 17604The Unicode data base is delivered in two different formats. The XML version 17605is valid for more modern Unicode releases. The other version is a collection 17606of files. The two are intended to give equivalent information. Perl uses the 17607older form; this allows you to recompile Perl to use early Unicode releases. 17608 17609The only non-character property that Perl currently supports is Named 17610Sequences, in which a sequence of code points 17611is given a name and generally treated as a single entity. (Perl supports 17612these via the C<\\N{...}> double-quotish construct, 17613L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>. 17614 17615Below is a list of the files in the Unicode data base that Perl doesn't 17616currently use, along with very brief descriptions of their purposes. 17617Some of the names of the files have been shortened from those that Unicode 17618uses, in order to allow them to be distinguishable from similarly named files 17619on file systems for which only the first 8 characters of a name are 17620significant. 17621 17622=over 4 17623 17624@unused_files 17625 17626=back 17627 17628=head1 SEE ALSO 17629 17630L<$unicode_reference_url> 17631 17632L<perlrecharclass> 17633 17634L<perlunicode> 17635 17636END 17637 17638 # And write it. The 0 means no utf8. 17639 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT); 17640 return; 17641} 17642 17643sub make_Heavy () { 17644 # Create and write Heavy.pl, which passes info about the tables to 17645 # utf8_heavy.pl 17646 17647 # Stringify structures for output 17648 my $loose_property_name_of 17649 = simple_dumper(\%loose_property_name_of, ' ' x 4); 17650 chomp $loose_property_name_of; 17651 17652 my $strict_property_name_of 17653 = simple_dumper(\%strict_property_name_of, ' ' x 4); 17654 chomp $strict_property_name_of; 17655 17656 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); 17657 chomp $stricter_to_file_of; 17658 17659 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4); 17660 chomp $inline_definitions; 17661 17662 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); 17663 chomp $loose_to_file_of; 17664 17665 my $nv_floating_to_rational 17666 = simple_dumper(\%nv_floating_to_rational, ' ' x 4); 17667 chomp $nv_floating_to_rational; 17668 17669 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4); 17670 chomp $why_deprecated; 17671 17672 # We set the key to the file when we associated files with tables, but we 17673 # couldn't do the same for the value then, as we might not have the file 17674 # for the alternate table figured out at that time. 17675 foreach my $cased (keys %caseless_equivalent_to) { 17676 my @path = $caseless_equivalent_to{$cased}->file_path; 17677 my $path; 17678 if ($path[0] eq "#") { # Pseudo-directory '#' 17679 $path = join '/', @path; 17680 } 17681 else { # Gets rid of lib/ 17682 $path = join '/', @path[1, -1]; 17683 } 17684 $caseless_equivalent_to{$cased} = $path; 17685 } 17686 my $caseless_equivalent_to 17687 = simple_dumper(\%caseless_equivalent_to, ' ' x 4); 17688 chomp $caseless_equivalent_to; 17689 17690 my $loose_property_to_file_of 17691 = simple_dumper(\%loose_property_to_file_of, ' ' x 4); 17692 chomp $loose_property_to_file_of; 17693 17694 my $strict_property_to_file_of 17695 = simple_dumper(\%strict_property_to_file_of, ' ' x 4); 17696 chomp $strict_property_to_file_of; 17697 17698 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); 17699 chomp $file_to_swash_name; 17700 17701 my @heavy = <<END; 17702$HEADER 17703$INTERNAL_ONLY_HEADER 17704 17705# This file is for the use of utf8_heavy.pl and Unicode::UCD 17706 17707# Maps Unicode (not Perl single-form extensions) property names in loose 17708# standard form to their corresponding standard names 17709\%utf8::loose_property_name_of = ( 17710$loose_property_name_of 17711); 17712 17713# Same, but strict names 17714\%utf8::strict_property_name_of = ( 17715$strict_property_name_of 17716); 17717 17718# Gives the definitions (in the form of inversion lists) for those properties 17719# whose definitions aren't kept in files 17720\@utf8::inline_definitions = ( 17721$inline_definitions 17722); 17723 17724# Maps property, table to file for those using stricter matching. For paths 17725# whose directory is '#', the file is in the form of a numeric index into 17726# \@inline_definitions 17727\%utf8::stricter_to_file_of = ( 17728$stricter_to_file_of 17729); 17730 17731# Maps property, table to file for those using loose matching. For paths 17732# whose directory is '#', the file is in the form of a numeric index into 17733# \@inline_definitions 17734\%utf8::loose_to_file_of = ( 17735$loose_to_file_of 17736); 17737 17738# Maps floating point to fractional form 17739\%utf8::nv_floating_to_rational = ( 17740$nv_floating_to_rational 17741); 17742 17743# If a floating point number doesn't have enough digits in it to get this 17744# close to a fraction, it isn't considered to be that fraction even if all the 17745# digits it does have match. 17746\$utf8::max_floating_slop = $MAX_FLOATING_SLOP; 17747 17748# Deprecated tables to generate a warning for. The key is the file containing 17749# the table, so as to avoid duplication, as many property names can map to the 17750# file, but we only need one entry for all of them. 17751\%utf8::why_deprecated = ( 17752$why_deprecated 17753); 17754 17755# A few properties have different behavior under /i matching. This maps 17756# those to substitute files to use under /i. 17757\%utf8::caseless_equivalent = ( 17758$caseless_equivalent_to 17759); 17760 17761# Property names to mapping files 17762\%utf8::loose_property_to_file_of = ( 17763$loose_property_to_file_of 17764); 17765 17766# Property names to mapping files 17767\%utf8::strict_property_to_file_of = ( 17768$strict_property_to_file_of 17769); 17770 17771# Files to the swash names within them. 17772\%utf8::file_to_swash_name = ( 17773$file_to_swash_name 17774); 17775 177761; 17777END 17778 17779 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8. 17780 return; 17781} 17782 17783sub make_Name_pm () { 17784 # Create and write Name.pm, which contains subroutines and data to use in 17785 # conjunction with Name.pl 17786 17787 # Maybe there's nothing to do. 17788 return unless $has_hangul_syllables || @code_points_ending_in_code_point; 17789 17790 my @name = <<END; 17791$HEADER 17792$INTERNAL_ONLY_HEADER 17793END 17794 17795 # Convert these structures to output format. 17796 my $code_points_ending_in_code_point = 17797 main::simple_dumper(\@code_points_ending_in_code_point, 17798 ' ' x 8); 17799 my $names = main::simple_dumper(\%names_ending_in_code_point, 17800 ' ' x 8); 17801 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point, 17802 ' ' x 8); 17803 17804 # Do the same with the Hangul names, 17805 my $jamo; 17806 my $jamo_l; 17807 my $jamo_v; 17808 my $jamo_t; 17809 my $jamo_re; 17810 if ($has_hangul_syllables) { 17811 17812 # Construct a regular expression of all the possible 17813 # combinations of the Hangul syllables. 17814 my @L_re; # Leading consonants 17815 for my $i ($LBase .. $LBase + $LCount - 1) { 17816 push @L_re, $Jamo{$i} 17817 } 17818 my @V_re; # Middle vowels 17819 for my $i ($VBase .. $VBase + $VCount - 1) { 17820 push @V_re, $Jamo{$i} 17821 } 17822 my @T_re; # Trailing consonants 17823 for my $i ($TBase + 1 .. $TBase + $TCount - 1) { 17824 push @T_re, $Jamo{$i} 17825 } 17826 17827 # The whole re is made up of the L V T combination. 17828 $jamo_re = '(' 17829 . join ('|', sort @L_re) 17830 . ')(' 17831 . join ('|', sort @V_re) 17832 . ')(' 17833 . join ('|', sort @T_re) 17834 . ')?'; 17835 17836 # These hashes needed by the algorithm were generated 17837 # during reading of the Jamo.txt file 17838 $jamo = main::simple_dumper(\%Jamo, ' ' x 8); 17839 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8); 17840 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8); 17841 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8); 17842 } 17843 17844 push @name, <<END; 17845 17846package charnames; 17847 17848# This module contains machine-generated tables and code for the 17849# algorithmically-determinable Unicode character names. The following 17850# routines can be used to translate between name and code point and vice versa 17851 17852{ # Closure 17853 17854 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 17855 # two must be 10; if there are 5, the first must not be a 0. Written this 17856 # way to decrease backtracking. The first regex allows the code point to 17857 # be at the end of a word, but to work properly, the word shouldn't end 17858 # with a valid hex character. The second one won't match a code point at 17859 # the end of a word, and doesn't have the run-on issue 17860 my \$run_on_code_point_re = qr/$run_on_code_point_re/; 17861 my \$code_point_re = qr/$code_point_re/; 17862 17863 # In the following hash, the keys are the bases of names which include 17864 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 17865 # of each key is another hash which is used to get the low and high ends 17866 # for each range of code points that apply to the name. 17867 my %names_ending_in_code_point = ( 17868$names 17869 ); 17870 17871 # The following hash is a copy of the previous one, except is for loose 17872 # matching, so each name has blanks and dashes squeezed out 17873 my %loose_names_ending_in_code_point = ( 17874$loose_names 17875 ); 17876 17877 # And the following array gives the inverse mapping from code points to 17878 # names. Lowest code points are first 17879 my \@code_points_ending_in_code_point = ( 17880$code_points_ending_in_code_point 17881 ); 17882END 17883 # Earlier releases didn't have Jamos. No sense outputting 17884 # them unless will be used. 17885 if ($has_hangul_syllables) { 17886 push @name, <<END; 17887 17888 # Convert from code point to Jamo short name for use in composing Hangul 17889 # syllable names 17890 my %Jamo = ( 17891$jamo 17892 ); 17893 17894 # Leading consonant (can be null) 17895 my %Jamo_L = ( 17896$jamo_l 17897 ); 17898 17899 # Vowel 17900 my %Jamo_V = ( 17901$jamo_v 17902 ); 17903 17904 # Optional trailing consonant 17905 my %Jamo_T = ( 17906$jamo_t 17907 ); 17908 17909 # Computed re that splits up a Hangul name into LVT or LV syllables 17910 my \$syllable_re = qr/$jamo_re/; 17911 17912 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 17913 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 17914 17915 # These constants names and values were taken from the Unicode standard, 17916 # version 5.1, section 3.12. They are used in conjunction with Hangul 17917 # syllables 17918 my \$SBase = $SBase_string; 17919 my \$LBase = $LBase_string; 17920 my \$VBase = $VBase_string; 17921 my \$TBase = $TBase_string; 17922 my \$SCount = $SCount; 17923 my \$LCount = $LCount; 17924 my \$VCount = $VCount; 17925 my \$TCount = $TCount; 17926 my \$NCount = \$VCount * \$TCount; 17927END 17928 } # End of has Jamos 17929 17930 push @name, << 'END'; 17931 17932 sub name_to_code_point_special { 17933 my ($name, $loose) = @_; 17934 17935 # Returns undef if not one of the specially handled names; otherwise 17936 # returns the code point equivalent to the input name 17937 # $loose is non-zero if to use loose matching, 'name' in that case 17938 # must be input as upper case with all blanks and dashes squeezed out. 17939END 17940 if ($has_hangul_syllables) { 17941 push @name, << 'END'; 17942 17943 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 17944 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 17945 { 17946 return if $name !~ qr/^$syllable_re$/; 17947 my $L = $Jamo_L{$1}; 17948 my $V = $Jamo_V{$2}; 17949 my $T = (defined $3) ? $Jamo_T{$3} : 0; 17950 return ($L * $VCount + $V) * $TCount + $T + $SBase; 17951 } 17952END 17953 } 17954 push @name, << 'END'; 17955 17956 # Name must end in 'code_point' for this to handle. 17957 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 17958 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 17959 17960 my $base = $1; 17961 my $code_point = CORE::hex $2; 17962 my $names_ref; 17963 17964 if ($loose) { 17965 $names_ref = \%loose_names_ending_in_code_point; 17966 } 17967 else { 17968 return if $base !~ s/-$//; 17969 $names_ref = \%names_ending_in_code_point; 17970 } 17971 17972 # Name must be one of the ones which has the code point in it. 17973 return if ! $names_ref->{$base}; 17974 17975 # Look through the list of ranges that apply to this name to see if 17976 # the code point is in one of them. 17977 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 17978 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 17979 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 17980 17981 # Here, the code point is in the range. 17982 return $code_point; 17983 } 17984 17985 # Here, looked like the name had a code point number in it, but 17986 # did not match one of the valid ones. 17987 return; 17988 } 17989 17990 sub code_point_to_name_special { 17991 my $code_point = shift; 17992 17993 # Returns the name of a code point if algorithmically determinable; 17994 # undef if not 17995END 17996 if ($has_hangul_syllables) { 17997 push @name, << 'END'; 17998 17999 # If in the Hangul range, calculate the name based on Unicode's 18000 # algorithm 18001 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 18002 use integer; 18003 my $SIndex = $code_point - $SBase; 18004 my $L = $LBase + $SIndex / $NCount; 18005 my $V = $VBase + ($SIndex % $NCount) / $TCount; 18006 my $T = $TBase + $SIndex % $TCount; 18007 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 18008 $name .= $Jamo{$T} if $T != $TBase; 18009 return $name; 18010 } 18011END 18012 } 18013 push @name, << 'END'; 18014 18015 # Look through list of these code points for one in range. 18016 foreach my $hash (@code_points_ending_in_code_point) { 18017 return if $code_point < $hash->{'low'}; 18018 if ($code_point <= $hash->{'high'}) { 18019 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 18020 } 18021 } 18022 return; # None found 18023 } 18024} # End closure 18025 180261; 18027END 18028 18029 main::write("Name.pm", 0, \@name); # The 0 means no utf8. 18030 return; 18031} 18032 18033sub make_UCD () { 18034 # Create and write UCD.pl, which passes info about the tables to 18035 # Unicode::UCD 18036 18037 # Create a mapping from each alias of Perl single-form extensions to all 18038 # its equivalent aliases, for quick look-up. 18039 my %perlprop_to_aliases; 18040 foreach my $table ($perl->tables) { 18041 18042 # First create the list of the aliases of each extension 18043 my @aliases_list; # List of legal aliases for this extension 18044 18045 my $table_name = $table->name; 18046 my $standard_table_name = standardize($table_name); 18047 my $table_full_name = $table->full_name; 18048 my $standard_table_full_name = standardize($table_full_name); 18049 18050 # Make sure that the list has both the short and full names 18051 push @aliases_list, $table_name, $table_full_name; 18052 18053 my $found_ucd = 0; # ? Did we actually get an alias that should be 18054 # output for this table 18055 18056 # Go through all the aliases (including the two just added), and add 18057 # any new unique ones to the list 18058 foreach my $alias ($table->aliases) { 18059 18060 # Skip non-legal names 18061 next unless $alias->ok_as_filename; 18062 next unless $alias->ucd; 18063 18064 $found_ucd = 1; # have at least one legal name 18065 18066 my $name = $alias->name; 18067 my $standard = standardize($name); 18068 18069 # Don't repeat a name that is equivalent to one already on the 18070 # list 18071 next if $standard eq $standard_table_name; 18072 next if $standard eq $standard_table_full_name; 18073 18074 push @aliases_list, $name; 18075 } 18076 18077 # If there were no legal names, don't output anything. 18078 next unless $found_ucd; 18079 18080 # To conserve memory in the program reading these in, omit full names 18081 # that are identical to the short name, when those are the only two 18082 # aliases for the property. 18083 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) { 18084 pop @aliases_list; 18085 } 18086 18087 # Here, @aliases_list is the list of all the aliases that this 18088 # extension legally has. Now can create a map to it from each legal 18089 # standardized alias 18090 foreach my $alias ($table->aliases) { 18091 next unless $alias->ucd; 18092 next unless $alias->ok_as_filename; 18093 push @{$perlprop_to_aliases{standardize($alias->name)}}, 18094 @aliases_list; 18095 } 18096 } 18097 18098 # Make a list of all combinations of properties/values that are suppressed. 18099 my @suppressed; 18100 if (! $debug_skip) { # This tends to fail in this debug mode 18101 foreach my $property_name (keys %why_suppressed) { 18102 18103 # Just the value 18104 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; 18105 18106 # The hash may contain properties not in this release of Unicode 18107 next unless defined (my $property = property_ref($property_name)); 18108 18109 # Find all combinations 18110 foreach my $prop_alias ($property->aliases) { 18111 my $prop_alias_name = standardize($prop_alias->name); 18112 18113 # If no =value, there's just one combination possible for this 18114 if (! $value_name) { 18115 18116 # The property may be suppressed, but there may be a proxy 18117 # for it, so it shouldn't be listed as suppressed 18118 next if $prop_alias->ucd; 18119 push @suppressed, $prop_alias_name; 18120 } 18121 else { # Otherwise 18122 foreach my $value_alias 18123 ($property->table($value_name)->aliases) 18124 { 18125 next if $value_alias->ucd; 18126 18127 push @suppressed, "$prop_alias_name=" 18128 . standardize($value_alias->name); 18129 } 18130 } 18131 } 18132 } 18133 } 18134 @suppressed = sort @suppressed; # So doesn't change between runs of this 18135 # program 18136 18137 # Convert the structure below (designed for Name.pm) to a form that UCD 18138 # wants, so it doesn't have to modify it at all; i.e. so that it includes 18139 # an element for the Hangul syllables in the appropriate place, and 18140 # otherwise changes the name to include the "-<code point>" suffix. 18141 my @algorithm_names; 18142 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came 18143 # along in this version 18144 # Copy it linearly. 18145 for my $i (0 .. @code_points_ending_in_code_point - 1) { 18146 18147 # Insert the hanguls in the correct place. 18148 if (! $done_hangul 18149 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase) 18150 { 18151 $done_hangul = 1; 18152 push @algorithm_names, { low => $SBase, 18153 high => $SBase + $SCount - 1, 18154 name => '<hangul syllable>', 18155 }; 18156 } 18157 18158 # Copy the current entry, modified. 18159 push @algorithm_names, { 18160 low => $code_points_ending_in_code_point[$i]->{'low'}, 18161 high => $code_points_ending_in_code_point[$i]->{'high'}, 18162 name => 18163 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>", 18164 }; 18165 } 18166 18167 # Serialize these structures for output. 18168 my $loose_to_standard_value 18169 = simple_dumper(\%loose_to_standard_value, ' ' x 4); 18170 chomp $loose_to_standard_value; 18171 18172 my $string_property_loose_to_name 18173 = simple_dumper(\%string_property_loose_to_name, ' ' x 4); 18174 chomp $string_property_loose_to_name; 18175 18176 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4); 18177 chomp $perlprop_to_aliases; 18178 18179 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4); 18180 chomp $prop_aliases; 18181 18182 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4); 18183 chomp $prop_value_aliases; 18184 18185 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : ""; 18186 chomp $suppressed; 18187 18188 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4); 18189 chomp $algorithm_names; 18190 18191 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); 18192 chomp $ambiguous_names; 18193 18194 my $combination_property = simple_dumper(\%combination_property, ' ' x 4); 18195 chomp $combination_property; 18196 18197 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); 18198 chomp $loose_defaults; 18199 18200 my @ucd = <<END; 18201$HEADER 18202$INTERNAL_ONLY_HEADER 18203 18204# This file is for the use of Unicode::UCD 18205 18206# Highest legal Unicode code point 18207\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING; 18208 18209# Hangul syllables 18210\$Unicode::UCD::HANGUL_BEGIN = $SBase_string; 18211\$Unicode::UCD::HANGUL_COUNT = $SCount; 18212 18213# Keys are all the possible "prop=value" combinations, in loose form; values 18214# are the standard loose name for the 'value' part of the key 18215\%Unicode::UCD::loose_to_standard_value = ( 18216$loose_to_standard_value 18217); 18218 18219# String property loose names to standard loose name 18220\%Unicode::UCD::string_property_loose_to_name = ( 18221$string_property_loose_to_name 18222); 18223 18224# Keys are Perl extensions in loose form; values are each one's list of 18225# aliases 18226\%Unicode::UCD::loose_perlprop_to_name = ( 18227$perlprop_to_aliases 18228); 18229 18230# Keys are standard property name; values are each one's aliases 18231\%Unicode::UCD::prop_aliases = ( 18232$prop_aliases 18233); 18234 18235# Keys of top level are standard property name; values are keys to another 18236# hash, Each one is one of the property's values, in standard form. The 18237# values are that prop-val's aliases. If only one specified, the short and 18238# long alias are identical. 18239\%Unicode::UCD::prop_value_aliases = ( 18240$prop_value_aliases 18241); 18242 18243# Ordered (by code point ordinal) list of the ranges of code points whose 18244# names are algorithmically determined. Each range entry is an anonymous hash 18245# of the start and end points and a template for the names within it. 18246\@Unicode::UCD::algorithmic_named_code_points = ( 18247$algorithm_names 18248); 18249 18250# The properties that as-is have two meanings, and which must be disambiguated 18251\%Unicode::UCD::ambiguous_names = ( 18252$ambiguous_names 18253); 18254 18255# Keys are the prop-val combinations which are the default values for the 18256# given property, expressed in standard loose form 18257\%Unicode::UCD::loose_defaults = ( 18258$loose_defaults 18259); 18260 18261# The properties that are combinations, in that they have both a map table and 18262# a match table. This is actually for UCD.t, so it knows how to test for 18263# these. 18264\%Unicode::UCD::combination_property = ( 18265$combination_property 18266); 18267 18268# All combinations of names that are suppressed. 18269# This is actually for UCD.t, so it knows which properties shouldn't have 18270# entries. If it got any bigger, would probably want to put it in its own 18271# file to use memory only when it was needed, in testing. 18272\@Unicode::UCD::suppressed_properties = ( 18273$suppressed 18274); 18275 182761; 18277END 18278 18279 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8. 18280 return; 18281} 18282 18283sub write_all_tables() { 18284 # Write out all the tables generated by this program to files, as well as 18285 # the supporting data structures, pod file, and .t file. 18286 18287 my @writables; # List of tables that actually get written 18288 my %match_tables_to_write; # Used to collapse identical match tables 18289 # into one file. Each key is a hash function 18290 # result to partition tables into buckets. 18291 # Each value is an array of the tables that 18292 # fit in the bucket. 18293 18294 # For each property ... 18295 # (sort so that if there is an immutable file name, it has precedence, so 18296 # some other property can't come in and take over its file name. (We 18297 # don't care if both defined, as they had better be different anyway.) 18298 # The property named 'Perl' needs to be first (it doesn't have any 18299 # immutable file name) because empty properties are defined in terms of 18300 # its table named 'All' under the -annotate option.) We also sort by 18301 # the property's name. This is just for repeatability of the outputs 18302 # between runs of this program, but does not affect correctness. 18303 PROPERTY: 18304 foreach my $property ($perl, 18305 sort { return -1 if defined $a->file; 18306 return 1 if defined $b->file; 18307 return $a->name cmp $b->name; 18308 } grep { $_ != $perl } property_ref('*')) 18309 { 18310 my $type = $property->type; 18311 18312 # And for each table for that property, starting with the mapping 18313 # table for it ... 18314 TABLE: 18315 foreach my $table($property, 18316 18317 # and all the match tables for it (if any), sorted so 18318 # the ones with the shortest associated file name come 18319 # first. The length sorting prevents problems of a 18320 # longer file taking a name that might have to be used 18321 # by a shorter one. The alphabetic sorting prevents 18322 # differences between releases 18323 sort { my $ext_a = $a->external_name; 18324 return 1 if ! defined $ext_a; 18325 my $ext_b = $b->external_name; 18326 return -1 if ! defined $ext_b; 18327 18328 # But return the non-complement table before 18329 # the complement one, as the latter is defined 18330 # in terms of the former, and needs to have 18331 # the information for the former available. 18332 return 1 if $a->complement != 0; 18333 return -1 if $b->complement != 0; 18334 18335 # Similarly, return a subservient table after 18336 # a leader 18337 return 1 if $a->leader != $a; 18338 return -1 if $b->leader != $b; 18339 18340 my $cmp = length $ext_a <=> length $ext_b; 18341 18342 # Return result if lengths not equal 18343 return $cmp if $cmp; 18344 18345 # Alphabetic if lengths equal 18346 return $ext_a cmp $ext_b 18347 } $property->tables 18348 ) 18349 { 18350 18351 # Here we have a table associated with a property. It could be 18352 # the map table (done first for each property), or one of the 18353 # other tables. Determine which type. 18354 my $is_property = $table->isa('Property'); 18355 18356 my $name = $table->name; 18357 my $complete_name = $table->complete_name; 18358 18359 # See if should suppress the table if is empty, but warn if it 18360 # contains something. 18361 my $suppress_if_empty_warn_if_not 18362 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0; 18363 18364 # Calculate if this table should have any code points associated 18365 # with it or not. 18366 my $expected_empty = 18367 18368 # $perl should be empty 18369 ($is_property && ($table == $perl)) 18370 18371 # Match tables in properties we skipped populating should be 18372 # empty 18373 || (! $is_property && ! $property->to_create_match_tables) 18374 18375 # Tables and properties that are expected to have no code 18376 # points should be empty 18377 || $suppress_if_empty_warn_if_not 18378 ; 18379 18380 # Set a boolean if this table is the complement of an empty binary 18381 # table 18382 my $is_complement_of_empty_binary = 18383 $type == $BINARY && 18384 (($table == $property->table('Y') 18385 && $property->table('N')->is_empty) 18386 || ($table == $property->table('N') 18387 && $property->table('Y')->is_empty)); 18388 18389 if ($table->is_empty) { 18390 18391 if ($suppress_if_empty_warn_if_not) { 18392 $table->set_fate($SUPPRESSED, 18393 $suppress_if_empty_warn_if_not); 18394 } 18395 18396 # Suppress (by skipping them) expected empty tables. 18397 next TABLE if $expected_empty; 18398 18399 # And setup to later output a warning for those that aren't 18400 # known to be allowed to be empty. Don't do the warning if 18401 # this table is a child of another one to avoid duplicating 18402 # the warning that should come from the parent one. 18403 if (($table == $property || $table->parent == $table) 18404 && $table->fate != $SUPPRESSED 18405 && $table->fate != $MAP_PROXIED 18406 && ! grep { $complete_name =~ /^$_$/ } 18407 @tables_that_may_be_empty) 18408 { 18409 push @unhandled_properties, "$table"; 18410 } 18411 18412 # The old way of expressing an empty match list was to 18413 # complement the list that matches everything. The new way is 18414 # to create an empty inversion list, but this doesn't work for 18415 # annotating, so use the old way then. 18416 $table->set_complement($All) if $annotate 18417 && $table != $property; 18418 } 18419 elsif ($expected_empty) { 18420 my $because = ""; 18421 if ($suppress_if_empty_warn_if_not) { 18422 $because = " because $suppress_if_empty_warn_if_not"; 18423 } 18424 18425 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); 18426 } 18427 18428 # Some tables should match everything 18429 my $expected_full = 18430 ($table->fate == $SUPPRESSED) 18431 ? 0 18432 : ($is_property) 18433 ? # All these types of map tables will be full because 18434 # they will have been populated with defaults 18435 ($type == $ENUM) 18436 18437 : # A match table should match everything if its method 18438 # shows it should 18439 ($table->matches_all 18440 18441 # The complement of an empty binary table will match 18442 # everything 18443 || $is_complement_of_empty_binary 18444 ) 18445 ; 18446 18447 my $count = $table->count; 18448 if ($expected_full) { 18449 if ($count != $MAX_WORKING_CODEPOINTS) { 18450 Carp::my_carp("$table matches only " 18451 . clarify_number($count) 18452 . " Unicode code points but should match " 18453 . clarify_number($MAX_WORKING_CODEPOINTS) 18454 . " (off by " 18455 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count)) 18456 . "). Proceeding anyway."); 18457 } 18458 18459 # Here is expected to be full. If it is because it is the 18460 # complement of an (empty) binary table that is to be 18461 # suppressed, then suppress this one as well. 18462 if ($is_complement_of_empty_binary) { 18463 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; 18464 my $opposing = $property->table($opposing_name); 18465 my $opposing_status = $opposing->status; 18466 if ($opposing_status) { 18467 $table->set_status($opposing_status, 18468 $opposing->status_info); 18469 } 18470 } 18471 } 18472 elsif ($count == $MAX_UNICODE_CODEPOINTS 18473 && $name ne "Any" 18474 && ($table == $property || $table->leader == $table) 18475 && $table->property->status ne $NORMAL) 18476 { 18477 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); 18478 } 18479 18480 if ($table->fate >= $SUPPRESSED) { 18481 if (! $is_property) { 18482 my @children = $table->children; 18483 foreach my $child (@children) { 18484 if ($child->fate < $SUPPRESSED) { 18485 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); 18486 } 18487 } 18488 } 18489 next TABLE; 18490 18491 } 18492 18493 if (! $is_property) { 18494 18495 make_ucd_table_pod_entries($table) if $table->property == $perl; 18496 18497 # Several things need to be done just once for each related 18498 # group of match tables. Do them on the parent. 18499 if ($table->parent == $table) { 18500 18501 # Add an entry in the pod file for the table; it also does 18502 # the children. 18503 make_re_pod_entries($table) if defined $pod_directory; 18504 18505 # See if the table matches identical code points with 18506 # something that has already been processed and is ready 18507 # for output. In that case, no need to have two files 18508 # with the same code points in them. We use the table's 18509 # hash() method to store these in buckets, so that it is 18510 # quite likely that if two tables are in the same bucket 18511 # they will be identical, so don't have to compare tables 18512 # frequently. The tables have to have the same status to 18513 # share a file, so add this to the bucket hash. (The 18514 # reason for this latter is that Heavy.pl associates a 18515 # status with a file.) We don't check tables that are 18516 # inverses of others, as it would lead to some coding 18517 # complications, and checking all the regular ones should 18518 # find everything. 18519 if ($table->complement == 0) { 18520 my $hash = $table->hash . ';' . $table->status; 18521 18522 # Look at each table that is in the same bucket as 18523 # this one would be. 18524 foreach my $comparison 18525 (@{$match_tables_to_write{$hash}}) 18526 { 18527 # If the table doesn't point back to this one, we 18528 # see if it matches identically 18529 if ( $comparison->leader != $table 18530 && $table->matches_identically_to($comparison)) 18531 { 18532 $table->set_equivalent_to($comparison, 18533 Related => 0); 18534 next TABLE; 18535 } 18536 } 18537 18538 # Here, not equivalent, add this table to the bucket. 18539 push @{$match_tables_to_write{$hash}}, $table; 18540 } 18541 } 18542 } 18543 else { 18544 18545 # Here is the property itself. 18546 # Don't write out or make references to the $perl property 18547 next if $table == $perl; 18548 18549 make_ucd_table_pod_entries($table); 18550 18551 # There is a mapping stored of the various synonyms to the 18552 # standardized name of the property for utf8_heavy.pl. 18553 # Also, the pod file contains entries of the form: 18554 # \p{alias: *} \p{full: *} 18555 # rather than show every possible combination of things. 18556 18557 my @property_aliases = $property->aliases; 18558 18559 my $full_property_name = $property->full_name; 18560 my $property_name = $property->name; 18561 my $standard_property_name = standardize($property_name); 18562 my $standard_property_full_name 18563 = standardize($full_property_name); 18564 18565 # We also create for Unicode::UCD a list of aliases for 18566 # the property. The list starts with the property name; 18567 # then its full name. Legacy properties are not listed in 18568 # Unicode::UCD. 18569 my @property_list; 18570 my @standard_list; 18571 if ( $property->fate <= $MAP_PROXIED) { 18572 @property_list = ($property_name, $full_property_name); 18573 @standard_list = ($standard_property_name, 18574 $standard_property_full_name); 18575 } 18576 18577 # For each synonym ... 18578 for my $i (0 .. @property_aliases - 1) { 18579 my $alias = $property_aliases[$i]; 18580 my $alias_name = $alias->name; 18581 my $alias_standard = standardize($alias_name); 18582 18583 18584 # Add other aliases to the list of property aliases 18585 if ($property->fate <= $MAP_PROXIED 18586 && ! grep { $alias_standard eq $_ } @standard_list) 18587 { 18588 push @property_list, $alias_name; 18589 push @standard_list, $alias_standard; 18590 } 18591 18592 # For utf8_heavy, set the mapping of the alias to the 18593 # property 18594 if ($type == $STRING) { 18595 if ($property->fate <= $MAP_PROXIED) { 18596 $string_property_loose_to_name{$alias_standard} 18597 = $standard_property_name; 18598 } 18599 } 18600 else { 18601 my $hash_ref = ($alias_standard =~ /^_/) 18602 ? \%strict_property_name_of 18603 : \%loose_property_name_of; 18604 if (exists $hash_ref->{$alias_standard}) { 18605 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained"); 18606 } 18607 else { 18608 $hash_ref->{$alias_standard} 18609 = $standard_property_name; 18610 } 18611 18612 # Now for the re pod entry for this alias. Skip if not 18613 # outputting a pod; skip the first one, which is the 18614 # full name so won't have an entry like: '\p{full: *} 18615 # \p{full: *}', and skip if don't want an entry for 18616 # this one. 18617 next if $i == 0 18618 || ! defined $pod_directory 18619 || ! $alias->make_re_pod_entry; 18620 18621 my $rhs = "\\p{$full_property_name: *}"; 18622 if ($property != $perl && $table->perl_extension) { 18623 $rhs .= ' (Perl extension)'; 18624 } 18625 push @match_properties, 18626 format_pod_line($indent_info_column, 18627 '\p{' . $alias->name . ': *}', 18628 $rhs, 18629 $alias->status); 18630 } 18631 } 18632 18633 # The list of all possible names is attached to each alias, so 18634 # lookup is easy 18635 if (@property_list) { 18636 push @{$prop_aliases{$standard_list[0]}}, @property_list; 18637 } 18638 18639 if ($property->fate <= $MAP_PROXIED) { 18640 18641 # Similarly, we create for Unicode::UCD a list of 18642 # property-value aliases. 18643 18644 # Look at each table in the property... 18645 foreach my $table ($property->tables) { 18646 my @values_list; 18647 my $table_full_name = $table->full_name; 18648 my $standard_table_full_name 18649 = standardize($table_full_name); 18650 my $table_name = $table->name; 18651 my $standard_table_name = standardize($table_name); 18652 18653 # The list starts with the table name and its full 18654 # name. 18655 push @values_list, $table_name, $table_full_name; 18656 18657 # We add to the table each unique alias that isn't 18658 # discouraged from use. 18659 foreach my $alias ($table->aliases) { 18660 next if $alias->status 18661 && $alias->status eq $DISCOURAGED; 18662 my $name = $alias->name; 18663 my $standard = standardize($name); 18664 next if $standard eq $standard_table_name; 18665 next if $standard eq $standard_table_full_name; 18666 push @values_list, $name; 18667 } 18668 18669 # Here @values_list is a list of all the aliases for 18670 # the table. That is, all the property-values given 18671 # by this table. By agreement with Unicode::UCD, 18672 # if the name and full name are identical, and there 18673 # are no other names, drop the duplicate entry to save 18674 # memory. 18675 if (@values_list == 2 18676 && $values_list[0] eq $values_list[1]) 18677 { 18678 pop @values_list 18679 } 18680 18681 # To save memory, unlike the similar list for property 18682 # aliases above, only the standard forms have the list. 18683 # This forces an extra step of converting from input 18684 # name to standard name, but the savings are 18685 # considerable. (There is only marginal savings if we 18686 # did this with the property aliases.) 18687 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list; 18688 } 18689 } 18690 18691 # Don't write out a mapping file if not desired. 18692 next if ! $property->to_output_map; 18693 } 18694 18695 # Here, we know we want to write out the table, but don't do it 18696 # yet because there may be other tables that come along and will 18697 # want to share the file, and the file's comments will change to 18698 # mention them. So save for later. 18699 push @writables, $table; 18700 18701 } # End of looping through the property and all its tables. 18702 } # End of looping through all properties. 18703 18704 # Now have all the tables that will have files written for them. Do it. 18705 foreach my $table (@writables) { 18706 my @directory; 18707 my $filename; 18708 my $property = $table->property; 18709 my $is_property = ($table == $property); 18710 18711 # For very short tables, instead of writing them out to actual files, 18712 # we in-line their inversion list definitions into Heavy.pl. The 18713 # definition replaces the file name, and the special pseudo-directory 18714 # '#' is used to signal this. This significantly cuts down the number 18715 # of files written at little extra cost to the hashes in Heavy.pl. 18716 # And it means, no run-time files to read to get the definitions. 18717 if (! $is_property 18718 && ! $annotate # For annotation, we want to explicitly show 18719 # everything, so keep in files 18720 && $table->ranges <= 3) 18721 { 18722 my @ranges = $table->ranges; 18723 my $count = @ranges; 18724 if ($count == 0) { # 0th index reserved for 0-length lists 18725 $filename = 0; 18726 } 18727 elsif ($table->leader != $table) { 18728 18729 # Here, is a table that is equivalent to another; code 18730 # in register_file_for_name() causes its leader's definition 18731 # to be used 18732 18733 next; 18734 } 18735 else { # No equivalent table so far. 18736 18737 # Build up its definition range-by-range. 18738 my $definition = ""; 18739 while (defined (my $range = shift @ranges)) { 18740 my $end = $range->end; 18741 if ($end < $MAX_WORKING_CODEPOINT) { 18742 $count++; 18743 $end = "\n" . ($end + 1); 18744 } 18745 else { # Extends to infinity, hence no 'end' 18746 $end = ""; 18747 } 18748 $definition .= "\n" . $range->start . $end; 18749 } 18750 $definition = "V$count" . $definition; 18751 $filename = @inline_definitions; 18752 push @inline_definitions, $definition; 18753 } 18754 @directory = "#"; 18755 register_file_for_name($table, \@directory, $filename); 18756 next; 18757 } 18758 18759 if (! $is_property) { 18760 # Match tables for the property go in lib/$subdirectory, which is 18761 # the property's name. Don't use the standard file name for this, 18762 # as may get an unfamiliar alias 18763 @directory = ($matches_directory, $property->external_name); 18764 } 18765 else { 18766 18767 @directory = $table->directory; 18768 $filename = $table->file; 18769 } 18770 18771 # Use specified filename if available, or default to property's 18772 # shortest name. We need an 8.3 safe filename (which means "an 8 18773 # safe" filename, since after the dot is only 'pl', which is < 3) 18774 # The 2nd parameter is if the filename shouldn't be changed, and 18775 # it shouldn't iff there is a hard-coded name for this table. 18776 $filename = construct_filename( 18777 $filename || $table->external_name, 18778 ! $filename, # mutable if no filename 18779 \@directory); 18780 18781 register_file_for_name($table, \@directory, $filename); 18782 18783 # Only need to write one file when shared by more than one 18784 # property 18785 next if ! $is_property 18786 && ($table->leader != $table || $table->complement != 0); 18787 18788 # Construct a nice comment to add to the file 18789 $table->set_final_comment; 18790 18791 $table->write; 18792 } 18793 18794 18795 # Write out the pod file 18796 make_pod; 18797 18798 # And Heavy.pl, Name.pm, UCD.pl 18799 make_Heavy; 18800 make_Name_pm; 18801 make_UCD; 18802 18803 make_property_test_script() if $make_test_script; 18804 make_normalization_test_script() if $make_norm_test_script; 18805 return; 18806} 18807 18808my @white_space_separators = ( # This used only for making the test script. 18809 "", 18810 ' ', 18811 "\t", 18812 ' ' 18813 ); 18814 18815sub generate_separator($) { 18816 # This used only for making the test script. It generates the colon or 18817 # equal separator between the property and property value, with random 18818 # white space surrounding the separator 18819 18820 my $lhs = shift; 18821 18822 return "" if $lhs eq ""; # No separator if there's only one (the r) side 18823 18824 # Choose space before and after randomly 18825 my $spaces_before =$white_space_separators[rand(@white_space_separators)]; 18826 my $spaces_after = $white_space_separators[rand(@white_space_separators)]; 18827 18828 # And return the whole complex, half the time using a colon, half the 18829 # equals 18830 return $spaces_before 18831 . (rand() < 0.5) ? '=' : ':' 18832 . $spaces_after; 18833} 18834 18835sub generate_tests($$$$$) { 18836 # This used only for making the test script. It generates test cases that 18837 # are expected to compile successfully in perl. Note that the LHS and 18838 # RHS are assumed to already be as randomized as the caller wants. 18839 18840 my $lhs = shift; # The property: what's to the left of the colon 18841 # or equals separator 18842 my $rhs = shift; # The property value; what's to the right 18843 my $valid_code = shift; # A code point that's known to be in the 18844 # table given by LHS=RHS; undef if table is 18845 # empty 18846 my $invalid_code = shift; # A code point known to not be in the table; 18847 # undef if the table is all code points 18848 my $warning = shift; 18849 18850 # Get the colon or equal 18851 my $separator = generate_separator($lhs); 18852 18853 # The whole 'property=value' 18854 my $name = "$lhs$separator$rhs"; 18855 18856 my @output; 18857 # Create a complete set of tests, with complements. 18858 if (defined $valid_code) { 18859 push @output, <<"EOC" 18860Expect(1, $valid_code, '\\p{$name}', $warning); 18861Expect(0, $valid_code, '\\p{^$name}', $warning); 18862Expect(0, $valid_code, '\\P{$name}', $warning); 18863Expect(1, $valid_code, '\\P{^$name}', $warning); 18864EOC 18865 } 18866 if (defined $invalid_code) { 18867 push @output, <<"EOC" 18868Expect(0, $invalid_code, '\\p{$name}', $warning); 18869Expect(1, $invalid_code, '\\p{^$name}', $warning); 18870Expect(1, $invalid_code, '\\P{$name}', $warning); 18871Expect(0, $invalid_code, '\\P{^$name}', $warning); 18872EOC 18873 } 18874 return @output; 18875} 18876 18877sub generate_error($$$) { 18878 # This used only for making the test script. It generates test cases that 18879 # are expected to not only not match, but to be syntax or similar errors 18880 18881 my $lhs = shift; # The property: what's to the left of the 18882 # colon or equals separator 18883 my $rhs = shift; # The property value; what's to the right 18884 my $already_in_error = shift; # Boolean; if true it's known that the 18885 # unmodified LHS and RHS will cause an error. 18886 # This routine should not force another one 18887 # Get the colon or equal 18888 my $separator = generate_separator($lhs); 18889 18890 # Since this is an error only, don't bother to randomly decide whether to 18891 # put the error on the left or right side; and assume that the RHS is 18892 # loosely matched, again for convenience rather than rigor. 18893 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; 18894 18895 my $property = $lhs . $separator . $rhs; 18896 18897 return <<"EOC"; 18898Error('\\p{$property}'); 18899Error('\\P{$property}'); 18900EOC 18901} 18902 18903# These are used only for making the test script 18904# XXX Maybe should also have a bad strict seps, which includes underscore. 18905 18906my @good_loose_seps = ( 18907 " ", 18908 "-", 18909 "\t", 18910 "", 18911 "_", 18912 ); 18913my @bad_loose_seps = ( 18914 "/a/", 18915 ':=', 18916 ); 18917 18918sub randomize_stricter_name { 18919 # This used only for making the test script. Take the input name and 18920 # return a randomized, but valid version of it under the stricter matching 18921 # rules. 18922 18923 my $name = shift; 18924 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 18925 18926 # If the name looks like a number (integer, floating, or rational), do 18927 # some extra work 18928 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { 18929 my $sign = $1; 18930 my $number = $2; 18931 my $separator = $3; 18932 18933 # If there isn't a sign, part of the time add a plus 18934 # Note: Not testing having any denominator having a minus sign 18935 if (! $sign) { 18936 $sign = '+' if rand() <= .3; 18937 } 18938 18939 # And add 0 or more leading zeros. 18940 $name = $sign . ('0' x int rand(10)) . $number; 18941 18942 if (defined $separator) { 18943 my $extra_zeros = '0' x int rand(10); 18944 18945 if ($separator eq '.') { 18946 18947 # Similarly, add 0 or more trailing zeros after a decimal 18948 # point 18949 $name .= $extra_zeros; 18950 } 18951 else { 18952 18953 # Or, leading zeros before the denominator 18954 $name =~ s,/,/$extra_zeros,; 18955 } 18956 } 18957 } 18958 18959 # For legibility of the test, only change the case of whole sections at a 18960 # time. To do this, first split into sections. The split returns the 18961 # delimiters 18962 my @sections; 18963 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { 18964 trace $section if main::DEBUG && $to_trace; 18965 18966 if (length $section > 1 && $section !~ /\D/) { 18967 18968 # If the section is a sequence of digits, about half the time 18969 # randomly add underscores between some of them. 18970 if (rand() > .5) { 18971 18972 # Figure out how many underscores to add. max is 1 less than 18973 # the number of digits. (But add 1 at the end to make sure 18974 # result isn't 0, and compensate earlier by subtracting 2 18975 # instead of 1) 18976 my $num_underscores = int rand(length($section) - 2) + 1; 18977 18978 # And add them evenly throughout, for convenience, not rigor 18979 use integer; 18980 my $spacing = (length($section) - 1)/ $num_underscores; 18981 my $temp = $section; 18982 $section = ""; 18983 for my $i (1 .. $num_underscores) { 18984 $section .= substr($temp, 0, $spacing, "") . '_'; 18985 } 18986 $section .= $temp; 18987 } 18988 push @sections, $section; 18989 } 18990 else { 18991 18992 # Here not a sequence of digits. Change the case of the section 18993 # randomly 18994 my $switch = int rand(4); 18995 if ($switch == 0) { 18996 push @sections, uc $section; 18997 } 18998 elsif ($switch == 1) { 18999 push @sections, lc $section; 19000 } 19001 elsif ($switch == 2) { 19002 push @sections, ucfirst $section; 19003 } 19004 else { 19005 push @sections, $section; 19006 } 19007 } 19008 } 19009 trace "returning", join "", @sections if main::DEBUG && $to_trace; 19010 return join "", @sections; 19011} 19012 19013sub randomize_loose_name($;$) { 19014 # This used only for making the test script 19015 19016 my $name = shift; 19017 my $want_error = shift; # if true, make an error 19018 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 19019 19020 $name = randomize_stricter_name($name); 19021 19022 my @parts; 19023 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 19024 19025 # Preserve trailing ones for the sake of not stripping the underscore from 19026 # 'L_' 19027 for my $part (split /[-\s_]+ (?= . )/, $name) { 19028 if (@parts) { 19029 if ($want_error and rand() < 0.3) { 19030 push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; 19031 $want_error = 0; 19032 } 19033 else { 19034 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 19035 } 19036 } 19037 push @parts, $part; 19038 } 19039 my $new = join("", @parts); 19040 trace "$name => $new" if main::DEBUG && $to_trace; 19041 19042 if ($want_error) { 19043 if (rand() >= 0.5) { 19044 $new .= $bad_loose_seps[rand(@bad_loose_seps)]; 19045 } 19046 else { 19047 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; 19048 } 19049 } 19050 return $new; 19051} 19052 19053# Used to make sure don't generate duplicate test cases. 19054my %test_generated; 19055 19056sub make_property_test_script() { 19057 # This used only for making the test script 19058 # this written directly -- it's huge. 19059 19060 print "Making test script\n" if $verbosity >= $PROGRESS; 19061 19062 # This uses randomness to test different possibilities without testing all 19063 # possibilities. To ensure repeatability, set the seed to 0. But if 19064 # tests are added, it will perturb all later ones in the .t file 19065 srand 0; 19066 19067 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name 19068 19069 # Keep going down an order of magnitude 19070 # until find that adding this quantity to 19071 # 1 remains 1; but put an upper limit on 19072 # this so in case this algorithm doesn't 19073 # work properly on some platform, that we 19074 # won't loop forever. 19075 my $digits = 0; 19076 my $min_floating_slop = 1; 19077 while (1+ $min_floating_slop != 1 19078 && $digits++ < 50) 19079 { 19080 my $next = $min_floating_slop / 10; 19081 last if $next == 0; # If underflows, 19082 # use previous one 19083 $min_floating_slop = $next; 19084 } 19085 19086 # It doesn't matter whether the elements of this array contain single lines 19087 # or multiple lines. main::write doesn't count the lines. 19088 my @output; 19089 19090 push @output, <<'EOF_CODE'; 19091Error('\p{Script=InGreek}'); # Bug #69018 19092Test_GCB("1100 $nobreak 1161"); # Bug #70940 19093Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 19094Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 19095Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726 19096 19097# Make sure this gets tested; it was not part of the official test suite at 19098# the time this was added. Note that this is as it would appear in the 19099# official suite, and gets modified to check for the perl tailoring by 19100# Test_WB() 19101Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable"); 19102Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable"); 19103EOF_CODE 19104 19105 # Sort these so get results in same order on different runs of this 19106 # program 19107 foreach my $property (sort { $a->has_dependency <=> $b->has_dependency 19108 or 19109 lc $a->name cmp lc $b->name 19110 } property_ref('*')) 19111 { 19112 # Non-binary properties should not match \p{}; Test all for that. 19113 if ($property->type != $BINARY) { 19114 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } 19115 $property->aliases; 19116 foreach my $property_alias ($property->aliases) { 19117 my $name = standardize($property_alias->name); 19118 19119 # But some names are ambiguous, meaning a binary property with 19120 # the same name when used in \p{}, and a different 19121 # (non-binary) property in other contexts. 19122 next if grep { $name eq $_ } keys %ambiguous_names; 19123 19124 push @output, <<"EOF_CODE"; 19125Error('\\p{$name}'); 19126Error('\\P{$name}'); 19127EOF_CODE 19128 } 19129 } 19130 foreach my $table (sort { $a->has_dependency <=> $b->has_dependency 19131 or 19132 lc $a->name cmp lc $b->name 19133 } $property->tables) 19134 { 19135 19136 # Find code points that match, and don't match this table. 19137 my $valid = $table->get_valid_code_point; 19138 my $invalid = $table->get_invalid_code_point; 19139 my $warning = ($table->status eq $DEPRECATED) 19140 ? "'deprecated'" 19141 : '""'; 19142 19143 # Test each possible combination of the property's aliases with 19144 # the table's. If this gets to be too many, could do what is done 19145 # in the set_final_comment() for Tables 19146 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases; 19147 next unless @table_aliases; 19148 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases; 19149 next unless @property_aliases; 19150 19151 # Every property can be optionally be prefixed by 'Is_', so test 19152 # that those work, by creating such a new alias for each 19153 # pre-existing one. 19154 push @property_aliases, map { Alias->new("Is_" . $_->name, 19155 $_->loose_match, 19156 $_->make_re_pod_entry, 19157 $_->ok_as_filename, 19158 $_->status, 19159 $_->ucd, 19160 ) 19161 } @property_aliases; 19162 my $max = max(scalar @table_aliases, scalar @property_aliases); 19163 for my $j (0 .. $max - 1) { 19164 19165 # The current alias for property is the next one on the list, 19166 # or if beyond the end, start over. Similarly for table 19167 my $property_name 19168 = $property_aliases[$j % @property_aliases]->name; 19169 19170 $property_name = "" if $table->property == $perl; 19171 my $table_alias = $table_aliases[$j % @table_aliases]; 19172 my $table_name = $table_alias->name; 19173 my $loose_match = $table_alias->loose_match; 19174 19175 # If the table doesn't have a file, any test for it is 19176 # already guaranteed to be in error 19177 my $already_error = ! $table->file_path; 19178 19179 # Generate error cases for this alias. 19180 push @output, generate_error($property_name, 19181 $table_name, 19182 $already_error); 19183 19184 # If the table is guaranteed to always generate an error, 19185 # quit now without generating success cases. 19186 next if $already_error; 19187 19188 # Now for the success cases. 19189 my $random; 19190 if ($loose_match) { 19191 19192 # For loose matching, create an extra test case for the 19193 # standard name. 19194 my $standard = standardize($table_name); 19195 19196 # $test_name should be a unique combination for each test 19197 # case; used just to avoid duplicate tests 19198 my $test_name = "$property_name=$standard"; 19199 19200 # Don't output duplicate test cases. 19201 if (! exists $test_generated{$test_name}) { 19202 $test_generated{$test_name} = 1; 19203 push @output, generate_tests($property_name, 19204 $standard, 19205 $valid, 19206 $invalid, 19207 $warning, 19208 ); 19209 } 19210 $random = randomize_loose_name($table_name) 19211 } 19212 else { # Stricter match 19213 $random = randomize_stricter_name($table_name); 19214 } 19215 19216 # Now for the main test case for this alias. 19217 my $test_name = "$property_name=$random"; 19218 if (! exists $test_generated{$test_name}) { 19219 $test_generated{$test_name} = 1; 19220 push @output, generate_tests($property_name, 19221 $random, 19222 $valid, 19223 $invalid, 19224 $warning, 19225 ); 19226 19227 # If the name is a rational number, add tests for the 19228 # floating point equivalent. 19229 if ($table_name =~ qr{/}) { 19230 19231 # Calculate the float, and find just the fraction. 19232 my $float = eval $table_name; 19233 my ($whole, $fraction) 19234 = $float =~ / (.*) \. (.*) /x; 19235 19236 # Starting with one digit after the decimal point, 19237 # create a test for each possible precision (number of 19238 # digits past the decimal point) until well beyond the 19239 # native number found on this machine. (If we started 19240 # with 0 digits, it would be an integer, which could 19241 # well match an unrelated table) 19242 PLACE: 19243 for my $i (1 .. $min_floating_slop + 3) { 19244 my $table_name = sprintf("%.*f", $i, $float); 19245 if ($i < $MIN_FRACTION_LENGTH) { 19246 19247 # If the test case has fewer digits than the 19248 # minimum acceptable precision, it shouldn't 19249 # succeed, so we expect an error for it. 19250 # E.g., 2/3 = .7 at one decimal point, and we 19251 # shouldn't say it matches .7. We should make 19252 # it be .667 at least before agreeing that the 19253 # intent was to match 2/3. But at the 19254 # less-than- acceptable level of precision, it 19255 # might actually match an unrelated number. 19256 # So don't generate a test case if this 19257 # conflating is possible. In our example, we 19258 # don't want 2/3 matching 7/10, if there is 19259 # a 7/10 code point. 19260 19261 # First, integers are not in the rationals 19262 # table. Don't generate an error if this 19263 # rounds to an integer using the given 19264 # precision. 19265 my $round = sprintf "%.0f", $table_name; 19266 next PLACE if abs($table_name - $round) 19267 < $MAX_FLOATING_SLOP; 19268 19269 # Here, isn't close enough to an integer to be 19270 # confusable with one. Now, see it it's 19271 # "close" to a known rational 19272 for my $existing 19273 (keys %nv_floating_to_rational) 19274 { 19275 next PLACE 19276 if abs($table_name - $existing) 19277 < $MAX_FLOATING_SLOP; 19278 } 19279 push @output, generate_error($property_name, 19280 $table_name, 19281 1 # 1 => already an error 19282 ); 19283 } 19284 else { 19285 19286 # Here the number of digits exceeds the 19287 # minimum we think is needed. So generate a 19288 # success test case for it. 19289 push @output, generate_tests($property_name, 19290 $table_name, 19291 $valid, 19292 $invalid, 19293 $warning, 19294 ); 19295 } 19296 } 19297 } 19298 } 19299 } 19300 $table->DESTROY(); 19301 } 19302 $property->DESTROY(); 19303 } 19304 19305 # Make any test of the boundary (break) properties TODO if the code 19306 # doesn't match the version being compiled 19307 my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version) 19308 ? "\nsub TODO_FAILING_BREAKS { 1 }\n" 19309 : "\nsub TODO_FAILING_BREAKS { 0 }\n"; 19310 19311 @output= map { 19312 map s/^/ /mgr, 19313 map "$_;\n", 19314 split /;\n/, $_ 19315 } @output; 19316 19317 # Cause there to be 'if' statements to only execute a portion of this 19318 # long-running test each time, so that we can have a bunch of .t's running 19319 # in parallel 19320 my $chunks = 10 # Number of test files 19321 - 1 # For GCB & SB 19322 - 1 # For WB 19323 - 4; # LB split into this many files 19324 my @output_chunked; 19325 my $chunk_count=0; 19326 my $chunk_size= int(@output / $chunks) + 1; 19327 while (@output) { 19328 $chunk_count++; 19329 my @chunk= splice @output, 0, $chunk_size; 19330 push @output_chunked, 19331 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19332 @chunk, 19333 "}\n"; 19334 } 19335 19336 $chunk_count++; 19337 push @output_chunked, 19338 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19339 (map {" Test_GCB('$_');\n"} @backslash_X_tests), 19340 (map {" Test_SB('$_');\n"} @SB_tests), 19341 "}\n"; 19342 19343 19344 $chunk_size= int(@LB_tests / 4) + 1; 19345 @LB_tests = map {" Test_LB('$_');\n"} @LB_tests; 19346 while (@LB_tests) { 19347 $chunk_count++; 19348 my @chunk= splice @LB_tests, 0, $chunk_size; 19349 push @output_chunked, 19350 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19351 @chunk, 19352 "}\n"; 19353 } 19354 19355 $chunk_count++; 19356 push @output_chunked, 19357 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19358 (map {" Test_WB('$_');\n"} @WB_tests), 19359 "}\n"; 19360 19361 &write($t_path, 19362 0, # Not utf8; 19363 [$HEADER, 19364 $TODO_FAILING_BREAKS, 19365 <DATA>, 19366 @output_chunked, 19367 "Finished();\n", 19368 ]); 19369 19370 return; 19371} 19372 19373sub make_normalization_test_script() { 19374 print "Making normalization test script\n" if $verbosity >= $PROGRESS; 19375 19376 my $n_path = 'TestNorm.pl'; 19377 19378 unshift @normalization_tests, <<'END'; 19379use utf8; 19380use Test::More; 19381 19382sub ord_string { # Convert packed ords to printable string 19383 use charnames (); 19384 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } 19385 unpack "U*", shift) . "'"; 19386 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; 19387} 19388 19389sub Test_N { 19390 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; 19391 my $display_source = ord_string($source); 19392 my $display_nfc = ord_string($nfc); 19393 my $display_nfd = ord_string($nfd); 19394 my $display_nfkc = ord_string($nfkc); 19395 my $display_nfkd = ord_string($nfkd); 19396 19397 use Unicode::Normalize; 19398 # NFC 19399 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd) 19400 # nfkc == toNFC(nfkc) == toNFC(nfkd) 19401 # 19402 # NFD 19403 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd) 19404 # nfkd == toNFD(nfkc) == toNFD(nfkd) 19405 # 19406 # NFKC 19407 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) == 19408 # toNFKC(nfkc) == toNFKC(nfkd) 19409 # 19410 # NFKD 19411 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) == 19412 # toNFKD(nfkc) == toNFKD(nfkd) 19413 19414 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc"); 19415 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc"); 19416 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc"); 19417 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc"); 19418 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc"); 19419 19420 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd"); 19421 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd"); 19422 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd"); 19423 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd"); 19424 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd"); 19425 19426 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc"); 19427 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc"); 19428 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc"); 19429 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc"); 19430 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc"); 19431 19432 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd"); 19433 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd"); 19434 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd"); 19435 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd"); 19436 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd"); 19437} 19438END 19439 19440 &write($n_path, 19441 1, # Is utf8; 19442 [ 19443 @normalization_tests, 19444 'done_testing();' 19445 ]); 19446 return; 19447} 19448 19449# Skip reasons, so will be exact same text and hence the files with each 19450# reason will get grouped together in perluniprops. 19451my $Documentation = "Documentation"; 19452my $Indic_Skip 19453 = "Provisional; for the analysis and processing of Indic scripts"; 19454my $Validation = "Validation Tests"; 19455my $Validation_Documentation = "Documentation of validation Tests"; 19456 19457# This is a list of the input files and how to handle them. The files are 19458# processed in their order in this list. Some reordering is possible if 19459# desired, but the PropertyAliases and PropValueAliases files should be first, 19460# and the extracted before the others (as data in an extracted file can be 19461# over-ridden by the non-extracted. Some other files depend on data derived 19462# from an earlier file, like UnicodeData requires data from Jamo, and the case 19463# changing and folding requires data from Unicode. Mostly, it is safest to 19464# order by first version releases in (except the Jamo). 19465# 19466# The version strings allow the program to know whether to expect a file or 19467# not, but if a file exists in the directory, it will be processed, even if it 19468# is in a version earlier than expected, so you can copy files from a later 19469# release into an earlier release's directory. 19470my @input_file_objects = ( 19471 Input_file->new('PropertyAliases.txt', v3.2, 19472 Handler => \&process_PropertyAliases, 19473 Early => [ \&substitute_PropertyAliases ], 19474 Required_Even_in_Debug_Skip => 1, 19475 ), 19476 Input_file->new(undef, v0, # No file associated with this 19477 Progress_Message => 'Finishing property setup', 19478 Handler => \&finish_property_setup, 19479 ), 19480 Input_file->new('PropValueAliases.txt', v3.2, 19481 Handler => \&process_PropValueAliases, 19482 Early => [ \&substitute_PropValueAliases ], 19483 Has_Missings_Defaults => $NOT_IGNORED, 19484 Required_Even_in_Debug_Skip => 1, 19485 ), 19486 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, 19487 Property => 'General_Category', 19488 ), 19489 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, 19490 Property => 'Canonical_Combining_Class', 19491 Has_Missings_Defaults => $NOT_IGNORED, 19492 ), 19493 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, 19494 Property => 'Numeric_Type', 19495 Has_Missings_Defaults => $NOT_IGNORED, 19496 ), 19497 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, 19498 Property => 'East_Asian_Width', 19499 Has_Missings_Defaults => $NOT_IGNORED, 19500 ), 19501 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, 19502 Property => 'Line_Break', 19503 Has_Missings_Defaults => $NOT_IGNORED, 19504 ), 19505 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, 19506 Property => 'Bidi_Class', 19507 Has_Missings_Defaults => $NOT_IGNORED, 19508 ), 19509 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, 19510 Property => 'Decomposition_Type', 19511 Has_Missings_Defaults => $NOT_IGNORED, 19512 ), 19513 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), 19514 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, 19515 Property => 'Numeric_Value', 19516 Each_Line_Handler => \&filter_numeric_value_line, 19517 Has_Missings_Defaults => $NOT_IGNORED, 19518 ), 19519 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, 19520 Property => 'Joining_Group', 19521 Has_Missings_Defaults => $NOT_IGNORED, 19522 ), 19523 19524 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, 19525 Property => 'Joining_Type', 19526 Has_Missings_Defaults => $NOT_IGNORED, 19527 ), 19528 Input_file->new("${EXTRACTED}DName.txt", v10.0.0, 19529 Skip => 'This file adds no new information not already' 19530 . ' present in other files', 19531 # And it's unnecessary programmer work to handle this new 19532 # format. Previous Derived files actually had bug fixes 19533 # in them that were useful, but that should not be the 19534 # case here. 19535 ), 19536 Input_file->new('Jamo.txt', v2.0.0, 19537 Property => 'Jamo_Short_Name', 19538 Each_Line_Handler => \&filter_jamo_line, 19539 ), 19540 Input_file->new('UnicodeData.txt', v1.1.5, 19541 Pre_Handler => \&setup_UnicodeData, 19542 19543 # We clean up this file for some early versions. 19544 Each_Line_Handler => [ (($v_version lt v2.0.0 ) 19545 ? \&filter_v1_ucd 19546 : ($v_version eq v2.1.5) 19547 ? \&filter_v2_1_5_ucd 19548 19549 # And for 5.14 Perls with 6.0, 19550 # have to also make changes 19551 : ($v_version ge v6.0.0 19552 && $^V lt v5.17.0) 19553 ? \&filter_v6_ucd 19554 : undef), 19555 19556 # Early versions did not have the 19557 # proper Unicode_1 names for the 19558 # controls 19559 (($v_version lt v3.0.0) 19560 ? \&filter_early_U1_names 19561 : undef), 19562 19563 # Early versions did not correctly 19564 # use the later method for giving 19565 # decimal digit values 19566 (($v_version le v3.2.0) 19567 ? \&filter_bad_Nd_ucd 19568 : undef), 19569 19570 # And the main filter 19571 \&filter_UnicodeData_line, 19572 ], 19573 EOF_Handler => \&EOF_UnicodeData, 19574 ), 19575 Input_file->new('CJKXREF.TXT', v1.1.5, 19576 Withdrawn => v2.0.0, 19577 Skip => 'Gives the mapping of CJK code points ' 19578 . 'between Unicode and various other standards', 19579 ), 19580 Input_file->new('ArabicShaping.txt', v2.0.0, 19581 Each_Line_Handler => 19582 ($v_version lt 4.1.0) 19583 ? \&filter_old_style_arabic_shaping 19584 : undef, 19585 # The first field after the range is a "schematic name" 19586 # not used by Perl 19587 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ], 19588 Has_Missings_Defaults => $NOT_IGNORED, 19589 ), 19590 Input_file->new('Blocks.txt', v2.0.0, 19591 Property => 'Block', 19592 Has_Missings_Defaults => $NOT_IGNORED, 19593 Each_Line_Handler => \&filter_blocks_lines 19594 ), 19595 Input_file->new('Index.txt', v2.0.0, 19596 Skip => 'Alphabetical index of Unicode characters', 19597 ), 19598 Input_file->new('NamesList.txt', v2.0.0, 19599 Skip => 'Annotated list of characters', 19600 ), 19601 Input_file->new('PropList.txt', v2.0.0, 19602 Each_Line_Handler => (($v_version lt v3.1.0) 19603 ? \&filter_old_style_proplist 19604 : undef), 19605 ), 19606 Input_file->new('Props.txt', v2.0.0, 19607 Withdrawn => v3.0.0, 19608 Skip => 'A subset of F<PropList.txt> (which is used instead)', 19609 ), 19610 Input_file->new('ReadMe.txt', v2.0.0, 19611 Skip => $Documentation, 19612 ), 19613 Input_file->new('Unihan.txt', v2.0.0, 19614 Withdrawn => v5.2.0, 19615 Construction_Time_Handler => \&construct_unihan, 19616 Pre_Handler => \&setup_unihan, 19617 Optional => [ "", 19618 'Unicode_Radical_Stroke' 19619 ], 19620 Each_Line_Handler => \&filter_unihan_line, 19621 ), 19622 Input_file->new('SpecialCasing.txt', v2.1.8, 19623 Each_Line_Handler => ($v_version eq 2.1.8) 19624 ? \&filter_2_1_8_special_casing_line 19625 : \&filter_special_casing_line, 19626 Pre_Handler => \&setup_special_casing, 19627 Has_Missings_Defaults => $IGNORED, 19628 ), 19629 Input_file->new( 19630 'LineBreak.txt', v3.0.0, 19631 Has_Missings_Defaults => $NOT_IGNORED, 19632 Property => 'Line_Break', 19633 # Early versions had problematic syntax 19634 Each_Line_Handler => ($v_version ge v3.1.0) 19635 ? undef 19636 : ($v_version lt v3.0.0) 19637 ? \&filter_substitute_lb 19638 : \&filter_early_ea_lb, 19639 # Must use long names for property values see comments at 19640 # sub filter_substitute_lb 19641 Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic', 19642 'Alphabetic', # default to this because XX -> 19643 # AL 19644 19645 # Don't use _Perl_LB as a synonym for 19646 # Line_Break in later perls, as it is tailored 19647 # and isn't the same as Line_Break 19648 'ONLY_EARLY' ], 19649 ), 19650 Input_file->new('EastAsianWidth.txt', v3.0.0, 19651 Property => 'East_Asian_Width', 19652 Has_Missings_Defaults => $NOT_IGNORED, 19653 # Early versions had problematic syntax 19654 Each_Line_Handler => (($v_version lt v3.1.0) 19655 ? \&filter_early_ea_lb 19656 : undef), 19657 ), 19658 Input_file->new('CompositionExclusions.txt', v3.0.0, 19659 Property => 'Composition_Exclusion', 19660 ), 19661 Input_file->new('UnicodeData.html', v3.0.0, 19662 Withdrawn => v4.0.1, 19663 Skip => $Documentation, 19664 ), 19665 Input_file->new('BidiMirroring.txt', v3.0.1, 19666 Property => 'Bidi_Mirroring_Glyph', 19667 Has_Missings_Defaults => ($v_version lt v6.2.0) 19668 ? $NO_DEFAULTS 19669 # Is <none> which doesn't mean 19670 # anything to us, we will use the 19671 # null string 19672 : $IGNORED, 19673 ), 19674 Input_file->new('NamesList.html', v3.0.0, 19675 Skip => 'Describes the format and contents of ' 19676 . 'F<NamesList.txt>', 19677 ), 19678 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0, 19679 Withdrawn => v5.1, 19680 Skip => $Documentation, 19681 ), 19682 Input_file->new('CaseFolding.txt', v3.0.1, 19683 Pre_Handler => \&setup_case_folding, 19684 Each_Line_Handler => 19685 [ ($v_version lt v3.1.0) 19686 ? \&filter_old_style_case_folding 19687 : undef, 19688 \&filter_case_folding_line 19689 ], 19690 Has_Missings_Defaults => $IGNORED, 19691 ), 19692 Input_file->new("NormTest.txt", v3.0.1, 19693 Handler => \&process_NormalizationsTest, 19694 Skip => ($make_norm_test_script) ? 0 : $Validation, 19695 ), 19696 Input_file->new('DCoreProperties.txt', v3.1.0, 19697 # 5.2 changed this file 19698 Has_Missings_Defaults => (($v_version ge v5.2.0) 19699 ? $NOT_IGNORED 19700 : $NO_DEFAULTS), 19701 ), 19702 Input_file->new('DProperties.html', v3.1.0, 19703 Withdrawn => v3.2.0, 19704 Skip => $Documentation, 19705 ), 19706 Input_file->new('PropList.html', v3.1.0, 19707 Withdrawn => v5.1, 19708 Skip => $Documentation, 19709 ), 19710 Input_file->new('Scripts.txt', v3.1.0, 19711 Property => 'Script', 19712 Each_Line_Handler => (($v_version le v4.0.0) 19713 ? \&filter_all_caps_script_names 19714 : undef), 19715 Has_Missings_Defaults => $NOT_IGNORED, 19716 ), 19717 Input_file->new('DNormalizationProps.txt', v3.1.0, 19718 Has_Missings_Defaults => $NOT_IGNORED, 19719 Each_Line_Handler => (($v_version lt v4.0.1) 19720 ? \&filter_old_style_normalization_lines 19721 : undef), 19722 ), 19723 Input_file->new('DerivedProperties.html', v3.1.1, 19724 Withdrawn => v5.1, 19725 Skip => $Documentation, 19726 ), 19727 Input_file->new('DAge.txt', v3.2.0, 19728 Has_Missings_Defaults => $NOT_IGNORED, 19729 Property => 'Age' 19730 ), 19731 Input_file->new('HangulSyllableType.txt', v4.0, 19732 Has_Missings_Defaults => $NOT_IGNORED, 19733 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ], 19734 Property => 'Hangul_Syllable_Type' 19735 ), 19736 Input_file->new('NormalizationCorrections.txt', v3.2.0, 19737 # This documents the cumulative fixes to erroneous 19738 # normalizations in earlier Unicode versions. Its main 19739 # purpose is so that someone running on an earlier 19740 # version can use this file to override what got 19741 # published in that earlier release. It would be easy 19742 # for mktables to handle this file. But all the 19743 # corrections in it should already be in the other files 19744 # for the release it is. To get it to actually mean 19745 # something useful, someone would have to be using an 19746 # earlier Unicode release, and copy it into the directory 19747 # for that release and recompile. So far there has been 19748 # no demand to do that, so this hasn't been implemented. 19749 Skip => 'Documentation of corrections already ' 19750 . 'incorporated into the Unicode data base', 19751 ), 19752 Input_file->new('StandardizedVariants.html', v3.2.0, 19753 Skip => 'Obsoleted as of Unicode 9.0, but previously ' 19754 . 'provided a visual display of the standard ' 19755 . 'variant sequences derived from ' 19756 . 'F<StandardizedVariants.txt>.', 19757 # I don't know why the html came earlier than the 19758 # .txt, but both are skipped anyway, so it doesn't 19759 # matter. 19760 ), 19761 Input_file->new('StandardizedVariants.txt', v4.0.0, 19762 Skip => 'Certain glyph variations for character display ' 19763 . 'are standardized. This lists the non-Unihan ' 19764 . 'ones; the Unihan ones are also not used by ' 19765 . 'Perl, and are in a separate Unicode data base ' 19766 . 'L<http://www.unicode.org/ivd>', 19767 ), 19768 Input_file->new('UCD.html', v4.0.0, 19769 Withdrawn => v5.2, 19770 Skip => $Documentation, 19771 ), 19772 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, 19773 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter', 19774 19775 # Don't use _Perl_WB as a synonym for 19776 # Word_Break in later perls, as it is tailored 19777 # and isn't the same as Word_Break 19778 'ONLY_EARLY' ], 19779 Property => 'Word_Break', 19780 Has_Missings_Defaults => $NOT_IGNORED, 19781 ), 19782 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1, 19783 Early => [ \&generate_GCB, '_Perl_GCB' ], 19784 Property => 'Grapheme_Cluster_Break', 19785 Has_Missings_Defaults => $NOT_IGNORED, 19786 ), 19787 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, 19788 Handler => \&process_GCB_test, 19789 retain_trailing_comments => 1, 19790 ), 19791 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0, 19792 Skip => $Validation_Documentation, 19793 ), 19794 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, 19795 Handler => \&process_SB_test, 19796 retain_trailing_comments => 1, 19797 ), 19798 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0, 19799 Skip => $Validation_Documentation, 19800 ), 19801 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, 19802 Handler => \&process_WB_test, 19803 retain_trailing_comments => 1, 19804 ), 19805 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0, 19806 Skip => $Validation_Documentation, 19807 ), 19808 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, 19809 Property => 'Sentence_Break', 19810 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ], 19811 Has_Missings_Defaults => $NOT_IGNORED, 19812 ), 19813 Input_file->new('NamedSequences.txt', v4.1.0, 19814 Handler => \&process_NamedSequences 19815 ), 19816 Input_file->new('Unihan.html', v4.1.0, 19817 Withdrawn => v5.2, 19818 Skip => $Documentation, 19819 ), 19820 Input_file->new('NameAliases.txt', v5.0, 19821 Property => 'Name_Alias', 19822 Each_Line_Handler => ($v_version le v6.0.0) 19823 ? \&filter_early_version_name_alias_line 19824 : \&filter_later_version_name_alias_line, 19825 ), 19826 # NameAliases.txt came along in v5.0. The above constructor handles 19827 # this. But until 6.1, it was lacking some information needed by core 19828 # perl. The constructor below handles that. It is either a kludge or 19829 # clever, depending on your point of view. The 'Withdrawn' parameter 19830 # indicates not to use it at all starting in 6.1 (so the above 19831 # constructor applies), and the 'v6.1' parameter indicates to use the 19832 # Early parameter before 6.1. Therefore 'Early" is always used, 19833 # yielding the internal-only property '_Perl_Name_Alias', which it 19834 # gets from a NameAliases.txt from 6.1 or later stored in 19835 # N_Asubst.txt. In combination with the above constructor, 19836 # 'Name_Alias' is publicly accessible starting with v5.0, and the 19837 # better 6.1 version is accessible to perl core in all releases. 19838 Input_file->new("NameAliases.txt", v6.1, 19839 Withdrawn => v6.1, 19840 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ], 19841 Property => 'Name_Alias', 19842 EOF_Handler => \&fixup_early_perl_name_alias, 19843 Each_Line_Handler => 19844 \&filter_later_version_name_alias_line, 19845 ), 19846 Input_file->new('NamedSqProv.txt', v5.0.0, 19847 Skip => 'Named sequences proposed for inclusion in a ' 19848 . 'later version of the Unicode Standard; if you ' 19849 . 'need them now, you can append this file to ' 19850 . 'F<NamedSequences.txt> and recompile perl', 19851 ), 19852 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0, 19853 Handler => \&process_LB_test, 19854 retain_trailing_comments => 1, 19855 ), 19856 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0, 19857 Skip => $Validation_Documentation, 19858 ), 19859 Input_file->new("BidiTest.txt", v5.2.0, 19860 Skip => $Validation, 19861 ), 19862 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, 19863 Optional => "", 19864 Each_Line_Handler => \&filter_unihan_line, 19865 ), 19866 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, 19867 Optional => "", 19868 Each_Line_Handler => \&filter_unihan_line, 19869 ), 19870 Input_file->new('UnihanIRGSources.txt', v5.2.0, 19871 Optional => [ "", 19872 'kCompatibilityVariant', 19873 'kIICore', 19874 'kIRG_GSource', 19875 'kIRG_HSource', 19876 'kIRG_JSource', 19877 'kIRG_KPSource', 19878 'kIRG_MSource', 19879 'kIRG_KSource', 19880 'kIRG_TSource', 19881 'kIRG_USource', 19882 'kIRG_VSource', 19883 ], 19884 Pre_Handler => \&setup_unihan, 19885 Each_Line_Handler => \&filter_unihan_line, 19886 ), 19887 Input_file->new('UnihanNumericValues.txt', v5.2.0, 19888 Optional => [ "", 19889 'kAccountingNumeric', 19890 'kOtherNumeric', 19891 'kPrimaryNumeric', 19892 ], 19893 Each_Line_Handler => \&filter_unihan_line, 19894 ), 19895 Input_file->new('UnihanOtherMappings.txt', v5.2.0, 19896 Optional => "", 19897 Each_Line_Handler => \&filter_unihan_line, 19898 ), 19899 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, 19900 Optional => [ "", 19901 'Unicode_Radical_Stroke' 19902 ], 19903 Each_Line_Handler => \&filter_unihan_line, 19904 ), 19905 Input_file->new('UnihanReadings.txt', v5.2.0, 19906 Optional => "", 19907 Each_Line_Handler => \&filter_unihan_line, 19908 ), 19909 Input_file->new('UnihanVariants.txt', v5.2.0, 19910 Optional => "", 19911 Each_Line_Handler => \&filter_unihan_line, 19912 ), 19913 Input_file->new('CJKRadicals.txt', v5.2.0, 19914 Skip => 'Maps the kRSUnicode property values to ' 19915 . 'corresponding code points', 19916 ), 19917 Input_file->new('EmojiSources.txt', v6.0.0, 19918 Skip => 'Maps certain Unicode code points to their ' 19919 . 'legacy Japanese cell-phone values', 19920 ), 19921 Input_file->new('ScriptExtensions.txt', v6.0.0, 19922 Property => 'Script_Extensions', 19923 Early => [ sub {} ], # Doesn't do anything but ensures 19924 # that this isn't skipped for early 19925 # versions 19926 Pre_Handler => \&setup_script_extensions, 19927 Each_Line_Handler => \&filter_script_extensions_line, 19928 Has_Missings_Defaults => (($v_version le v6.0.0) 19929 ? $NO_DEFAULTS 19930 : $IGNORED), 19931 ), 19932 # These two Indic files are actually not usable as-is until 6.1.0, 19933 # because their property values are missing from PropValueAliases.txt 19934 # until that release, so that further work would have to be done to get 19935 # them to work properly, which isn't worth it because of them being 19936 # provisional. 19937 Input_file->new('IndicMatraCategory.txt', v6.0.0, 19938 Withdrawn => v8.0.0, 19939 Property => 'Indic_Matra_Category', 19940 Has_Missings_Defaults => $NOT_IGNORED, 19941 Skip => $Indic_Skip, 19942 ), 19943 Input_file->new('IndicSyllabicCategory.txt', v6.0.0, 19944 Property => 'Indic_Syllabic_Category', 19945 Has_Missings_Defaults => $NOT_IGNORED, 19946 Skip => (($v_version lt v8.0.0) 19947 ? $Indic_Skip 19948 : 0), 19949 ), 19950 Input_file->new('USourceData.txt', v6.2.0, 19951 Skip => 'Documentation of status and cross reference of ' 19952 . 'proposals for encoding by Unicode of Unihan ' 19953 . 'characters', 19954 ), 19955 Input_file->new('USourceGlyphs.pdf', v6.2.0, 19956 Skip => 'Pictures of the characters in F<USourceData.txt>', 19957 ), 19958 Input_file->new('BidiBrackets.txt', v6.3.0, 19959 Properties => [ 'Bidi_Paired_Bracket', 19960 'Bidi_Paired_Bracket_Type' 19961 ], 19962 Has_Missings_Defaults => $NO_DEFAULTS, 19963 ), 19964 Input_file->new("BidiCharacterTest.txt", v6.3.0, 19965 Skip => $Validation, 19966 ), 19967 Input_file->new('IndicPositionalCategory.txt', v8.0.0, 19968 Property => 'Indic_Positional_Category', 19969 Has_Missings_Defaults => $NOT_IGNORED, 19970 ), 19971 Input_file->new('TangutSources.txt', v9.0.0, 19972 Skip => 'Specifies source mappings for Tangut ideographs' 19973 . ' and components. This data file also includes' 19974 . ' informative radical-stroke values that are used' 19975 . ' internally by Unicode', 19976 ), 19977 Input_file->new('VerticalOrientation.txt', v10.0.0, 19978 Property => 'Vertical_Orientation', 19979 Has_Missings_Defaults => $NOT_IGNORED, 19980 ), 19981 Input_file->new('NushuSources.txt', v10.0.0, 19982 Skip => 'Specifies source material for Nushu characters', 19983 ), 19984); 19985 19986# End of all the preliminaries. 19987# Do it... 19988 19989if (@missing_early_files) { 19990 print simple_fold(join_lines(<<END 19991 19992The compilation cannot be completed because one or more required input files, 19993listed below, are missing. This is because you are compiling Unicode version 19994$unicode_version, which predates the existence of these file(s). To fully 19995function, perl needs the data that these files would have contained if they 19996had been in this release. To work around this, create copies of later 19997versions of the missing files in the directory containing '$0'. (Perl will 19998make the necessary adjustments to the data to compensate for it not being the 19999same version as is being compiled.) The files are available from unicode.org, 20000via either ftp or http. If using http, they will be under 20001www.unicode.org/versions/. Below are listed the source file name of each 20002missing file, the Unicode version to copy it from, and the name to store it 20003as. (Note that the listed source file name may not be exactly the one that 20004Unicode calls it. If you don't find it, you can look it up in 'README.perl' 20005to get the correct name.) 20006END 20007 )); 20008 print simple_fold(join_lines("\n$_")) for @missing_early_files; 20009 exit 2; 20010} 20011 20012if ($compare_versions) { 20013 Carp::my_carp(<<END 20014Warning. \$compare_versions is set. Output is not suitable for production 20015END 20016 ); 20017} 20018 20019# Put into %potential_files a list of all the files in the directory structure 20020# that could be inputs to this program 20021File::Find::find({ 20022 wanted=>sub { 20023 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the 20024 # name's case 20025 my $full = lc(File::Spec->rel2abs($_)); 20026 $potential_files{$full} = 1; 20027 return; 20028 } 20029}, File::Spec->curdir()); 20030 20031my @mktables_list_output_files; 20032my $old_start_time = 0; 20033my $old_options = ""; 20034 20035if (! -e $file_list) { 20036 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; 20037 $write_unchanged_files = 1; 20038} elsif ($write_unchanged_files) { 20039 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; 20040} 20041else { 20042 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; 20043 my $file_handle; 20044 if (! open $file_handle, "<", $file_list) { 20045 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); 20046 $glob_list = 1; 20047 } 20048 else { 20049 my @input; 20050 20051 # Read and parse mktables.lst, placing the results from the first part 20052 # into @input, and the second part into @mktables_list_output_files 20053 for my $list ( \@input, \@mktables_list_output_files ) { 20054 while (<$file_handle>) { 20055 s/^ \s+ | \s+ $//xg; 20056 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) { 20057 $old_start_time = $1; 20058 next; 20059 } 20060 if (/^ \s* \# \s* From\ options\ (.+) /x) { 20061 $old_options = $1; 20062 next; 20063 } 20064 next if /^ \s* (?: \# .* )? $/x; 20065 last if /^ =+ $/x; 20066 my ( $file ) = split /\t/; 20067 push @$list, $file; 20068 } 20069 @$list = uniques(@$list); 20070 next; 20071 } 20072 20073 # Look through all the input files 20074 foreach my $input (@input) { 20075 next if $input eq 'version'; # Already have checked this. 20076 20077 # Ignore if doesn't exist. The checking about whether we care or 20078 # not is done via the Input_file object. 20079 next if ! file_exists($input); 20080 20081 # The paths are stored with relative names, and with '/' as the 20082 # delimiter; convert to absolute on this machine 20083 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); 20084 $potential_files{lc $full} = 1; 20085 } 20086 } 20087 20088 close $file_handle; 20089} 20090 20091if ($glob_list) { 20092 20093 # Here wants to process all .txt files in the directory structure. 20094 # Convert them to full path names. They are stored in the platform's 20095 # relative style 20096 my @known_files; 20097 foreach my $object (@input_file_objects) { 20098 my $file = $object->file; 20099 next unless defined $file; 20100 push @known_files, File::Spec->rel2abs($file); 20101 } 20102 20103 my @unknown_input_files; 20104 foreach my $file (keys %potential_files) { # The keys are stored in lc 20105 next if grep { $file eq lc($_) } @known_files; 20106 20107 # Here, the file is unknown to us. Get relative path name 20108 $file = File::Spec->abs2rel($file); 20109 push @unknown_input_files, $file; 20110 20111 # What will happen is we create a data structure for it, and add it to 20112 # the list of input files to process. First get the subdirectories 20113 # into an array 20114 my (undef, $directories, undef) = File::Spec->splitpath($file); 20115 $directories =~ s;/$;;; # Can have extraneous trailing '/' 20116 my @directories = File::Spec->splitdir($directories); 20117 20118 # If the file isn't extracted (meaning none of the directories is the 20119 # extracted one), just add it to the end of the list of inputs. 20120 if (! grep { $EXTRACTED_DIR eq $_ } @directories) { 20121 push @input_file_objects, Input_file->new($file, v0); 20122 } 20123 else { 20124 20125 # Here, the file is extracted. It needs to go ahead of most other 20126 # processing. Search for the first input file that isn't a 20127 # special required property (that is, find one whose first_release 20128 # is non-0), and isn't extracted. Also, the Age property file is 20129 # processed before the extracted ones, just in case 20130 # $compare_versions is set. 20131 for (my $i = 0; $i < @input_file_objects; $i++) { 20132 if ($input_file_objects[$i]->first_released ne v0 20133 && lc($input_file_objects[$i]->file) ne 'dage.txt' 20134 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) 20135 { 20136 splice @input_file_objects, $i, 0, 20137 Input_file->new($file, v0); 20138 last; 20139 } 20140 } 20141 20142 } 20143 } 20144 if (@unknown_input_files) { 20145 print STDERR simple_fold(join_lines(<<END 20146 20147The following files are unknown as to how to handle. Assuming they are 20148typical property files. You'll know by later error messages if it worked or 20149not: 20150END 20151 ) . " " . join(", ", @unknown_input_files) . "\n\n"); 20152 } 20153} # End of looking through directory structure for more .txt files. 20154 20155# Create the list of input files from the objects we have defined, plus 20156# version 20157my @input_files = qw(version Makefile); 20158foreach my $object (@input_file_objects) { 20159 my $file = $object->file; 20160 next if ! defined $file; # Not all objects have files 20161 next if defined $object->skip;; 20162 push @input_files, $file; 20163} 20164 20165if ( $verbosity >= $VERBOSE ) { 20166 print "Expecting ".scalar( @input_files )." input files. ", 20167 "Checking ".scalar( @mktables_list_output_files )." output files.\n"; 20168} 20169 20170# We set $most_recent to be the most recently changed input file, including 20171# this program itself (done much earlier in this file) 20172foreach my $in (@input_files) { 20173 next unless -e $in; # Keep going even if missing a file 20174 my $mod_time = (stat $in)[9]; 20175 $most_recent = $mod_time if $mod_time > $most_recent; 20176 20177 # See that the input files have distinct names, to warn someone if they 20178 # are adding a new one 20179 if ($make_list) { 20180 my ($volume, $directories, $file ) = File::Spec->splitpath($in); 20181 $directories =~ s;/$;;; # Can have extraneous trailing '/' 20182 my @directories = File::Spec->splitdir($directories); 20183 construct_filename($file, 'mutable', \@directories); 20184 } 20185} 20186 20187# We use 'Makefile' just to see if it has changed since the last time we 20188# rebuilt. Now discard it. 20189@input_files = grep { $_ ne 'Makefile' } @input_files; 20190 20191my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild 20192 || ! scalar @mktables_list_output_files # or if no outputs known 20193 || $old_start_time < $most_recent # or out-of-date 20194 || $old_options ne $command_line_arguments; # or with different 20195 # options 20196 20197# Now we check to see if any output files are older than youngest, if 20198# they are, we need to continue on, otherwise we can presumably bail. 20199if (! $rebuild) { 20200 foreach my $out (@mktables_list_output_files) { 20201 if ( ! file_exists($out)) { 20202 print "'$out' is missing.\n" if $verbosity >= $VERBOSE; 20203 $rebuild = 1; 20204 last; 20205 } 20206 #local $to_trace = 1 if main::DEBUG; 20207 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; 20208 if ( (stat $out)[9] <= $most_recent ) { 20209 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; 20210 print "'$out' is too old.\n" if $verbosity >= $VERBOSE; 20211 $rebuild = 1; 20212 last; 20213 } 20214 } 20215} 20216if (! $rebuild) { 20217 print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; 20218 exit(0); 20219} 20220print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE; 20221 20222# Ready to do the major processing. First create the perl pseudo-property. 20223$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); 20224 20225# Process each input file 20226foreach my $file (@input_file_objects) { 20227 $file->run; 20228} 20229 20230# Finish the table generation. 20231 20232print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; 20233finish_Unicode(); 20234 20235# For the very specialized case of comparing two Unicode versions... 20236if (DEBUG && $compare_versions) { 20237 handle_compare_versions(); 20238} 20239 20240print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; 20241compile_perl(); 20242 20243print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; 20244add_perl_synonyms(); 20245 20246print "Writing tables\n" if $verbosity >= $PROGRESS; 20247write_all_tables(); 20248 20249# Write mktables.lst 20250if ( $file_list and $make_list ) { 20251 20252 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; 20253 foreach my $file (@input_files, @files_actually_output) { 20254 my (undef, $directories, $basefile) = File::Spec->splitpath($file); 20255 my @directories = grep length, File::Spec->splitdir($directories); 20256 $file = join '/', @directories, $basefile; 20257 } 20258 20259 my $ofh; 20260 if (! open $ofh,">",$file_list) { 20261 Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); 20262 return 20263 } 20264 else { 20265 my $localtime = localtime $start_time; 20266 print $ofh <<"END"; 20267# 20268# $file_list -- File list for $0. 20269# 20270# Autogenerated starting on $start_time ($localtime) 20271# From options $command_line_arguments 20272# 20273# - First section is input files 20274# ($0 itself is not listed but is automatically considered an input) 20275# - Section separator is /^=+\$/ 20276# - Second section is a list of output files. 20277# - Lines matching /^\\s*#/ are treated as comments 20278# which along with blank lines are ignored. 20279# 20280 20281# Input files: 20282 20283END 20284 print $ofh "$_\n" for sort(@input_files); 20285 print $ofh "\n=================================\n# Output files:\n\n"; 20286 print $ofh "$_\n" for sort @files_actually_output; 20287 print $ofh "\n# ",scalar(@input_files)," input files\n", 20288 "# ",scalar(@files_actually_output)+1," output files\n\n", 20289 "# End list\n"; 20290 close $ofh 20291 or Carp::my_carp("Failed to close $ofh: $!"); 20292 20293 print "Filelist has ",scalar(@input_files)," input files and ", 20294 scalar(@files_actually_output)+1," output files\n" 20295 if $verbosity >= $VERBOSE; 20296 } 20297} 20298 20299# Output these warnings unless -q explicitly specified. 20300if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { 20301 if (@unhandled_properties) { 20302 print "\nProperties and tables that unexpectedly have no code points\n"; 20303 foreach my $property (sort @unhandled_properties) { 20304 print $property, "\n"; 20305 } 20306 } 20307 20308 if (%potential_files) { 20309 print "\nInput files that are not considered:\n"; 20310 foreach my $file (sort keys %potential_files) { 20311 print File::Spec->abs2rel($file), "\n"; 20312 } 20313 } 20314 print "\nAll done\n" if $verbosity >= $VERBOSE; 20315} 20316 20317if ($version_of_mk_invlist_bounds lt $v_version) { 20318 Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need" 20319 . " to be checked and possibly updated to Unicode" 20320 . " $string_version"); 20321} 20322 20323exit(0); 20324 20325# TRAILING CODE IS USED BY make_property_test_script() 20326__DATA__ 20327 20328use strict; 20329use warnings; 20330 20331# Test qr/\X/ and the \p{} regular expression constructs. This file is 20332# constructed by mktables from the tables it generates, so if mktables is 20333# buggy, this won't necessarily catch those bugs. Tests are generated for all 20334# feasible properties; a few aren't currently feasible; see 20335# is_code_point_usable() in mktables for details. 20336 20337# Standard test packages are not used because this manipulates SIG_WARN. It 20338# exits 0 if every non-skipped test succeeded; -1 if any failed. 20339 20340my $Tests = 0; 20341my $Fails = 0; 20342 20343# loc_tools.pl requires this function to be defined 20344sub ok($@) { 20345 my ($pass, @msg) = @_; 20346 print "not " unless $pass; 20347 print "ok "; 20348 print ++$Tests; 20349 print " - ", join "", @msg if @msg; 20350 print "\n"; 20351} 20352 20353sub Expect($$$$) { 20354 my $expected = shift; 20355 my $ord = shift; 20356 my $regex = shift; 20357 my $warning_type = shift; # Type of warning message, like 'deprecated' 20358 # or empty if none 20359 my $line = (caller)[2]; 20360 20361 # Convert the code point to hex form 20362 my $string = sprintf "\"\\x{%04X}\"", $ord; 20363 20364 my @tests = ""; 20365 20366 # The first time through, use all warnings. If the input should generate 20367 # a warning, add another time through with them turned off 20368 push @tests, "no warnings '$warning_type';" if $warning_type; 20369 20370 foreach my $no_warnings (@tests) { 20371 20372 # Store any warning messages instead of outputting them 20373 local $SIG{__WARN__} = $SIG{__WARN__}; 20374 my $warning_message; 20375 $SIG{__WARN__} = sub { $warning_message = $_[0] }; 20376 20377 $Tests++; 20378 20379 # A string eval is needed because of the 'no warnings'. 20380 # Assumes no parentheses in the regular expression 20381 my $result = eval "$no_warnings 20382 my \$RegObj = qr($regex); 20383 $string =~ \$RegObj ? 1 : 0"; 20384 if (not defined $result) { 20385 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; 20386 $Fails++; 20387 } 20388 elsif ($result ^ $expected) { 20389 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; 20390 $Fails++; 20391 } 20392 elsif ($warning_message) { 20393 if (! $warning_type || ($warning_type && $no_warnings)) { 20394 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; 20395 $Fails++; 20396 } 20397 else { 20398 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; 20399 } 20400 } 20401 elsif ($warning_type && ! $no_warnings) { 20402 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; 20403 $Fails++; 20404 } 20405 else { 20406 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; 20407 } 20408 } 20409 return; 20410} 20411 20412sub Error($) { 20413 my $regex = shift; 20414 $Tests++; 20415 if (eval { 'x' =~ qr/$regex/; 1 }) { 20416 $Fails++; 20417 my $line = (caller)[2]; 20418 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; 20419 } 20420 else { 20421 my $line = (caller)[2]; 20422 print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; 20423 } 20424 return; 20425} 20426 20427# Break test files (e.g. GCBTest.txt) character that break allowed here 20428my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7)); 20429utf8::upgrade($breakable_utf8); 20430 20431# Break test files (e.g. GCBTest.txt) character that indicates can't break 20432# here 20433my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7)); 20434utf8::upgrade($nobreak_utf8); 20435 20436my $are_ctype_locales_available; 20437my $utf8_locale; 20438chdir 't' if -d 't'; 20439eval { require "./loc_tools.pl" }; 20440if (defined &locales_enabled) { 20441 $are_ctype_locales_available = locales_enabled('LC_CTYPE'); 20442 if ($are_ctype_locales_available) { 20443 $utf8_locale = &find_utf8_ctype_locale; 20444 } 20445} 20446 20447# Eval'd so can run on versions earlier than the property is available in 20448my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/'; 20449if (! defined $WB_Extend_or_Format_re) { 20450 $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/'; 20451} 20452 20453sub _test_break($$) { 20454 # Test various break property matches. The 2nd parameter gives the 20455 # property name. The input is a line from auxiliary/*Test.txt for the 20456 # given property. Each such line is a sequence of Unicode (not native) 20457 # code points given by their hex numbers, separated by the two characters 20458 # defined just before this subroutine that indicate that either there can 20459 # or cannot be a break between the adjacent code points. All these are 20460 # tested. 20461 # 20462 # For the gcb property extra tests are made. if there isn't a break, that 20463 # means the sequence forms an extended grapheme cluster, which means that 20464 # \X should match the whole thing. If there is a break, \X should stop 20465 # there. This is all converted by this routine into a match: $string =~ 20466 # /(\X)/, Each \X should match the next cluster; and that is what is 20467 # checked. 20468 20469 my $template = shift; 20470 my $break_type = shift; 20471 20472 my $line = (caller 1)[2]; # Line number 20473 my $comment = ""; 20474 20475 if ($template =~ / ( .*? ) \s* \# (.*) /x) { 20476 $template = $1; 20477 $comment = $2; 20478 20479 # Replace leading spaces with a single one. 20480 $comment =~ s/ ^ \s* / # /x; 20481 } 20482 20483 # The line contains characters above the ASCII range, but in Latin1. It 20484 # may or may not be in utf8, and if it is, it may or may not know it. So, 20485 # convert these characters to 8 bits. If knows is in utf8, simply 20486 # downgrade. 20487 if (utf8::is_utf8($template)) { 20488 utf8::downgrade($template); 20489 } else { 20490 20491 # Otherwise, if it is in utf8, but doesn't know it, the next lines 20492 # convert the two problematic characters to their 8-bit equivalents. 20493 # If it isn't in utf8, they don't harm anything. 20494 use bytes; 20495 $template =~ s/$nobreak_utf8/$nobreak/g; 20496 $template =~ s/$breakable_utf8/$breakable/g; 20497 } 20498 20499 # Perl customizes wb. So change the official tests accordingly 20500 if ($break_type eq 'wb' && $WB_Extend_or_Format_re) { 20501 20502 # Split into elements that alternate between code point and 20503 # break/no-break 20504 my @line = split / +/, $template; 20505 20506 # Look at each code point and its following one 20507 for (my $i = 1; $i < @line - 1 - 1; $i+=2) { 20508 20509 # The customization only involves changing some breaks to 20510 # non-breaks. 20511 next if $line[$i+1] =~ /$nobreak/; 20512 20513 my $lhs = chr utf8::unicode_to_native(hex $line[$i]); 20514 my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]); 20515 20516 # And it only affects adjacent space characters. 20517 next if $lhs !~ /\s/u; 20518 20519 # But, we want to make sure to test spaces followed by a Extend 20520 # or Format. 20521 next if $rhs !~ /\s|$WB_Extend_or_Format_re/; 20522 20523 # To test the customization, add some white-space before this to 20524 # create a span. The $lhs white space may or may not be bound to 20525 # that span, and also with the $rhs. If the $rhs is a binding 20526 # character, the $lhs is bound to it and not to the span, unless 20527 # $lhs is vertical space. In all other cases, the $lhs is bound 20528 # to the span. If the $rhs is white space, it is bound to the 20529 # $lhs 20530 my $bound; 20531 my $span; 20532 if ($rhs =~ /$WB_Extend_or_Format_re/) { 20533 if ($lhs =~ /\v/) { 20534 $bound = $breakable; 20535 $span = $nobreak; 20536 } 20537 else { 20538 $bound = $nobreak; 20539 $span = $breakable; 20540 } 20541 } 20542 else { 20543 $span = $nobreak; 20544 $bound = $nobreak; 20545 } 20546 20547 splice @line, $i, 0, ( '0020', $nobreak, '0020', $span); 20548 $i += 4; 20549 $line[$i+1] = $bound; 20550 } 20551 $template = join " ", @line; 20552 } 20553 20554 # The input is just the break/no-break symbols and sequences of Unicode 20555 # code points as hex digits separated by spaces for legibility. e.g.: 20556 # ÷ 0020 × 0308 ÷ 0020 ÷ 20557 # Convert to native \x format 20558 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex; 20559 $template =~ s/ \s* //gx; # Probably the line above removed all spaces; 20560 # but be sure 20561 20562 # Make a copy of the input with the symbols replaced by \b{} and \B{} as 20563 # appropriate 20564 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx; 20565 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx; 20566 20567 my $display_string = $template =~ s/[$breakable$nobreak]//gr; 20568 my $string = eval "\"$display_string\""; 20569 20570 # The remaining massaging of the input is for the \X tests. Get rid of 20571 # the leading and trailing breakables 20572 $template =~ s/^ \s* $breakable \s* //x; 20573 $template =~ s/ \s* $breakable \s* $ //x; 20574 20575 # Delete no-breaks 20576 $template =~ s/ \s* $nobreak \s* //xg; 20577 20578 # Split the input into segments that are breakable between them. 20579 my @should_display = split /\s*$breakable\s*/, $template; 20580 my @should_match = map { eval "\"$_\"" } @should_display; 20581 20582 # If a string can be represented in both non-ut8 and utf8, test both cases 20583 my $display_upgrade = ""; 20584 UPGRADE: 20585 for my $to_upgrade (0 .. 1) { 20586 20587 if ($to_upgrade) { 20588 20589 # If already in utf8, would just be a repeat 20590 next UPGRADE if utf8::is_utf8($string); 20591 20592 utf8::upgrade($string); 20593 $display_upgrade = " (utf8-upgraded)"; 20594 } 20595 20596 my @modifiers = qw(a aa d u i); 20597 if ($are_ctype_locales_available) { 20598 push @modifiers, "l$utf8_locale" if defined $utf8_locale; 20599 20600 # The /l modifier has C after it to indicate the locale to try 20601 push @modifiers, "lC"; 20602 } 20603 20604 # Test for each of the regex modifiers. 20605 for my $modifier (@modifiers) { 20606 my $display_locale = ""; 20607 20608 # For /l, set the locale to what it says to. 20609 if ($modifier =~ / ^ l (.*) /x) { 20610 my $locale = $1; 20611 $display_locale = "(locale = $locale)"; 20612 POSIX::setlocale(&POSIX::LC_CTYPE, $locale); 20613 $modifier = 'l'; 20614 } 20615 20616 no warnings qw(locale regexp surrogate); 20617 my $pattern = "(?$modifier:$break_pattern)"; 20618 20619 # Actually do the test 20620 my $matched_text; 20621 my $matched = $string =~ qr/$pattern/; 20622 if ($matched) { 20623 $matched_text = "matched"; 20624 } 20625 else { 20626 $matched_text = "failed to match"; 20627 print "not "; 20628 20629 if (TODO_FAILING_BREAKS) { 20630 $comment = " # $comment" unless $comment =~ / ^ \s* \# /x; 20631 $comment =~ s/#/# TODO/; 20632 } 20633 } 20634 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n"; 20635 20636 # Only print the comment on the first use of this line 20637 $comment = ""; 20638 20639 # Repeat with the first \B{} in the pattern. This makes sure the 20640 # code in regexec.c:find_byclass() for \B gets executed 20641 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) { 20642 my $B_pattern = "$1$2"; 20643 $matched = $string =~ qr/$B_pattern/; 20644 print "not " unless $matched; 20645 $matched_text = ($matched) ? "matched" : "failed to match"; 20646 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale"; 20647 print " # TODO" if TODO_FAILING_BREAKS && ! $matched; 20648 print "\n"; 20649 } 20650 } 20651 20652 next if $break_type ne 'gcb'; 20653 20654 # Finally, do the \X match. 20655 my @matches = $string =~ /(\X)/g; 20656 20657 # Look through each matched cluster to verify that it matches what we 20658 # expect. 20659 my $min = (@matches < @should_match) ? @matches : @should_match; 20660 for my $i (0 .. $min - 1) { 20661 $Tests++; 20662 if ($matches[$i] eq $should_match[$i]) { 20663 print "ok $Tests - "; 20664 if ($i == 0) { 20665 print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; 20666 } else { 20667 print "And \\X #", $i + 1, 20668 } 20669 print " correctly matched $should_display[$i]; line $line\n"; 20670 } else { 20671 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ } 20672 split "", $matches[$i]); 20673 print "not ok $Tests -"; 20674 print " # TODO" if TODO_FAILING_BREAKS; 20675 print " In \"$display_string\" =~ /(\\X)/g, \\X #", 20676 $i + 1, 20677 " should have matched $should_display[$i]", 20678 " but instead matched $matches[$i]", 20679 ". Abandoning rest of line $line\n"; 20680 next UPGRADE; 20681 } 20682 } 20683 20684 # And the number of matches should equal the number of expected matches. 20685 $Tests++; 20686 if (@matches == @should_match) { 20687 print "ok $Tests - Nothing was left over; line $line\n"; 20688 } else { 20689 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line"; 20690 print " # TODO" if TODO_FAILING_BREAKS; 20691 print "\n"; 20692 } 20693 } 20694 20695 return; 20696} 20697 20698sub Test_GCB($) { 20699 _test_break(shift, 'gcb'); 20700} 20701 20702sub Test_LB($) { 20703 _test_break(shift, 'lb'); 20704} 20705 20706sub Test_SB($) { 20707 _test_break(shift, 'sb'); 20708} 20709 20710sub Test_WB($) { 20711 _test_break(shift, 'wb'); 20712} 20713 20714sub Finished() { 20715 print "1..$Tests\n"; 20716 exit($Fails ? -1 : 0); 20717} 20718 20719