1#!/usr/bin/perl -w 2use strict; 3use Carp; 4 5die "$0: Please run me as ./mktables to avoid unnecessary differences\n" 6 unless $0 eq "./mktables"; 7 8## 9## mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) 10## from the Unicode database files (lib/unicore/*.txt). 11## 12 13mkdir("lib", 0755); 14mkdir("To", 0755); 15 16## 17## Process any args. 18## 19my $Verbose = 0; 20my $MakeTestScript = 0; 21 22while (@ARGV) 23{ 24 my $arg = shift @ARGV; 25 if ($arg eq '-v') { 26 $Verbose = 1; 27 } elsif ($arg eq '-q') { 28 $Verbose = 0; 29 } elsif ($arg eq '-maketest') { 30 $MakeTestScript = 1; 31 } else { 32 die "usage: $0 [-v|-q] [-maketest]"; 33 } 34} 35 36my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1. 37 38my $HEADER=<<"EOF"; 39# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 40# This file is built by $0 from e.g. UnicodeData.txt. 41# Any changes made here will be lost! 42 43EOF 44 45 46## 47## Given a filename and a reference to an array of lines, 48## write the lines to the file only if the contents have not changed. 49## 50sub WriteIfChanged($\@) 51{ 52 my $file = shift; 53 my $lines = shift; 54 55 my $TextToWrite = join '', @$lines; 56 if (open IN, $file) { 57 local($/) = undef; 58 my $PreviousText = <IN>; 59 close IN; 60 if ($PreviousText eq $TextToWrite) { 61 print "$file unchanged.\n" if $Verbose; 62 return; 63 } 64 } 65 if (not open OUT, ">$file") { 66 die "$0: can't open $file for output: $!\n"; 67 } 68 print "$file written.\n" if $Verbose; 69 70 print OUT $TextToWrite; 71 close OUT; 72} 73 74## 75## The main datastructure (a "Table") represents a set of code points that 76## are part of a particular quality (that are part of \pL, \p{InGreek}, 77## etc.). They are kept as ranges of code points (starting and ending of 78## each range). 79## 80## For example, a range ASCII LETTERS would be represented as: 81## [ [ 0x41 => 0x5A, 'UPPER' ], 82## [ 0x61 => 0x7A, 'LOWER, ] ] 83## 84sub RANGE_START() { 0 } ## index into range element 85sub RANGE_END() { 1 } ## index into range element 86sub RANGE_NAME() { 2 } ## index into range element 87 88## Conceptually, these should really be folded into the 'Table' objects 89my %TableInfo; 90my %TableDesc; 91my %FuzzyNames; 92my %AliasInfo; 93my %CanonicalToOrig; 94 95## 96## Turn something like 97## OLD-ITALIC 98## into 99## OldItalic 100## 101sub CanonicalName($) 102{ 103 my $orig = shift; 104 my $name = lc $orig; 105 $name =~ s/(?<![a-z])(\w)/\u$1/g; 106 $name =~ s/[-_\s]+//g; 107 108 $CanonicalToOrig{$name} = $orig if not $CanonicalToOrig{$name}; 109 return $name; 110} 111 112## 113## Associates a property ("Greek", "Lu", "Assigned",...) with a Table. 114## 115## Called like: 116## New_Prop(In => 'Greek', $Table, Desc => 'Greek Block', Fuzzy => 1); 117## 118## Normally, these parameters are set when the Table is created (when the 119## Table->New constructor is called), but there are times when it needs to 120## be done after-the-fact...) 121## 122sub New_Prop($$$@) 123{ 124 my $Type = shift; ## "Is" or "In"; 125 my $Name = shift; 126 my $Table = shift; 127 128 ## remaining args are optional key/val 129 my %Args = @_; 130 131 my $Fuzzy = delete $Args{Fuzzy}; 132 my $Desc = delete $Args{Desc}; # description 133 134 $Name = CanonicalName($Name) if $Fuzzy; 135 136 ## sanity check a few args 137 if (%Args or ($Type ne 'Is' and $Type ne 'In') or not ref $Table) { 138 confess "$0: bad args to New_Prop" 139 } 140 141 if (not $TableInfo{$Type}->{$Name}) 142 { 143 $TableInfo{$Type}->{$Name} = $Table; 144 $TableDesc{$Type}->{$Name} = $Desc; 145 if ($Fuzzy) { 146 $FuzzyNames{$Type}->{$Name} = $Name; 147 } 148 } 149} 150 151 152## 153## Creates a new Table object. 154## 155## Args are key/value pairs: 156## In => Name -- Name of "In" property to be associated with 157## Is => Name -- Name of "Is" property to be associated with 158## Fuzzy => Boolean -- True if name can be accessed "fuzzily" 159## Desc => String -- Description of the property 160## 161## No args are required. 162## 163sub Table::New 164{ 165 my $class = shift; 166 my %Args = @_; 167 168 my $Table = bless [], $class; 169 170 my $Fuzzy = delete $Args{Fuzzy}; 171 my $Desc = delete $Args{Desc}; 172 173 for my $Type ('Is', 'In') 174 { 175 if (my $Name = delete $Args{$Type}) { 176 New_Prop($Type => $Name, $Table, Desc => $Desc, Fuzzy => $Fuzzy); 177 } 178 } 179 180 ## shouldn't have any left over 181 if (%Args) { 182 confess "$0: bad args to Table->New" 183 } 184 185 return $Table; 186} 187 188## 189## Returns true if the Table has no code points 190## 191sub Table::IsEmpty 192{ 193 my $Table = shift; #self 194 return not @$Table; 195} 196 197## 198## Returns true if the Table has code points 199## 200sub Table::NotEmpty 201{ 202 my $Table = shift; #self 203 return @$Table; 204} 205 206## 207## Returns the maximum code point currently in the table. 208## 209sub Table::Max 210{ 211 my $Table = shift; #self 212 confess "oops" if $Table->IsEmpty; ## must have code points to have a max 213 return $Table->[-1]->[RANGE_END]; 214} 215 216## 217## Replaces the codepoints in the Table with those in the Table given 218## as an arg. (NOTE: this is not a "deep copy"). 219## 220sub Table::Replace($$) 221{ 222 my $Table = shift; #self 223 my $New = shift; 224 225 @$Table = @$New; 226} 227 228## 229## Given a new code point, make the last range of the Table extend to 230## include the new (and all intervening) code points. 231## 232sub Table::Extend 233{ 234 my $Table = shift; #self 235 my $codepoint = shift; 236 237 my $PrevMax = $Table->Max; 238 239 confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax; 240 241 $Table->[-1]->[RANGE_END] = $codepoint; 242} 243 244## 245## Given a code point range start and end (and optional name), blindly 246## append them to the list of ranges for the Table. 247## 248## NOTE: Code points must be added in strictly ascending numeric order. 249## 250sub Table::RawAppendRange 251{ 252 my $Table = shift; #self 253 my $start = shift; 254 my $end = shift; 255 my $name = shift; 256 $name = "" if not defined $name; ## warning: $name can be "0" 257 258 push @$Table, [ $start, # RANGE_START 259 $end, # RANGE_END 260 $name ]; # RANGE_NAME 261} 262 263## 264## Given a code point (and optional name), add it to the Table. 265## 266## NOTE: Code points must be added in strictly ascending numeric order. 267## 268sub Table::Append 269{ 270 my $Table = shift; #self 271 my $codepoint = shift; 272 my $name = shift; 273 $name = "" if not defined $name; ## warning: $name can be "0" 274 275 ## 276 ## If we've already got a range working, and this code point is the next 277 ## one in line, and if the name is the same, just extend the current range. 278 ## 279 if ($Table->NotEmpty 280 and 281 $Table->Max == $codepoint - 1 282 and 283 $Table->[-1]->[RANGE_NAME] eq $name) 284 { 285 $Table->Extend($codepoint); 286 } 287 else 288 { 289 $Table->RawAppendRange($codepoint, $codepoint, $name); 290 } 291} 292 293## 294## Given a code point range starting value and ending value (and name), 295## Add the range to teh Table. 296## 297## NOTE: Code points must be added in strictly ascending numeric order. 298## 299sub Table::AppendRange 300{ 301 my $Table = shift; #self 302 my $start = shift; 303 my $end = shift; 304 my $name = shift; 305 $name = "" if not defined $name; ## warning: $name can be "0" 306 307 $Table->Append($start, $name); 308 $Table->Extend($end) if $end > $start; 309} 310 311## 312## Return a new Table that represents all code points not in the Table. 313## 314sub Table::Invert 315{ 316 my $Table = shift; #self 317 318 my $New = Table->New(); 319 my $max = -1; 320 for my $range (@$Table) 321 { 322 my $start = $range->[RANGE_START]; 323 my $end = $range->[RANGE_END]; 324 if ($start-1 >= $max+1) { 325 $New->AppendRange($max+1, $start-1, ""); 326 } 327 $max = $end; 328 } 329 if ($max+1 < $LastUnicodeCodepoint) { 330 $New->AppendRange($max+1, $LastUnicodeCodepoint); 331 } 332 return $New; 333} 334 335## 336## Merges any number of other tables with $self, returning the new table. 337## (existing tables are not modified) 338## 339## 340## Args may be Tables, or individual code points (as integers). 341## 342## Can be called as either a constructor or a method. 343## 344sub Table::Merge 345{ 346 shift(@_) if not ref $_[0]; ## if called as a constructor, lose the class 347 my @Tables = @_; 348 349 ## Accumulate all records from all tables 350 my @Records; 351 for my $Arg (@Tables) 352 { 353 if (ref $Arg) { 354 ## arg is a table -- get its ranges 355 push @Records, @$Arg; 356 } else { 357 ## arg is a codepoint, make a range 358 push @Records, [ $Arg, $Arg ] 359 } 360 } 361 362 ## sort by range start, with longer ranges coming first. 363 my ($first, @Rest) = sort { 364 ($a->[RANGE_START] <=> $b->[RANGE_START]) 365 or 366 ($b->[RANGE_END] <=> $b->[RANGE_END]) 367 } @Records; 368 369 my $New = Table->New(); 370 371 ## Ensuring the first range is there makes the subsequent loop easier 372 $New->AppendRange($first->[RANGE_START], 373 $first->[RANGE_END]); 374 375 ## Fold in records so long as they add new information. 376 for my $set (@Rest) 377 { 378 my $start = $set->[RANGE_START]; 379 my $end = $set->[RANGE_END]; 380 if ($start > $New->Max) { 381 $New->AppendRange($start, $end); 382 } elsif ($end > $New->Max) { 383 $New->Extend($end); 384 } 385 } 386 387 return $New; 388} 389 390## 391## Given a filename, write a representation of the Table to a file. 392## May have an optional comment as a 2nd arg. 393## 394sub Table::Write 395{ 396 my $Table = shift; #self 397 my $filename = shift; 398 my $comment = shift; 399 400 my @OUT = $HEADER; 401 if (defined $comment) { 402 $comment =~ s/\s+\Z//; 403 $comment =~ s/^/# /gm; 404 push @OUT, "#\n$comment\n#\n"; 405 } 406 push @OUT, "return <<'END';\n"; 407 408 for my $set (@$Table) 409 { 410 my $start = $set->[RANGE_START]; 411 my $end = $set->[RANGE_END]; 412 my $name = $set->[RANGE_NAME]; 413 414 if ($start == $end) { 415 push @OUT, sprintf "%04X\t\t%s\n", $start, $name; 416 } else { 417 push @OUT, sprintf "%04X\t%04X\t%s\n", $start, $end, $name; 418 } 419 } 420 421 push @OUT, "END\n"; 422 423 WriteIfChanged($filename, @OUT); 424} 425 426## This used only for making the test script. 427## helper function 428sub IsUsable($) 429{ 430 my $code = shift; 431 return 0 if $code <= 0x0000; ## don't use null 432 return 0 if $code >= $LastUnicodeCodepoint; ## keep in range 433 return 0 if ($code >= 0xD800 and $code <= 0xDFFF); ## no surrogates 434 return 0 if ($code >= 0xFDD0 and $code <= 0xFDEF); ## utf8.c says no good 435 return 0 if (($code & 0xFFFF) == 0xFFFE); ## utf8.c says no good 436 return 0 if (($code & 0xFFFF) == 0xFFFF); ## utf8.c says no good 437 return 1; 438} 439 440## Return a code point that's part of the table. 441## Returns nothing if the table is empty (or covers only surrogates). 442## This used only for making the test script. 443sub Table::ValidCode 444{ 445 my $Table = shift; #self 446 for my $set (@$Table) { 447 return $set->[RANGE_END] if IsUsable($set->[RANGE_END]); 448 } 449 return (); 450} 451 452## Return a code point that's not part of the table 453## Returns nothing if the table covers all code points. 454## This used only for making the test script. 455sub Table::InvalidCode 456{ 457 my $Table = shift; #self 458 459 return 0x1234 if $Table->IsEmpty(); 460 461 for my $set (@$Table) 462 { 463 if (IsUsable($set->[RANGE_END] + 1)) 464 { 465 return $set->[RANGE_END] + 1; 466 } 467 468 if (IsUsable($set->[RANGE_START] - 1)) 469 { 470 return $set->[RANGE_START] - 1; 471 } 472 } 473 return (); 474} 475 476########################################################################### 477########################################################################### 478########################################################################### 479 480 481## 482## Called like: 483## New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 1); 484## 485## The args must be in that order, although the Fuzzy pair may be omitted. 486## 487## This creates 'IsAll' as an alias for 'IsAny' 488## 489sub New_Alias($$$@) 490{ 491 my $Type = shift; ## "Is" or "In" 492 my $Alias = shift; 493 my $SameAs = shift; # expecting "SameAs" -- just ignored 494 my $Name = shift; 495 496 ## remaining args are optional key/val 497 my %Args = @_; 498 499 my $Fuzzy = delete $Args{Fuzzy}; 500 501 ## sanity check a few args 502 if (%Args or ($Type ne 'Is' and $Type ne 'In') or $SameAs ne 'SameAs') { 503 confess "$0: bad args to New_Alias" 504 } 505 506 $Alias = CanonicalName($Alias) if $Fuzzy; 507 508 if (not $TableInfo{$Type}->{$Name}) 509 { 510 my $CName = CanonicalName($Name); 511 if ($TableInfo{$Type}->{$CName}) { 512 confess "$0: Use canonical form '$CName' instead of '$Name' for alias."; 513 } else { 514 confess "$0: don't have orignial $Type => $Name to make alias"; 515 } 516 } 517 if ($TableInfo{$Alias}) { 518 confess "$0: already have original $Type => $Alias; can't make alias"; 519 } 520 $AliasInfo{$Type}->{$Name} = $Alias; 521 if ($Fuzzy) { 522 $FuzzyNames{$Type}->{$Alias} = $Name; 523 } 524 525} 526 527 528## All assigned code points 529my $Assigned = Table->New(Is => 'Assigned', 530 Desc => "All assigned code points", 531 Fuzzy => 0); 532 533my $Name = Table->New(); ## all characters, individually by name 534my $General = Table->New(); ## all characters, grouped by category 535my %General; 536my %Cat; 537 538## 539## Process UnicodeData.txt (Categories, etc.) 540## 541sub UnicodeData_Txt() 542{ 543 my $Bidi = Table->New(); 544 my $Deco = Table->New(); 545 my $Comb = Table->New(); 546 my $Number = Table->New(); 547 my $Mirrored = Table->New(Is => 'Mirrored', 548 Desc => "Mirrored in bidirectional text", 549 Fuzzy => 0); 550 551 my %DC; 552 my %Bidi; 553 my %Deco; 554 $Deco{Canon} = Table->New(Is => 'Canon', 555 Desc => 'Decomposes to multiple characters', 556 Fuzzy => 0); 557 $Deco{Compat} = Table->New(Is => 'Compat', 558 Desc => 'Compatible with a more-basic character', 559 Fuzzy => 0); 560 561 ## Initialize Perl-generated categories 562 ## (Categories from UnicodeData.txt are auto-initialized in gencat) 563 $Cat{Alnum} = 564 Table->New(Is => 'Alnum', Desc => "[[:Alnum:]]", Fuzzy => 0); 565 $Cat{Alpha} = 566 Table->New(Is => 'Alpha', Desc => "[[:Alpha:]]", Fuzzy => 0); 567 $Cat{ASCII} = 568 Table->New(Is => 'ASCII', Desc => "[[:ASCII:]]", Fuzzy => 0); 569 $Cat{Blank} = 570 Table->New(Is => 'Blank', Desc => "[[:Blank:]]", Fuzzy => 0); 571 $Cat{Cntrl} = 572 Table->New(Is => 'Cntrl', Desc => "[[:Cntrl:]]", Fuzzy => 0); 573 $Cat{Digit} = 574 Table->New(Is => 'Digit', Desc => "[[:Digit:]]", Fuzzy => 0); 575 $Cat{Graph} = 576 Table->New(Is => 'Graph', Desc => "[[:Graph:]]", Fuzzy => 0); 577 $Cat{Lower} = 578 Table->New(Is => 'Lower', Desc => "[[:Lower:]]", Fuzzy => 0); 579 $Cat{Print} = 580 Table->New(Is => 'Print', Desc => "[[:Print:]]", Fuzzy => 0); 581 $Cat{Punct} = 582 Table->New(Is => 'Punct', Desc => "[[:Punct:]]", Fuzzy => 0); 583 $Cat{Space} = 584 Table->New(Is => 'Space', Desc => "[[:Space:]]", Fuzzy => 0); 585 $Cat{Title} = 586 Table->New(Is => 'Title', Desc => "[[:Title:]]", Fuzzy => 0); 587 $Cat{Upper} = 588 Table->New(Is => 'Upper', Desc => "[[:Upper:]]", Fuzzy => 0); 589 $Cat{XDigit} = 590 Table->New(Is => 'XDigit', Desc => "[[:XDigit:]]", Fuzzy => 0); 591 $Cat{Word} = 592 Table->New(Is => 'Word', Desc => "[[:Word:]]", Fuzzy => 0); 593 $Cat{SpacePerl} = 594 Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0); 595 596 my %To; 597 $To{Upper} = Table->New(); 598 $To{Lower} = Table->New(); 599 $To{Title} = Table->New(); 600 $To{Digit} = Table->New(); 601 602 sub gencat($$$$) 603 { 604 my ($name, ## Name ("LATIN CAPITAL LETTER A") 605 $cat, ## Category ("Lu", "Zp", "Nd", etc.) 606 $code, ## Code point (as an integer) 607 $op) = @_; 608 609 my $MajorCat = substr($cat, 0, 1); ## L, M, Z, S, etc 610 611 $Assigned->$op($code); 612 $Name->$op($code, $name); 613 $General->$op($code, $cat); 614 615 ## add to the sub category (e.g. "Lu", "Nd", "Cf", ..) 616 $Cat{$cat} ||= Table->New(Is => $cat, 617 Desc => "General Category '$cat'", 618 Fuzzy => 0); 619 $Cat{$cat}->$op($code); 620 621 ## add to the major category (e.g. "L", "N", "C", ...) 622 $Cat{$MajorCat} ||= Table->New(Is => $MajorCat, 623 Desc => "Major Category '$MajorCat'", 624 Fuzzy => 0); 625 $Cat{$MajorCat}->$op($code); 626 627 ($General{$name} ||= Table->New)->$op($code, $name); 628 629 # 005F: SPACING UNDERSCORE 630 $Cat{Word}->$op($code) if $cat =~ /^[LMN]|Pc/; 631 $Cat{Alnum}->$op($code) if $cat =~ /^[LM]|Nd/; 632 $Cat{Alpha}->$op($code) if $cat =~ /^[LM]/; 633 634 my $isspace = 635 ($cat =~ /Zs|Zl|Zp/ && 636 $code != 0x200B) # 200B is ZWSP which is for line break control 637 # and therefore it is not part of "space" even while it is "Zs". 638 || $code == 0x0009 # 0009: HORIZONTAL TAB 639 || $code == 0x000A # 000A: LINE FEED 640 || $code == 0x000B # 000B: VERTICAL TAB 641 || $code == 0x000C # 000C: FORM FEED 642 || $code == 0x000D # 000D: CARRIAGE RETURN 643 || $code == 0x0085 # 0085: NEL 644 645 ; 646 647 $Cat{Space}->$op($code) if $isspace; 648 649 $Cat{SpacePerl}->$op($code) if $isspace 650 && $code != 0x000B; # Backward compat. 651 652 $Cat{Blank}->$op($code) if $isspace 653 && !($code == 0x000A || 654 $code == 0x000B || 655 $code == 0x000C || 656 $code == 0x000D || 657 $code == 0x0085 || 658 $cat =~ /^Z[lp]/); 659 660 $Cat{Digit}->$op($code) if $cat eq "Nd"; 661 $Cat{Upper}->$op($code) if $cat eq "Lu"; 662 $Cat{Lower}->$op($code) if $cat eq "Ll"; 663 $Cat{Title}->$op($code) if $cat eq "Lt"; 664 $Cat{ASCII}->$op($code) if $code <= 0x007F; 665 $Cat{Cntrl}->$op($code) if $cat =~ /^C/; 666 my $isgraph = !$isspace && $cat !~ /Cc|Cs|Cn/; 667 $Cat{Graph}->$op($code) if $isgraph; 668 $Cat{Print}->$op($code) if $isgraph || $isspace; 669 $Cat{Punct}->$op($code) if $cat =~ /^P/; 670 671 $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9 672 || ($code >= 0x41 && $code <= 0x46) ## A..F 673 || ($code >= 0x61 && $code <= 0x66); ## a..f 674 } 675 676 ## open ane read file..... 677 if (not open IN, "UnicodeData.txt") { 678 die "$0: UnicodeData.txt: $!\n"; 679 } 680 681 ## 682 ## For building \p{_CombAbove} and \p{_CanonDCIJ} 683 ## 684 my %_Above_HexCodes; ## Hexcodes for chars with $comb == 230 ("ABOVE") 685 686 my %CodeToDeco; ## Maps code to decomp. list for chars with first 687 ## decomp. char an "i" or "j" (for \p{_CanonDCIJ}) 688 689 ## This is filled in as we go.... 690 my $CombAbove = Table->New(Is => '_CombAbove', 691 Desc => '(for internal casefolding use)', 692 Fuzzy => 0); 693 694 while (<IN>) 695 { 696 next unless /^[0-9A-Fa-f]+;/; 697 s/\s+$//; 698 699 my ($hexcode, ## code point in hex (e.g. "0041") 700 $name, ## character name (e.g. "LATIN CAPITAL LETTER A") 701 $cat, ## category (e.g. "Lu") 702 $comb, ## Canonical combining class (e.t. "230") 703 $bidi, ## directional category (e.g. "L") 704 $deco, ## decomposition mapping 705 $decimal, ## decimal digit value 706 $digit, ## digit value 707 $number, ## numeric value 708 $mirrored, ## mirrored 709 $unicode10, ## name in Unicode 1.0 710 $comment, ## comment field 711 $upper, ## uppercase mapping 712 $lower, ## lowercase mapping 713 $title, ## titlecase mapping 714 ) = split(/\s*;\s*/); 715 716 # Note that in Unicode 3.2 there will be names like 717 # LINE FEED (LF), which probably means that \N{} needs 718 # to cope also with LINE FEED and LF. 719 $name = $unicode10 if $name eq '<control>' && $unicode10 ne ''; 720 721 my $code = hex($hexcode); 722 723 if ($comb and $comb == 230) { 724 $CombAbove->Append($code); 725 $_Above_HexCodes{$hexcode} = 1; 726 } 727 728 ## Used in building \p{_CanonDCIJ} 729 if ($deco and $deco =~ m/^006[9A]\b/) { 730 $CodeToDeco{$code} = $deco; 731 } 732 733 ## 734 ## There are a few pairs of lines like: 735 ## AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 736 ## D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 737 ## that define ranges. 738 ## 739 if ($name =~ /^<(.+), (First|Last)>$/) 740 { 741 $name = $1; 742 gencat($name, $cat, $code, $2 eq 'First' ? 'Append' : 'Extend'); 743 #New_Prop(In => $name, $General{$name}, Fuzzy => 1); 744 } 745 else 746 { 747 ## normal (single-character) lines 748 gencat($name, $cat, $code, 'Append'); 749 750 # No Append() here since since several codes may map into one. 751 $To{Upper}->RawAppendRange($code, $code, $upper) if $upper; 752 $To{Lower}->RawAppendRange($code, $code, $lower) if $lower; 753 $To{Title}->RawAppendRange($code, $code, $title) if $title; 754 $To{Digit}->Append($code, $decimal) if length $decimal; 755 756 $Bidi->Append($code, $bidi); 757 $Comb->Append($code, $comb) if $comb; 758 $Number->Append($code, $number) if length $number; 759 760 $Mirrored->Append($code) if $mirrored eq "Y"; 761 762 $Bidi{$bidi} ||= Table->New(Is => "Bidi$bidi", 763 Desc => "Bi-directional category '$bidi'", 764 Fuzzy => 0); 765 $Bidi{$bidi}->Append($code); 766 767 if ($deco) 768 { 769 $Deco->Append($code, $deco); 770 if ($deco =~/^<(\w+)>/) 771 { 772 $Deco{Compat}->Append($code); 773 774 $DC{$1} ||= Table->New(Is => "DC$1", 775 Desc => "Compatible with '$1'", 776 Fuzzy => 0); 777 $DC{$1}->Append($code); 778 } 779 else 780 { 781 $Deco{Canon}->Append($code); 782 } 783 } 784 } 785 } 786 close IN; 787 788 ## 789 ## Tidy up a few special cases.... 790 ## 791 792 $Cat{Cn} = $Assigned->Invert; ## Cn is everything that doesn't exist 793 New_Prop(Is => 'Cn', 794 $Cat{Cn}, 795 Desc => "General Category 'Cn' [not functional in Perl]", 796 Fuzzy => 0); 797 798 ## Unassigned is the same as 'Cn' 799 New_Alias(Is => 'Unassigned', SameAs => 'Cn', Fuzzy => 0); 800 801 $Cat{C}->Replace($Cat{C}->Merge($Cat{Cn})); ## Now merge in Cn into C 802 803 804 # L& is Ll, Lu, and Lt. 805 New_Prop(Is => 'L&', 806 Table->Merge(@Cat{qw[Ll Lu Lt]}), 807 Desc => '[\p{Ll}\p{Lu}\p{Lt}]', 808 Fuzzy => 0); 809 810 ## Any and All are all code points. 811 my $Any = Table->New(Is => 'Any', 812 Desc => sprintf("[\\x{0000}-\\x{%X}]", 813 $LastUnicodeCodepoint), 814 Fuzzy => 0); 815 $Any->RawAppendRange(0, $LastUnicodeCodepoint); 816 817 New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 0); 818 819 ## 820 ## Build special properties for Perl's internal case-folding needs: 821 ## \p{_CaseIgnorable} 822 ## \p{_CanonDCIJ} 823 ## \p{_CombAbove} 824 ## _CombAbove was built above. Others are built here.... 825 ## 826 827 ## \p{_CaseIgnorable} is [\p{Mn}\0x00AD\x2010] 828 New_Prop(Is => '_CaseIgnorable', 829 Table->Merge($Cat{Mn}, 830 0x00AD, #SOFT HYPHEN 831 0x2010), #HYPHEN 832 Desc => '(for internal casefolding use)', 833 Fuzzy => 0); 834 835 836 ## \p{_CanonDCIJ} is fairly complex... 837 my $CanonCDIJ = Table->New(Is => '_CanonDCIJ', 838 Desc => '(for internal casefolding use)', 839 Fuzzy => 0); 840 ## It contains the ASCII 'i' and 'j'.... 841 $CanonCDIJ->Append(0x0069); # ASCII ord("i") 842 $CanonCDIJ->Append(0x006A); # ASCII ord("j") 843 ## ...and any character with a decomposition that starts with either of 844 ## those code points, but only if the decomposition does not have any 845 ## combining character with the "ABOVE" canonical combining class. 846 for my $code (sort { $a <=> $b} keys %CodeToDeco) 847 { 848 ## Need to ensure that all decomposition characters do not have 849 ## a %HexCodeToComb in %AboveCombClasses. 850 my $want = 1; 851 for my $deco_hexcode (split / /, $CodeToDeco{$code}) 852 { 853 if (exists $_Above_HexCodes{$deco_hexcode}) { 854 ## one of the decmposition chars has an ABOVE combination 855 ## class, so we're not interested in this one 856 $want = 0; 857 last; 858 } 859 } 860 if ($want) { 861 $CanonCDIJ->Append($code); 862 } 863 } 864 865 866 867 ## 868 ## Now dump the files. 869 ## 870 $Name->Write("Name.pl"); 871 $Bidi->Write("Bidirectional.pl"); 872 $Comb->Write("CombiningClass.pl"); 873 $Deco->Write("Decomposition.pl"); 874 $Number->Write("Number.pl"); 875 $General->Write("Category.pl"); 876 877 for my $to (sort keys %To) { 878 $To{$to}->Write("To/$to.pl"); 879 } 880} 881 882## 883## Process LineBreak.txt 884## 885sub LineBreak_Txt() 886{ 887 if (not open IN, "LineBreak.txt") { 888 die "$0: LineBreak.txt: $!\n"; 889 } 890 891 my $Lbrk = Table->New(); 892 my %Lbrk; 893 894 while (<IN>) 895 { 896 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; 897 898 my ($first, $last, $lbrk) = (hex($1), hex($2||""), $3); 899 900 $Lbrk->Append($first, $lbrk); 901 902 $Lbrk{$lbrk} ||= Table->New(Is => "Lbrk$lbrk", 903 Desc => "Linebreak category '$lbrk'", 904 Fuzzy => 0); 905 $Lbrk{$lbrk}->Append($first); 906 907 if ($last) { 908 $Lbrk->Extend($last); 909 $Lbrk{$lbrk}->Extend($last); 910 } 911 } 912 close IN; 913 914 $Lbrk->Write("Lbrk.pl"); 915} 916 917## 918## Process ArabicShaping.txt. 919## 920sub ArabicShaping_txt() 921{ 922 if (not open IN, "ArabicShaping.txt") { 923 die "$0: ArabicShaping.txt: $!\n"; 924 } 925 926 my $ArabLink = Table->New(); 927 my $ArabLinkGroup = Table->New(); 928 929 while (<IN>) 930 { 931 next unless /^[0-9A-Fa-f]+;/; 932 s/\s+$//; 933 934 my ($hexcode, $name, $link, $linkgroup) = split(/\s*;\s*/); 935 my $code = hex($hexcode); 936 $ArabLink->Append($code, $link); 937 $ArabLinkGroup->Append($code, $linkgroup); 938 } 939 close IN; 940 941 $ArabLink->Write("ArabLink.pl"); 942 $ArabLinkGroup->Write("ArabLnkGrp.pl"); 943} 944 945## 946## Process Jamo.txt. 947## 948sub Jamo_txt() 949{ 950 if (not open IN, "Jamo.txt") { 951 die "$0: Jamo.txt: $!\n"; 952 } 953 my $Short = Table->New(); 954 955 while (<IN>) 956 { 957 next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; 958 my ($code, $short) = (hex($1), $2); 959 960 $Short->Append($code, $short); 961 } 962 close IN; 963 $Short->Write("JamoShort.pl"); 964} 965 966## 967## Process Scripts.txt. 968## 969sub Scripts_txt() 970{ 971 my @ScriptInfo; 972 973 if (not open(IN, "Scripts.txt")) { 974 die "$0: Scripts.txt: $!\n"; 975 } 976 while (<IN>) { 977 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; 978 979 # Wait until all the scripts have been read since 980 # they are not listed in numeric order. 981 push @ScriptInfo, [ hex($1), hex($2||""), $3 ]; 982 } 983 close IN; 984 985 # Now append the scripts properties in their code point order. 986 987 my %Script; 988 my $Scripts = Table->New(); 989 990 for my $script (sort { $a->[0] <=> $b->[0] } @ScriptInfo) 991 { 992 my ($first, $last, $name) = @$script; 993 $Scripts->Append($first, $name); 994 995 $Script{$name} ||= Table->New(Is => $name, 996 Desc => "Script '$name'", 997 Fuzzy => 1); 998 $Script{$name}->Append($first, $name); 999 1000 if ($last) { 1001 $Scripts->Extend($last); 1002 $Script{$name}->Extend($last); 1003 } 1004 } 1005 1006 $Scripts->Write("Scripts.pl"); 1007 1008 ## Common is everything not explicitly assigned to a Script 1009 ## 1010 ## ***shouldn't this be intersected with \p{Assigned}? ****** 1011 ## 1012 New_Prop(Is => 'Common', 1013 $Scripts->Invert, 1014 Desc => 'Pseudo-Script of codepoints not in other Unicode scripts', 1015 Fuzzy => 1); 1016} 1017 1018## 1019## Given a name like "Close Punctuation", return a regex (that when applied 1020## with /i) matches any valid form of that name (e.g. "ClosePunctuation", 1021## "Close-Punctuation", etc.) 1022## 1023## Accept any space, dash, or underbar where in the official name there is 1024## space or a dash (or underbar, but there never is). 1025## 1026## 1027sub NameToRegex($) 1028{ 1029 my $Name = shift; 1030 $Name =~ s/[- _]/(?:[-_]|\\s+)?/g; 1031 return $Name; 1032} 1033 1034## 1035## Process Blocks.txt. 1036## 1037sub Blocks_txt() 1038{ 1039 my $Blocks = Table->New(); 1040 my %Blocks; 1041 1042 if (not open IN, "Blocks.txt") { 1043 die "$0: Blocks.txt: $!\n"; 1044 } 1045 1046 while (<IN>) 1047 { 1048 #next if not /Private Use$/; 1049 next if not /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; 1050 1051 my ($first, $last, $name) = (hex($1), hex($2), $3); 1052 1053 $Blocks->Append($first, $name); 1054 1055 $Blocks{$name} ||= Table->New(In => $name, 1056 Desc => "Block '$name'", 1057 Fuzzy => 1); 1058 $Blocks{$name}->Append($first, $name); 1059 1060 if ($last and $last != $first) { 1061 $Blocks->Extend($last); 1062 $Blocks{$name}->Extend($last); 1063 } 1064 } 1065 close IN; 1066 1067 $Blocks->Write("Blocks.pl"); 1068} 1069 1070## 1071## Read in the PropList.txt. It contains extended properties not 1072## listed in the UnicodeData.txt, such as 'Other_Alphabetic': 1073## alphabetic but not of the general category L; many modifiers 1074## belong to this extended property category: while they are not 1075## alphabets, they are alphabetic in nature. 1076## 1077sub PropList_txt() 1078{ 1079 my @PropInfo; 1080 1081 if (not open IN, "PropList.txt") { 1082 die "$0: PropList.txt: $!\n"; 1083 } 1084 1085 while (<IN>) 1086 { 1087 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; 1088 1089 # Wait until all the extended properties have been read since 1090 # they are not listed in numeric order. 1091 push @PropInfo, [ hex($1), hex($2||""), $3 ]; 1092 } 1093 close IN; 1094 1095 # Now append the extended properties in their code point order. 1096 my $Props = Table->New(); 1097 my %Prop; 1098 1099 for my $prop (sort { $a->[0] <=> $b->[0] } @PropInfo) 1100 { 1101 my ($first, $last, $name) = @$prop; 1102 $Props->Append($first, $name); 1103 1104 $Prop{$name} ||= Table->New(Is => $name, 1105 Desc => "Extended property '$name'", 1106 Fuzzy => 1); 1107 $Prop{$name}->Append($first, $name); 1108 1109 if ($last) { 1110 $Props->Extend($last); 1111 $Prop{$name}->Extend($last); 1112 } 1113 } 1114 1115 # Alphabetic is L and Other_Alphabetic. 1116 New_Prop(Is => 'Alphabetic', 1117 Table->Merge($Cat{L}, $Prop{Other_Alphabetic}), 1118 Desc => '[\p{L}\p{OtherAlphabetic}]', # use canonical names here 1119 Fuzzy => 1); 1120 1121 # Lowercase is Ll and Other_Lowercase. 1122 New_Prop(Is => 'Lowercase', 1123 Table->Merge($Cat{Ll}, $Prop{Other_Lowercase}), 1124 Desc => '[\p{Ll}\p{OtherLowercase}]', # use canonical names here 1125 Fuzzy => 1); 1126 1127 # Uppercase is Lu and Other_Uppercase. 1128 New_Prop(Is => 'Uppercase', 1129 Table->Merge($Cat{Lu}, $Prop{Other_Uppercase}), 1130 Desc => '[\p{Lu}\p{Other_Uppercase}]', # use canonical names here 1131 Fuzzy => 1); 1132 1133 # Math is Sm and Other_Math. 1134 New_Prop(Is => 'Math', 1135 Table->Merge($Cat{Sm}, $Prop{Other_Math}), 1136 Desc => '[\p{Sm}\p{OtherMath}]', # use canonical names here 1137 Fuzzy => 1); 1138 1139 # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. 1140 New_Prop(Is => 'ID_Start', 1141 Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl]}), 1142 Desc => '[\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]', 1143 Fuzzy => 1); 1144 1145 # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. 1146 New_Prop(Is => 'ID_Continue', 1147 Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl Mn Mc Nd Pc ]}), 1148 Desc => '[\p{ID_Start}\p{Mn}\p{Mc}\p{Nd}\p{Pc}]', 1149 Fuzzy => 1); 1150} 1151 1152sub Make_GC_Aliases() 1153{ 1154 ## 1155 ## The mapping from General Category long forms to short forms is 1156 ## currently hardwired here since no simple data file in the UCD 1157 ## seems to do that. Unicode 3.2 will assumedly correct this. 1158 ## 1159 my %Is = ( 1160 'Letter' => 'L', 1161 'Uppercase_Letter' => 'Lu', 1162 'Lowercase_Letter' => 'Ll', 1163 'Titlecase_Letter' => 'Lt', 1164 'Modifier_Letter' => 'Lm', 1165 'Other_Letter' => 'Lo', 1166 1167 'Mark' => 'M', 1168 'Non_Spacing_Mark' => 'Mn', 1169 'Spacing_Mark' => 'Mc', 1170 'Enclosing_Mark' => 'Me', 1171 1172 'Separator' => 'Z', 1173 'Space_Separator' => 'Zs', 1174 'Line_Separator' => 'Zl', 1175 'Paragraph_Separator' => 'Zp', 1176 1177 'Number' => 'N', 1178 'Decimal_Number' => 'Nd', 1179 'Letter_Number' => 'Nl', 1180 'Other_Number' => 'No', 1181 1182 'Punctuation' => 'P', 1183 'Connector_Punctuation' => 'Pc', 1184 'Dash_Punctuation' => 'Pd', 1185 'Open_Punctuation' => 'Ps', 1186 'Close_Punctuation' => 'Pe', 1187 'Initial_Punctuation' => 'Pi', 1188 'Final_Punctuation' => 'Pf', 1189 'Other_Punctuation' => 'Po', 1190 1191 'Symbol' => 'S', 1192 'Math_Symbol' => 'Sm', 1193 'Currency_Symbol' => 'Sc', 1194 'Modifier_Symbol' => 'Sk', 1195 'Other_Symbol' => 'So', 1196 1197 'Other' => 'C', 1198 'Control' => 'Cc', 1199 'Format' => 'Cf', 1200 'Surrogate' => 'Cs', 1201 'Private Use' => 'Co', 1202 'Unassigned' => 'Cn', 1203 ); 1204 1205 ## make the aliases.... 1206 while (my ($Alias, $Name) = each %Is) { 1207 New_Alias(Is => $Alias, SameAs => $Name, Fuzzy => 1); 1208 } 1209} 1210 1211 1212## 1213## These are used in: 1214## MakePropTestScript() 1215## WriteAllMappings() 1216## for making the test script. 1217## 1218my %FuzzyNameToTest; 1219my %ExactNameToTest; 1220 1221 1222## This used only for making the test script 1223sub GenTests($$$$) 1224{ 1225 my $FH = shift; 1226 my $Prop = shift; 1227 my $MatchCode = shift; 1228 my $FailCode = shift; 1229 1230 if (defined $MatchCode) { 1231 printf $FH qq/Expect(1, "\\x{%04X}", '\\p{$Prop}' );\n/, $MatchCode; 1232 printf $FH qq/Expect(0, "\\x{%04X}", '\\p{^$Prop}');\n/, $MatchCode; 1233 printf $FH qq/Expect(0, "\\x{%04X}", '\\P{$Prop}' );\n/, $MatchCode; 1234 printf $FH qq/Expect(1, "\\x{%04X}", '\\P{^$Prop}');\n/, $MatchCode; 1235 } 1236 if (defined $FailCode) { 1237 printf $FH qq/Expect(0, "\\x{%04X}", '\\p{$Prop}' );\n/, $FailCode; 1238 printf $FH qq/Expect(1, "\\x{%04X}", '\\p{^$Prop}');\n/, $FailCode; 1239 printf $FH qq/Expect(1, "\\x{%04X}", '\\P{$Prop}' );\n/, $FailCode; 1240 printf $FH qq/Expect(0, "\\x{%04X}", '\\P{^$Prop}');\n/, $FailCode; 1241 } 1242} 1243 1244## This used only for making the test script 1245sub ExpectError($$) 1246{ 1247 my $FH = shift; 1248 my $prop = shift; 1249 1250 print $FH qq/Error('\\p{$prop}');\n/; 1251 print $FH qq/Error('\\P{$prop}');\n/; 1252} 1253 1254## This used only for making the test script 1255my @GoodSeps = ( 1256 " ", 1257 "-", 1258 " \t ", 1259 "", 1260 "", 1261 "_", 1262 ); 1263my @BadSeps = ( 1264 "--", 1265 "__", 1266 " _", 1267 "/" 1268 ); 1269 1270## This used only for making the test script 1271sub RandomlyFuzzifyName($;$) 1272{ 1273 my $Name = shift; 1274 my $WantError = shift; ## if true, make an error 1275 1276 my @parts; 1277 for my $part (split /[-\s_]+/, $Name) 1278 { 1279 if (@parts) { 1280 if ($WantError and rand() < 0.3) { 1281 push @parts, $BadSeps[rand(@BadSeps)]; 1282 $WantError = 0; 1283 } else { 1284 push @parts, $GoodSeps[rand(@GoodSeps)]; 1285 } 1286 } 1287 my $switch = int rand(4); 1288 if ($switch == 0) { 1289 push @parts, uc $part; 1290 } elsif ($switch == 1) { 1291 push @parts, lc $part; 1292 } elsif ($switch == 2) { 1293 push @parts, ucfirst $part; 1294 } else { 1295 push @parts, $part; 1296 } 1297 } 1298 my $new = join('', @parts); 1299 1300 if ($WantError) { 1301 if (rand() >= 0.5) { 1302 $new .= $BadSeps[rand(@BadSeps)]; 1303 } else { 1304 $new = $BadSeps[rand(@BadSeps)] . $new; 1305 } 1306 } 1307 return $new; 1308} 1309 1310## This used only for making the test script 1311sub MakePropTestScript() 1312{ 1313 ## this written directly -- it's huge. 1314 if (not open OUT, ">TestProp.pl") { 1315 die "$0: TestProp.pl: $!\n"; 1316 } 1317 print OUT <DATA>; 1318 1319 while (my ($Name, $Table) = each %ExactNameToTest) 1320 { 1321 GenTests(*OUT, $Name, $Table->ValidCode, $Table->InvalidCode); 1322 ExpectError(*OUT, uc $Name) if uc $Name ne $Name; 1323 ExpectError(*OUT, lc $Name) if lc $Name ne $Name; 1324 } 1325 1326 1327 while (my ($Name, $Table) = each %FuzzyNameToTest) 1328 { 1329 my $Orig = $CanonicalToOrig{$Name}; 1330 my %Names = ( 1331 $Name => 1, 1332 $Orig => 1, 1333 RandomlyFuzzifyName($Orig) => 1 1334 ); 1335 1336 for my $N (keys %Names) { 1337 GenTests(*OUT, $N, $Table->ValidCode, $Table->InvalidCode); 1338 } 1339 1340 ExpectError(*OUT, RandomlyFuzzifyName($Orig, 'ERROR')); 1341 } 1342 1343 print OUT "Finished();\n"; 1344 close OUT; 1345} 1346 1347 1348## 1349## These are used only in: 1350## RegisterFileForName() 1351## WriteAllMappings() 1352## 1353my %Exact; ## will become %utf8::Exact; 1354my %Canonical; ## will become %utf8::Canonical; 1355my %CaComment; ## Comment for %Canonical entry of same key 1356 1357## 1358## Given info about a name and a datafile that it should be associated with, 1359## register that assocation in %Exact and %Canonical. 1360sub RegisterFileForName($$$$) 1361{ 1362 my $Type = shift; 1363 my $Name = shift; 1364 my $IsFuzzy = shift; 1365 my $filename = shift; 1366 1367 ## 1368 ## Now in details for the mapping. $Type eq 'Is' has the 1369 ## Is removed, as it will be removed in utf8_heavy when this 1370 ## data is being checked. In keeps its "In", but a second 1371 ## sans-In record is written if it doesn't conflict with 1372 ## anything already there. 1373 ## 1374 if (not $IsFuzzy) 1375 { 1376 if ($Type eq 'Is') { 1377 die "oops[$Name]" if $Exact{$Name}; 1378 $Exact{$Name} = $filename; 1379 } else { 1380 die "oops[$Type$Name]" if $Exact{"$Type$Name"}; 1381 $Exact{"$Type$Name"} = $filename; 1382 $Exact{$Name} = $filename if not $Exact{$Name}; 1383 } 1384 } 1385 else 1386 { 1387 my $CName = lc $Name; 1388 if ($Type eq 'Is') { 1389 die "oops[$CName]" if $Canonical{$CName}; 1390 $Canonical{$CName} = $filename; 1391 $CaComment{$CName} = $Name if $Name =~ tr/A-Z// >= 2; 1392 } else { 1393 die "oops[$Type$CName]" if $Canonical{lc "$Type$CName"}; 1394 $Canonical{lc "$Type$CName"} = $filename; 1395 $CaComment{lc "$Type$CName"} = "$Type$Name"; 1396 if (not $Canonical{$CName}) { 1397 $Canonical{$CName} = $filename; 1398 $CaComment{$CName} = "$Type$Name"; 1399 } 1400 } 1401 } 1402} 1403 1404## 1405## Writes the info accumulated in 1406## 1407## %TableInfo; 1408## %FuzzyNames; 1409## %AliasInfo; 1410## 1411## 1412sub WriteAllMappings() 1413{ 1414 my @MAP; 1415 1416 my %BaseNames; ## Base names already used (for avoiding 8.3 conflicts) 1417 1418 ## 'Is' *MUST* come first, so its names have precidence over 'In's 1419 for my $Type ('Is', 'In') 1420 { 1421 my %RawNameToFile; ## a per-$Type cache 1422 1423 for my $Name (sort {length $a <=> length $b} keys %{$TableInfo{$Type}}) 1424 { 1425 ## Note: $Name is already canonical 1426 my $Table = $TableInfo{$Type}->{$Name}; 1427 my $IsFuzzy = $FuzzyNames{$Type}->{$Name}; 1428 1429 ## Need an 8.3 safe filename (which means "an 8 safe" $filename) 1430 my $filename; 1431 { 1432 ## 'Is' items lose 'Is' from the basename. 1433 $filename = $Type eq 'Is' ? $Name : "$Type$Name"; 1434 1435 $filename =~ s/[^\w_]+/_/g; # "L&" -> "L_" 1436 substr($filename, 8) = '' if length($filename) > 8; 1437 1438 ## 1439 ## Make sure the basename doesn't conflict with something we 1440 ## might have already written. If we have, say, 1441 ## InGreekExtended1 1442 ## InGreekExtended2 1443 ## they become 1444 ## InGreekE 1445 ## InGreek2 1446 ## 1447 while (my $num = $BaseNames{lc $filename}++) 1448 { 1449 $num++; ## so basenames with numbers start with '2', which 1450 ## just looks more natural. 1451 ## Want to append $num, but if it'll make the basename longer 1452 ## than 8 characters, pre-truncate $filename so that the result 1453 ## is acceptable. 1454 my $delta = length($filename) + length($num) - 8; 1455 if ($delta > 0) { 1456 substr($filename, -$delta) = $num; 1457 } else { 1458 $filename .= $num; 1459 } 1460 } 1461 }; 1462 1463 ## 1464 ## Construct a nice comment to add to the file, and build data 1465 ## for the "./Properties" file along the way. 1466 ## 1467 my $Comment; 1468 { 1469 my $Desc = $TableDesc{$Type}->{$Name} || ""; 1470 ## get list of names this table is reference by 1471 my @Supported = $Name; 1472 while (my ($Orig, $Alias) = each %{ $AliasInfo{$Type} }) 1473 { 1474 if ($Orig eq $Name) { 1475 push @Supported, $Alias; 1476 } 1477 } 1478 1479 my $TypeToShow = $Type eq 'Is' ? "" : $Type; 1480 my $OrigProp; 1481 1482 $Comment = "This file supports:\n"; 1483 for my $N (@Supported) 1484 { 1485 my $IsFuzzy = $FuzzyNames{$Type}->{$N}; 1486 my $Prop = "\\p{$TypeToShow$Name}"; 1487 $OrigProp = $Prop if not $OrigProp; #cache for aliases 1488 if ($IsFuzzy) { 1489 $Comment .= "\t$Prop (and fuzzy permutations)\n"; 1490 } else { 1491 $Comment .= "\t$Prop\n"; 1492 } 1493 my $MyDesc = ($N eq $Name) ? $Desc : "Alias for $OrigProp ($Desc)"; 1494 1495 push @MAP, sprintf("%s %-42s %s\n", 1496 $IsFuzzy ? '*' : ' ', $Prop, $MyDesc); 1497 } 1498 if ($Desc) { 1499 $Comment .= "\nMeaning: $Desc\n"; 1500 } 1501 1502 } 1503 ## 1504 ## Okay, write the file... 1505 ## 1506 $Table->Write("lib/$filename.pl", $Comment); 1507 1508 ## and register it 1509 $RawNameToFile{$Name} = $filename; 1510 RegisterFileForName($Type => $Name, $IsFuzzy, $filename); 1511 1512 if ($IsFuzzy) 1513 { 1514 my $CName = CanonicalName($Type . '_'. $Name); 1515 $FuzzyNameToTest{$Name} = $Table if !$FuzzyNameToTest{$Name}; 1516 $FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName}; 1517 } else { 1518 $ExactNameToTest{$Name} = $Table; 1519 } 1520 1521 } 1522 1523 ## Register aliase info 1524 for my $Name (sort {length $a <=> length $b} keys %{$AliasInfo{$Type}}) 1525 { 1526 my $Alias = $AliasInfo{$Type}->{$Name}; 1527 my $IsFuzzy = $FuzzyNames{$Type}->{$Alias}; 1528 my $filename = $RawNameToFile{$Name}; 1529 die "oops [$Alias]->[$Name]" if not $filename; 1530 RegisterFileForName($Type => $Alias, $IsFuzzy, $filename); 1531 1532 my $Table = $TableInfo{$Type}->{$Name}; 1533 die "oops" if not $Table; 1534 if ($IsFuzzy) 1535 { 1536 my $CName = CanonicalName($Type .'_'. $Alias); 1537 $FuzzyNameToTest{$Alias} = $Table if !$FuzzyNameToTest{$Alias}; 1538 $FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName}; 1539 } else { 1540 $ExactNameToTest{$Alias} = $Table; 1541 } 1542 } 1543 } 1544 1545 ## 1546 ## Write out the property list 1547 ## 1548 { 1549 my @OUT = ( 1550 "##\n", 1551 "## This file created by $0\n", 1552 "## List of built-in \\p{...}/\\P{...} properties.\n", 1553 "##\n", 1554 "## '*' means name may be 'fuzzy'\n", 1555 "##\n\n", 1556 sort { substr($a,2) cmp substr($b, 2) } @MAP, 1557 ); 1558 WriteIfChanged('Properties', @OUT); 1559 } 1560 1561 use Text::Tabs (); ## using this makes the files about half the size 1562 1563 ## Write Exact.pl 1564 { 1565 my @OUT = ( 1566 $HEADER, 1567 "##\n", 1568 "## Data in this file used by ../utf8_heavy.pl\n", 1569 "##\n\n", 1570 "## Mapping from name to filename in ./lib\n", 1571 "%utf8::Exact = (\n", 1572 ); 1573 1574 for my $Name (sort keys %Exact) 1575 { 1576 my $File = $Exact{$Name}; 1577 $Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name "; 1578 my $Text = sprintf("%-15s => %s,\n", $Name, qq/'$File'/); 1579 push @OUT, Text::Tabs::unexpand($Text); 1580 } 1581 push @OUT, ");\n1;\n"; 1582 1583 WriteIfChanged('Exact.pl', @OUT); 1584 } 1585 1586 ## Write Canonical.pl 1587 { 1588 my @OUT = ( 1589 $HEADER, 1590 "##\n", 1591 "## Data in this file used by ../utf8_heavy.pl\n", 1592 "##\n\n", 1593 "## Mapping from lc(canonical name) to filename in ./lib\n", 1594 "%utf8::Canonical = (\n", 1595 ); 1596 my $Trail = ""; ## used just to keep the spacing pretty 1597 for my $Name (sort keys %Canonical) 1598 { 1599 my $File = $Canonical{$Name}; 1600 if ($CaComment{$Name}) { 1601 push @OUT, "\n" if not $Trail; 1602 push @OUT, " # $CaComment{$Name}\n"; 1603 $Trail = "\n"; 1604 } else { 1605 $Trail = ""; 1606 } 1607 $Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name "; 1608 my $Text = sprintf(" %-41s => %s,\n$Trail", $Name, qq/'$File'/); 1609 push @OUT, Text::Tabs::unexpand($Text); 1610 } 1611 push @OUT, ");\n1\n"; 1612 WriteIfChanged('Canonical.pl', @OUT); 1613 } 1614 1615 MakePropTestScript() if $MakeTestScript; 1616} 1617 1618 1619sub SpecialCasing_txt() 1620{ 1621 # 1622 # Read in the special cases. 1623 # 1624 1625 my %CaseInfo; 1626 1627 if (not open IN, "SpecialCasing.txt") { 1628 die "$0: SpecialCasing.txt: $!\n"; 1629 } 1630 while (<IN>) { 1631 next unless /^[0-9A-Fa-f]+;/; 1632 s/\#.*//; 1633 s/\s+$//; 1634 1635 my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); 1636 1637 if ($condition) { # not implemented yet 1638 print "# SKIPPING $_\n" if $Verbose; 1639 next; 1640 } 1641 1642 # Wait until all the special cases have been read since 1643 # they are not listed in numeric order. 1644 my $ix = hex($code); 1645 push @{$CaseInfo{Lower}}, [ $ix, $code, $lower ] 1646 unless $code eq $lower; 1647 push @{$CaseInfo{Title}}, [ $ix, $code, $title ] 1648 unless $code eq $title; 1649 push @{$CaseInfo{Upper}}, [ $ix, $code, $upper ] 1650 unless $code eq $upper; 1651 } 1652 close IN; 1653 1654 # Now write out the special cases properties in their code point order. 1655 # Prepend them to the To/{Upper,Lower,Title}.pl. 1656 1657 for my $case (qw(Lower Title Upper)) 1658 { 1659 my $NormalCase = do "To/$case.pl" || die "$0: $@\n"; 1660 1661 my @OUT = 1662 ( 1663 $HEADER, "\n", 1664 "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n", 1665 "%utf8::ToSpec$case =\n(\n", 1666 ); 1667 1668 for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) { 1669 my ($ix, $code, $to) = @$prop; 1670 my $tostr = 1671 join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; 1672 push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix))); 1673 # Remove any single-character mappings for 1674 # the same character since we are going for 1675 # the special casing rules. 1676 $NormalCase =~ s/^$code\t\t\w+\n//m; 1677 } 1678 push @OUT, ( 1679 ");\n\n", 1680 "return <<'END';\n", 1681 $NormalCase, 1682 "END\n" 1683 ); 1684 WriteIfChanged("To/$case.pl", @OUT); 1685 } 1686} 1687 1688# 1689# Read in the case foldings. 1690# 1691# We will do full case folding, C + F + I (see CaseFolding.txt). 1692# 1693sub CaseFolding_txt() 1694{ 1695 if (not open IN, "CaseFolding.txt") { 1696 die "$0: CaseFolding.txt: $!\n"; 1697 } 1698 1699 my $Fold = Table->New(); 1700 my %Fold; 1701 1702 while (<IN>) { 1703 # Skip status 'S', simple case folding 1704 next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/; 1705 1706 my ($code, $status, $fold) = (hex($1), $2, $3); 1707 1708 if ($status eq 'C') { # Common: one-to-one folding 1709 # No append() since several codes may fold into one. 1710 $Fold->RawAppendRange($code, $code, $fold); 1711 } else { # F: full, or I: dotted uppercase I -> dotless lowercase I 1712 $Fold{$code} = $fold; 1713 } 1714 } 1715 close IN; 1716 1717 $Fold->Write("To/Fold.pl"); 1718 1719 # 1720 # Prepend the special foldings to the common foldings. 1721 # 1722 my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n"; 1723 1724 my @OUT = 1725 ( 1726 $HEADER, "\n", 1727 "# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n", 1728 "%utf8::ToSpecFold =\n(\n", 1729 ); 1730 for my $code (sort { $a <=> $b } keys %Fold) { 1731 my $foldstr = 1732 join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; 1733 push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code))); 1734 } 1735 push @OUT, ( 1736 ");\n\n", 1737 "return <<'END';\n", 1738 $CommonFold, 1739 "END\n", 1740 ); 1741 1742 WriteIfChanged("To/Fold.pl", @OUT); 1743} 1744 1745## Do it.... 1746 1747UnicodeData_Txt(); 1748Make_GC_Aliases(); 1749PropList_txt(); 1750 1751Scripts_txt(); 1752Blocks_txt(); 1753 1754WriteAllMappings(); 1755 1756LineBreak_Txt(); 1757ArabicShaping_txt(); 1758Jamo_txt(); 1759SpecialCasing_txt(); 1760CaseFolding_txt(); 1761 1762exit(0); 1763 1764## TRAILING CODE IS USED BY MakePropTestScript() 1765__DATA__ 1766use strict; 1767use warnings; 1768 1769my $Tests = 0; 1770my $Fails = 0; 1771 1772sub Expect($$$) 1773{ 1774 my $Expect = shift; 1775 my $String = shift; 1776 my $Regex = shift; 1777 my $Line = (caller)[2]; 1778 1779 $Tests++; 1780 my $RegObj; 1781 my $result = eval { 1782 $RegObj = qr/$Regex/; 1783 $String =~ $RegObj ? 1 : 0 1784 }; 1785 1786 if (not defined $result) { 1787 print "couldn't compile /$Regex/ on $0 line $Line: $@\n"; 1788 $Fails++; 1789 } elsif ($result ^ $Expect) { 1790 print "bad result (expected $Expect) on $0 line $Line: $@\n"; 1791 $Fails++; 1792 } 1793} 1794 1795sub Error($) 1796{ 1797 my $Regex = shift; 1798 $Tests++; 1799 if (eval { 'x' =~ qr/$Regex/; 1 }) { 1800 $Fails++; 1801 my $Line = (caller)[2]; 1802 print "expected error for /$Regex/ on $0 line $Line: $@\n"; 1803 } 1804} 1805 1806sub Finished() 1807{ 1808 if ($Fails == 0) { 1809 print "All $Tests tests passed.\n"; 1810 exit(0); 1811 } else { 1812 print "$Tests tests, $Fails failed!\n"; 1813 exit(-1); 1814 } 1815} 1816