1use v5.16.0; 2use strict; 3use warnings; 4no warnings 'experimental::regex_sets'; 5require './regen/regen_lib.pl'; 6require './regen/charset_translations.pl'; 7use Unicode::UCD qw(prop_invlist prop_invmap search_invlist); 8use charnames qw(:loose); 9binmode(STDERR, ":utf8"); 10 11# Set this to 1 temporarily to get on stderr the complete list of paired 12# string delimiters this generates. This list is suitable for plugging into a 13# pod. 14my $output_lists = 0; 15 16# Set this to 1 temporarily to get on stderr the complete list of punctuation 17# marks and symbols that look to be directional but we didn't include for some 18# reason. 19my $output_omitteds = 0; 20 21my $out_fh = open_new('unicode_constants.h', '>', 22 {style => '*', by => $0, 23 from => "Unicode data"}); 24 25print $out_fh <<END; 26 27#ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */ 28#define PERL_UNICODE_CONSTANTS_H_ 1 29 30/* This file contains #defines for the version of Unicode being used and 31 * various Unicode code points. The values the code point macros expand to 32 * are the native Unicode code point, or all or portions of the UTF-8 encoding 33 * for the code point. In the former case, the macro name has the suffix 34 * "_NATIVE"; otherwise, the suffix "_UTF8". 35 * 36 * The macros that have the suffix "_UTF8" may have further suffixes, as 37 * follows: 38 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 39 * representation; the value will be a numeric constant. 40 * "_FIRST_BYTEs" same, but the first byte is represented as a literal 41 * string 42 * "_TAIL" if instead it represents all but the first byte. This, 43 * and with no additional suffix are both string constants */ 44 45/* 46=for apidoc_section \$unicode 47 48=for apidoc AmnU|const char *|BOM_UTF8 49 50This is a macro that evaluates to a string constant of the UTF-8 bytes that 51define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl 52is compiled on. This allows code to use a mnemonic for this character that 53works on both ASCII and EBCDIC platforms. 54S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in 55bytes. 56 57=for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8 58 59This is a macro that evaluates to a string constant of the UTF-8 bytes that 60define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl 61is compiled on. This allows code to use a mnemonic for this character that 62works on both ASCII and EBCDIC platforms. 63S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in 64bytes. 65 66=cut 67*/ 68 69END 70 71sub backslash_x_form($$;$) { 72 # Output the code point represented by the byte string $bytes as a 73 # sequence of \x{} constants. $bytes should be the UTF-8 for the code 74 # point if the final parameter is absent or empty. Otherwise it should be 75 # the Latin1 code point itself. 76 # 77 # The output is translated into the character set '$charset'. 78 79 my ($bytes, $charset, $non_utf8) = @_; 80 if ($non_utf8) { 81 die "Must be utf8 if above 255" if $bytes > 255; 82 my $a2n = get_a2n($charset); 83 return sprintf "\\x%02X", $a2n->[$bytes]; 84 } 85 else { 86 return join "", map { sprintf "\\x%02X", ord $_ } 87 split //, cp_2_utfbytes($bytes, $charset); 88 } 89} 90 91# The most complicated thing this program does is generate paired string 92# delimiters from the Unicode database. Some of these come from the 93# Unicode Bidirectional (bidi) algorithm. 94 95# These all visually look like left and right delimiters 96my @bidi_strong_lefts = ( 'LESS-THAN', 97 'ELEMENT OF', 98 'PRECEDE', 99 'PRECEDES', 100 'SMALLER THAN', 101 'SUBSET', 102 ); 103my @bidi_strong_rights = ( 'GREATER-THAN', 104 'CONTAINS', 105 'SUCCEED', 106 'SUCCEEDS', 107 'LARGER THAN', 108 'SUPERSET', 109 ); 110 111# Create an array of hashes for these, so as to translate between them, and 112# avoid recompiling patterns in the loop. 113my @bidi_strong_directionals; 114for (my $i = 0; $i < @bidi_strong_lefts; $i++) { 115 push @bidi_strong_directionals, 116 { 117 LHS => $bidi_strong_lefts[$i], 118 RHS => $bidi_strong_rights[$i], 119 L_pattern => qr/\b$bidi_strong_lefts[$i]\b/, 120 R_pattern => qr/\b$bidi_strong_rights[$i]\b/, 121 }; 122} 123 124my @ok_bidi_symbols = ( 125 'TACK', 126 'TURNSTILE', 127 ); 128my $ok_bidi_symbols_re = join '|', @ok_bidi_symbols; 129$ok_bidi_symbols_re = qr/\b($ok_bidi_symbols_re)\b/n; 130 131 132# Many characters have mirrors that Unicode hasn't included in their Bidi 133# algorithm. This program uses their names to find them. The next few 134# definitions are towards that end. 135 136# Most horizontal directionality is based on LEFT vs RIGHT. But it's 137# complicated: 138# 1) a barb on one or the other side of a harpoon doesn't indicate 139# directionality of the character. (A HARPOON is the word Unicode uses 140# to indicate an arrow with a one-sided tip.) 141my $no_barb_re = qr/(*nlb:BARB )/; 142 143# 2) RIGHT-SHADED doesn't signify anything about direction of the character 144# itself. These are the suffixes Unicode uses to indicate this. /aa is 145# needed because the wildcard names feature currently requires it for names. 146my $shaded_re = qr/ [- ] (SHADED | SHADOWED) /naax; 147 148# 3a) there are a few anomalies caught here. 'LEFT LUGGAGE' would have been 149# better named UNCLAIMED, and doesn't indicate directionality. 150my $real_LEFT_re = qr/ \b $no_barb_re LEFT (*nla: $shaded_re) 151 (*nla: [ ] LUGGAGE \b) 152 /nx; 153# 3b) And in most cases,a RIGHT TRIANGLE also doesn't refer to 154# directionality, but indicates it contains a 90 degree angle. 155my $real_RIGHT_re = qr/ \b $no_barb_re RIGHT (*nla: $shaded_re) 156 (*nla: [ ] (TRI)? ANGLE \b) 157 /nx; 158# More items could be added to these as needed 159 160# 4) something that is pointing R goes on the left, so is different than 161# the character on the R. For example, a RIGHT BRACKET would be 162# different from a RIGHT-FACING bracket. These patterns capture the 163# typical ways that Unicode character names indicate the latter meaning 164# as a suffix to RIGHT or LEFT 165my $pointing_suffix_re = qr/ ( WARDS # e.g., RIGHTWARDS 166 | [ ] ARROW # A R arrow points to the R 167 | [ -] FACING 168 | [ -] POINTING 169 | [ ] PENCIL # Implies a direction of its 170 # point 171 ) \b /nx; 172# And correspondingly for a prefix for LEFT RIGHT 173my $pointing_prefix_re = qr/ \b ( # e.g. UP RIGHT implies a direction 174 UP ( [ ] AND)? 175 | DOWN ( [ ] AND)? 176 | CONVERGING 177 | POINTING [ ] (DIRECTLY)? 178 | TO [ ] THE 179 ) 180 [ ] 181 /nx; 182 183my @other_directionals = 184 { 185 LHS => 'LEFT', 186 RHS => 'RIGHT', 187 L_pattern => 188 # Something goes on the left if it contains LEFT and doesn't 189 # point left, or it contains RIGHT and does point right. 190 qr/ \b (*nlb: $pointing_prefix_re) $real_LEFT_re 191 (*nla: $pointing_suffix_re) 192 | \b (*plb: $pointing_prefix_re) $real_RIGHT_re \b 193 | \b $real_RIGHT_re (*pla: $pointing_suffix_re) 194 /nx, 195 R_pattern => 196 qr/ \b (*nlb: $pointing_prefix_re) $real_RIGHT_re 197 (*nla: $pointing_suffix_re) 198 | \b (*plb: $pointing_prefix_re) $real_LEFT_re \b 199 | \b $real_LEFT_re (*pla: $pointing_suffix_re) 200 /nx, 201 }; 202 203# Some horizontal directionality is based on EAST vs WEST. These words are 204# almost always used by Unicode to indicate the direction pointing to, without 205# the general consistency in phrasing in L/R above. There are a handful of 206# possible exceptions, with only WEST WIND ever at all possibly an issue 207push @other_directionals, 208 { 209 LHS => 'EAST', 210 RHS => 'WEST', 211 L_pattern => qr/ \b ( EAST (*nla: [ ] WIND) 212 | WEST (*pla: [ ] WIND)) \b /x, 213 R_pattern => qr/ \b ( WEST (*nla: [ ] WIND) 214 | EAST (*pla: [ ] WIND)) \b /x, 215 }; 216 217# The final way the Unicode signals mirroring is by using the words REVERSE or 218# REVERSED; 219my $reverse_re = qr/ \b REVERSE D? [- ] /x; 220 221# Create a mapping from each direction to its opposite one 222my %opposite_of; 223foreach my $directional (@bidi_strong_directionals, @other_directionals) { 224 $opposite_of{$directional->{LHS}} = $directional->{RHS}; 225 $opposite_of{$directional->{RHS}} = $directional->{LHS}; 226} 227 228# Join the two types of each direction as alternatives 229my $L_re = join "|", map { $_->{L_pattern} } @bidi_strong_directionals, 230 @other_directionals; 231my $R_re = join "|", map { $_->{R_pattern} } @bidi_strong_directionals, 232 @other_directionals; 233# And anything containing directionality will be either one of these two 234my $directional_re = join "|", $L_re, $R_re; 235 236# Now compile the strings that result from above 237$L_re = qr/$L_re/; 238$R_re = qr/$R_re/; 239$directional_re = qr/($directional_re)/; # Make sure to capture $1 240 241my @included_symbols = ( 242 0x2326, 0x232B, # ERASE 243 0x23E9 .. 0x23EA, # DOUBLE TRIANGLE 244 0x23ED .. 0x23EE, # DOUBLE TRIANGLE with BAR 245 0x269E .. 0x269F, # THREE LINES CONVERGING 246 0x1D102 .. 0x1D103, # MUSIC STAVES 247 0x1D106 .. 0x1D107, # MUSIC STAVES 248 0x1F57B, # TELEPHONE RECEIVER 249 0x1F57D, # TELEPHONE RECEIVER 250 0x1F508 .. 0x1F50A, # LOUD SPEAKER 251 0x1F568 .. 0x1F56A, # LOUD SPEAKER 252 0x1F5E6 .. 0x1F5E7, # THREE RAYS 253 ); 254my %included_symbols; 255$included_symbols{$_} = 1 for @included_symbols; 256 257sub format_pairs_line($;$) { 258 my ($from, $to) = @_; 259 260 # Format a line containing a character singleton or pair in preparation 261 # for output, suitable for pod. 262 263 my $lhs_name = charnames::viacode($from); 264 my $lhs_hex = sprintf "%04X", $from; 265 my $rhs_name; 266 my $rhs_hex; 267 my $name = $lhs_name; 268 269 my $hanging_indent = 26; 270 271 # Treat a trivial pair as a singleton 272 undef $to if defined $to && $to == $from; 273 274 if (defined $to) { 275 my $rhs_name = charnames::viacode($to); 276 $rhs_hex = sprintf "%04X", $to; 277 278 # Most of the names differ only in LEFT vs RIGHT; some in 279 # LESS-THAN vs GREATER-THAN. It takes less space, and is easier to 280 # understand if they are displayed combined. 281 if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) { 282 $name =~ s,$directional_re,$1/$opposite_of{$1},g; 283 } 284 else { # Otherwise, display them sequentially 285 $name .= ", " . $rhs_name; 286 } 287 } 288 289 # Handle double-width characters, based on the East Asian Width property. 290 # Add an extra space to non-wide ones so things stay vertically aligned. 291 my $extra = 0; 292 my $output_line = " " # Indent in case output being used for verbatim 293 # pod 294 . chr $from; 295 if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) { 296 $extra++; # The length() will be shorter than the displayed 297 # width 298 } 299 else { 300 $output_line .= " "; 301 } 302 if (defined $to) { 303 $output_line .= " " . chr $to; 304 if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) { 305 $extra++; 306 } 307 else { 308 $output_line .= " "; 309 } 310 } 311 else { 312 $output_line .= " "; 313 } 314 315 $output_line .= " U+$lhs_hex"; 316 $output_line .= ", U+$rhs_hex" if defined $to;; 317 my $cur_len = $extra + length $output_line; 318 $output_line .= " " x ($hanging_indent - $cur_len); 319 320 my $max_len = 74; # Pod formatter will indent 4 spaces 321 $cur_len = length $output_line; 322 323 if ($cur_len + length $name <= $max_len) { 324 $output_line .= $name; # It will fit 325 } 326 else { # It won't fit. Append a segment that is unbreakable until would 327 # exceed the available width; then start on a new line 328 # Doesn't handle the case where the whole segment doesn't fit; 329 # this just doesn't come up with the input data. 330 while ($name =~ / ( .+? ) \b{lb} /xg) { 331 my $segment = $1; 332 my $added_length = length $segment; 333 if ($cur_len + $added_length > $max_len) { 334 $output_line =~ s/ +$//; 335 $output_line .= "\n" . " " x $hanging_indent; 336 $cur_len = $hanging_indent; 337 } 338 339 $output_line .= $segment; 340 $cur_len += $added_length; 341 } 342 } 343 344 return $output_line . "\n"; 345} 346 347my $version = Unicode::UCD::UnicodeVersion(); 348my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x; 349$dotdot = 0 unless defined $dotdot; 350 351print $out_fh <<END; 352#define UNICODE_MAJOR_VERSION $major 353#define UNICODE_DOT_VERSION $dot 354#define UNICODE_DOT_DOT_VERSION $dotdot 355 356END 357 358# Gather the characters in Unicode that have left/right symmetry suitable for 359# paired string delimiters 360my %paireds; 361 362# So don't have to grep an array to determine if have already dealt with the 363# characters that are the keys 364my %inverted_paireds; 365 366# This property is the universe of all characters in Unicode which 367# are of some import to the Bidirectional Algorithm, and for which there is 368# another Unicode character that is a mirror of it. 369my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) = 370 prop_invmap("Bidi_Mirroring_Glyph"); 371 372# Keep track of the characters we don't use, and why not. 373my %discards; 374my $non_directional = 'No perceived horizontal direction'; 375my $not_considered_directional_because = "Not considered directional because"; 376my $trailing_up_down = 'Vertical direction after all L/R direction'; 377my $unpaired = "Didn't find a mirror"; 378my $illegal = "Mirror illegal"; 379my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror"; 380my $bidirectional = "Bidirectional"; 381my $r2l = "Is in a Right to Left script"; 382 383my %unused_bidi_pairs; 384my %inverted_unused_bidi_pairs; 385my %unused_pairs; # 386my %inverted_unused_pairs; 387 388# Could be more explicit about allowing, e.g. ARROWS, ARROWHEAD, but this 389# suffices 390my $arrow_like_re = qr/\b(ARROW|HARPOON)/; 391 392# Go through the Unicode Punctuation and Symbol characters looking for ones 393# that have mirrors, suitable for being string delimiters. Some of these are 394# easily derivable from Unicode properties dealing with the bidirectional 395# algorithm. But the purpose of that algorithm isn't the same as ours, and 396# excludes many suitable ones. In particular, no arrows are included in it. 397# To find suitable ones, we also look at character names to see if there is a 398# character with that name, but the horizontal direction reversed. That will 399# almost certainly be a mirror. 400foreach my $list (qw(Punctuation Symbol)) { 401 my @invlist = prop_invlist($list); 402 die "Empty list $list" unless @invlist; 403 404 my $is_Symbol = $list eq 'Symbol'; 405 406 # Convert from an inversion list to an array containing everything that 407 # matches. (This uses the recipe given in Unicode::UCD.) 408 my @full_list; 409 for (my $i = 0; $i < @invlist; $i += 2) { 410 my $upper = ($i + 1) < @invlist 411 ? $invlist[$i+1] - 1 # In range 412 : $Unicode::UCD::MAX_CP; # To infinity. 413 for my $j ($invlist[$i] .. $upper) { 414 push @full_list, $j; 415 } 416 } 417 418 CODE_POINT: 419 foreach my $code_point (@full_list) { 420 #print STDERR __FILE__, ": ", __LINE__, ": ", sprintf("%04x ", $code_point), charnames::viacode($code_point), "\n"; 421 my $chr = chr $code_point; 422 423 # Don't reexamine something we've already determined. This happens 424 # when its mate was earlier processed and found this one. 425 foreach my $hash_ref (\%paireds, \%inverted_paireds, 426 \%unused_bidi_pairs, \%inverted_unused_bidi_pairs, 427 \%unused_pairs, \%inverted_unused_pairs) 428 { 429 next CODE_POINT if exists $hash_ref->{$code_point} 430 } 431 432 my $name = charnames::viacode($code_point); 433 my $original_had_REVERSE; 434 my $mirror; 435 my $mirror_code_point; 436 437 # If Unicode considers this to have a mirror, we don't have to go 438 # looking 439 if ($chr =~ /\p{Bidi_Mirrored}/) { 440 my $i = search_invlist($bmg_invlist, $code_point); 441 $mirror_code_point = $bmg_invmap->[$i]; 442 if ( $mirror_code_point eq $bmg_default) { 443 $discards{$code_point} = { reason => $no_encoded_mate, 444 mirror => undef 445 }; 446 next; 447 } 448 449 # Certain Unicode properties classify some mirrored characters as 450 # opening (left) vs closing (right). Skip the closing ones this 451 # iteration; they will be handled later when the opening mate 452 # comes along. 453 if ($chr =~ /(?[ \p{BPT=Close} 454 | \p{Gc=Close_Punctuation} 455 ])/) 456 { 457 next; # Get this when its opening mirror comes up. 458 } 459 elsif ($chr =~ /(?[ \p{BPT=Open} 460 | \p{Gc=Open_Punctuation} 461 | \p{Gc=Initial_Punctuation} 462 | \p{Gc=Final_Punctuation} 463 ])/) 464 { 465 # Here, it's a left delimiter. (The ones in Final Punctuation 466 # can be opening ones in some languages.) 467 $paireds{$code_point} = $mirror_code_point; 468 $inverted_paireds{$mirror_code_point} = $code_point; 469 470 # If the delimiter can be used on either side, add its 471 # complement 472 if ($chr =~ /(?[ \p{Gc=Initial_Punctuation} 473 | \p{Gc=Final_Punctuation} 474 ])/) 475 { 476 $paireds{$mirror_code_point} = $code_point; 477 $inverted_paireds{$code_point} = $mirror_code_point; 478 } 479 480 next; 481 } 482 483 # Unicode doesn't consider '< >' to be brackets, but Perl does. There are 484 # lots of variants of these in Unicode; easiest to accept all of 485 # them that aren't bidirectional (which would be visually 486 # confusing). 487 for (my $i = 0; $i < @bidi_strong_directionals; $i++) { 488 my $hash_ref = $bidi_strong_directionals[$i]; 489 490 next if $name !~ $hash_ref->{L_pattern}; 491 492 if ($name =~ $hash_ref->{R_pattern}) { 493 $discards{$code_point} = { reason => $bidirectional, 494 mirror => $mirror_code_point 495 }; 496 next CODE_POINT; 497 } 498 499 $paireds{$code_point} = $mirror_code_point; 500 $inverted_paireds{$mirror_code_point} = $code_point; 501 $original_had_REVERSE = $name =~ /$reverse_re/; 502 next CODE_POINT; 503 } 504 505 # The other paired symbols are more iffy as being desirable paired 506 # delimiters; we let the code below decide what to do with them. 507 $mirror = charnames::viacode($mirror_code_point); 508 } 509 else { # Here is not involved with the bidirectional algorithm. 510 511 # Get the mirror (if any) from reversing the directions in the 512 # name, and looking that up 513 $mirror = $name; 514 $mirror =~ s/$directional_re/$opposite_of{$1}/g; 515 $original_had_REVERSE = $mirror =~ s/$reverse_re//g; 516 $mirror_code_point = charnames::vianame($mirror); 517 } 518 519 # Letter-like symbols don't really stand on their own and don't look 520 # like traditional delimiters. 521 if ($chr =~ /\p{Sk}/) { 522 $discards{$code_point} 523 = { reason => "Letter-like symbols are not eligible", 524 mirror => $mirror_code_point 525 }; 526 next CODE_POINT; 527 } 528 529 # Certain names are always treated as non directional. 530 if ($name =~ m{ \b ( WITH [ ] (?:LEFT|RIGHT) [ ] HALF [ ] BLACK 531 | BLOCK 532 | BOX [ ] DRAWINGS 533 | CIRCLE [ ] WITH 534 | EXTENSION 535 | (?: UPPER | LOWER ) [ ] HOOK 536 537 # The VERTICAL marks these as not actually 538 # L/R mirrored. 539 | PRESENTATION [ ] FORM [ ] FOR [ ] VERTICAL 540 | QUADRANT 541 | SHADE 542 | SQUARE [ ] WITH 543 ) \b }x) 544 { 545 $discards{$code_point} 546 = { reason => "$not_considered_directional_because name" 547 . " contains '$1'", 548 mirror => $mirror_code_point 549 }; 550 next CODE_POINT; 551 } 552 553 # If these are equal, it means the original had no horizontal 554 # directioning 555 if ($name eq $mirror) { 556 $discards{$code_point} = { reason => $non_directional, 557 mirror => undef 558 }; 559 next CODE_POINT; 560 } 561 562 # If the name has both left and right directions, it is bidirectional, 563 # so not suited to be a paired delimiter. 564 if ($name =~ $L_re && $name =~ $R_re) { 565 $discards{$code_point} = { reason => $bidirectional, 566 mirror => $mirror_code_point 567 }; 568 next CODE_POINT; 569 } 570 571 # If no mate was found, it could be that it's like the case of 572 # SPEAKER vs RIGHT SPEAKER (which probably means the mirror was added 573 # in a later version than the original. Check by removing all 574 # directionality and trying to see if there is a character with that 575 # name. 576 if (! defined $mirror_code_point) { 577 $mirror =~ s/$directional_re //; 578 $mirror_code_point = charnames::vianame($mirror); 579 if (! defined $mirror_code_point) { 580 581 # Still no mate. 582 $discards{$code_point} = { reason => $unpaired, 583 mirror => undef 584 }; 585 next; 586 } 587 } 588 589 if ($code_point == $mirror_code_point) { 590 $discards{$code_point} = 591 { reason => "$unpaired - Single character, multiple" 592 . " names; Unicode name correction", 593 mirror => $mirror_code_point 594 }; 595 next; 596 } 597 598 if ($is_Symbol) { 599 600 # Skip if the direction is followed by a vertical motion 601 # (which defeats the left-right directionality). 602 if ( $name =~ / ^ .* $no_barb_re 603 \b (UP|DOWN|NORTH|SOUTH) /gx 604 and not $name =~ /$directional_re/g) 605 { 606 $discards{$code_point} = { reason => $trailing_up_down, 607 mirror => $mirror_code_point 608 }; 609 next; 610 } 611 } 612 613 # There are a few characters like REVERSED SEMICOLON that are mirrors, 614 # but have always commonly been used unmirrored. There is also the 615 # PILCROW SIGN and its mirror which might be considered to be 616 # legitimate mirrors, but maybe not. Additionally the current 617 # algorithm for finding the mirror depends on each member of a pair 618 # being respresented by the same number of bytes as its mate. By 619 # skipping these, we solve both problems 620 if ($code_point < 256 != $mirror_code_point < 256) { 621 $discards{$code_point} = { reason => $illegal, 622 mirror => $mirror_code_point 623 }; 624 next; 625 } 626 627 # And '/' and '\' are mirrors that we don't accept 628 if ( $name =~ /SOLIDUS/ 629 && $name =~ s/REVERSE SOLIDUS/SOLIDUS/r 630 eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r) 631 { 632 $discards{$code_point} = { reason => $illegal, 633 mirror => $mirror_code_point 634 }; 635 next; 636 } 637 638 # Exclude characters that are R to L ordering, as this can cause 639 # confusion. See GH #22228 640 if ($chr =~ / (?[ \p{Bidi_Class:R} + \p{Bidi_Class:AL} ]) /x) { 641 $discards{$code_point} = { reason => $r2l, 642 mirror => $mirror_code_point 643 }; 644 next; 645 } 646 647 # We enter the pair with the original code point on the left; if it 648 # should instead be on the R, swap. Most Symbols that contain the 649 # word REVERSE go on the rhs, except those whose names explicitly 650 # indicate lhs. FINAL in the name indicates stays on the rhs. 651 if ($name =~ $R_re || ( $original_had_REVERSE 652 && $is_Symbol 653 && $name !~ $L_re 654 && $name !~ /\bFINAL\b/ 655 )) 656 { 657 my $temp = $code_point; 658 $code_point = $mirror_code_point; 659 $mirror_code_point = $temp; 660 } 661 662 # Only a few symbols are currently used, determined by inspection, but 663 # all the (few) remaining paired punctuations. 664 if ( ! $is_Symbol 665 || defined $included_symbols{$code_point} 666 || ( $chr =~ /\p{BidiMirrored}/ 667 && ( $name =~ $ok_bidi_symbols_re 668 || $mirror =~ $ok_bidi_symbols_re)) 669 || $name =~ /\bINDEX\b/ # index FINGER pointing 670 671 # Also accept most arrows that don't have N/S in their 672 # names. (Those are almost all currently pointing at an 673 # angle, like SW anyway.) 674 || ( $name !~ /\bNORTH|SOUTH\b/ 675 && $name =~ $arrow_like_re 676 677 # Arguably bi-directional 678 && $name !~ /U-SHAPED/) 679 ) { 680 $paireds{$code_point} = $mirror_code_point; 681 $inverted_paireds{$mirror_code_point} = $code_point; 682 683 # Again, accept either one at either end for these ambiguous 684 # punctuation delimiters 685 if ($chr =~ /[\p{PI}\p{PF}]/x) { 686 $paireds{$mirror_code_point} = $code_point; 687 $inverted_paireds{$code_point} = $mirror_code_point; 688 } 689 } 690 elsif ( $chr =~ /\p{BidiMirrored}/ 691 && ! exists $inverted_unused_bidi_pairs{$code_point} 692 && ! defined $inverted_unused_bidi_pairs{$code_point}) 693 { 694 $unused_bidi_pairs{$code_point} = $mirror_code_point; 695 $inverted_unused_bidi_pairs{$mirror_code_point} = $code_point; 696 } 697 elsif ( ! exists $inverted_unused_pairs{$code_point} 698 && ! defined $inverted_unused_pairs{$code_point}) 699 { # A pair that we don't currently accept 700 $unused_pairs{$code_point} = $mirror_code_point; 701 $inverted_unused_pairs{$mirror_code_point} = $code_point; 702 } 703 } # End of loop through code points 704} # End of loop through properties 705 706# The rest of the data are at __DATA__ in this file. 707 708my @data = <DATA>; 709 710foreach my $charset (get_supported_code_pages()) { 711 print $out_fh "\n" . get_conditional_compile_line_start($charset); 712 713 my @a2n = @{get_a2n($charset)}; 714 715 for ( @data ) { 716 chomp; 717 718 # Convert any '#' comments to /* ... */; empty lines and comments are 719 # output as blank lines 720 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { 721 my $comment_body = $1 // ""; 722 if ($comment_body ne "") { 723 print $out_fh "/* $comment_body */\n"; 724 } 725 else { 726 print $out_fh "\n"; 727 } 728 next; 729 } 730 731 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token 732 (?: [\ ]+ ( [^ ]* ) )? # optional flag 733 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required 734 /x) 735 { 736 die "Unexpected syntax at line $.: $_\n"; 737 } 738 739 my $name_or_cp = $1; 740 my $flag = $2; 741 my $desired_name = $3; 742 743 my $name; 744 my $cp; 745 my $U_cp; # code point in Unicode (not-native) terms 746 747 if ($name_or_cp =~ /^U\+(.*)/) { 748 $U_cp = hex $1; 749 $name = charnames::viacode($name_or_cp); 750 if (! defined $name) { 751 next if $flag =~ /skip_if_undef/; 752 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name; 753 $name = ""; 754 } 755 } 756 else { 757 $name = $name_or_cp; 758 die "Unknown name '$name' at line $.: $_\n" unless defined $name; 759 $U_cp = charnames::vianame($name =~ s/_/ /gr); 760 } 761 762 $cp = ($U_cp < 256) 763 ? $a2n[$U_cp] 764 : $U_cp; 765 766 $name = $desired_name if $name eq "" && $desired_name; 767 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes 768 769 my $str; 770 my $suffix; 771 if (defined $flag && $flag eq 'native') { 772 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; 773 $suffix = '_NATIVE'; 774 $str = sprintf "0x%02X", $cp; # Is a numeric constant 775 } 776 else { 777 $str = backslash_x_form($U_cp, $charset); 778 779 $suffix = '_UTF8'; 780 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { 781 $str = "\"$str\""; # Will be a string constant 782 } elsif ($flag eq 'tail') { 783 $str =~ s/\\x..//; # Remove the first byte 784 $suffix .= '_TAIL'; 785 $str = "\"$str\""; # Will be a string constant 786 } 787 elsif ($flag =~ / ^ first (_s)? $ /x) { 788 my $wants_string = defined $1; 789 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte 790 $suffix .= '_FIRST_BYTE'; 791 if ($wants_string) { 792 $suffix .= '_s'; 793 $str = "\"\\x$str\""; 794 } 795 else { 796 $str = "0x$str"; # Is a numeric constant 797 } 798 } 799 else { 800 die "Unknown flag at line $.: $_\n"; 801 } 802 } 803 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; 804 } 805 806 # Now output the strings of opening/closing delimiters. The Unicode 807 # values were earlier entered into %paireds 808 my $utf8_opening = ""; 809 my $utf8_closing = ""; 810 my $non_utf8_opening = ""; 811 my $non_utf8_closing = ""; 812 my $deprecated_if_not_mirrored = ""; 813 my $non_utf8_deprecated_if_not_mirrored = ""; 814 815 for my $from (sort { $a <=> $b } keys %paireds) { 816 my $to = $paireds{$from}; 817 my $utf8_from_backslashed = backslash_x_form($from, $charset); 818 my $utf8_to_backslashed = backslash_x_form($to, $charset); 819 my $non_utf8_from_backslashed; 820 my $non_utf8_to_backslashed; 821 822 $utf8_opening .= $utf8_from_backslashed; 823 $utf8_closing .= $utf8_to_backslashed; 824 825 if ($from < 256) { 826 $non_utf8_from_backslashed = 827 backslash_x_form($from, $charset, 'not_utf8'); 828 $non_utf8_to_backslashed = 829 backslash_x_form($to, $charset, 'not_utf8'); 830 831 $non_utf8_opening .= $non_utf8_from_backslashed; 832 $non_utf8_closing .= $non_utf8_to_backslashed; 833 } 834 835 # Only the ASCII range paired delimiters have traditionally been 836 # accepted. Until the feature is considered standard, the non-ASCII 837 # opening ones must be deprecated when the feature isn't in effect, so 838 # as to warn about behavior that is planned to change. 839 if ($from > 127) { 840 $deprecated_if_not_mirrored .= $utf8_from_backslashed; 841 $non_utf8_deprecated_if_not_mirrored .= 842 $non_utf8_from_backslashed if $from < 256; 843 844 # We deprecate using any of these strongly directional characters 845 # at either end of the string, in part so we could allow them to 846 # be reversed. 847 $deprecated_if_not_mirrored .= $utf8_to_backslashed 848 if index ($deprecated_if_not_mirrored, 849 $utf8_to_backslashed) < 0; 850 } 851 852 # The implementing code in toke.c assumes that the byte length of each 853 # opening delimiter is the same as its mirrored closing one. This 854 # makes sure of that by checking upon each iteration of the loop. 855 if (length $utf8_opening != length $utf8_closing) { 856 die "Byte length of representation of '" 857 . charnames::viacode($from) 858 . " differs from its mapping '" 859 . charnames::viacode($to) 860 . "'"; 861 } 862 863 print STDERR format_pairs_line($from, $to) if $output_lists; 864 } 865 $output_lists = 0; # Only output in first iteration 866 867 print $out_fh <<~"EOT"; 868 869 # ifdef PERL_IN_TOKE_C 870 /* Paired characters for quote-like operators, in UTF-8 */ 871 # define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening" 872 # define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing" 873 874 /* And not in UTF-8 */ 875 # define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening" 876 # define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing" 877 878 /* And what's deprecated */ 879 # define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored" 880 # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored" 881 # endif 882 EOT 883 884 my $max_PRINT_A = 0; 885 for my $i (0x20 .. 0x7E) { 886 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; 887 } 888 $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A; 889 print $out_fh <<"EOT"; 890 891# ifdef PERL_IN_REGCOMP_ANY 892# define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */ 893# endif 894EOT 895 896 print $out_fh get_conditional_compile_line_end(); 897 898} 899 900if ($output_omitteds) { 901 # We haven't bothered to delete things that later became used. 902 foreach my $which (\%paireds, 903 \%unused_bidi_pairs, 904 \%unused_pairs) 905 { 906 foreach my $lhs (keys $which->%*) { 907 delete $discards{$lhs}; 908 delete $discards{$which->{$lhs}}; 909 } 910 } 911 912 print STDERR "\nMirrored says Unicode, but not currently used as paired string delimiters\n"; 913 foreach my $from (sort { $a <=> $b } keys %unused_bidi_pairs) { 914 print STDERR format_pairs_line($from, $unused_bidi_pairs{$from}); 915 } 916 917 print STDERR "\nMirror found by name, but not currently used as paired string delimiters\n"; 918 foreach my $from (sort { $a <=> $b } keys %unused_pairs) { 919 print STDERR format_pairs_line($from, $unused_pairs{$from}); 920 } 921 922 # Invert %discards so that all the code points for a given reason are 923 # keyed by that reason. 924 my %inverted_discards; 925 foreach my $code_point (sort { $a <=> $b } keys %discards) { 926 my $type = $discards{$code_point}{reason}; 927 push $inverted_discards{$type}->@*, [ $code_point, 928 $discards{$code_point}{mirror} 929 ]; 930 } 931 932 # Then output each list 933 foreach my $type (sort keys %inverted_discards) { 934 print STDERR "\n$type\n" if $type ne ""; 935 foreach my $ref ($inverted_discards{$type}->@*) { 936 print STDERR format_pairs_line($ref->[0], $ref->[1]); 937 } 938 } 939} 940 941my $count = 0; 942my @other_invlist = prop_invlist("Other"); 943for (my $i = 0; $i < @other_invlist; $i += 2) { 944 $count += ((defined $other_invlist[$i+1]) 945 ? $other_invlist[$i+1] 946 : 0x110000) 947 - $other_invlist[$i]; 948} 949$count = 0x110000 - $count; 950print $out_fh <<~"EOT"; 951 952 /* The number of code points not matching \\pC */ 953 #ifdef PERL_IN_REGCOMP_ANY 954 # define NON_OTHER_COUNT $count 955 #endif 956 EOT 957 958# If this release has both the CWCM and CWCF properties, find the highest code 959# point which changes under any case change. We can use this to short-circuit 960# code 961my @cwcm = prop_invlist('CWCM'); 962if (@cwcm) { 963 my @cwcf = prop_invlist('CWCF'); 964 if (@cwcf) { 965 my $max = ($cwcm[-1] < $cwcf[-1]) 966 ? $cwcf[-1] 967 : $cwcm[-1]; 968 $max = sprintf "0x%X", $max - 1; 969 print $out_fh <<~"EOS"; 970 971 /* The highest code point that has any type of case change */ 972 #ifdef PERL_IN_UTF8_C 973 # define HIGHEST_CASE_CHANGING_CP $max 974 #endif 975 EOS 976 } 977} 978 979print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n"; 980 981read_only_bottom_close_and_rename($out_fh); 982 983# DATA FORMAT 984# 985# Note that any apidoc comments you want in the file need to be added to one 986# of the prints above 987# 988# A blank line is output as-is. 989# Comments (lines whose first non-blank is a '#') are converted to C-style, 990# though empty comments are converted to blank lines. Otherwise, each line 991# represents one #define, and begins with either a Unicode character name with 992# the blanks and dashes in it squeezed out or replaced by underscores; or it 993# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter 994# case, the name will be looked-up to use as the name of the macro. In either 995# case, the macro name will have suffixes as listed above, and all blanks and 996# dashes will be replaced by underscores. 997# 998# Each line may optionally have one of the following flags on it, separated by 999# white space from the initial token. 1000# string indicates that the output is to be of the string form 1001# described in the comments above that are placed in the file. 1002# string_skip_ifundef is the same as 'string', but instead of dying if the 1003# code point doesn't exist, the line is just skipped: no output is 1004# generated for it 1005# first indicates that the output is to be of the FIRST_BYTE form. 1006# first_s indicates that the output is to be of the FIRST_BYTEs form. 1007# tail indicates that the output is of the _TAIL form. 1008# native indicates that the output is the code point, converted to the 1009# platform's native character set if applicable 1010# 1011# If the code point has no official name, the desired name may be appended 1012# after the flag, which will be ignored if there is an official name. 1013# 1014# This program is used to make it convenient to create compile time constants 1015# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually 1016# having to figure things out. 1017 1018__DATA__ 1019U+017F string 1020 1021U+0300 string 1022U+0307 string 1023 1024U+1E9E string_skip_if_undef 1025 1026U+FB05 string 1027U+FB06 string 1028U+0130 string 1029U+0131 string 1030 1031U+2010 string 1032BOM first 1033BOM tail 1034 1035BOM string 1036 1037U+FFFD string 1038 1039U+10FFFF string MAX_UNICODE 1040 1041NBSP native 1042NBSP string 1043 1044DEL native 1045CR native 1046LF native 1047VT native 1048ESC native 1049U+00DF native 1050U+00DF string 1051U+00E5 native 1052U+00C5 native 1053U+00FF native 1054U+00B5 native 1055U+00B5 string 1056U+066B string 1057U+066B first 1058U+066B tail 1059U+066B first_s 1060