xref: /openbsd-src/gnu/usr.bin/perl/regen/unicode_constants.pl (revision 3d61058aa5c692477b6d18acfbbdb653a9930ff9)
1use v5.16.0;
2use strict;
3use warnings;
4no warnings 'experimental::regex_sets';
5require './regen/regen_lib.pl';
6require './regen/charset_translations.pl';
7use Unicode::UCD qw(prop_invlist prop_invmap search_invlist);
8use charnames qw(:loose);
9binmode(STDERR, ":utf8");
10
11# Set this to 1 temporarily to get on stderr the complete list of paired
12# string delimiters this generates.  This list is suitable for plugging into a
13# pod.
14my $output_lists = 0;
15
16# Set this to 1 temporarily to get on stderr the complete list of punctuation
17# marks and symbols that look to be directional but we didn't include for some
18# reason.
19my $output_omitteds = 0;
20
21my $out_fh = open_new('unicode_constants.h', '>',
22        {style => '*', by => $0,
23                      from => "Unicode data"});
24
25print $out_fh <<END;
26
27#ifndef PERL_UNICODE_CONSTANTS_H_   /* Guard against nested #includes */
28#define PERL_UNICODE_CONSTANTS_H_   1
29
30/* This file contains #defines for the version of Unicode being used and
31 * various Unicode code points.  The values the code point macros expand to
32 * are the native Unicode code point, or all or portions of the UTF-8 encoding
33 * for the code point.  In the former case, the macro name has the suffix
34 * "_NATIVE"; otherwise, the suffix "_UTF8".
35 *
36 * The macros that have the suffix "_UTF8" may have further suffixes, as
37 * follows:
38 *  "_FIRST_BYTE"   if the value is just the first byte of the UTF-8
39 *                  representation; the value will be a numeric constant.
40 *  "_FIRST_BYTEs"  same, but the first byte is represented as a literal
41 *                  string
42 *  "_TAIL"         if instead it represents all but the first byte.  This,
43 *                  and with no additional suffix are both string constants */
44
45/*
46=for apidoc_section \$unicode
47
48=for apidoc AmnU|const char *|BOM_UTF8
49
50This is a macro that evaluates to a string constant of the  UTF-8 bytes that
51define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
52is compiled on.  This allows code to use a mnemonic for this character that
53works on both ASCII and EBCDIC platforms.
54S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
55bytes.
56
57=for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8
58
59This is a macro that evaluates to a string constant of the  UTF-8 bytes that
60define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
61is compiled on.  This allows code to use a mnemonic for this character that
62works on both ASCII and EBCDIC platforms.
63S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
64bytes.
65
66=cut
67*/
68
69END
70
71sub backslash_x_form($$;$) {
72    # Output the code point represented by the byte string $bytes as a
73    # sequence of \x{} constants.  $bytes should be the UTF-8 for the code
74    # point if the final parameter is absent or empty.  Otherwise it should be
75    # the Latin1 code point itself.
76    #
77    # The output is translated into the character set '$charset'.
78
79    my ($bytes, $charset, $non_utf8) = @_;
80    if ($non_utf8) {
81        die "Must be utf8 if above 255" if $bytes > 255;
82        my $a2n = get_a2n($charset);
83        return sprintf "\\x%02X", $a2n->[$bytes];
84    }
85    else {
86        return join "", map { sprintf "\\x%02X", ord $_ }
87                        split //, cp_2_utfbytes($bytes, $charset);
88    }
89}
90
91# The most complicated thing this program does is generate paired string
92# delimiters from the Unicode database.  Some of these come from the
93# Unicode Bidirectional (bidi) algorithm.
94
95# These all visually look like left and right delimiters
96my @bidi_strong_lefts = (  'LESS-THAN',
97                           'ELEMENT OF',
98                           'PRECEDE',
99                           'PRECEDES',
100                           'SMALLER THAN',
101                           'SUBSET',
102                        );
103my @bidi_strong_rights = ( 'GREATER-THAN',
104                           'CONTAINS',
105                           'SUCCEED',
106                           'SUCCEEDS',
107                           'LARGER THAN',
108                           'SUPERSET',
109                         );
110
111# Create an array of hashes for these, so as to translate between them, and
112# avoid recompiling patterns in the loop.
113my @bidi_strong_directionals;
114for (my $i = 0; $i < @bidi_strong_lefts; $i++) {
115    push @bidi_strong_directionals,
116                {
117                   LHS => $bidi_strong_lefts[$i],
118                   RHS => $bidi_strong_rights[$i],
119                   L_pattern => qr/\b$bidi_strong_lefts[$i]\b/,
120                   R_pattern => qr/\b$bidi_strong_rights[$i]\b/,
121                };
122}
123
124my @ok_bidi_symbols = (
125                        'TACK',
126                        'TURNSTILE',
127                      );
128my $ok_bidi_symbols_re = join '|', @ok_bidi_symbols;
129$ok_bidi_symbols_re = qr/\b($ok_bidi_symbols_re)\b/n;
130
131
132# Many characters have mirrors that Unicode hasn't included in their Bidi
133# algorithm.  This program uses their names to find them.  The next few
134# definitions are towards that end.
135
136# Most horizontal directionality is based on LEFT vs RIGHT.  But it's
137# complicated:
138#   1)  a barb on one or the other side of a harpoon doesn't indicate
139#       directionality of the character.  (A HARPOON is the word Unicode uses
140#       to indicate an arrow with a one-sided tip.)
141my $no_barb_re =  qr/(*nlb:BARB )/;
142
143#   2)  RIGHT-SHADED doesn't signify anything about direction of the character
144#   itself.  These are the suffixes Unicode uses to indicate this.  /aa is
145#   needed because the wildcard names feature currently requires it for names.
146my $shaded_re = qr/ [- ] (SHADED | SHADOWED) /naax;
147
148#   3a) there are a few anomalies caught here.  'LEFT LUGGAGE' would have been
149#       better named UNCLAIMED, and doesn't indicate directionality.
150my $real_LEFT_re =  qr/ \b $no_barb_re LEFT  (*nla: $shaded_re)
151                                             (*nla: [ ] LUGGAGE \b)
152                    /nx;
153#   3b) And in most cases,a RIGHT TRIANGLE also doesn't refer to
154#       directionality, but indicates it contains a 90 degree angle.
155my $real_RIGHT_re = qr/ \b $no_barb_re RIGHT (*nla: $shaded_re)
156                                             (*nla: [ ] (TRI)? ANGLE \b)
157                    /nx;
158#       More items could be added to these as needed
159
160#   4)  something that is pointing R goes on the left, so is different than
161#       the character on the R.  For example,  a RIGHT BRACKET would be
162#       different from a RIGHT-FACING bracket.  These patterns capture the
163#       typical ways that Unicode character names indicate the latter meaning
164#       as a suffix to RIGHT or LEFT
165my $pointing_suffix_re = qr/ (       WARDS     # e.g., RIGHTWARDS
166                              | [ ]  ARROW     # A R arrow points to the R
167                              | [ -] FACING
168                              | [ -] POINTING
169                              | [ ]  PENCIL    # Implies a direction of its
170                                               # point
171                         ) \b /nx;
172# And correspondingly for a prefix for LEFT RIGHT
173my $pointing_prefix_re = qr/ \b (  # e.g. UP RIGHT implies a direction
174                                   UP   ( [ ] AND)?
175                                 | DOWN ( [ ] AND)?
176                                 | CONVERGING
177                                 | POINTING [ ] (DIRECTLY)?
178                                 | TO [ ] THE
179                               )
180                             [ ]
181                           /nx;
182
183my @other_directionals =
184    {
185        LHS => 'LEFT',
186        RHS => 'RIGHT',
187        L_pattern =>
188                  # Something goes on the left if it contains LEFT and doesn't
189                  # point left, or it contains RIGHT and does point right.
190            qr/   \b (*nlb: $pointing_prefix_re) $real_LEFT_re
191                                                    (*nla: $pointing_suffix_re)
192                | \b (*plb: $pointing_prefix_re)  $real_RIGHT_re \b
193                | \b $real_RIGHT_re (*pla: $pointing_suffix_re)
194            /nx,
195        R_pattern =>
196            qr/   \b (*nlb: $pointing_prefix_re) $real_RIGHT_re
197                                                    (*nla: $pointing_suffix_re)
198                | \b (*plb: $pointing_prefix_re)  $real_LEFT_re \b
199                | \b $real_LEFT_re (*pla: $pointing_suffix_re)
200            /nx,
201    };
202
203# Some horizontal directionality is based on EAST vs WEST.  These words are
204# almost always used by Unicode to indicate the direction pointing to, without
205# the general consistency in phrasing in L/R above.  There are a handful of
206# possible exceptions, with only WEST WIND ever at all possibly an issue
207push @other_directionals,
208    {
209        LHS => 'EAST',
210        RHS => 'WEST',
211        L_pattern => qr/ \b (  EAST (*nla: [ ] WIND)
212                             | WEST (*pla: [ ] WIND)) \b /x,
213        R_pattern => qr/ \b (  WEST (*nla: [ ] WIND)
214                             | EAST (*pla: [ ] WIND)) \b /x,
215    };
216
217# The final way the Unicode signals mirroring is by using the words REVERSE or
218# REVERSED;
219my $reverse_re = qr/ \b REVERSE D? [- ] /x;
220
221# Create a mapping from each direction to its opposite one
222my %opposite_of;
223foreach my $directional (@bidi_strong_directionals, @other_directionals) {
224    $opposite_of{$directional->{LHS}} = $directional->{RHS};
225    $opposite_of{$directional->{RHS}} = $directional->{LHS};
226}
227
228# Join the two types of each direction as alternatives
229my $L_re = join "|", map { $_->{L_pattern} } @bidi_strong_directionals,
230                                             @other_directionals;
231my $R_re = join "|", map { $_->{R_pattern} } @bidi_strong_directionals,
232                                             @other_directionals;
233# And anything containing directionality will be either one of these two
234my $directional_re = join "|", $L_re, $R_re;
235
236# Now compile the strings that result from above
237$L_re = qr/$L_re/;
238$R_re = qr/$R_re/;
239$directional_re = qr/($directional_re)/;    # Make sure to capture $1
240
241my @included_symbols = (
242                         0x2326,   0x232B,      # ERASE
243                         0x23E9 .. 0x23EA,      # DOUBLE TRIANGLE
244                         0x23ED .. 0x23EE,      # DOUBLE TRIANGLE with BAR
245                         0x269E .. 0x269F,      # THREE LINES CONVERGING
246                         0x1D102 .. 0x1D103,    # MUSIC STAVES
247                         0x1D106 .. 0x1D107,    # MUSIC STAVES
248                         0x1F57B,               # TELEPHONE RECEIVER
249                         0x1F57D,               # TELEPHONE RECEIVER
250                         0x1F508 .. 0x1F50A,    # LOUD SPEAKER
251                         0x1F568 .. 0x1F56A,    # LOUD SPEAKER
252                         0x1F5E6 .. 0x1F5E7,    # THREE RAYS
253                       );
254my %included_symbols;
255$included_symbols{$_} = 1 for @included_symbols;
256
257sub format_pairs_line($;$) {
258    my ($from, $to) = @_;
259
260    # Format a line containing a character singleton or pair in preparation
261    # for output, suitable for pod.
262
263    my $lhs_name = charnames::viacode($from);
264    my $lhs_hex = sprintf "%04X", $from;
265    my $rhs_name;
266    my $rhs_hex;
267    my $name = $lhs_name;
268
269    my $hanging_indent = 26;
270
271    # Treat a trivial pair as a singleton
272    undef $to if defined $to && $to == $from;
273
274    if (defined $to) {
275        my $rhs_name = charnames::viacode($to);
276        $rhs_hex = sprintf "%04X", $to;
277
278        # Most of the names differ only in LEFT vs RIGHT; some in
279        # LESS-THAN vs GREATER-THAN.  It takes less space, and is easier to
280        # understand if they are displayed combined.
281        if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) {
282            $name =~ s,$directional_re,$1/$opposite_of{$1},g;
283        }
284        else {  # Otherwise, display them sequentially
285            $name .= ",  " . $rhs_name;
286        }
287    }
288
289    # Handle double-width characters, based on the East Asian Width property.
290    # Add an extra space to non-wide ones so things stay vertically aligned.
291    my $extra = 0;
292    my $output_line = " "   # Indent in case output being used for verbatim
293                            # pod
294                    . chr $from;
295    if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) {
296        $extra++;       # The length() will be shorter than the displayed
297                        # width
298    }
299    else {
300        $output_line .= " ";
301    }
302    if (defined $to) {
303        $output_line .= " " . chr $to;
304        if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) {
305            $extra++;
306        }
307        else {
308            $output_line .= " ";
309        }
310    }
311    else {
312        $output_line .= "   ";
313    }
314
315    $output_line .= "   U+$lhs_hex";
316    $output_line .= ", U+$rhs_hex" if defined $to;;
317    my $cur_len = $extra + length $output_line;
318    $output_line .= " " x ($hanging_indent - $cur_len);
319
320    my $max_len = 74;   # Pod formatter will indent 4 spaces
321    $cur_len = length $output_line;
322
323    if ($cur_len + length $name <= $max_len) {
324        $output_line .= $name;  # It will fit
325    }
326    else {  # It won't fit.  Append a segment that is unbreakable until would
327            # exceed the available width; then start on a new line
328            # Doesn't handle the case where the whole segment doesn't fit;
329            # this just doesn't come up with the input data.
330        while ($name =~ / ( .+? ) \b{lb} /xg) {
331            my $segment = $1;
332            my $added_length = length $segment;
333            if ($cur_len + $added_length > $max_len) {
334                $output_line =~ s/ +$//;
335                $output_line .= "\n" . " " x $hanging_indent;
336                $cur_len = $hanging_indent;
337            }
338
339            $output_line .= $segment;
340            $cur_len += $added_length;
341        }
342    }
343
344    return $output_line . "\n";
345}
346
347my $version = Unicode::UCD::UnicodeVersion();
348my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
349$dotdot = 0 unless defined $dotdot;
350
351print $out_fh <<END;
352#define UNICODE_MAJOR_VERSION   $major
353#define UNICODE_DOT_VERSION     $dot
354#define UNICODE_DOT_DOT_VERSION $dotdot
355
356END
357
358# Gather the characters in Unicode that have left/right symmetry suitable for
359# paired string delimiters
360my %paireds;
361
362# So don't have to grep an array to determine if have already dealt with the
363# characters that are the keys
364my %inverted_paireds;
365
366# This property is the universe of all characters in Unicode which
367# are of some import to the Bidirectional Algorithm, and for which there is
368# another Unicode character that is a mirror of it.
369my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) =
370                                            prop_invmap("Bidi_Mirroring_Glyph");
371
372# Keep track of the characters we don't use, and why not.
373my %discards;
374my $non_directional = 'No perceived horizontal direction';
375my $not_considered_directional_because = "Not considered directional because";
376my $trailing_up_down = 'Vertical direction after all L/R direction';
377my $unpaired = "Didn't find a mirror";
378my $illegal = "Mirror illegal";
379my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror";
380my $bidirectional = "Bidirectional";
381my $r2l = "Is in a Right to Left script";
382
383my %unused_bidi_pairs;
384my %inverted_unused_bidi_pairs;
385my %unused_pairs;   #
386my %inverted_unused_pairs;
387
388# Could be more explicit about allowing, e.g. ARROWS, ARROWHEAD, but this
389# suffices
390my $arrow_like_re = qr/\b(ARROW|HARPOON)/;
391
392# Go through the Unicode Punctuation and Symbol characters looking for ones
393# that have mirrors, suitable for being string delimiters.  Some of these are
394# easily derivable from Unicode properties dealing with the bidirectional
395# algorithm.  But the purpose of that algorithm isn't the same as ours, and
396# excludes many suitable ones.  In particular, no arrows are included in it.
397# To find suitable ones, we also look at character names to see if there is a
398# character with that name, but the horizontal direction reversed.  That will
399# almost certainly be a mirror.
400foreach my $list (qw(Punctuation Symbol)) {
401    my @invlist = prop_invlist($list);
402    die "Empty list $list" unless @invlist;
403
404    my $is_Symbol = $list eq 'Symbol';
405
406    # Convert from an inversion list to an array containing everything that
407    # matches.  (This uses the recipe given in Unicode::UCD.)
408    my @full_list;
409    for (my $i = 0; $i < @invlist; $i += 2) {
410       my $upper = ($i + 1) < @invlist
411                   ? $invlist[$i+1] - 1      # In range
412                   : $Unicode::UCD::MAX_CP;  # To infinity.
413       for my $j ($invlist[$i] .. $upper) {
414           push @full_list, $j;
415       }
416    }
417
418  CODE_POINT:
419    foreach my $code_point (@full_list) {
420        #print STDERR __FILE__, ": ", __LINE__, ": ", sprintf("%04x ", $code_point), charnames::viacode($code_point), "\n";
421        my $chr = chr $code_point;
422
423        # Don't reexamine something we've already determined.  This happens
424        # when its mate was earlier processed and found this one.
425        foreach my $hash_ref (\%paireds,           \%inverted_paireds,
426                              \%unused_bidi_pairs, \%inverted_unused_bidi_pairs,
427                              \%unused_pairs,      \%inverted_unused_pairs)
428        {
429            next CODE_POINT if exists $hash_ref->{$code_point}
430        }
431
432        my $name = charnames::viacode($code_point);
433        my $original_had_REVERSE;
434        my $mirror;
435        my $mirror_code_point;
436
437        # If Unicode considers this to have a mirror, we don't have to go
438        # looking
439        if ($chr =~ /\p{Bidi_Mirrored}/) {
440            my $i = search_invlist($bmg_invlist, $code_point);
441            $mirror_code_point = $bmg_invmap->[$i];
442            if ( $mirror_code_point eq $bmg_default) {
443                $discards{$code_point} = { reason => $no_encoded_mate,
444                                           mirror => undef
445                                         };
446                next;
447            }
448
449            # Certain Unicode properties classify some mirrored characters as
450            # opening (left) vs closing (right).  Skip the closing ones this
451            # iteration; they will be handled later when the opening mate
452            # comes along.
453            if ($chr =~ /(?[  \p{BPT=Close}
454                            | \p{Gc=Close_Punctuation}
455                         ])/)
456            {
457                next;   # Get this when its opening mirror comes up.
458            }
459            elsif ($chr =~ /(?[  \p{BPT=Open}
460                               | \p{Gc=Open_Punctuation}
461                               | \p{Gc=Initial_Punctuation}
462                               | \p{Gc=Final_Punctuation}
463                            ])/)
464            {
465                # Here, it's a left delimiter.  (The ones in Final Punctuation
466                # can be opening ones in some languages.)
467                $paireds{$code_point} = $mirror_code_point;
468                $inverted_paireds{$mirror_code_point} = $code_point;
469
470                # If the delimiter can be used on either side, add its
471                # complement
472                if ($chr =~ /(?[  \p{Gc=Initial_Punctuation}
473                                | \p{Gc=Final_Punctuation}
474                             ])/)
475                {
476                    $paireds{$mirror_code_point} = $code_point;
477                    $inverted_paireds{$code_point} = $mirror_code_point;
478                }
479
480                next;
481            }
482
483            # Unicode doesn't consider '< >' to be brackets, but Perl does.  There are
484            # lots of variants of these in Unicode; easiest to accept all of
485            # them that aren't bidirectional (which would be visually
486            # confusing).
487            for (my $i = 0; $i < @bidi_strong_directionals; $i++) {
488                my $hash_ref = $bidi_strong_directionals[$i];
489
490                next if $name !~ $hash_ref->{L_pattern};
491
492                if ($name =~ $hash_ref->{R_pattern}) {
493                    $discards{$code_point} = { reason => $bidirectional,
494                                               mirror => $mirror_code_point
495                                             };
496                    next CODE_POINT;
497                }
498
499                $paireds{$code_point} = $mirror_code_point;
500                $inverted_paireds{$mirror_code_point} = $code_point;
501                $original_had_REVERSE = $name =~ /$reverse_re/;
502                next CODE_POINT;
503            }
504
505            # The other paired symbols are more iffy as being desirable paired
506            # delimiters; we let the code below decide what to do with them.
507            $mirror = charnames::viacode($mirror_code_point);
508        }
509        else { # Here is not involved with the bidirectional algorithm.
510
511            # Get the mirror (if any) from reversing the directions in the
512            # name, and looking that up
513            $mirror = $name;
514            $mirror =~ s/$directional_re/$opposite_of{$1}/g;
515            $original_had_REVERSE = $mirror =~ s/$reverse_re//g;
516            $mirror_code_point = charnames::vianame($mirror);
517        }
518
519        # Letter-like symbols don't really stand on their own and don't look
520        # like traditional delimiters.
521        if ($chr =~ /\p{Sk}/) {
522            $discards{$code_point}
523                = { reason => "Letter-like symbols are not eligible",
524                    mirror => $mirror_code_point
525                  };
526            next CODE_POINT;
527        }
528
529        # Certain names are always treated as non directional.
530        if ($name =~ m{ \b (  WITH [ ] (?:LEFT|RIGHT) [ ] HALF [ ] BLACK
531                            | BLOCK
532                            | BOX [ ] DRAWINGS
533                            | CIRCLE [ ] WITH
534                            | EXTENSION
535                            | (?: UPPER | LOWER ) [ ] HOOK
536
537                              # The VERTICAL marks these as not actually
538                              # L/R mirrored.
539                            | PRESENTATION [ ] FORM [ ] FOR [ ] VERTICAL
540                            | QUADRANT
541                            | SHADE
542                            | SQUARE [ ] WITH
543                        ) \b }x)
544        {
545            $discards{$code_point}
546                = { reason => "$not_considered_directional_because name"
547                            . " contains '$1'",
548                    mirror => $mirror_code_point
549                  };
550            next CODE_POINT;
551        }
552
553        # If these are equal, it means the original had no horizontal
554        # directioning
555        if ($name eq $mirror) {
556            $discards{$code_point} = { reason => $non_directional,
557                                       mirror => undef
558                                     };
559            next CODE_POINT;
560        }
561
562        # If the name has both left and right directions, it is bidirectional,
563        # so not suited to be a paired delimiter.
564        if ($name =~ $L_re && $name =~ $R_re) {
565            $discards{$code_point} = { reason => $bidirectional,
566                                       mirror => $mirror_code_point
567                                     };
568            next CODE_POINT;
569        }
570
571        # If no mate was found, it could be that it's like the case of
572        # SPEAKER vs RIGHT SPEAKER (which probably means the mirror was added
573        # in a later version than the original.  Check by removing all
574        # directionality and trying to see if there is a character with that
575        # name.
576        if (! defined $mirror_code_point) {
577            $mirror =~ s/$directional_re //;
578            $mirror_code_point = charnames::vianame($mirror);
579            if (! defined $mirror_code_point) {
580
581                # Still no mate.
582                $discards{$code_point} = { reason => $unpaired,
583                                           mirror => undef
584                                         };
585                next;
586            }
587        }
588
589        if ($code_point == $mirror_code_point) {
590            $discards{$code_point} =
591                { reason => "$unpaired - Single character, multiple"
592                          . " names; Unicode name correction",
593                  mirror => $mirror_code_point
594                };
595            next;
596        }
597
598        if ($is_Symbol) {
599
600            # Skip if the direction is followed by a vertical motion
601            # (which defeats the left-right directionality).
602            if (        $name =~ / ^ .* $no_barb_re
603                                   \b (UP|DOWN|NORTH|SOUTH) /gx
604                and not $name =~ /$directional_re/g)
605            {
606                $discards{$code_point} = { reason => $trailing_up_down,
607                                           mirror => $mirror_code_point
608                                         };
609                next;
610            }
611        }
612
613        # There are a few characters like REVERSED SEMICOLON that are mirrors,
614        # but have always commonly been used unmirrored.  There is also the
615        # PILCROW SIGN and its mirror which might be considered to be
616        # legitimate mirrors, but maybe not.  Additionally the current
617        # algorithm for finding the mirror depends on each member of a pair
618        # being respresented by the same number of bytes as its mate.  By
619        # skipping these, we solve both problems
620        if ($code_point < 256 != $mirror_code_point < 256) {
621            $discards{$code_point} = { reason => $illegal,
622                                        mirror => $mirror_code_point
623                                     };
624            next;
625        }
626
627        # And '/' and '\' are mirrors that we don't accept
628        if (   $name =~ /SOLIDUS/
629            &&    $name   =~ s/REVERSE SOLIDUS/SOLIDUS/r
630               eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r)
631        {
632            $discards{$code_point} = { reason => $illegal,
633                                        mirror => $mirror_code_point
634                                     };
635            next;
636        }
637
638        # Exclude characters that are R to L ordering, as this can cause
639        # confusion.  See GH #22228
640        if ($chr =~ / (?[ \p{Bidi_Class:R} + \p{Bidi_Class:AL} ]) /x) {
641            $discards{$code_point} = { reason => $r2l,
642                                       mirror => $mirror_code_point
643                                     };
644            next;
645        }
646
647        # We enter the pair with the original code point on the left; if it
648        # should instead be on the R, swap.  Most Symbols that contain the
649        # word REVERSE go on the rhs, except those whose names explicitly
650        # indicate lhs.  FINAL in the name indicates stays on the rhs.
651        if ($name =~ $R_re || (   $original_had_REVERSE
652                               && $is_Symbol
653                               && $name !~ $L_re
654                               && $name !~ /\bFINAL\b/
655                               ))
656        {
657            my $temp = $code_point;
658            $code_point = $mirror_code_point;
659            $mirror_code_point = $temp;
660        }
661
662        # Only a few symbols are currently used, determined by inspection, but
663        # all the (few) remaining paired punctuations.
664        if ( ! $is_Symbol
665            || defined $included_symbols{$code_point}
666            || (    $chr =~ /\p{BidiMirrored}/
667                && (   $name   =~ $ok_bidi_symbols_re
668                    || $mirror =~ $ok_bidi_symbols_re))
669            || $name =~ /\bINDEX\b/         # index FINGER pointing
670
671                   # Also accept most arrows that don't have N/S in their
672                   # names.  (Those are almost all currently pointing at an
673                   # angle, like SW anyway.)
674            || (   $name !~ /\bNORTH|SOUTH\b/
675                && $name =~ $arrow_like_re
676
677                    # Arguably bi-directional
678                && $name !~ /U-SHAPED/)
679        ) {
680            $paireds{$code_point} = $mirror_code_point;
681            $inverted_paireds{$mirror_code_point} = $code_point;
682
683            # Again, accept either one at either end for these ambiguous
684            # punctuation delimiters
685            if ($chr =~ /[\p{PI}\p{PF}]/x) {
686                $paireds{$mirror_code_point} = $code_point;
687                $inverted_paireds{$code_point} = $mirror_code_point;
688            }
689        }
690        elsif (     $chr =~ /\p{BidiMirrored}/
691               && ! exists  $inverted_unused_bidi_pairs{$code_point}
692               && ! defined $inverted_unused_bidi_pairs{$code_point})
693            {
694                $unused_bidi_pairs{$code_point} = $mirror_code_point;
695                $inverted_unused_bidi_pairs{$mirror_code_point} = $code_point;
696            }
697        elsif (   ! exists  $inverted_unused_pairs{$code_point}
698                && ! defined $inverted_unused_pairs{$code_point})
699        {   # A pair that we don't currently accept
700            $unused_pairs{$code_point} = $mirror_code_point;
701            $inverted_unused_pairs{$mirror_code_point} = $code_point;
702        }
703    }   # End of loop through code points
704}   # End of loop through properties
705
706# The rest of the data are at __DATA__  in this file.
707
708my @data = <DATA>;
709
710foreach my $charset (get_supported_code_pages()) {
711    print $out_fh "\n" . get_conditional_compile_line_start($charset);
712
713    my @a2n = @{get_a2n($charset)};
714
715    for ( @data ) {
716        chomp;
717
718        # Convert any '#' comments to /* ... */; empty lines and comments are
719        # output as blank lines
720        if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
721            my $comment_body = $1 // "";
722            if ($comment_body ne "") {
723                print $out_fh "/* $comment_body */\n";
724            }
725            else {
726                print $out_fh "\n";
727            }
728            next;
729        }
730
731        unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
732                        (?: [\ ]+ ( [^ ]* ) )?  # optional flag
733                        (?: [\ ]+ ( .* ) )?  # name if unnamed; flag is required
734                    /x)
735        {
736            die "Unexpected syntax at line $.: $_\n";
737        }
738
739        my $name_or_cp = $1;
740        my $flag = $2;
741        my $desired_name = $3;
742
743        my $name;
744        my $cp;
745        my $U_cp;   # code point in Unicode (not-native) terms
746
747        if ($name_or_cp =~ /^U\+(.*)/) {
748            $U_cp = hex $1;
749            $name = charnames::viacode($name_or_cp);
750            if (! defined $name) {
751                next if $flag =~ /skip_if_undef/;
752                die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
753                $name = "";
754            }
755        }
756        else {
757            $name = $name_or_cp;
758            die "Unknown name '$name' at line $.: $_\n" unless defined $name;
759            $U_cp = charnames::vianame($name =~ s/_/ /gr);
760        }
761
762        $cp = ($U_cp < 256)
763            ? $a2n[$U_cp]
764            : $U_cp;
765
766        $name = $desired_name if $name eq "" && $desired_name;
767        $name =~ s/[- ]/_/g;   # The macro name can have no blanks nor dashes
768
769        my $str;
770        my $suffix;
771        if (defined $flag && $flag eq 'native') {
772            die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
773            $suffix = '_NATIVE';
774            $str = sprintf "0x%02X", $cp;        # Is a numeric constant
775        }
776        else {
777            $str = backslash_x_form($U_cp, $charset);
778
779            $suffix = '_UTF8';
780            if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
781                $str = "\"$str\"";  # Will be a string constant
782            } elsif ($flag eq 'tail') {
783                    $str =~ s/\\x..//;  # Remove the first byte
784                    $suffix .= '_TAIL';
785                    $str = "\"$str\"";  # Will be a string constant
786            }
787            elsif ($flag =~ / ^ first (_s)? $ /x) {
788                my $wants_string = defined $1;
789                $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
790                $suffix .= '_FIRST_BYTE';
791                if ($wants_string) {
792                    $suffix .= '_s';
793                    $str = "\"\\x$str\"";
794                }
795                else {
796                    $str = "0x$str";        # Is a numeric constant
797                }
798            }
799            else {
800                die "Unknown flag at line $.: $_\n";
801            }
802        }
803        printf $out_fh "#   define %s%s  %s    /* U+%04X */\n", $name, $suffix, $str, $U_cp;
804    }
805
806    # Now output the strings of opening/closing delimiters.  The Unicode
807    # values were earlier entered into %paireds
808    my $utf8_opening = "";
809    my $utf8_closing = "";
810    my $non_utf8_opening = "";
811    my $non_utf8_closing = "";
812    my $deprecated_if_not_mirrored = "";
813    my $non_utf8_deprecated_if_not_mirrored = "";
814
815    for my $from (sort { $a <=> $b } keys %paireds) {
816        my $to = $paireds{$from};
817        my $utf8_from_backslashed = backslash_x_form($from, $charset);
818        my $utf8_to_backslashed   = backslash_x_form($to, $charset);
819        my $non_utf8_from_backslashed;
820        my $non_utf8_to_backslashed;
821
822        $utf8_opening .= $utf8_from_backslashed;
823        $utf8_closing .= $utf8_to_backslashed;
824
825        if ($from < 256) {
826            $non_utf8_from_backslashed =
827                                  backslash_x_form($from, $charset, 'not_utf8');
828            $non_utf8_to_backslashed =
829                                  backslash_x_form($to, $charset, 'not_utf8');
830
831            $non_utf8_opening .= $non_utf8_from_backslashed;
832            $non_utf8_closing .= $non_utf8_to_backslashed;
833        }
834
835        # Only the ASCII range paired delimiters have traditionally been
836        # accepted.  Until the feature is considered standard, the non-ASCII
837        # opening ones must be deprecated when the feature isn't in effect, so
838        # as to warn about behavior that is planned to change.
839        if ($from > 127) {
840            $deprecated_if_not_mirrored .= $utf8_from_backslashed;
841            $non_utf8_deprecated_if_not_mirrored .=
842                                    $non_utf8_from_backslashed if $from < 256;
843
844            # We deprecate using any of these strongly directional characters
845            # at either end of the string, in part so we could allow them to
846            # be reversed.
847            $deprecated_if_not_mirrored .= $utf8_to_backslashed
848                                       if index ($deprecated_if_not_mirrored,
849                                                 $utf8_to_backslashed) < 0;
850        }
851
852        # The implementing code in toke.c assumes that the byte length of each
853        # opening delimiter is the same as its mirrored closing one.  This
854        # makes sure of that by checking upon each iteration of the loop.
855        if (length $utf8_opening != length $utf8_closing) {
856            die "Byte length of representation of '"
857              .  charnames::viacode($from)
858              . " differs from its mapping '"
859              .  charnames::viacode($to)
860              .  "'";
861        }
862
863        print STDERR format_pairs_line($from, $to) if $output_lists;
864    }
865    $output_lists = 0;  # Only output in first iteration
866
867    print $out_fh <<~"EOT";
868
869        #   ifdef PERL_IN_TOKE_C
870               /* Paired characters for quote-like operators, in UTF-8 */
871        #      define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening"
872        #      define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing"
873
874               /* And not in UTF-8 */
875        #      define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening"
876        #      define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing"
877
878               /* And what's deprecated */
879        #      define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored"
880        #      define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored"
881        #   endif
882        EOT
883
884    my $max_PRINT_A = 0;
885    for my $i (0x20 .. 0x7E) {
886        $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
887    }
888    $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A;
889    print $out_fh <<"EOT";
890
891#   ifdef PERL_IN_REGCOMP_ANY
892#     define MAX_PRINT_A  $max_PRINT_A   /* The max code point that isPRINT_A */
893#   endif
894EOT
895
896    print $out_fh get_conditional_compile_line_end();
897
898}
899
900if ($output_omitteds) {
901    # We haven't bothered to delete things that later became used.
902    foreach my $which (\%paireds,
903                       \%unused_bidi_pairs,
904                       \%unused_pairs)
905    {
906        foreach my $lhs (keys $which->%*) {
907            delete $discards{$lhs};
908            delete $discards{$which->{$lhs}};
909        }
910    }
911
912    print STDERR "\nMirrored says Unicode, but not currently used as paired string delimiters\n";
913    foreach my $from (sort { $a <=> $b } keys %unused_bidi_pairs) {
914        print STDERR format_pairs_line($from, $unused_bidi_pairs{$from});
915    }
916
917    print STDERR "\nMirror found by name, but not currently used as paired string delimiters\n";
918    foreach my $from (sort { $a <=> $b } keys %unused_pairs) {
919        print STDERR format_pairs_line($from, $unused_pairs{$from});
920    }
921
922    # Invert %discards so that all the code points for a given reason are
923    # keyed by that reason.
924    my %inverted_discards;
925    foreach my $code_point (sort { $a <=> $b } keys %discards) {
926        my $type = $discards{$code_point}{reason};
927        push $inverted_discards{$type}->@*, [ $code_point,
928                                              $discards{$code_point}{mirror}
929                                            ];
930    }
931
932    # Then output each list
933    foreach my $type (sort keys %inverted_discards) {
934        print STDERR "\n$type\n" if $type ne "";
935        foreach my $ref ($inverted_discards{$type}->@*) {
936            print STDERR format_pairs_line($ref->[0], $ref->[1]);
937        }
938    }
939}
940
941my $count = 0;
942my @other_invlist = prop_invlist("Other");
943for (my $i = 0; $i < @other_invlist; $i += 2) {
944    $count += ((defined $other_invlist[$i+1])
945              ? $other_invlist[$i+1]
946              : 0x110000)
947              - $other_invlist[$i];
948}
949$count = 0x110000 - $count;
950print $out_fh <<~"EOT";
951
952    /* The number of code points not matching \\pC */
953    #ifdef PERL_IN_REGCOMP_ANY
954    #  define NON_OTHER_COUNT  $count
955    #endif
956    EOT
957
958# If this release has both the CWCM and CWCF properties, find the highest code
959# point which changes under any case change.  We can use this to short-circuit
960# code
961my @cwcm = prop_invlist('CWCM');
962if (@cwcm) {
963    my @cwcf = prop_invlist('CWCF');
964    if (@cwcf) {
965        my $max = ($cwcm[-1] < $cwcf[-1])
966                  ? $cwcf[-1]
967                  : $cwcm[-1];
968        $max = sprintf "0x%X", $max - 1;
969        print $out_fh <<~"EOS";
970
971            /* The highest code point that has any type of case change */
972            #ifdef PERL_IN_UTF8_C
973            #  define HIGHEST_CASE_CHANGING_CP  $max
974            #endif
975            EOS
976    }
977}
978
979print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";
980
981read_only_bottom_close_and_rename($out_fh);
982
983# DATA FORMAT
984#
985# Note that any apidoc comments you want in the file need to be added to one
986# of the prints above
987#
988# A blank line is output as-is.
989# Comments (lines whose first non-blank is a '#') are converted to C-style,
990# though empty comments are converted to blank lines.  Otherwise, each line
991# represents one #define, and begins with either a Unicode character name with
992# the blanks and dashes in it squeezed out or replaced by underscores; or it
993# may be a hexadecimal Unicode code point of the form U+xxxx.  In the latter
994# case, the name will be looked-up to use as the name of the macro.  In either
995# case, the macro name will have suffixes as listed above, and all blanks and
996# dashes will be replaced by underscores.
997#
998# Each line may optionally have one of the following flags on it, separated by
999# white space from the initial token.
1000#   string  indicates that the output is to be of the string form
1001#           described in the comments above that are placed in the file.
1002#   string_skip_ifundef  is the same as 'string', but instead of dying if the
1003#           code point doesn't exist, the line is just skipped: no output is
1004#           generated for it
1005#   first   indicates that the output is to be of the FIRST_BYTE form.
1006#   first_s indicates that the output is to be of the FIRST_BYTEs form.
1007#   tail    indicates that the output is of the _TAIL form.
1008#   native  indicates that the output is the code point, converted to the
1009#           platform's native character set if applicable
1010#
1011# If the code point has no official name, the desired name may be appended
1012# after the flag, which will be ignored if there is an official name.
1013#
1014# This program is used to make it convenient to create compile time constants
1015# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
1016# having to figure things out.
1017
1018__DATA__
1019U+017F string
1020
1021U+0300 string
1022U+0307 string
1023
1024U+1E9E string_skip_if_undef
1025
1026U+FB05 string
1027U+FB06 string
1028U+0130 string
1029U+0131 string
1030
1031U+2010 string
1032BOM first
1033BOM tail
1034
1035BOM string
1036
1037U+FFFD string
1038
1039U+10FFFF string MAX_UNICODE
1040
1041NBSP native
1042NBSP string
1043
1044DEL native
1045CR  native
1046LF  native
1047VT  native
1048ESC native
1049U+00DF native
1050U+00DF string
1051U+00E5 native
1052U+00C5 native
1053U+00FF native
1054U+00B5 native
1055U+00B5 string
1056U+066B string
1057U+066B first
1058U+066B tail
1059U+066B first_s
1060