1use v5.16.0; 2use strict; 3use warnings; 4require 'regen/regen_lib.pl'; 5use charnames qw(:loose); 6 7my $out_fh = open_new('unicode_constants.h', '>', 8 {style => '*', by => $0, 9 from => "Unicode data"}); 10 11print $out_fh <<END; 12 13#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */ 14#define H_UNICODE_CONSTANTS 1 15 16/* This file contains #defines for various Unicode code points. The values 17 * the macros expand to are the native Unicode code point, or all or portions 18 * of the UTF-8 encoding for the code point. In the former case, the macro 19 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8". 20 * 21 * The macros that have the suffix "_UTF8" may have further suffixes, as 22 * follows: 23 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 24 * representation; the value will be a numeric constant. 25 * "_TAIL" if instead it represents all but the first byte. This, and 26 * with no additional suffix are both string constants */ 27 28END 29 30# The data are at the end of this file. A blank line is output as-is. 31# Comments (lines whose first non-blank is a '#') are converted to C-style, 32# though empty comments are converted to blank lines. Otherwise, each line 33# represents one #define, and begins with either a Unicode character name with 34# the blanks and dashes in it squeezed out or replaced by underscores; or it 35# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter 36# case, the name will be looked-up to use as the name of the macro. In either 37# case, the macro name will have suffixes as listed above, and all blanks and 38# dashes will be replaced by underscores. 39# 40# Each line may optionally have one of the following flags on it, separated by 41# white space from the initial token. 42# string indicates that the output is to be of the string form 43# described in the comments above that are placed in the file. 44# string_skip_ifundef is the same as 'string', but instead of dying if the 45# code point doesn't exist, the line is just skipped: no output is 46# generated for it 47# first indicates that the output is to be of the FIRST_BYTE form. 48# tail indicates that the output is of the _TAIL form. 49# native indicates that the output is the code point, converted to the 50# platform's native character set if applicable 51# 52# If the code point has no official name, the desired name may be appended 53# after the flag, which will be ignored if there is an official name. 54# 55# This program is used to make it convenient to create compile time constants 56# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually 57# having to figure things out. 58 59while ( <DATA> ) { 60 chomp; 61 62 # Convert any '#' comments to /* ... */; empty lines and comments are 63 # output as blank lines 64 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { 65 my $comment_body = $1 // ""; 66 if ($comment_body ne "") { 67 print $out_fh "/* $comment_body */\n"; 68 } 69 else { 70 print $out_fh "\n"; 71 } 72 next; 73 } 74 75 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token 76 (?: [\ ]+ ( [^ ]* ) )? # optional flag 77 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required 78 /x) 79 { 80 die "Unexpected syntax at line $.: $_\n"; 81 } 82 83 my $name_or_cp = $1; 84 my $flag = $2; 85 my $desired_name = $3; 86 87 my $name; 88 my $cp; 89 my $U_cp; # code point in Unicode (not-native) terms 90 my $undef_ok = $desired_name || $flag =~ /skip_if_undef/; 91 92 if ($name_or_cp =~ /^U\+(.*)/) { 93 $U_cp = hex $1; 94 $name = charnames::viacode($name_or_cp); 95 if (! defined $name) { 96 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok; 97 $name = ""; 98 } 99 $cp = utf8::unicode_to_native($U_cp); 100 } 101 else { 102 $name = $name_or_cp; 103 $cp = charnames::vianame($name =~ s/_/ /gr); 104 $U_cp = utf8::native_to_unicode($cp); 105 die "Unknown name '$name' at line $.: $_\n" unless defined $name; 106 } 107 108 $name = $desired_name if $name eq "" && $desired_name; 109 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes 110 111 my $str = join "", map { sprintf "\\x%02X", $_ } 112 unpack("U0C*", pack("U", $cp)); 113 114 my $suffix = '_UTF8'; 115 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { 116 $str = "\"$str\""; # Will be a string constant 117 } elsif ($flag eq 'tail') { 118 $str =~ s/\\x..//; # Remove the first byte 119 $suffix .= '_TAIL'; 120 $str = "\"$str\""; # Will be a string constant 121 } 122 elsif ($flag eq 'first') { 123 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte 124 $suffix .= '_FIRST_BYTE'; 125 $str = "0x$str"; # Is a numeric constant 126 } 127 elsif ($flag eq 'native') { 128 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; 129 $suffix = '_NATIVE'; 130 $str = sprintf "0x%02X", $cp; # Is a numeric constant 131 } 132 else { 133 die "Unknown flag at line $.: $_\n"; 134 } 135 printf $out_fh "#define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; 136} 137 138print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; 139 140read_only_bottom_close_and_rename($out_fh); 141 142__DATA__ 143U+017F string 144 145U+0300 string 146 147U+0399 string 148U+03BC string 149 150U+1E9E string 151 152U+FB05 string 153U+FB06 string 154 155U+2010 string 156U+D800 first FIRST_SURROGATE 157BOM first 158BOM tail 159 160DEL native 161CR native 162LF native 163U+00DF native 164U+00E5 native 165U+00C5 native 166U+00FF native 167U+00B5 native 168