1#!./perl 2# 3# Tests that have to do with checking whether characters have (or not have) 4# certain Unicode properties; belong (or not belong) to blocks, scripts, etc. 5# 6 7use strict; 8use warnings; 9use 5.010; 10 11my $IS_EBCDIC = ord ('A') == 193; 12 13sub run_tests; 14 15# 16# This is the data to test. 17# 18# This is a hash; keys are the property to test. 19# Values are arrays containing characters to test. The characters can 20# have the following formats: 21# '\N{CHARACTER NAME}' - Use character with that name 22# '\x{1234}' - Use character with that hex escape 23# '0x1234' - Use chr() to get that character 24# "a" - Character to use 25# 26# If a character entry starts with ! the character does not belong to the class 27# 28# If the class is just single letter, we use both \pL and \p{L} 29# 30 31use charnames ':full'; 32 33my @CLASSES = ( 34 L => ["a", "A"], 35 Ll => ["b", "!B"], 36 Lu => ["!c", "C"], 37 IsLl => ["d", "!D"], 38 IsLu => ["!e", "E"], 39 LC => ["f", "!1"], 40 'L&' => ["g", "!2"], 41 'Lowercase Letter' => ["h", "!H"], 42 43 Common => ["!i", "3"], 44 Inherited => ["!j", '\x{300}'], 45 46 InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'], 47 InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'], 48 InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'], 49 InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'], 50 InKatakana => ['\N{KATAKANA LETTER SMALL A}'], 51 IsLatin => ["0x100", "0x212b"], 52 IsHebrew => ["0x5d0", "0xfb4f"], 53 IsGreek => ["0x37a", "0x386", "!0x387", "0x388", 54 "0x38a", "!0x38b", "0x38c"], 55 HangulSyllables => ['\x{AC00}'], 56 'Script=Latin' => ['\x{0100}'], 57 'Block=LatinExtendedA' => ['\x{0100}'], 58 'Category=UppercaseLetter' => ['\x{0100}'], 59 60 # 61 # It's ok to repeat class names. 62 # 63 InLatin1Supplement => 64 $IS_EBCDIC ? ['!\x{7f}', '\x{80}', '!\x{100}'] 65 : ['!\x{7f}', '\x{80}', '\x{ff}', '!\x{100}'], 66 InLatinExtendedA => 67 ['!\x{7f}', '!\x{80}', '!\x{ff}', '\x{100}'], 68 69 # 70 # Properties are case-insensitive, and may have whitespace, 71 # dashes and underscores. 72 # 73 'in-latin1_SUPPLEMENT' => ['\x{80}', 74 '\N{LATIN SMALL LETTER Y WITH DIAERESIS}'], 75 ' ^ In Latin 1 Supplement ' 76 => ['!\x{80}', '\N{COFFIN}'], 77 'latin-1 supplement' => ['\x{80}', "0xDF"], 78 79); 80 81my @USER_DEFINED_PROPERTIES = ( 82 # 83 # User defined properties 84 # 85 InKana1 => ['\x{3040}', '!\x{303F}'], 86 InKana2 => ['\x{3040}', '!\x{303F}'], 87 InKana3 => ['\x{3041}', '!\x{3040}'], 88 InNotKana => ['\x{3040}', '!\x{3041}'], 89 InConsonant => ['d', '!e'], 90 IsSyriac1 => ['\x{0712}', '!\x{072F}'], 91 Syriac1 => ['\x{0712}', '!\x{072F}'], 92 '# User-defined character properties my lack \n at the end', 93 InGreekSmall => ['\N{GREEK SMALL LETTER PI}', 94 '\N{GREEK SMALL LETTER FINAL SIGMA}'], 95 InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'], 96 Dash => ['-'], 97 ASCII_Hex_Digit => ['!-', 'A'], 98 AsciiHexAndDash => ['-', 'A'], 99); 100 101 102# 103# From the short properties we populate POSIX-like classes. 104# 105my %SHORT_PROPERTIES = ( 106 'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'], 107 'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'], 108 'Lo' => ['\N{HIRAGANA LETTER SMALL A}'], 109 # is also in other alphabetic 110 'Mn' => ['\N{HEBREW POINT RAFE}'], 111 'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'], 112 'Pc' => ["_"], 113 'Po' => ["!"], 114 'Zs' => [" "], 115 'Cc' => ['\x{00}'], 116); 117 118# 119# Illegal properties 120# 121my @ILLEGAL_PROPERTIES = qw [q qrst]; 122 123my %d; 124 125while (my ($class, $chars) = each %SHORT_PROPERTIES) { 126 push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars; 127 push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars; 128 push @{$d {IsASCII}} => map {length ($_) == 1 || $_ eq '\x{00}' 129 ? $_ : "!$_"} @$chars; 130 push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars; 131 push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars; 132 push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars; 133 push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]|Co)/ 134 ? $_ : "!$_"} @$chars; 135 push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]|Co|Zs)/ 136 ? $_ : "!$_"} @$chars; 137 push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars; 138 push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars; 139 push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars; 140 push @{$d {IsWord}} => map {$class =~ /^[LMN]/ || $_ eq "_" 141 ? $_ : "!$_"} @$chars; 142 push @{$d {IsSpace}} => map {$class =~ /^Z/ || 143 length ($_) == 1 && ord ($_) >= 0x09 144 && ord ($_) <= 0x0D 145 ? $_ : "!$_"} @$chars; 146} 147 148delete $d {IsASCII} if $IS_EBCDIC; 149 150push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, 151 "# POSIX like properties" => %d, 152 "# User defined properties" => @USER_DEFINED_PROPERTIES; 153 154 155# 156# Calculate the number of tests. 157# 158my $count = 0; 159for (my $i = 0; $i < @CLASSES; $i += 2) { 160 $i ++, redo if $CLASSES [$i] =~ /^\h*#\h*(.*)/; 161 $count += (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]}; 162} 163$count += 2 * @ILLEGAL_PROPERTIES; 164$count += 2 * grep {length $_ == 1} @ILLEGAL_PROPERTIES; 165 166my $tests = 0; 167 168say "1..$count"; 169 170run_tests unless caller (); 171 172sub match { 173 my ($char, $match, $nomatch) = @_; 174 175 my ($str, $name); 176 177 given ($char) { 178 when (/^\\/) { 179 $str = eval qq ["$char"]; 180 $name = qq ["$char"]; 181 } 182 when (/^0x([0-9A-Fa-f]+)$/) { 183 $str = chr hex $1; 184 $name = "chr ($char)"; 185 } 186 default { 187 $str = $char; 188 $name = qq ["$char"]; 189 } 190 } 191 192 print "not " unless $str =~ /$match/; 193 print "ok ", ++ $tests, " - $name =~ /$match/\n"; 194 print "not " unless $str !~ /$nomatch/; 195 print "ok ", ++ $tests, " - $name !~ /$nomatch/\n"; 196} 197 198sub run_tests { 199 200 while (@CLASSES) { 201 my $class = shift @CLASSES; 202 if ($class =~ /^\h*#\h*(.*)/) { 203 print "# $1\n"; 204 next; 205 } 206 last unless @CLASSES; 207 my $chars = shift @CLASSES; 208 my @in = grep {!/^!./} @$chars; 209 my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars; 210 my $in_pat = eval qq ['\\p{$class}']; 211 my $out_pat = eval qq ['\\P{$class}']; 212 213 match $_, $in_pat, $out_pat for @in; 214 match $_, $out_pat, $in_pat for @out; 215 216 if (1 == length $class) { 217 my $in_pat = eval qq ['\\p$class']; 218 my $out_pat = eval qq ['\\P$class']; 219 220 match $_, $in_pat, $out_pat for @in; 221 match $_, $out_pat, $in_pat for @out; 222 } 223 } 224 225 226 my $pat = qr /^Can't find Unicode property definition/; 227 print "# Illegal properties\n"; 228 foreach my $p (@ILLEGAL_PROPERTIES) { 229 undef $@; 230 my $r = eval "'a' =~ /\\p{$p}/; 1"; 231 print "not " unless !$r && $@ && $@ =~ $pat; 232 print "ok ", ++ $tests, " - Unknown Unicode property \\p{$p}\n"; 233 undef $@; 234 my $s = eval "'a' =~ /\\P{$p}/; 1"; 235 print "not " unless !$s && $@ && $@ =~ $pat; 236 print "ok ", ++ $tests, " - Unknown Unicode property \\P{$p}\n"; 237 if (length $p == 1) { 238 undef $@; 239 my $r = eval "'a' =~ /\\p$p/; 1"; 240 print "not " unless !$r && $@ && $@ =~ $pat; 241 print "ok ", ++ $tests, " - Unknown Unicode property \\p$p\n"; 242 undef $@; 243 my $s = eval "'a' =~ /\\P$p/; 1"; 244 print "not " unless !$s && $@ && $@ =~ $pat; 245 print "ok ", ++ $tests, " - Unknown Unicode property \\P$p\n"; 246 } 247 } 248} 249 250 251# 252# User defined properties 253# 254 255sub InKana1 {<<'--'} 2563040 309F 25730A0 30FF 258-- 259 260sub InKana2 {<<'--'} 261+utf8::InHiragana 262+utf8::InKatakana 263-- 264 265sub InKana3 {<<'--'} 266+utf8::InHiragana 267+utf8::InKatakana 268-utf8::IsCn 269-- 270 271sub InNotKana {<<'--'} 272!utf8::InHiragana 273-utf8::InKatakana 274+utf8::IsCn 275-- 276 277sub InConsonant {<<'--'} # Not EBCDIC-aware. 2780061 007f 279-0061 280-0065 281-0069 282-006f 283-0075 284-- 285 286sub IsSyriac1 {<<'--'} 2870712 072C 2880730 074A 289-- 290 291sub Syriac1 {<<'--'} 2920712 072C 2930730 074A 294-- 295 296sub InGreekSmall {return "03B1\t03C9"} 297sub InGreekCapital {return "0391\t03A9\n-03A2"} 298 299sub AsciiHexAndDash {<<'--'} 300+utf8::ASCII_Hex_Digit 301+utf8::Dash 302-- 303 304__END__ 305