1#!./perl 2# 3# Tests that have to do with checking whether characters have (or not have) 4# certain Unicode properties; belong (or not belong) to blocks, scripts, etc. 5# 6 7use strict; 8use warnings; 9use 5.010; 10 11BEGIN { 12 require './test.pl'; 13 skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)"); 14} 15 16sub run_tests; 17 18# 19# This is the data to test. 20# 21# This is a hash; keys are the property to test. 22# Values are arrays containing characters to test. The characters can 23# have the following formats: 24# '\N{CHARACTER NAME}' - Use character with that name 25# '\x{1234}' - Use character with that hex escape 26# '0x1234' - Use chr() to get that character 27# "a" - Character to use 28# 29# If a character entry starts with ! the character does not belong to the class 30# 31# If the class is just single letter, we use both \pL and \p{L} 32# 33 34use charnames ':full'; 35 36my @CLASSES = ( 37 L => ["a", "A"], 38 Ll => ["b", "!B"], 39 Lu => ["!c", "C"], 40 IsLl => ["d", "!D"], 41 IsLu => ["!e", "E"], 42 LC => ["f", "!1"], 43 'L&' => ["g", "!2"], 44 'Lowercase Letter' => ["h", "!H"], 45 46 Common => ["!i", "3"], 47 Inherited => ["!j", '\x{300}'], 48 49 InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'], 50 InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'], 51 InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'], 52 InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'], 53 InKatakana => ['\N{KATAKANA LETTER SMALL A}'], 54 IsLatin => ["0x100", "0x212b"], 55 IsHebrew => ["0x5d0", "0xfb4f"], 56 IsGreek => ["0x37a", "0x386", "!0x387", "0x388", 57 "0x38a", "!0x38b", "0x38c"], 58 HangulSyllables => ['\x{AC00}'], 59 'Script=Latin' => ['\x{0100}'], 60 'Block=LatinExtendedA' => ['\x{0100}'], 61 'Category=UppercaseLetter' => ['\x{0100}'], 62 63 # 64 # It's ok to repeat class names. 65 # 66 InLatin1Supplement => 67 $::IS_EBCDIC ? ['!\x{7f}', '\x{80}', '!\x{100}'] 68 : ['!\x{7f}', '\x{80}', '\x{ff}', '!\x{100}'], 69 InLatinExtendedA => 70 ['!\x{7f}', '!\x{80}', '!\x{ff}', '\x{100}'], 71 72 # 73 # Properties are case-insensitive, and may have whitespace, 74 # dashes and underscores. 75 # 76 'in-latin1_SUPPLEMENT' => ['\x{80}', 77 '\N{LATIN SMALL LETTER Y WITH DIAERESIS}'], 78 ' ^ In Latin 1 Supplement ' 79 => ['!\x{80}', '\N{COFFIN}'], 80 'latin-1 supplement' => ['\x{80}', "0xDF"], 81 82); 83 84my @USER_DEFINED_PROPERTIES = ( 85 # 86 # User defined properties 87 # 88 InKana1 => ['\x{3040}', '!\x{303F}'], 89 InKana2 => ['\x{3040}', '!\x{303F}'], 90 InKana3 => ['\x{3041}', '!\x{3040}'], 91 InNotKana => ['\x{3040}', '!\x{3041}'], 92 InConsonant => ['d', '!e'], 93 IsSyriac1 => ['\x{0712}', '!\x{072F}'], 94 IsSyriac1KanaMark => ['\x{309A}', '!\x{3090}'], 95 IsSyriac1KanaMark => ['\x{0730}', '!\x{0712}'], 96 '# User-defined character properties may lack \n at the end', 97 InGreekSmall => ['\N{GREEK SMALL LETTER PI}', 98 '\N{GREEK SMALL LETTER FINAL SIGMA}'], 99 InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'], 100 Dash => ['-'], 101 ASCII_Hex_Digit => ['!-', 'A'], 102 IsAsciiHexAndDash => ['-', 'A'], 103); 104 105my @USER_CASELESS_PROPERTIES = ( 106 # 107 # User defined properties which differ depending on /i. Second entry is 108 # false regularly, true under /i 109 # 110 'IsMyUpper' => ["M", "!m" ], 111); 112 113 114# 115# From the short properties we populate POSIX-like classes. 116# 117my %SHORT_PROPERTIES = ( 118 'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'], 119 'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'], 120 'Lo' => ['\N{HIRAGANA LETTER SMALL A}'], 121 # is also in other alphabetic 122 'Mn' => ['\N{HEBREW POINT RAFE}'], 123 'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'], 124 'Pc' => ["_"], 125 'Po' => ["!"], 126 'Zs' => [" "], 127 'Cc' => ['\x{00}'], 128); 129 130# 131# Illegal properties 132# 133my @ILLEGAL_PROPERTIES = 134 qw[q qrst f foo isfoo infoo ISfoo INfoo Is::foo In::foo]; 135 136my %d; 137 138while (my ($class, $chars) = each %SHORT_PROPERTIES) { 139 push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars; 140 push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars; 141 push @{$d {IsASCII}} => map {length ($_) == 1 || $_ eq '\x{00}' 142 ? $_ : "!$_"} @$chars; 143 push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars; 144 push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars; 145 push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars; 146 push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]|Co)/ 147 ? $_ : "!$_"} @$chars; 148 push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]|Co|Zs)/ 149 ? $_ : "!$_"} @$chars; 150 push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars; 151 push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars; 152 push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars; 153 push @{$d {IsWord}} => map {$class =~ /^[LMN]/ || $_ eq "_" 154 ? $_ : "!$_"} @$chars; 155 push @{$d {IsSpace}} => map {$class =~ /^Z/ || 156 length ($_) == 1 && ord ($_) >= 0x09 157 && ord ($_) <= 0x0D 158 ? $_ : "!$_"} @$chars; 159} 160 161delete $d {IsASCII} if $::IS_EBCDIC; 162 163push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, 164 "# POSIX like properties" => %d, 165 "# User defined properties" => @USER_DEFINED_PROPERTIES; 166 167 168# 169# Calculate the number of tests. 170# 171my $count = 0; 172for (my $i = 0; $i < @CLASSES; $i += 2) { 173 $i ++, redo if $CLASSES [$i] =~ /^\h*#\h*(.*)/; 174 $count += 2 * (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]}; 175} 176$count += 4 * @ILLEGAL_PROPERTIES; 177$count += 4 * grep {length $_ == 1} @ILLEGAL_PROPERTIES; 178$count += 8 * @USER_CASELESS_PROPERTIES; 179 180plan(tests => $count); 181 182run_tests unless caller (); 183 184sub match { 185 my ($char, $match, $nomatch, $caseless) = @_; 186 $caseless = "" unless defined $caseless; 187 $caseless = 'i' if $caseless; 188 189 my ($str, $name); 190 191 if ($char =~ /^\\/) { 192 $str = eval qq ["$char"]; 193 $name = qq ["$char"]; 194 } 195 elsif ($char =~ /^0x([0-9A-Fa-f]+)$/) { 196 $str = chr hex $1; 197 $name = "chr ($char)"; 198 } 199 else { 200 $str = $char; 201 $name = qq ["$char"]; 202 } 203 204 undef $@; 205 my $pat = "qr/$match/$caseless"; 206 my $match_pat = eval $pat; 207 is($@, '', "$pat compiled correctly to a regexp: $@"); 208 like($str, $match_pat, "$name correctly matched"); 209 210 undef $@; 211 $pat = "qr/$nomatch/$caseless"; 212 my $nomatch_pat = eval $pat; 213 is($@, '', "$pat compiled correctly to a regexp: $@"); 214 unlike($str, $nomatch_pat, "$name correctly did not match"); 215} 216 217sub run_tests { 218 219 while (@CLASSES) { 220 my $class = shift @CLASSES; 221 if ($class =~ /^\h*#\h*(.*)/) { 222 print "# $1\n"; 223 next; 224 } 225 last unless @CLASSES; 226 my $chars = shift @CLASSES; 227 my @in = grep {!/^!./} @$chars; 228 my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars; 229 my $in_pat = eval qq ['\\p{$class}']; 230 my $out_pat = eval qq ['\\P{$class}']; 231 232 match $_, $in_pat, $out_pat for @in; 233 match $_, $out_pat, $in_pat for @out; 234 235 if (1 == length $class) { # Repeat without braces if name length 1 236 my $in_pat = eval qq ['\\p$class']; 237 my $out_pat = eval qq ['\\P$class']; 238 239 match $_, $in_pat, $out_pat for @in; 240 match $_, $out_pat, $in_pat for @out; 241 } 242 } 243 244 245 my $pat = qr /^Can't find Unicode property definition/; 246 print "# Illegal properties\n"; 247 foreach my $p (@ILLEGAL_PROPERTIES) { 248 undef $@; 249 my $r = eval "'a' =~ /\\p{$p}/; 1"; 250 is($r, undef, "Unknown Unicode property \\p{$p}"); 251 like($@, $pat, "Unknown Unicode property \\p{$p}"); 252 undef $@; 253 my $s = eval "'a' =~ /\\P{$p}/; 1"; 254 is($s, undef, "Unknown Unicode property \\p{$p}"); 255 like($@, $pat, "Unknown Unicode property \\p{$p}"); 256 if (length $p == 1) { 257 undef $@; 258 my $r = eval "'a' =~ /\\p$p/; 1"; 259 is($r, undef, "Unknown Unicode property \\p$p"); 260 like($@, $pat, "Unknown Unicode property \\p$p"); 261 undef $@; 262 my $s = eval "'a' =~ /\\P$p/; 1"; 263 is($r, undef, "Unknown Unicode property \\P$p"); 264 like($@, $pat, "Unknown Unicode property \\P$p"); 265 } 266 } 267 268 print "# User-defined properties with /i differences\n"; 269 foreach my $class (shift @USER_CASELESS_PROPERTIES) { 270 my $chars_ref = shift @USER_CASELESS_PROPERTIES; 271 my @in = grep {!/^!./} @$chars_ref; 272 my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars_ref; 273 my $in_pat = eval qq ['\\p{$class}']; 274 my $out_pat = eval qq ['\\P{$class}']; 275 276 # Verify works as regularly for not /i 277 match $_, $in_pat, $out_pat for @in; 278 match $_, $out_pat, $in_pat for @out; 279 280 # Verify that adding /i doesn't change the in set. 281 match $_, $in_pat, $out_pat, 'i' for @in; 282 283 # Verify that adding /i does change the out set to match. 284 match $_, $in_pat, $out_pat, 'i' for @out; 285 } 286} 287 288 289# 290# User defined properties 291# 292 293sub InKana1 {<<'--'} 2943040 309F 29530A0 30FF 296-- 297 298sub InKana2 {<<'--'} 299+utf8::InHiragana 300+utf8::InKatakana 301-- 302 303sub InKana3 {<<'--'} 304+utf8::InHiragana 305+utf8::InKatakana 306-utf8::IsCn 307-- 308 309sub InNotKana {<<'--'} 310!utf8::InHiragana 311-utf8::InKatakana 312+utf8::IsCn 313-- 314 315sub InConsonant {<<'--'} # Not EBCDIC-aware. 3160061 007f 317-0061 318-0065 319-0069 320-006f 321-0075 322-- 323 324sub IsSyriac1 {<<'--'} 3250712 072C 3260730 074A 327-- 328 329sub InGreekSmall {return "03B1\t03C9"} 330sub InGreekCapital {return "0391\t03A9\n-03A2"} 331 332sub IsAsciiHexAndDash {<<'--'} 333+utf8::ASCII_Hex_Digit 334+utf8::Dash 335-- 336 337sub IsMyUpper { 338 my $caseless = shift; 339 if ($caseless) { 340 return "0041\t005A\n0061\t007A" 341 } 342 else { 343 return "0041\t005A" 344 } 345} 346 347# Verify that can use user-defined properties inside another one 348sub IsSyriac1KanaMark {<<'--'} 349+main::IsSyriac1 350+main::InKana3 351&utf8::IsMark 352-- 353 354# fake user-defined properties; these subs shouldn't be called, because 355# their names don't start with In or Is 356 357sub f { die } 358sub foo { die } 359sub isfoo { die } 360sub infoo { die } 361sub ISfoo { die } 362sub INfoo { die } 363sub Is::foo { die } 364sub In::foo { die } 365__END__ 366