1# Test the /a, /d, etc regex modifiers 2# For comprehensive tests, set $ENV{PERL_DEBUG_FULL_TEST} to some true value 3 4BEGIN { 5 chdir 't' if -d 't'; 6 require './test.pl'; 7 set_up_inc('../lib', '../dist/if'); 8 require './loc_tools.pl'; 9} 10 11use strict; 12use warnings; 13no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure 14 # they work, even though they warn. 15use Config; 16 17plan('no_plan'); 18 19# Each case is a valid element of its hash key. Choose, where available, at 20# least one of each type: ASCII-range, non-ASCII range Latin-1, and above 21# Latin1 range code point. 22my %testcases = ( 23 '\w' => [ 0x16B ], 24 '\d' => [ ord("0"), ord("1"), ord("2"), ord("3"), ord("4"), ord("5"), 25 ord("6"), ord("7"), ord("8"), ord("9"), 26 0x0662, 27 ], 28 '[:blank:]' => [ ord("\t"), ord(" "), 29 0x1680 30 ], 31 '\s' => [ ord("\t"), ord("\n"), ord("\cK"), ord("\f"), ord("\r"), 32 ord(" "), 33 utf8::unicode_to_native(0x85), 34 utf8::unicode_to_native(0xA0), 35 0x2029, 36 ], 37 '[:graph:]' => [ 0x02C7 ], 38 '[:lower:]' => [ ord("a"), ord("b"), ord("c"), ord("d"), ord("e"), 39 ord("f"), ord("g"), ord("h"), ord("i"), ord("j"), 40 ord("k"), ord("l"), ord("m"), ord("n"), ord("o"), 41 ord("p"), ord("q"), ord("r"), ord("s"), ord("t"), 42 ord("u"), ord("v"), ord("w"), ord("x"), ord("y"), 43 ord("z"), 44 0x0127 ], 45 '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), 46 ord('>'), ord('|'), ord('-'), ord(','), ord(';'), 47 ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), 48 ord('"'), ord('('), ord(')'), ord('['), ord(']'), 49 ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), 50 ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), 51 ord("'"), 52 0x055C 53 ], 54 '[:upper:]' => [ ord("A"), ord("B"), ord("C"), ord("D"), ord("E"), 55 ord("F"), ord("G"), ord("H"), ord("I"), ord("J"), 56 ord("K"), ord("L"), ord("M"), ord("N"), ord("O"), 57 ord("P"), ord("Q"), ord("R"), ord("S"), ord("T"), 58 ord("U"), ord("V"), ord("W"), ord("X"), ord("Y"), 59 ord("Z"), 60 0x0126 61 ], 62 '[:xdigit:]' => [ ord("0"), ord("1"), ord("2"), ord("3"), ord("4"), 63 ord("5"), ord("6"), ord("7"), ord("8"), ord("9"), 64 ord("A"), ord("B"), ord("C"), ord("D"), ord("E"), 65 ord("F"), ord("a"), ord("b"), ord("c"), ord("d"), 66 ord("e"), ord("f"), 67 0xFF15, 68 ], 69); 70 71if ($ENV{PERL_DEBUG_FULL_TEST}) { 72 push @{$testcases{'[:cntrl:]'}}, utf8::unicode_to_native($_) 73 for (0x00 .. 0x1F, 0x7F .. 0x9F); 74 push @{$testcases{'[:blank:]'}}, utf8::unicode_to_native(0xA0); 75 push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xA1); 76 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native($_) 77 for (0xA2 .. 0xA6); 78 push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xA7); 79 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xA8); 80 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xA9); 81 push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native(0xAA); 82 push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xAB); 83 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native($_) 84 for (0xAC .. 0xB4); 85 push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native(0xB5); 86 push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xB6); 87 push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xB7); 88 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xB8); 89 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xB9); 90 push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native(0xBA); 91 push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xBB); 92 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xBC); 93 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xBD); 94 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xBE); 95 push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xBF); 96 push @{$testcases{'[:upper:]'}}, utf8::unicode_to_native($_) 97 for (0xC0 .. 0xD6); 98 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xD7); 99 push @{$testcases{'[:upper:]'}}, utf8::unicode_to_native($_) 100 for (0xD8 .. 0xDE); 101 push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native($_) 102 for (0xDF .. 0xF6); 103 push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xF7); 104 push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native($_) 105 for (0xF8 .. 0xF8); 106 107 push @{$testcases{'[:alpha:]'}}, @{$testcases{'[:lower:]'}}, 108 @{$testcases{'[:upper:]'}}; 109 push @{$testcases{'[:alnum:]'}}, @{$testcases{'[:alpha:]'}}, 110 @{$testcases{'\d'}}; 111 push @{$testcases{'\w'}}, @{$testcases{'[:alnum:]'}}, ord("_"); 112 push @{$testcases{'[:print:]'}}, @{$testcases{'[:graph:]'}}, 113 ord(" "), 114 utf8::unicode_to_native(0xA0); 115} 116 117@{$testcases{'[:digit:]'}} = @{$testcases{'\d'}}; 118@{$testcases{'[:space:]'}} = @{$testcases{'\s'}}; 119@{$testcases{'[:word:]'}} = @{$testcases{'\w'}}; 120 121#use Data::Dumper; 122#$Data::Dumper::Sortkeys = 1; 123#print STDERR Dumper \%testcases; 124 125my $utf8_locale; 126 127my @charsets = qw(a d u aa); 128my $locales_ok = locales_enabled([ 'LC_CTYPE', 'LC_ALL' ]); 129if (! is_miniperl() && $locales_ok) { 130 require POSIX; 131 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; 132 if ($current_locale eq 'C') { 133 134 # test for d_setlocale is repeated here because this one is compile 135 # time, and the one above is run time 136 use if $Config{d_setlocale}, 'locale'; 137 138 # Some implementations don't have the 128-255 range characters all 139 # mean nothing under the C locale (an example being VMS). This is 140 # legal, but since we don't know what the right answers should be, 141 # skip the locale tests in that situation. 142 for my $i (128 .. 255) { 143 goto skip_adding_C_locale 144 if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/; 145 } 146 push @charsets, 'l'; 147 148 skip_adding_C_locale: 149 150 # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale 151 $utf8_locale = find_utf8_ctype_locale(); 152 push @charsets, 'L' if defined $utf8_locale; 153 } 154} 155 156# For each possible character set... 157foreach my $charset (@charsets) { 158 my $locale; 159 my $charset_mod = lc $charset; 160 my $charset_display; 161 if ($charset_mod eq 'l') { 162 $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l') 163 ? "C" 164 : $utf8_locale 165 ); 166 die "Couldn't change to locale " . (($charset eq 'l') ? "C" : $utf8_locale) unless $locale; 167 $charset_display = $charset_mod . " ($locale)"; 168 } 169 else { 170 $charset_display = $charset_mod; 171 } 172 173 # And in utf8 or not 174 foreach my $upgrade ("", 'utf8::upgrade($a); ') { 175 176 # reverse gets the, \w, \s, \d first. 177 for my $class (reverse sort keys %testcases) { 178 179 # The complement of \w is \W; of [:posix:] is [:^posix:] 180 my $complement = $class; 181 if ($complement !~ s/ ( \[: ) /$1^/x) { 182 $complement = uc($class); 183 } 184 185 # For each test case 186 foreach my $ord (@{$testcases{$class}}) { 187 my $char = chr($ord); 188 $char = ($char eq '$') ? '\$' : display($char); 189 190 # > 255 already implies upgraded. Skip the ones that don't 191 # have an explicit upgrade. This shows more clearly in the 192 # output which tests are in utf8, or not. 193 next if $ord > 255 && ! $upgrade; 194 195 my $reason = ""; # Explanation output with each test 196 my $neg_reason = ""; 197 my $match = 1; # Calculated whether test regex should 198 # match or not 199 200 # Everything always matches in ASCII, or under /u, or under /l 201 # with a UTF-8 locale 202 if (utf8::native_to_unicode($ord) < 128 203 || $charset eq 'u' 204 || $charset eq 'L') 205 { 206 $reason = "\"$char\" is a $class under /$charset_display"; 207 $neg_reason = "\"$char\" is not a $complement under /$charset_display"; 208 } 209 elsif ($charset eq "a" || $charset eq "aa") { 210 $match = 0; 211 $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display"; 212 $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display"; 213 } 214 elsif ($ord > 255) { 215 $reason = "\"$char\" is a $class under /$charset_display"; 216 $neg_reason = "\"$char\" is not a $complement under /$charset_display"; 217 } 218 elsif ($charset eq 'l') { 219 220 # We are using the C locale, which is essentially ASCII, 221 # but under utf8, the above-latin1 chars are treated as 222 # Unicode) 223 $reason = "\"$char\" is not a $class in the C locale under /$charset_mod"; 224 $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod"; 225 $match = 0; 226 } 227 elsif ($upgrade) { 228 $reason = "\"$char\" is a $class in utf8 under /$charset_display"; 229 $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display"; 230 } 231 else { 232 $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display"; 233 $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)"; 234 $match = 0; 235 } 236 $reason = "; $reason" if $reason; 237 $neg_reason = "; $neg_reason" if $neg_reason; 238 239 my $op; 240 my $neg_op; 241 if ($match) { 242 $op = '=~'; 243 $neg_op = '!~'; 244 } 245 else { 246 $op = '!~'; 247 $neg_op = '=~'; 248 } 249 250 # In [...] or not 251 foreach my $bracketed (0, 1) { 252 my $lb = ""; 253 my $rb = ""; 254 if ($bracketed) { 255 256 # Adds an extra char to the character class to make sure 257 # that the class doesn't get optimized away. (Make 258 # sure to not use the character being tested.) 259 my $extra = ($char eq "_") ? ":" : "_"; 260 $lb = ($bracketed) ? "[$extra" : ""; 261 $rb = ($bracketed) ? ']' : ""; 262 } 263 else { # [:posix:] must be inside outer [ ] 264 next if $class =~ /\[/; 265 } 266 267 my $length = 10; # For regexec.c regrepeat() cases by 268 # matching more than one item 269 # Test both class and its complement, and with one or more 270 # than one item to match. 271 foreach my $eval ( 272 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x], 273 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x], 274 ) { 275 ok (eval $eval, $eval . $reason); 276 } 277 foreach my $eval ( 278 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x], 279 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x], 280 ) { 281 ok (eval $eval, $eval . $neg_reason); 282 } 283 } 284 285 next if $class ne '\w'; 286 287 # Test \b, \B at beginning and end of string 288 foreach my $eval ( 289 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x], 290 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x], 291 ) { 292 ok (eval $eval, $eval . $reason); 293 } 294 foreach my $eval ( 295 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x], 296 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x], 297 ) { 298 ok (eval $eval, $eval . $neg_reason); 299 } 300 301 # Test \b, \B adjacent to a non-word char, both before it and 302 # after. We test with ASCII, Latin1 and Unicode non-word chars 303 foreach my $space_ord (@{$testcases{'\s'}}) { 304 305 # This is an anomalous character, so skip. 306 next if $space_ord == ord("\n"); 307 308 # Useless to try to test non-utf8 when the ord itself 309 # forces utf8 310 next if $space_ord > 255 && ! $upgrade; 311 312 my $space = display(chr $space_ord); 313 314 foreach my $eval ( 315 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], 316 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], 317 ) { 318 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); 319 } 320 foreach my $eval ( 321 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], 322 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], 323 ) { 324 ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w"); 325 } 326 } 327 328 # Test \b, \B in the middle of two nominally word chars, but 329 # one or both may be considered non-word depending on range 330 # and charset. 331 foreach my $other_ord (@{$testcases{'\w'}}) { 332 next if $other_ord > 255 && ! $upgrade; 333 my $other = display(chr $other_ord); 334 335 # Determine if the other char is a word char in current 336 # circumstances 337 my $other_is_word = 1; 338 my $other_reason = "\"$other\" is a $class under /$charset_display"; 339 my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display"; 340 if (utf8::native_to_unicode($other_ord) > 127 341 && $charset ne 'u' && $charset ne 'L' 342 && (($charset eq "a" || $charset eq "aa") 343 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) 344 { 345 $other_is_word = 0; 346 $other_reason = "\"$other\" is not a $class under /$charset_display"; 347 $other_neg_reason = "\"$other\" is a $complement under /$charset_display"; 348 } 349 my $both_reason = $reason; 350 $both_reason .= "; $other_reason" if $other_ord != $ord; 351 my $both_neg_reason = $neg_reason; 352 $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord; 353 354 # If both are the same wordness, then \b will fail; \B 355 # succeed 356 if ($match == $other_is_word) { 357 $op = '!~'; 358 $neg_op = '=~'; 359 } 360 else { 361 $op = '=~'; 362 $neg_op = '!~'; 363 } 364 365 foreach my $eval ( 366 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x], 367 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x], 368 ) { 369 ok (eval $eval, $eval . $both_reason); 370 } 371 foreach my $eval ( 372 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x], 373 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x], 374 ) { 375 ok (eval $eval, $eval . $both_neg_reason); 376 } 377 378 next if $other_ord == $ord; 379 380 # These start with the \b or \B. They are included, based 381 # on source code analysis, to force the testing of the FBC 382 # (find_by_class) portions of regexec.c. 383 foreach my $eval ( 384 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x], 385 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x], 386 ) { 387 ok (eval $eval, $eval . $both_reason); 388 } 389 foreach my $eval ( 390 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x], 391 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x], 392 ) { 393 ok (eval $eval, $eval . $both_neg_reason); 394 } 395 } 396 } # End of each test case in a class 397 } # End of \w, \s, ... 398 } # End of utf8 upgraded or not 399} 400 401plan(curr_test() - 1); 402