1# Test the /a, /d, etc regex modifiers 2 3BEGIN { 4 chdir 't' if -d 't'; 5 @INC = '../lib'; 6 require './test.pl'; 7 require './loc_tools.pl'; 8} 9 10use strict; 11use warnings; 12use Config; 13 14plan('no_plan'); 15 16# Each case is a valid element of its hash key. Choose, where available, an 17# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point. 18my %testcases = ( 19 '\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha 20 '\d' => [ ord("0"), 0x0662 ], 21 '\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:] 22 '[:cntrl:]' => [ 0x00, 0x88 ], 23 '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be 24 # [:print:] 25 '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ], 26 '[:punct:]' => [ ord("!"), 0xBF, 0x055C ], 27 '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ], 28 '[:xdigit:]' => [ ord("4"), 0xFF15 ], 29); 30 31$testcases{'[:digit:]'} = $testcases{'\d'}; 32$testcases{'[:alnum:]'} = $testcases{'\w'}; 33$testcases{'[:alpha:]'} = $testcases{'\w'}; 34$testcases{'[:blank:]'} = $testcases{'\s'}; 35$testcases{'[:print:]'} = $testcases{'[:graph:]'}; 36$testcases{'[:space:]'} = $testcases{'\s'}; 37$testcases{'[:word:]'} = $testcases{'\w'}; 38 39my $utf8_locale; 40 41my @charsets = qw(a d u aa); 42if (! is_miniperl() && $Config{d_setlocale}) { 43 require POSIX; 44 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; 45 if ($current_locale eq 'C') { 46 47 # test for d_setlocale is repeated here because this one is compile 48 # time, and the one above is run time 49 use if $Config{d_setlocale}, 'locale'; 50 51 # Some implementations don't have the 128-255 range characters all 52 # mean nothing under the C locale (an example being VMS). This is 53 # legal, but since we don't know what the right answers should be, 54 # skip the locale tests in that situation. 55 for my $i (128 .. 255) { 56 goto skip_adding_C_locale if chr($i) =~ /[[:print:]]/; 57 } 58 push @charsets, 'l'; 59 60 skip_adding_C_locale: 61 62 # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale 63 $utf8_locale = find_utf8_ctype_locale(); 64 push @charsets, 'L' if defined $utf8_locale; 65 } 66} 67 68# For each possible character set... 69foreach my $charset (@charsets) { 70 my $locale; 71 my $charset_mod = lc $charset; 72 my $charset_display; 73 if ($charset_mod eq 'l') { 74 $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l') 75 ? "C" 76 : $utf8_locale 77 ); 78 die "Couldn't change locale" unless $locale; 79 $charset_display = $charset_mod . " ($locale)"; 80 } 81 else { 82 $charset_display = $charset_mod; 83 } 84 85 # And in utf8 or not 86 foreach my $upgrade ("", 'utf8::upgrade($a); ') { 87 88 # reverse gets the, \w, \s, \d first. 89 for my $class (reverse sort keys %testcases) { 90 91 # The complement of \w is \W; of [:posix:] is [:^posix:] 92 my $complement = $class; 93 if ($complement !~ s/ ( \[: ) /$1^/x) { 94 $complement = uc($class); 95 } 96 97 # For each test case 98 foreach my $ord (@{$testcases{$class}}) { 99 my $char = display(chr($ord)); 100 101 # > 255 already implies upgraded. Skip the ones that don't 102 # have an explicit upgrade. This shows more clearly in the 103 # output which tests are in utf8, or not. 104 next if $ord > 255 && ! $upgrade; 105 106 my $reason = ""; # Explanation output with each test 107 my $neg_reason = ""; 108 my $match = 1; # Calculated whether test regex should 109 # match or not 110 111 # Everything always matches in ASCII, or under /u 112 if ($ord < 128 || $charset eq 'u' || $charset eq 'L') { 113 $reason = "\"$char\" is a $class under /$charset_display"; 114 $neg_reason = "\"$char\" is not a $complement under /$charset_display"; 115 } 116 elsif ($charset eq "a" || $charset eq "aa") { 117 $match = 0; 118 $reason = "\"$char\" is non-ASCII, which can't be a $class under /a"; 119 $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /a"; 120 } 121 elsif ($ord > 255) { 122 $reason = "\"$char\" is a $class under /$charset_display"; 123 $neg_reason = "\"$char\" is not a $complement under /$charset_display"; 124 } 125 elsif ($charset eq 'l') { 126 127 # We are using the C locale, which is essentially ASCII, 128 # but under utf8, the above-latin1 chars are treated as 129 # Unicode) 130 $reason = "\"$char\" is not a $class in the C locale under /l"; 131 $neg_reason = "\"$char\" is a $complement in the C locale under /l"; 132 $match = 0; 133 } 134 elsif ($upgrade) { 135 $reason = "\"$char\" is a $class in utf8 under /d"; 136 $neg_reason = "\"$char\" is not a $complement in utf8 under /d"; 137 } 138 else { 139 $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /d"; 140 $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /d (unless in utf8)"; 141 $match = 0; 142 } 143 $reason = "; $reason" if $reason; 144 $neg_reason = "; $neg_reason" if $neg_reason; 145 146 my $op; 147 my $neg_op; 148 if ($match) { 149 $op = '=~'; 150 $neg_op = '!~'; 151 } 152 else { 153 $op = '!~'; 154 $neg_op = '=~'; 155 } 156 157 # In [...] or not 158 foreach my $bracketed (0, 1) { 159 my $lb = ""; 160 my $rb = ""; 161 if ($bracketed) { 162 163 # Adds an extra char to the character class to make sure 164 # that the class doesn't get optimized away. 165 $lb = ($bracketed) ? '[_' : ""; 166 $rb = ($bracketed) ? ']' : ""; 167 } 168 else { # [:posix:] must be inside outer [ ] 169 next if $class =~ /\[/; 170 } 171 172 my $length = 10; # For regexec.c regrepeat() cases by 173 # matching more than one item 174 # Test both class and its complement, and with one or more 175 # than one item to match. 176 foreach my $eval ( 177 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x], 178 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x], 179 ) { 180 ok (eval $eval, $eval . $reason); 181 } 182 foreach my $eval ( 183 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x], 184 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x], 185 ) { 186 ok (eval $eval, $eval . $neg_reason); 187 } 188 } 189 190 next if $class ne '\w'; 191 192 # Test \b, \B at beginning and end of string 193 foreach my $eval ( 194 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x], 195 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x], 196 ) { 197 ok (eval $eval, $eval . $reason); 198 } 199 foreach my $eval ( 200 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x], 201 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x], 202 ) { 203 ok (eval $eval, $eval . $neg_reason); 204 } 205 206 # Test \b, \B adjacent to a non-word char, both before it and 207 # after. We test with ASCII, Latin1 and Unicode non-word chars 208 foreach my $space_ord (@{$testcases{'\s'}}) { 209 210 # Useless to try to test non-utf8 when the ord itself 211 # forces utf8 212 next if $space_ord > 255 && ! $upgrade; 213 214 my $space = display(chr $space_ord); 215 216 foreach my $eval ( 217 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], 218 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], 219 ) { 220 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); 221 } 222 foreach my $eval ( 223 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], 224 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], 225 ) { 226 ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w"); 227 } 228 } 229 230 # Test \b, \B in the middle of two nominally word chars, but 231 # one or both may be considered non-word depending on range 232 # and charset. 233 foreach my $other_ord (@{$testcases{'\w'}}) { 234 next if $other_ord > 255 && ! $upgrade; 235 my $other = display(chr $other_ord); 236 237 # Determine if the other char is a word char in current 238 # circumstances 239 my $other_is_word = 1; 240 my $other_reason = "\"$other\" is a $class under /$charset_display"; 241 my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display"; 242 if ($other_ord > 127 243 && $charset ne 'u' && $charset ne 'L' 244 && (($charset eq "a" || $charset eq "aa") 245 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) 246 { 247 $other_is_word = 0; 248 $other_reason = "\"$other\" is not a $class under /$charset_display"; 249 $other_neg_reason = "\"$other\" is a $complement under /$charset_display"; 250 } 251 my $both_reason = $reason; 252 $both_reason .= "; $other_reason" if $other_ord != $ord; 253 my $both_neg_reason = $neg_reason; 254 $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord; 255 256 # If both are the same wordness, then \b will fail; \B 257 # succeed 258 if ($match == $other_is_word) { 259 $op = '!~'; 260 $neg_op = '=~'; 261 } 262 else { 263 $op = '=~'; 264 $neg_op = '!~'; 265 } 266 267 foreach my $eval ( 268 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x], 269 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x], 270 ) { 271 ok (eval $eval, $eval . $both_reason); 272 } 273 foreach my $eval ( 274 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x], 275 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x], 276 ) { 277 ok (eval $eval, $eval . $both_neg_reason); 278 } 279 280 next if $other_ord == $ord; 281 282 # These start with the \b or \B. They are included, based 283 # on source code analysis, to force the testing of the FBC 284 # (find_by_class) portions of regexec.c. 285 foreach my $eval ( 286 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x], 287 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x], 288 ) { 289 ok (eval $eval, $eval . $both_reason); 290 } 291 foreach my $eval ( 292 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x], 293 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x], 294 ) { 295 ok (eval $eval, $eval . $both_neg_reason); 296 } 297 } 298 } # End of each test case in a class 299 } # End of \w, \s, ... 300 } # End of utf8 upgraded or not 301} 302 303plan(curr_test() - 1); 304