1 2BEGIN { 3 unless ("A" eq pack('U', 0x41)) { 4 print "1..0 # Unicode::Collate " . 5 "cannot stringify a Unicode code point\n"; 6 exit 0; 7 } 8 if ($ENV{PERL_CORE}) { 9 chdir('t') if -d 't'; 10 @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); 11 } 12} 13 14use strict; 15use warnings; 16BEGIN { $| = 1; print "1..96\n"; } 17my $count = 0; 18sub ok ($;$) { 19 my $p = my $r = shift; 20 if (@_) { 21 my $x = shift; 22 $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; 23 } 24 print $p ? "ok" : "not ok", ' ', ++$count, "\n"; 25} 26 27use Unicode::Collate; 28 29ok(1); 30 31######################### 32 33sub _pack_U { Unicode::Collate::pack_U(@_) } 34sub _unpack_U { Unicode::Collate::unpack_U(@_) } 35 36my $A_acute = _pack_U(0xC1); 37my $a_acute = _pack_U(0xE1); 38my $acute = _pack_U(0x0301); 39 40my $hiragana = "\x{3042}\x{3044}"; 41my $katakana = "\x{30A2}\x{30A4}"; 42 43# 1 44 45my $Collator = Unicode::Collate->new( 46 table => 'keys.txt', 47 normalization => undef, 48); 49 50ok(ref $Collator, "Unicode::Collate"); 51 52ok($Collator->cmp("", ""), 0); 53ok($Collator->eq("", "")); 54ok($Collator->cmp("", "perl"), -1); 55 56ok( 57 join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ), 58 join(':', qw/ aca acha acia acka ada / ), 59); 60 61ok( 62 join(':', $Collator->sort( qw/ ACHA ACA ADA ACIA ACKA / ) ), 63 join(':', qw/ ACA ACHA ACIA ACKA ADA / ), 64); 65 66# 7 67 68ok($Collator->cmp("A$acute", $A_acute), 0); # @version 3.1.1 (prev: -1) 69ok($Collator->cmp($a_acute, $A_acute), -1); 70ok($Collator->eq("A\cA$acute", $A_acute)); # UCA v9. \cA is invariant. 71 72my %old_level = $Collator->change(level => 1); 73ok($Collator->eq("A$acute", $A_acute)); 74ok($Collator->eq("A", $A_acute)); 75 76ok($Collator->change(level => 2)->eq($a_acute, $A_acute)); 77ok($Collator->lt("A", $A_acute)); 78 79ok($Collator->change(%old_level)->lt("A", $A_acute)); 80ok($Collator->lt("A", $A_acute)); 81ok($Collator->lt("A", $a_acute)); 82ok($Collator->lt($a_acute, $A_acute)); 83 84# 18 85 86$Collator->change(level => 2); 87 88ok($Collator->{level}, 2); 89 90ok( $Collator->cmp("ABC","abc"), 0); 91ok( $Collator->eq("ABC","abc") ); 92ok( $Collator->le("ABC","abc") ); 93ok( $Collator->cmp($hiragana, $katakana), 0); 94ok( $Collator->eq($hiragana, $katakana) ); 95ok( $Collator->ge($hiragana, $katakana) ); 96 97# 25 98 99# hangul 100ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") ); 101ok( $Collator->eq("a\x{AE00}b", "a\x{1100}\x{1173}\x{11AF}b") ); 102ok( $Collator->gt("a\x{AE00}b", "a\x{1100}\x{1173}b\x{11AF}") ); 103ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); 104ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); 105ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana 106 107# 31 108 109$Collator->change(%old_level, katakana_before_hiragana => 1); 110 111ok($Collator->{level}, 4); 112 113ok( $Collator->cmp("abc", "ABC"), -1); 114ok( $Collator->ne("abc", "ABC") ); 115ok( $Collator->lt("abc", "ABC") ); 116ok( $Collator->le("abc", "ABC") ); 117ok( $Collator->cmp($hiragana, $katakana), 1); 118ok( $Collator->ne($hiragana, $katakana) ); 119ok( $Collator->gt($hiragana, $katakana) ); 120ok( $Collator->ge($hiragana, $katakana) ); 121 122# 40 123 124$Collator->change(upper_before_lower => 1); 125 126ok( $Collator->cmp("abc", "ABC"), 1); 127ok( $Collator->ge("abc", "ABC"), 1); 128ok( $Collator->gt("abc", "ABC"), 1); 129ok( $Collator->cmp($hiragana, $katakana), 1); 130ok( $Collator->ge($hiragana, $katakana), 1); 131ok( $Collator->gt($hiragana, $katakana), 1); 132 133# 46 134 135$Collator->change(katakana_before_hiragana => 0); 136 137ok( $Collator->cmp("abc", "ABC"), 1); 138ok( $Collator->cmp($hiragana, $katakana), -1); 139 140# 48 141 142$Collator->change(upper_before_lower => 0); 143 144ok( $Collator->cmp("abc", "ABC"), -1); 145ok( $Collator->le("abc", "ABC") ); 146ok( $Collator->cmp($hiragana, $katakana), -1); 147ok( $Collator->lt($hiragana, $katakana) ); 148 149# 52 150 151{ 152 my $ignoreAE = Unicode::Collate->new( 153 table => 'keys.txt', 154 normalization => undef, 155 ignoreChar => qr/^[aAeE]$/, 156 ); 157 ok($ignoreAE->eq("element","lament")); 158 ok($ignoreAE->eq("Perl","ePrl")); 159} 160 161# 54 162 163{ 164 my $undefAE = Unicode::Collate->new( 165 table => 'keys.txt', 166 normalization => undef, 167 undefChar => qr/^[aAeE]$/, 168 ); 169 ok($undefAE ->gt("edge","fog")); 170 ok($Collator->lt("edge","fog")); 171 ok($undefAE ->gt("lake","like")); 172 ok($Collator->lt("lake","like")); 173} 174 175# 58 176 177{ 178 my $dropArticles = Unicode::Collate->new( 179 table => "keys.txt", 180 normalization => undef, 181 preprocess => sub { 182 my $string = shift; 183 $string =~ s/\b(?:an?|the)\s+//ig; 184 $string; 185 }, 186 ); 187 ok($dropArticles->eq("camel", "a camel")); 188 ok($dropArticles->eq("Perl", "The Perl")); 189 ok($dropArticles->lt("the pen", "a pencil")); 190 ok($Collator->lt("Perl", "The Perl")); 191 ok($Collator->gt("the pen", "a pencil")); 192} 193 194# 63 195 196{ 197 my $undefName = Unicode::Collate->new( 198 table => "keys.txt", 199 normalization => undef, 200 undefName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/, 201 ); 202 # HIRAGANA and KATAKANA are made undefined via undefName. 203 # So they are after CJK Unified Ideographs. 204 205 ok($undefName->lt("\x{4E00}", $hiragana)); 206 ok($undefName->lt("\x{4E03}", $katakana)); 207 ok($Collator ->gt("\x{4E00}", $hiragana)); 208 ok($Collator ->gt("\x{4E03}", $katakana)); 209} 210 211# 67 212 213{ 214 my $O_str = Unicode::Collate->new( 215 table => "keys.txt", 216 normalization => undef, 217 entry => <<'ENTRIES', 2180008 ; [*0008.0000.0000.0000] # BACKSPACE (need to be non-ignorable) 219004F 0337 ; [.0B53.0020.0008.004F] # capital O WITH SHORT SOLIDUS OVERLAY 220006F 0008 002F ; [.0B53.0020.0002.006F] # LATIN SMALL LETTER O WITH STROKE 221004F 0008 002F ; [.0B53.0020.0008.004F] # LATIN CAPITAL LETTER O WITH STROKE 222006F 0337 ; [.0B53.0020.0002.004F] # small O WITH SHORT SOLIDUS OVERLAY 223200B ; [.2000.0000.0000.0000] # ZERO WIDTH SPACE (may be non-sense but ...) 224#00F8 ; [.0B53.0020.0002.00F8] # LATIN SMALL LETTER O WITH STROKE 225#00D8 ; [.0B53.0020.0008.00D8] # LATIN CAPITAL LETTER O WITH STROKE 226ENTRIES 227 ); 228 229 my $o_BS_slash = _pack_U(0x006F, 0x0008, 0x002F); 230 my $O_BS_slash = _pack_U(0x004F, 0x0008, 0x002F); 231 my $o_sol = _pack_U(0x006F, 0x0337); 232 my $O_sol = _pack_U(0x004F, 0x0337); 233 my $o_stroke = _pack_U(0x00F8); 234 my $O_stroke = _pack_U(0x00D8); 235 236 ok($O_str->eq($o_stroke, $o_BS_slash)); 237 ok($O_str->eq($O_stroke, $O_BS_slash)); 238 239 ok($O_str->eq($o_stroke, $o_sol)); 240 ok($O_str->eq($O_stroke, $O_sol)); 241 242 ok($Collator->eq("\x{200B}", "\0")); 243 ok($O_str ->gt("\x{200B}", "\0")); 244 ok($O_str ->gt("\x{200B}", "A")); 245} 246 247# 74 248 249my %origVer = $Collator->change(UCA_Version => 8); 250 251$Collator->change(level => 3); 252 253ok($Collator->gt("!\x{300}", "")); 254ok($Collator->gt("!\x{300}", "!")); 255ok($Collator->eq("!\x{300}", "\x{300}")); 256 257$Collator->change(level => 2); 258 259ok($Collator->eq("!\x{300}", "\x{300}")); 260 261$Collator->change(level => 4); 262 263ok($Collator->gt("!\x{300}", "!")); 264ok($Collator->lt("!\x{300}", "\x{300}")); 265 266$Collator->change(%origVer, level => 3); 267 268ok($Collator->eq("!\x{300}", "")); 269ok($Collator->eq("!\x{300}", "!")); 270ok($Collator->lt("!\x{300}", "\x{300}")); 271 272$Collator->change(level => 4); 273 274ok($Collator->gt("!\x{300}", "")); 275ok($Collator->eq("!\x{300}", "!")); 276 277# 85 278 279$_ = 'Foo'; 280 281my $c = Unicode::Collate->new( 282 table => 'keys.txt', 283 normalization => undef, 284 upper_before_lower => 1, 285); 286 287ok($_, 'Foo'); # fixed at v. 0.52; no longer clobber $_ 288 289my($temp, @temp); # Not the result but the side effect matters. 290 291$_ = 'Foo'; 292$temp = $c->getSortKey("abc"); 293ok($_, 'Foo'); 294 295$_ = 'Foo'; 296$temp = $c->viewSortKey("abc"); 297ok($_, 'Foo'); 298 299$_ = 'Foo'; 300@temp = $c->sort("abc", "xyz", "def"); 301ok($_, 'Foo'); 302 303$_ = 'Foo'; 304@temp = $c->index("perl5", "RL"); 305ok($_, 'Foo'); 306 307$_ = 'Foo'; 308@temp = $c->index("perl5", "LR"); 309ok($_, 'Foo'); 310 311# 91 312 313{ 314 my $caseless = Unicode::Collate->new( 315 table => "keys.txt", 316 normalization => undef, 317 preprocess => sub { uc shift }, 318 ); 319 ok( $Collator->gt("ABC","abc") ); 320 ok( $caseless->eq("ABC","abc") ); 321} 322 323# 93 324 325{ 326 eval { require Unicode::Normalize; }; 327 if ($@) { 328 eval { my $n1 = Unicode::Collate->new(table => "keys.txt"); }; 329 ok($@ =~ /Unicode::Normalize is required/); 330 331 eval { my $n2 = Unicode::Collate->new 332 (table => "keys.txt", normalization => undef); }; 333 ok(!$@); 334 335 eval { my $n3 = Unicode::Collate->new 336 (table => "keys.txt", normalization => 'prenormalized'); }; 337 ok($@ =~ /Unicode::Normalize is required/); 338 } else { 339 ok(1) for 1..3; 340 } 341} 342 343# 96 344 345