1package Unicode::Collate; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Collate cannot stringify a Unicode code point\n"; 6 } 7 unless (0x41 == unpack('U', 'A')) { 8 die "Unicode::Collate cannot get a Unicode code point\n"; 9 } 10} 11 12use 5.006; 13use strict; 14use warnings; 15use Carp; 16use File::Spec; 17 18no warnings 'utf8'; 19 20our $VERSION = '1.14'; 21our $PACKAGE = __PACKAGE__; 22 23### begin XS only ### 24require DynaLoader; 25our @ISA = qw(DynaLoader); 26bootstrap Unicode::Collate $VERSION; 27### end XS only ### 28 29my @Path = qw(Unicode Collate); 30my $KeyFile = "allkeys.txt"; 31 32# Perl's boolean 33use constant TRUE => 1; 34use constant FALSE => ""; 35use constant NOMATCHPOS => -1; 36 37# A coderef to get combining class imported from Unicode::Normalize 38# (i.e. \&Unicode::Normalize::getCombinClass). 39# This is also used as a HAS_UNICODE_NORMALIZE flag. 40my $CVgetCombinClass; 41 42# Supported Levels 43use constant MinLevel => 1; 44use constant MaxLevel => 4; 45 46# Minimum weights at level 2 and 3, respectively 47use constant Min2Wt => 0x20; 48use constant Min3Wt => 0x02; 49 50# Shifted weight at 4th level 51use constant Shift4Wt => 0xFFFF; 52 53# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 54use constant VCE_TEMPLATE => 'Cn4'; 55 56# A sort key: 16-bit weights 57use constant KEY_TEMPLATE => 'n*'; 58 59# The tie-breaking: 32-bit weights 60use constant TIE_TEMPLATE => 'N*'; 61 62# Level separator in a sort key: 63# i.e. pack(KEY_TEMPLATE, 0) 64use constant LEVEL_SEP => "\0\0"; 65 66# As Unicode code point separator for hash keys. 67# A joined code point string (denoted by JCPS below) 68# like "65;768" is used for internal processing 69# instead of Perl's Unicode string like "\x41\x{300}", 70# as the native code point is different from the Unicode code point 71# on EBCDIC platform. 72# This character must not be included in any stringified 73# representation of an integer. 74use constant CODE_SEP => ';'; 75 # NOTE: in regex /;/ is used for $jcps! 76 77# boolean values of variable weights 78use constant NON_VAR => 0; # Non-Variable character 79use constant VAR => 1; # Variable character 80 81# specific code points 82use constant Hangul_SIni => 0xAC00; 83use constant Hangul_SFin => 0xD7A3; 84 85# Logical_Order_Exception in PropList.txt 86my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 87 88# for highestFFFF and minimalFFFE 89my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 90my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 91 92sub UCA_Version { "30" } 93 94sub Base_Unicode_Version { "7.0.0" } 95 96###### 97 98sub pack_U { 99 return pack('U*', @_); 100} 101 102sub unpack_U { 103 return unpack('U*', shift(@_).pack('U*')); 104} 105 106###### 107 108my (%VariableOK); 109@VariableOK{ qw/ 110 blanked non-ignorable shifted shift-trimmed 111 / } = (); # keys lowercased 112 113our @ChangeOK = qw/ 114 alternate backwards level normalization rearrange 115 katakana_before_hiragana upper_before_lower ignore_level2 116 overrideCJK overrideHangul overrideOut preprocess UCA_Version 117 hangul_terminator variable identical highestFFFF minimalFFFE 118 long_contraction 119 /; 120 121our @ChangeNG = qw/ 122 entry mapping table maxlength contraction 123 ignoreChar ignoreName undefChar undefName rewrite 124 versionTable alternateTable backwardsTable forwardsTable 125 rearrangeTable variableTable 126 derivCode normCode rearrangeHash backwardsFlag 127 suppress suppressHash 128 __useXS /; ### XS only 129# The hash key 'ignored' was deleted at v 0.21. 130# The hash key 'isShift' was deleted at v 0.23. 131# The hash key 'combining' was deleted at v 0.24. 132# The hash key 'entries' was deleted at v 0.30. 133# The hash key 'L3_ignorable' was deleted at v 0.40. 134 135sub version { 136 my $self = shift; 137 return $self->{versionTable} || 'unknown'; 138} 139 140my (%ChangeOK, %ChangeNG); 141@ChangeOK{ @ChangeOK } = (); 142@ChangeNG{ @ChangeNG } = (); 143 144sub change { 145 my $self = shift; 146 my %hash = @_; 147 my %old; 148 if (exists $hash{alternate}) { 149 if (exists $hash{variable}) { 150 delete $hash{alternate}; 151 } else { 152 $hash{variable} = $hash{alternate}; 153 } 154 } 155 foreach my $k (keys %hash) { 156 if (exists $ChangeOK{$k}) { 157 $old{$k} = $self->{$k}; 158 $self->{$k} = $hash{$k}; 159 } elsif (exists $ChangeNG{$k}) { 160 croak "change of $k via change() is not allowed!"; 161 } 162 # else => ignored 163 } 164 $self->checkCollator(); 165 return wantarray ? %old : $self; 166} 167 168sub _checkLevel { 169 my $level = shift; 170 my $key = shift; # 'level' or 'backwards' 171 MinLevel <= $level or croak sprintf 172 "Illegal level %d (in value for key '%s') lower than %d.", 173 $level, $key, MinLevel; 174 $level <= MaxLevel or croak sprintf 175 "Unsupported level %d (in value for key '%s') higher than %d.", 176 $level, $key, MaxLevel; 177} 178 179my %DerivCode = ( 180 8 => \&_derivCE_8, 181 9 => \&_derivCE_9, 182 11 => \&_derivCE_9, # 11 == 9 183 14 => \&_derivCE_14, 184 16 => \&_derivCE_14, # 16 == 14 185 18 => \&_derivCE_18, 186 20 => \&_derivCE_20, 187 22 => \&_derivCE_22, 188 24 => \&_derivCE_24, 189 26 => \&_derivCE_24, # 26 == 24 190 28 => \&_derivCE_24, # 28 == 24 191 30 => \&_derivCE_24, # 30 == 24 192); 193 194sub checkCollator { 195 my $self = shift; 196 _checkLevel($self->{level}, "level"); 197 198 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 199 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 200 201 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 202 $self->{alternateTable} || 'shifted'; 203 $self->{variable} = $self->{alternate} = lc($self->{variable}); 204 exists $VariableOK{ $self->{variable} } 205 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 206 207 if (! defined $self->{backwards}) { 208 $self->{backwardsFlag} = 0; 209 } elsif (! ref $self->{backwards}) { 210 _checkLevel($self->{backwards}, "backwards"); 211 $self->{backwardsFlag} = 1 << $self->{backwards}; 212 } else { 213 my %level; 214 $self->{backwardsFlag} = 0; 215 for my $b (@{ $self->{backwards} }) { 216 _checkLevel($b, "backwards"); 217 $level{$b} = 1; 218 } 219 for my $v (sort keys %level) { 220 $self->{backwardsFlag} += 1 << $v; 221 } 222 } 223 224 defined $self->{rearrange} or $self->{rearrange} = []; 225 ref $self->{rearrange} 226 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 227 228 # keys of $self->{rearrangeHash} are $self->{rearrange}. 229 $self->{rearrangeHash} = undef; 230 231 if (@{ $self->{rearrange} }) { 232 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 233 } 234 235 $self->{normCode} = undef; 236 237 if (defined $self->{normalization}) { 238 eval { require Unicode::Normalize }; 239 $@ and croak "Unicode::Normalize is required to normalize strings"; 240 241 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 242 243 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 244 $self->{normCode} = \&Unicode::Normalize::NFD; 245 } 246 elsif ($self->{normalization} ne 'prenormalized') { 247 my $norm = $self->{normalization}; 248 $self->{normCode} = sub { 249 Unicode::Normalize::normalize($norm, shift); 250 }; 251 eval { $self->{normCode}->("") }; # try 252 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 253 } 254 } 255 return; 256} 257 258sub new 259{ 260 my $class = shift; 261 my $self = bless { @_ }, $class; 262 263### begin XS only ### 264 if (! exists $self->{table} && !defined $self->{rewrite} && 265 !defined $self->{undefName} && !defined $self->{ignoreName} && 266 !defined $self->{undefChar} && !defined $self->{ignoreChar}) { 267 $self->{__useXS} = \&_fetch_simple; 268 } else { 269 $self->{__useXS} = undef; 270 } 271### end XS only ### 272 273 # keys of $self->{suppressHash} are $self->{suppress}. 274 if ($self->{suppress} && @{ $self->{suppress} }) { 275 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 276 } # before read_table() 277 278 # If undef is passed explicitly, no file is read. 279 $self->{table} = $KeyFile if ! exists $self->{table}; 280 $self->read_table() if defined $self->{table}; 281 282 if ($self->{entry}) { 283 while ($self->{entry} =~ /([^\n]+)/g) { 284 $self->parseEntry($1, TRUE); 285 } 286 } 287 288 # only in new(), not in change() 289 $self->{level} ||= MaxLevel; 290 $self->{UCA_Version} ||= UCA_Version(); 291 292 $self->{overrideHangul} = FALSE 293 if ! exists $self->{overrideHangul}; 294 $self->{overrideCJK} = FALSE 295 if ! exists $self->{overrideCJK}; 296 $self->{normalization} = 'NFD' 297 if ! exists $self->{normalization}; 298 $self->{rearrange} = $self->{rearrangeTable} || 299 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 300 if ! exists $self->{rearrange}; 301 $self->{backwards} = $self->{backwardsTable} 302 if ! exists $self->{backwards}; 303 exists $self->{long_contraction} or $self->{long_contraction} 304 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24; 305 306 # checkCollator() will be called in change() 307 $self->checkCollator(); 308 309 return $self; 310} 311 312sub parseAtmark { 313 my $self = shift; 314 my $line = shift; # after s/^\s*\@// 315 316 if ($line =~ /^version\s*(\S*)/) { 317 $self->{versionTable} ||= $1; 318 } 319 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 320 $self->{variableTable} ||= $1; 321 } 322 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 323 $self->{alternateTable} ||= $1; 324 } 325 elsif ($line =~ /^backwards\s+(\S*)/) { 326 push @{ $self->{backwardsTable} }, $1; 327 } 328 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use 329 push @{ $self->{forwardsTable} }, $1; 330 } 331 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 332 push @{ $self->{rearrangeTable} }, _getHexArray($1); 333 } 334} 335 336sub read_table { 337 my $self = shift; 338 339### begin XS only ### 340 if ($self->{__useXS}) { 341 my @rest = _fetch_rest(); # complex matter need to parse 342 for my $line (@rest) { 343 next if $line =~ /^\s*#/; 344 345 if ($line =~ s/^\s*\@//) { 346 $self->parseAtmark($line); 347 } else { 348 $self->parseEntry($line); 349 } 350 } 351 return; 352 } 353### end XS only ### 354 355 my($f, $fh); 356 foreach my $d (@INC) { 357 $f = File::Spec->catfile($d, @Path, $self->{table}); 358 last if open($fh, $f); 359 $f = undef; 360 } 361 if (!defined $f) { 362 $f = File::Spec->catfile(@Path, $self->{table}); 363 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 364 } 365 366 while (my $line = <$fh>) { 367 next if $line =~ /^\s*#/; 368 369 if ($line =~ s/^\s*\@//) { 370 $self->parseAtmark($line); 371 } else { 372 $self->parseEntry($line); 373 } 374 } 375 close $fh; 376} 377 378 379## 380## get $line, parse it, and write an entry in $self 381## 382sub parseEntry 383{ 384 my $self = shift; 385 my $line = shift; 386 my $tailoring = shift; 387 my($name, $entry, @uv, @key); 388 389 if (defined $self->{rewrite}) { 390 $line = $self->{rewrite}->($line); 391 } 392 393 return if $line !~ /^\s*[0-9A-Fa-f]/; 394 395 # removes comment and gets name 396 $name = $1 397 if $line =~ s/[#%]\s*(.*)//; 398 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 399 400 # gets element 401 my($e, $k) = split /;/, $line; 402 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 403 if ! $k; 404 405 @uv = _getHexArray($e); 406 return if !@uv; 407 return if @uv > 1 && $self->{suppressHash} && !$tailoring && 408 exists $self->{suppressHash}{$uv[0]}; 409 $entry = join(CODE_SEP, @uv); # in JCPS 410 411 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 412 my $ele = pack_U(@uv); 413 414 # regarded as if it were not stored in the table 415 return 416 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 417 418 # replaced as completely ignorable 419 $k = '[.0000.0000.0000.0000]' 420 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 421 } 422 423 # replaced as completely ignorable 424 $k = '[.0000.0000.0000.0000]' 425 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 426 427 my $is_L3_ignorable = TRUE; 428 429 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 430 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 431 my @wt = _getHexArray($arr); 432 push @key, pack(VCE_TEMPLATE, $var, @wt); 433 $is_L3_ignorable = FALSE 434 if $wt[0] || $wt[1] || $wt[2]; 435 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 436 # is completely ignorable. 437 # For expansion, an entry $is_L3_ignorable 438 # if and only if "all" CEs are [.0000.0000.0000]. 439 } 440 441 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 442 443 if (@uv > 1) { 444 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 445 $self->{maxlength}{$uv[0]} = @uv; 446 } 447 } 448 while (@uv > 2) { 449 pop @uv; 450 my $fake_entry = join(CODE_SEP, @uv); # in JCPS 451 $self->{contraction}{$fake_entry} = 1; 452 } 453} 454 455 456sub viewSortKey 457{ 458 my $self = shift; 459 my $str = shift; 460 $self->visualizeSortKey($self->getSortKey($str)); 461} 462 463 464sub process 465{ 466 my $self = shift; 467 my $str = shift; 468 my $prep = $self->{preprocess}; 469 my $norm = $self->{normCode}; 470 471 $str = &$prep($str) if ref $prep; 472 $str = &$norm($str) if ref $norm; 473 return $str; 474} 475 476## 477## arrayref of JCPS = splitEnt(string to be collated) 478## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 479## 480sub splitEnt 481{ 482 my $self = shift; 483 my $str = shift; 484 my $wLen = shift; # with Length 485 486 my $map = $self->{mapping}; 487 my $max = $self->{maxlength}; 488 my $reH = $self->{rearrangeHash}; 489 my $vers = $self->{UCA_Version}; 490 my $ver9 = $vers >= 9 && $vers <= 11; 491 my $long = $self->{long_contraction}; 492 my $uXS = $self->{__useXS}; ### XS only 493 494 my @buf; 495 496 # get array of Unicode code point of string. 497 my @src = unpack_U($str); 498 499 # rearrangement: 500 # Character positions are not kept if rearranged, 501 # then neglected if $wLen is true. 502 if ($reH && ! $wLen) { 503 for (my $i = 0; $i < @src; $i++) { 504 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 505 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 506 $i++; 507 } 508 } 509 } 510 511 # remove a code point marked as a completely ignorable. 512 for (my $i = 0; $i < @src; $i++) { 513 if ($vers <= 20 && _isIllegal($src[$i])) { 514 $src[$i] = undef; 515 } elsif ($ver9) { 516 $src[$i] = undef if $map->{ $src[$i] } 517 ? @{ $map->{ $src[$i] } } == 0 518 : $uXS && _ignorable_simple($src[$i]); ### XS only 519 } 520 } 521 522 for (my $i = 0; $i < @src; $i++) { 523 my $jcps = $src[$i]; 524 525 # skip removed code point 526 if (! defined $jcps) { 527 if ($wLen && @buf) { 528 $buf[-1][2] = $i + 1; 529 } 530 next; 531 } 532 533 my $i_orig = $i; 534 535 # find contraction 536 if ($max->{$jcps}) { 537 my $temp_jcps = $jcps; 538 my $jcpsLen = 1; 539 my $maxLen = $max->{$jcps}; 540 541 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 542 next if ! defined $src[$p]; 543 $temp_jcps .= CODE_SEP . $src[$p]; 544 $jcpsLen++; 545 if ($map->{$temp_jcps}) { 546 $jcps = $temp_jcps; 547 $i = $p; 548 } 549 } 550 551 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 552 # This process requires Unicode::Normalize. 553 # If "normalization" is undef, here should be skipped *always* 554 # (in spite of bool value of $CVgetCombinClass), 555 # since canonical ordering cannot be expected. 556 # Blocked combining character should not be contracted. 557 558 # $self->{normCode} is false in the case of "prenormalized". 559 if ($self->{normalization}) { 560 my $cont = $self->{contraction}; 561 my $preCC = 0; 562 my $preCC_uc = 0; 563 my $jcps_uc = $jcps; 564 my(@out, @out_uc); 565 566 for (my $p = $i + 1; $p < @src; $p++) { 567 next if ! defined $src[$p]; 568 my $curCC = $CVgetCombinClass->($src[$p]); 569 last unless $curCC; 570 my $tail = CODE_SEP . $src[$p]; 571 572 if ($preCC != $curCC && $map->{$jcps.$tail}) { 573 $jcps .= $tail; 574 push @out, $p; 575 } else { 576 $preCC = $curCC; 577 } 578 579 next if !$long; 580 581 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} || 582 $cont->{$jcps_uc.$tail})) { 583 $jcps_uc .= $tail; 584 push @out_uc, $p; 585 } else { 586 $preCC_uc = $curCC; 587 } 588 } 589 590 if (@out_uc && $map->{$jcps_uc}) { 591 $jcps = $jcps_uc; 592 $src[$_] = undef for @out_uc; 593 } else { 594 $src[$_] = undef for @out; 595 } 596 } 597 } 598 599 # skip completely ignorable 600 if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 : 601 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only 602 if ($wLen && @buf) { 603 $buf[-1][2] = $i + 1; 604 } 605 next; 606 } 607 608 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 609 } 610 return \@buf; 611} 612 613## 614## VCE = _pack_override(input, codepoint, derivCode) 615## 616sub _pack_override ($$$) { 617 my $r = shift; 618 my $u = shift; 619 my $der = shift; 620 621 if (ref $r) { 622 return pack(VCE_TEMPLATE, NON_VAR, @$r); 623 } elsif (defined $r) { 624 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 625 } else { 626 $u = 0xFFFD if 0x10FFFF < $u; 627 return $der->($u); 628 } 629} 630 631## 632## list of VCE = getWt(JCPS) 633## 634sub getWt 635{ 636 my $self = shift; 637 my $u = shift; 638 my $map = $self->{mapping}; 639 my $der = $self->{derivCode}; 640 my $out = $self->{overrideOut}; 641 my $uXS = $self->{__useXS}; ### XS only 642 643 return if !defined $u; 644 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 645 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 646 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; 647 648 my @ce; 649 if ($map->{$u}) { 650 @ce = @{ $map->{$u} }; # $u may be a contraction 651### begin XS only ### 652 } elsif ($uXS && _exists_simple($u)) { 653 @ce = _fetch_simple($u); 654### end XS only ### 655 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { 656 my $hang = $self->{overrideHangul}; 657 if ($hang) { 658 @ce = map _pack_override($_, $u, $der), $hang->($u); 659 } elsif (!defined $hang) { 660 @ce = $der->($u); 661 } else { 662 my $max = $self->{maxlength}; 663 my @decH = _decompHangul($u); 664 665 if (@decH == 2) { 666 my $contract = join(CODE_SEP, @decH); 667 @decH = ($contract) if $map->{$contract}; 668 } else { # must be <@decH == 3> 669 if ($max->{$decH[0]}) { 670 my $contract = join(CODE_SEP, @decH); 671 if ($map->{$contract}) { 672 @decH = ($contract); 673 } else { 674 $contract = join(CODE_SEP, @decH[0,1]); 675 $map->{$contract} and @decH = ($contract, $decH[2]); 676 } 677 # even if V's ignorable, LT contraction is not supported. 678 # If such a situation were required, NFD should be used. 679 } 680 if (@decH == 3 && $max->{$decH[1]}) { 681 my $contract = join(CODE_SEP, @decH[1,2]); 682 $map->{$contract} and @decH = ($decH[0], $contract); 683 } 684 } 685 686 @ce = map({ 687 $map->{$_} ? @{ $map->{$_} } : 688 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only 689 $der->($_); 690 } @decH); 691 } 692 } elsif ($out && 0x10FFFF < $u) { 693 @ce = map _pack_override($_, $u, $der), $out->($u); 694 } else { 695 my $cjk = $self->{overrideCJK}; 696 my $vers = $self->{UCA_Version}; 697 if ($cjk && _isUIdeo($u, $vers)) { 698 @ce = map _pack_override($_, $u, $der), $cjk->($u); 699 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 700 @ce = _uideoCE_8($u); 701 } else { 702 @ce = $der->($u); 703 } 704 } 705 return map $self->varCE($_), @ce; 706} 707 708 709## 710## string sortkey = getSortKey(string arg) 711## 712sub getSortKey 713{ 714 my $self = shift; 715 my $orig = shift; 716 my $str = $self->process($orig); 717 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 718 my $vers = $self->{UCA_Version}; 719 my $term = $self->{hangul_terminator}; 720 my $lev = $self->{level}; 721 my $iden = $self->{identical}; 722 723 my @buf; # weight arrays 724 if ($term) { 725 my $preHST = ''; 726 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 727 foreach my $jcps (@$rEnt) { 728 # weird things like VL, TL-contraction are not considered! 729 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 730 if ($preHST && !$curHST || # hangul before non-hangul 731 $preHST =~ /L\z/ && $curHST =~ /^T/ || 732 $preHST =~ /V\z/ && $curHST =~ /^L/ || 733 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 734 push @buf, $termCE; 735 } 736 $preHST = $curHST; 737 push @buf, $self->getWt($jcps); 738 } 739 push @buf, $termCE if $preHST; # end at hangul 740 } else { 741 foreach my $jcps (@$rEnt) { 742 push @buf, $self->getWt($jcps); 743 } 744 } 745 746 my $rkey = $self->mk_SortKey(\@buf); ### XS only 747 748 if ($iden || $vers >= 26 && $lev == MaxLevel) { 749 $rkey .= LEVEL_SEP; 750 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 751 } 752 return $rkey; 753} 754 755 756## 757## int compare = cmp(string a, string b) 758## 759sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 760sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 761sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 762sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 763sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 764sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 765sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 766 767## 768## list[strings] sorted = sort(list[strings] arg) 769## 770sub sort { 771 my $obj = shift; 772 return 773 map { $_->[1] } 774 sort{ $a->[0] cmp $b->[0] } 775 map [ $obj->getSortKey($_), $_ ], @_; 776} 777 778 779## 780## bool _nonIgnorAtLevel(arrayref weights, int level) 781## 782sub _nonIgnorAtLevel($$) 783{ 784 my $wt = shift; 785 return if ! defined $wt; 786 my $lv = shift; 787 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 788} 789 790## 791## bool _eqArray( 792## arrayref of arrayref[weights] source, 793## arrayref of arrayref[weights] substr, 794## int level) 795## * comparison of graphemes vs graphemes. 796## @$source >= @$substr must be true (check it before call this); 797## 798sub _eqArray($$$) 799{ 800 my $source = shift; 801 my $substr = shift; 802 my $lev = shift; 803 804 for my $g (0..@$substr-1){ 805 # Do the $g'th graphemes have the same number of AV weights? 806 return if @{ $source->[$g] } != @{ $substr->[$g] }; 807 808 for my $w (0..@{ $substr->[$g] }-1) { 809 for my $v (0..$lev-1) { 810 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 811 } 812 } 813 } 814 return 1; 815} 816 817## 818## (int position, int length) 819## int position = index(string, substring, position, [undoc'ed global]) 820## 821## With "global" (only for the list context), 822## returns list of arrayref[position, length]. 823## 824sub index 825{ 826 my $self = shift; 827 $self->{preprocess} and 828 croak "Don't use Preprocess with index(), match(), etc."; 829 $self->{normCode} and 830 croak "Don't use Normalization with index(), match(), etc."; 831 832 my $str = shift; 833 my $len = length($str); 834 my $sub = shift; 835 my $subE = $self->splitEnt($sub); 836 my $pos = @_ ? shift : 0; 837 $pos = 0 if $pos < 0; 838 my $glob = shift; 839 840 my $lev = $self->{level}; 841 my $v2i = $self->{UCA_Version} >= 9 && 842 $self->{variable} ne 'non-ignorable'; 843 844 if (! @$subE) { 845 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 846 return $glob 847 ? map([$_, 0], $temp..$len) 848 : wantarray ? ($temp,0) : $temp; 849 } 850 $len < $pos 851 and return wantarray ? () : NOMATCHPOS; 852 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 853 @$strE 854 or return wantarray ? () : NOMATCHPOS; 855 856 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 857 858 my $last_is_variable; 859 for my $vwt (map $self->getWt($_), @$subE) { 860 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 861 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 862 863 # "Ignorable (L1, L2) after Variable" since track. v. 9 864 if ($v2i) { 865 if ($var) { 866 $last_is_variable = TRUE; 867 } 868 elsif (!$wt[0]) { # ignorable 869 $to_be_pushed = FALSE if $last_is_variable; 870 } 871 else { 872 $last_is_variable = FALSE; 873 } 874 } 875 876 if (@subWt && !$var && !$wt[0]) { 877 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 878 } elsif ($to_be_pushed) { 879 push @subWt, [ \@wt ]; 880 } 881 # else ===> skipped 882 } 883 884 my $count = 0; 885 my $end = @$strE - 1; 886 887 $last_is_variable = FALSE; # reuse 888 for (my $i = 0; $i <= $end; ) { # no $i++ 889 my $found_base = 0; 890 891 # fetch a grapheme 892 while ($i <= $end && $found_base == 0) { 893 for my $vwt ($self->getWt($strE->[$i][0])) { 894 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 895 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 896 897 # "Ignorable (L1, L2) after Variable" since track. v. 9 898 if ($v2i) { 899 if ($var) { 900 $last_is_variable = TRUE; 901 } 902 elsif (!$wt[0]) { # ignorable 903 $to_be_pushed = FALSE if $last_is_variable; 904 } 905 else { 906 $last_is_variable = FALSE; 907 } 908 } 909 910 if (@strWt && !$var && !$wt[0]) { 911 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 912 $finPos[-1] = $strE->[$i][2]; 913 } elsif ($to_be_pushed) { 914 push @strWt, [ \@wt ]; 915 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 916 $finPos[-1] = NOMATCHPOS if $found_base; 917 push @finPos, $strE->[$i][2]; 918 $found_base++; 919 } 920 # else ===> no-op 921 } 922 $i++; 923 } 924 925 # try to match 926 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 927 if ($iniPos[0] != NOMATCHPOS && 928 $finPos[$#subWt] != NOMATCHPOS && 929 _eqArray(\@strWt, \@subWt, $lev)) { 930 my $temp = $iniPos[0] + $pos; 931 932 if ($glob) { 933 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 934 splice @strWt, 0, $#subWt; 935 splice @iniPos, 0, $#subWt; 936 splice @finPos, 0, $#subWt; 937 } 938 else { 939 return wantarray 940 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 941 : $temp; 942 } 943 } 944 shift @strWt; 945 shift @iniPos; 946 shift @finPos; 947 } 948 } 949 950 return $glob 951 ? @g_ret 952 : wantarray ? () : NOMATCHPOS; 953} 954 955## 956## scalarref to matching part = match(string, substring) 957## 958sub match 959{ 960 my $self = shift; 961 if (my($pos,$len) = $self->index($_[0], $_[1])) { 962 my $temp = substr($_[0], $pos, $len); 963 return wantarray ? $temp : \$temp; 964 # An lvalue ref \substr should be avoided, 965 # since its value is affected by modification of its referent. 966 } 967 else { 968 return; 969 } 970} 971 972## 973## arrayref matching parts = gmatch(string, substring) 974## 975sub gmatch 976{ 977 my $self = shift; 978 my $str = shift; 979 my $sub = shift; 980 return map substr($str, $_->[0], $_->[1]), 981 $self->index($str, $sub, 0, 'g'); 982} 983 984## 985## bool subst'ed = subst(string, substring, replace) 986## 987sub subst 988{ 989 my $self = shift; 990 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 991 992 if (my($pos,$len) = $self->index($_[0], $_[1])) { 993 if ($code) { 994 my $mat = substr($_[0], $pos, $len); 995 substr($_[0], $pos, $len, $code->($mat)); 996 } else { 997 substr($_[0], $pos, $len, $_[2]); 998 } 999 return TRUE; 1000 } 1001 else { 1002 return FALSE; 1003 } 1004} 1005 1006## 1007## int count = gsubst(string, substring, replace) 1008## 1009sub gsubst 1010{ 1011 my $self = shift; 1012 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1013 my $cnt = 0; 1014 1015 # Replacement is carried out from the end, then use reverse. 1016 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1017 if ($code) { 1018 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1019 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1020 } else { 1021 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1022 } 1023 $cnt++; 1024 } 1025 return $cnt; 1026} 1027 10281; 1029__END__ 1030 1031=head1 NAME 1032 1033Unicode::Collate - Unicode Collation Algorithm 1034 1035=head1 SYNOPSIS 1036 1037 use Unicode::Collate; 1038 1039 #construct 1040 $Collator = Unicode::Collate->new(%tailoring); 1041 1042 #sort 1043 @sorted = $Collator->sort(@not_sorted); 1044 1045 #compare 1046 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1047 1048B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1049according to Perl's Unicode support. See L<perlunicode>, 1050L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1051Otherwise you can use C<preprocess> or should decode them before. 1052 1053=head1 DESCRIPTION 1054 1055This module is an implementation of Unicode Technical Standard #10 1056(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1057 1058=head2 Constructor and Tailoring 1059 1060The C<new> method returns a collator object. If new() is called 1061with no parameters, the collator should do the default collation. 1062 1063 $Collator = Unicode::Collate->new( 1064 UCA_Version => $UCA_Version, 1065 alternate => $alternate, # alias for 'variable' 1066 backwards => $levelNumber, # or \@levelNumbers 1067 entry => $element, 1068 hangul_terminator => $term_primary_weight, 1069 highestFFFF => $bool, 1070 identical => $bool, 1071 ignoreName => qr/$ignoreName/, 1072 ignoreChar => qr/$ignoreChar/, 1073 ignore_level2 => $bool, 1074 katakana_before_hiragana => $bool, 1075 level => $collationLevel, 1076 long_contraction => $bool, 1077 minimalFFFE => $bool, 1078 normalization => $normalization_form, 1079 overrideCJK => \&overrideCJK, 1080 overrideHangul => \&overrideHangul, 1081 preprocess => \&preprocess, 1082 rearrange => \@charList, 1083 rewrite => \&rewrite, 1084 suppress => \@charList, 1085 table => $filename, 1086 undefName => qr/$undefName/, 1087 undefChar => qr/$undefChar/, 1088 upper_before_lower => $bool, 1089 variable => $variable, 1090 ); 1091 1092=over 4 1093 1094=item UCA_Version 1095 1096If the revision (previously "tracking version") number of UCA is given, 1097behavior of that revision is emulated on collating. 1098If omitted, the return value of C<UCA_Version()> is used. 1099 1100The following revisions are supported. The default is 30. 1101 1102 UCA Unicode Standard DUCET (@version) 1103 ------------------------------------------------------- 1104 8 3.1 3.0.1 (3.0.1d9) 1105 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) 1106 11 4.0 4.0.0 (4.0.0) 1107 14 4.1.0 4.1.0 (4.1.0) 1108 16 5.0 5.0.0 (5.0.0) 1109 18 5.1.0 5.1.0 (5.1.0) 1110 20 5.2.0 5.2.0 (5.2.0) 1111 22 6.0.0 6.0.0 (6.0.0) 1112 24 6.1.0 6.1.0 (6.1.0) 1113 26 6.2.0 6.2.0 (6.2.0) 1114 28 6.3.0 6.3.0 (6.3.0) 1115 30 7.0.0 7.0.0 (7.0.0) 1116 1117* See below C<long_contraction> with C<UCA_Version> 22 and 24. 1118 1119* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1120since C<UCA_Version> 22. 1121 1122* Out-of-range codepoints (greater than U+10FFFF) are not ignored, 1123and can be overridden since C<UCA_Version> 22. 1124 1125* Fully ignorable characters were ignored, and would not interrupt 1126contractions with C<UCA_Version> 9 and 11. 1127 1128* Treatment of ignorables after variables and some behaviors 1129were changed at C<UCA_Version> 9. 1130 1131* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1132depend on C<UCA_Version>. 1133 1134* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1135C<hangul_terminator>. 1136 1137=item alternate 1138 1139-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1140 1141For backward compatibility, C<alternate> (old name) can be used 1142as an alias for C<variable>. 1143 1144=item backwards 1145 1146-- see 3.4 Backward Accents, UTS #10. 1147 1148 backwards => $levelNumber or \@levelNumbers 1149 1150Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1151If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1152forwards at all the levels. 1153 1154=item entry 1155 1156-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10. 1157 1158If the same character (or a sequence of characters) exists 1159in the collation element table through C<table>, 1160mapping to collation elements is overridden. 1161If it does not exist, the mapping is defined additionally. 1162 1163 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 11640063 0068 ; [.0E6A.0020.0002.0063] # ch 11650043 0068 ; [.0E6A.0020.0007.0043] # Ch 11660043 0048 ; [.0E6A.0020.0008.0043] # CH 1167006C 006C ; [.0F4C.0020.0002.006C] # ll 1168004C 006C ; [.0F4C.0020.0007.004C] # Ll 1169004C 004C ; [.0F4C.0020.0008.004C] # LL 117000F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1171006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 117200D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1173004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1174ENTRY 1175 1176 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 117700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 117800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1179ENTRY 1180 1181B<NOTE:> The code point in the UCA file format (before C<';'>) 1182B<must> be a Unicode code point (defined as hexadecimal), 1183but not a native code point. 1184So C<0063> must always denote C<U+0063>, 1185but not a character of C<"\x63">. 1186 1187Weighting may vary depending on collation element table. 1188So ensure the weights defined in C<entry> will be consistent with 1189those in the collation element table loaded via C<table>. 1190 1191In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1192and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1193(as a value between C<0E60> and C<0E6D>) 1194makes ordering as C<C E<lt> CH E<lt> D>. 1195Exactly speaking DUCET already has some characters between C<C> and C<D>: 1196C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1197C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1198and C<c-curl> (C<U+0255>) with C<0E69>. 1199Then primary weight C<0E6A> for C<CH> makes C<CH> 1200ordered between C<c-curl> and C<D>. 1201 1202=item hangul_terminator 1203 1204-- see 7.1.4 Trailing Weights, UTS #10. 1205 1206If a true value is given (non-zero but should be positive), 1207it will be added as a terminator primary weight to the end of 1208every standard Hangul syllable. Secondary and any higher weights 1209for terminator are set to zero. 1210If the value is false or C<hangul_terminator> key does not exist, 1211insertion of terminator weights will not be performed. 1212 1213Boundaries of Hangul syllables are determined 1214according to conjoining Jamo behavior in F<the Unicode Standard> 1215and F<HangulSyllableType.txt>. 1216 1217B<Implementation Note:> 1218(1) For expansion mapping (Unicode character mapped 1219to a sequence of collation elements), a terminator will not be added 1220between collation elements, even if Hangul syllable boundary exists there. 1221Addition of terminator is restricted to the next position 1222to the last collation element. 1223 1224(2) Non-conjoining Hangul letters 1225(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1226automatically terminated with a terminator primary weight. 1227These characters may need terminator included in a collation element 1228table beforehand. 1229 1230=item highestFFFF 1231 1232-- see 5.14 Collation Elements, UTS #35. 1233 1234If the parameter is made true, C<U+FFFF> has a highest primary weight. 1235When a boolean of C<$coll-E<gt>ge($str, "abc")> and 1236C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 1237begins with C<"abc">, or another primary equivalent. 1238C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 1239such as C<"abc\x{FFFF}xyz">. 1240 1241C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 1242almost, but the latter has a problem that you should know which letter is 1243next to C<c>. For a certain language where C<ch> as the next letter, 1244C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. 1245 1246Note: 1247This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>. 1248Any other character than C<U+FFFF> can be tailored by C<entry>. 1249 1250=item identical 1251 1252-- see A.3 Deterministic Comparison, UTS #10. 1253 1254By default, strings whose weights are equal should be equal, 1255even though their code points are not equal. 1256Completely ignorable characters are ignored. 1257 1258If the parameter is made true, a final, tie-breaking level is used. 1259If no difference of weights is found after the comparison through 1260all the level specified by C<level>, the comparison with code points 1261will be performed. 1262For the tie-breaking comparison, the sort key has code points 1263of the original string appended. 1264Completely ignorable characters are not ignored. 1265 1266If C<preprocess> and/or C<normalization> is applied, the code points 1267of the string after them (in NFD by default) are used. 1268 1269=item ignoreChar 1270 1271=item ignoreName 1272 1273-- see 3.6 Variable Weighting, UTS #10. 1274 1275Makes the entry in the table completely ignorable; 1276i.e. as if the weights were zero at all level. 1277 1278Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1279will be ignored. Through C<ignoreName>, any character whose name 1280(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1281will be ignored. 1282 1283E.g. when 'a' and 'e' are ignorable, 1284'element' is equal to 'lament' (or 'lmnt'). 1285 1286=item ignore_level2 1287 1288-- see 5.1 Parametric Tailoring, UTS #10. 1289 1290By default, case-sensitive comparison (that is level 3 difference) 1291won't ignore accents (that is level 2 difference). 1292 1293If the parameter is made true, accents (and other primary ignorable 1294characters) are ignored, even though cases are taken into account. 1295 1296B<NOTE>: C<level> should be 3 or greater. 1297 1298=item katakana_before_hiragana 1299 1300-- see 7.2 Tertiary Weight Table, UTS #10. 1301 1302By default, hiragana is before katakana. 1303If the parameter is made true, this is reversed. 1304 1305B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1306distinctions must occur in level 3, and their weights at level 3 must be 1307same as those mentioned in 7.3.1, UTS #10. 1308If you define your collation elements which violate this requirement, 1309this parameter does not work validly. 1310 1311=item level 1312 1313-- see 4.3 Form Sort Key, UTS #10. 1314 1315Set the maximum level. 1316Any higher levels than the specified one are ignored. 1317 1318 Level 1: alphabetic ordering 1319 Level 2: diacritic ordering 1320 Level 3: case ordering 1321 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1322 1323 ex.level => 2, 1324 1325If omitted, the maximum is the 4th. 1326 1327B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 1328But this module only uses weights within 0xFFFF. 1329When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 1330and 'shift-trimmed'), the level 4 may be unreliable. 1331 1332See also C<identical>. 1333 1334=item long_contraction 1335 1336-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10. 1337 1338If the parameter is made true, for a contraction with three or more 1339characters (here nicknamed "long contraction"), initial substrings 1340will be handled. 1341For example, a contraction ABC, where A is a starter, and B and C 1342are non-starters (character with non-zero combining character class), 1343will be detected even if there is not AB as a contraction. 1344 1345B<Default:> Usually false. 1346If C<UCA_Version> is 22 or 24, and the value of C<long_contraction> 1347is not specified in C<new()>, a true value is set implicitly. 1348This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0. 1349 1350C<change()> handles C<long_contraction> explicitly only. 1351If C<long_contraction> is not specified in C<change()>, even though 1352C<UCA_Version> is changed, C<long_contraction> will not be changed. 1353 1354B<Limitation:> Scanning non-starters is one-way (no back tracking). 1355If AB is found but not ABC is not found, other long contraction where 1356the first character is A and the second is not B may not be found. 1357 1358Under C<(normalization =E<gt> undef)>, detection step of discontiguous 1359contractions will be skipped. 1360 1361B<Note:> The following contractions in DUCET are not considered 1362in steps S2.1.1 to S2.1.3, where they are discontiguous. 1363 1364 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR) 1365 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL) 1366 1367For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY> 1368(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD. 1369In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected, 1370instead of C<0FB2 0F71 0F80>. 1371Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of 1372contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected. 1373 1374=item minimalFFFE 1375 1376-- see 5.14 Collation Elements, UTS #35. 1377 1378If the parameter is made true, C<U+FFFE> has a minimal primary weight. 1379The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 1380first compares C<$a1> and C<$b1> at level 1, and 1381then C<$a2> and C<$b2> at level 1, as followed. 1382 1383 "ab\x{FFFE}a" 1384 "Ab\x{FFFE}a" 1385 "ab\x{FFFE}c" 1386 "Ab\x{FFFE}c" 1387 "ab\x{FFFE}xyz" 1388 "abc\x{FFFE}def" 1389 "abc\x{FFFE}xYz" 1390 "aBc\x{FFFE}xyz" 1391 "abcX\x{FFFE}def" 1392 "abcx\x{FFFE}xyz" 1393 "b\x{FFFE}aaa" 1394 "bbb\x{FFFE}a" 1395 1396Note: 1397This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>. 1398Any other character than C<U+FFFE> can be tailored by C<entry>. 1399 1400=item normalization 1401 1402-- see 4.1 Normalize, UTS #10. 1403 1404If specified, strings are normalized before preparation of sort keys 1405(the normalization is executed after preprocess). 1406 1407A form name C<Unicode::Normalize::normalize()> accepts will be applied 1408as C<$normalization_form>. 1409Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1410See C<Unicode::Normalize::normalize()> for detail. 1411If omitted, C<'NFD'> is used. 1412 1413C<normalization> is performed after C<preprocess> (if defined). 1414 1415Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1416though they are not concerned with C<Unicode::Normalize::normalize()>. 1417 1418If C<undef> (not a string C<"undef">) is passed explicitly 1419as the value for this key, 1420any normalization is not carried out (this may make tailoring easier 1421if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1422only contiguous contractions are resolved; 1423e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1424C<A-cedilla-ring> would be primary equal to C<A>. 1425In this point, 1426C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1427B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1428 1429In the case of C<(normalization =E<gt> "prenormalized")>, 1430any normalization is not performed, but 1431discontiguous contractions with combining characters are performed. 1432Therefore 1433C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1434B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1435If source strings are finely prenormalized, 1436C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1437 1438Except C<(normalization =E<gt> undef)>, 1439B<Unicode::Normalize> is required (see also B<CAVEAT>). 1440 1441=item overrideCJK 1442 1443-- see 7.1 Derived Collation Elements, UTS #10. 1444 1445By default, CJK unified ideographs are ordered in Unicode codepoint 1446order, but those in the CJK Unified Ideographs block are less than 1447those in the CJK Unified Ideographs Extension A etc. 1448 1449 In the CJK Unified Ideographs block: 1450 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1451 U+4E00..U+9FBB if UCA_Version is 14 or 16. 1452 U+4E00..U+9FC3 if UCA_Version is 18. 1453 U+4E00..U+9FCB if UCA_Version is 20 or 22. 1454 U+4E00..U+9FCC if UCA_Version is 24 or later. 1455 1456 In the CJK Unified Ideographs Extension blocks: 1457 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version. 1458 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later. 1459 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later. 1460 1461Through C<overrideCJK>, ordering of CJK unified ideographs (including 1462extensions) can be overridden. 1463 1464ex. CJK unified ideographs in the JIS code point order. 1465 1466 overrideCJK => sub { 1467 my $u = shift; # get a Unicode codepoint 1468 my $b = pack('n', $u); # to UTF-16BE 1469 my $s = your_unicode_to_sjis_converter($b); # convert 1470 my $n = unpack('n', $s); # convert sjis to short 1471 [ $n, 0x20, 0x2, $u ]; # return the collation element 1472 }, 1473 1474The return value may be an arrayref of 1st to 4th weights as shown 1475above. The return value may be an integer as the primary weight 1476as shown below. If C<undef> is returned, the default derived 1477collation element will be used. 1478 1479 overrideCJK => sub { 1480 my $u = shift; # get a Unicode codepoint 1481 my $b = pack('n', $u); # to UTF-16BE 1482 my $s = your_unicode_to_sjis_converter($b); # convert 1483 my $n = unpack('n', $s); # convert sjis to short 1484 return $n; # return the primary weight 1485 }, 1486 1487The return value may be a list containing zero or more of 1488an arrayref, an integer, or C<undef>. 1489 1490ex. ignores all CJK unified ideographs. 1491 1492 overrideCJK => sub {()}, # CODEREF returning empty list 1493 1494 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1495 # as U+4E00 is a CJK unified ideograph and to be ignorable. 1496 1497If a false value (including C<undef>) is passed, C<overrideCJK> 1498has no effect. 1499C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one. 1500 1501But assignment of weight for CJK unified ideographs 1502in C<table> or C<entry> is still valid. 1503If C<undef> is passed explicitly as the value for this key, 1504weights for CJK unified ideographs are treated as undefined. 1505However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)> 1506has no special meaning. 1507 1508B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 1509C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 1510C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 1511ideographs. But they can't be overridden via C<overrideCJK> when you use 1512DUCET, as the table includes weights for them. C<table> or C<entry> has 1513priority over C<overrideCJK>. 1514 1515=item overrideHangul 1516 1517-- see 7.1 Derived Collation Elements, UTS #10. 1518 1519By default, Hangul syllables are decomposed into Hangul Jamo, 1520even if C<(normalization =E<gt> undef)>. 1521But the mapping of Hangul syllables may be overridden. 1522 1523This parameter works like C<overrideCJK>, so see there for examples. 1524 1525If you want to override the mapping of Hangul syllables, 1526NFD and NFKD are not appropriate, since NFD and NFKD will decompose 1527Hangul syllables before overriding. FCD may decompose Hangul syllables 1528as the case may be. 1529 1530If a false value (but not C<undef>) is passed, C<overrideHangul> 1531has no effect. 1532C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one. 1533 1534If C<undef> is passed explicitly as the value for this key, 1535weight for Hangul syllables is treated as undefined 1536without decomposition into Hangul Jamo. 1537But definition of weight for Hangul syllables 1538in C<table> or C<entry> is still valid. 1539 1540=item overrideOut 1541 1542-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. 1543 1544Perl seems to allow out-of-range values (greater than 0x10FFFF). 1545By default, out-of-range values are replaced with C<U+FFFD> 1546(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22, 1547or ignored when C<UCA_Version> E<lt>= 20. 1548 1549When C<UCA_Version> E<gt>= 22, the weights of out-of-range values 1550can be overridden. Though C<table> or C<entry> are available for them, 1551out-of-range values are too many. 1552 1553C<overrideOut> can perform it algorithmically. 1554This parameter works like C<overrideCJK>, so see there for examples. 1555 1556ex. ignores all out-of-range values. 1557 1558 overrideOut => sub {()}, # CODEREF returning empty list 1559 1560If a false value (including C<undef>) is passed, C<overrideOut> 1561has no effect. 1562C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one. 1563 1564B<NOTE ABOUT U+FFFD:> 1565 1566UCA recommends that out-of-range values should not be ignored for security 1567reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. 1568However, C<U+FFFD> is wrongly mapped to a variable collation element 1569in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be 1570ignored when C<variable> isn't C<Non-ignorable>. 1571 1572The mapping of C<U+FFFD> is corrected in Unicode 6.3.0. 1573see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights> 1574(7.1.4 Trailing Weights). Such a correction is reproduced by this. 1575 1576 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer 1577 1578This workaround is unnecessary since Unicode 6.3.0. 1579 1580=item preprocess 1581 1582-- see 5.4 Preprocessing, UTS #10. 1583 1584If specified, the coderef is used to preprocess each string 1585before the formation of sort keys. 1586 1587ex. dropping English articles, such as "a" or "the". 1588Then, "the pen" is before "a pencil". 1589 1590 preprocess => sub { 1591 my $str = shift; 1592 $str =~ s/\b(?:an?|the)\s+//gi; 1593 return $str; 1594 }, 1595 1596C<preprocess> is performed before C<normalization> (if defined). 1597 1598ex. decoding strings in a legacy encoding such as shift-jis: 1599 1600 $sjis_collator = Unicode::Collate->new( 1601 preprocess => \&your_shiftjis_to_unicode_decoder, 1602 ); 1603 @result = $sjis_collator->sort(@shiftjis_strings); 1604 1605B<Note:> Strings returned from the coderef will be interpreted 1606according to Perl's Unicode support. See L<perlunicode>, 1607L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1608 1609=item rearrange 1610 1611-- see 3.5 Rearrangement, UTS #10. 1612 1613Characters that are not coded in logical order and to be rearranged. 1614If C<UCA_Version> is equal to or less than 11, default is: 1615 1616 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1617 1618If you want to disallow any rearrangement, pass C<undef> or C<[]> 1619(a reference to empty list) as the value for this key. 1620 1621If C<UCA_Version> is equal to or greater than 14, default is C<[]> 1622(i.e. no rearrangement). 1623 1624B<According to the version 9 of UCA, this parameter shall not be used; 1625but it is not warned at present.> 1626 1627=item rewrite 1628 1629If specified, the coderef is used to rewrite lines in C<table> or C<entry>. 1630The coderef will get each line, and then should return a rewritten line 1631according to the UCA file format. 1632If the coderef returns an empty line, the line will be skipped. 1633 1634e.g. any primary ignorable characters into tertiary ignorable: 1635 1636 rewrite => sub { 1637 my $line = shift; 1638 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 1639 return $line; 1640 }, 1641 1642This example shows rewriting weights. C<rewrite> is allowed to 1643affect code points, weights, and the name. 1644 1645B<NOTE>: C<table> is available to use another table file; 1646preparing a modified table once would be more efficient than 1647rewriting lines on reading an unmodified table every time. 1648 1649=item suppress 1650 1651-- see suppress contractions in 5.14.11 Special-Purpose Commands, 1652UTS #35 (LDML). 1653 1654Contractions beginning with the specified characters are suppressed, 1655even if those contractions are defined in C<table>. 1656 1657An example for Russian and some languages using the Cyrillic script: 1658 1659 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 1660 1661where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 1662 1663B<NOTE>: Contractions via C<entry> are not be suppressed. 1664 1665=item table 1666 1667-- see 3.8 Default Unicode Collation Element Table, UTS #10. 1668 1669You can use another collation element table if desired. 1670 1671The table file should locate in the F<Unicode/Collate> directory 1672on C<@INC>. Say, if the filename is F<Foo.txt>, 1673the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1674 1675By default, F<allkeys.txt> (as the filename of DUCET) is used. 1676If you will prepare your own table file, any name other than F<allkeys.txt> 1677may be better to avoid namespace conflict. 1678 1679B<NOTE>: When XSUB is used, the DUCET is compiled on building this 1680module, and it may save time at the run time. 1681Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table, 1682or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 1683C<rewrite> will prevent this module from using the compiled DUCET. 1684 1685If C<undef> is passed explicitly as the value for this key, 1686no file is read (but you can define collation elements via C<entry>). 1687 1688A typical way to define a collation element table 1689without any file of table: 1690 1691 $onlyABC = Unicode::Collate->new( 1692 table => undef, 1693 entry => << 'ENTRIES', 16940061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 16950041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 16960062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 16970042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 16980063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 16990043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1700ENTRIES 1701 ); 1702 1703If C<ignoreName> or C<undefName> is used, character names should be 1704specified as a comment (following C<#>) on each line. 1705 1706=item undefChar 1707 1708=item undefName 1709 1710-- see 6.3.4 Reducing the Repertoire, UTS #10. 1711 1712Undefines the collation element as if it were unassigned in the C<table>. 1713This reduces the size of the table. 1714If an unassigned character appears in the string to be collated, 1715the sort key is made from its codepoint 1716as a single-character collation element, 1717as it is greater than any other assigned collation elements 1718(in the codepoint order among the unassigned characters). 1719But, it'd be better to ignore characters 1720unfamiliar to you and maybe never used. 1721 1722Through C<undefChar>, any character matching C<qr/$undefChar/> 1723will be undefined. Through C<undefName>, any character whose name 1724(given in the C<table> file as a comment) matches C<qr/$undefName/> 1725will be undefined. 1726 1727ex. Collation weights for beyond-BMP characters are not stored in object: 1728 1729 undefChar => qr/[^\0-\x{fffd}]/, 1730 1731=item upper_before_lower 1732 1733-- see 6.6 Case Comparisons, UTS #10. 1734 1735By default, lowercase is before uppercase. 1736If the parameter is made true, this is reversed. 1737 1738B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1739distinctions must occur in level 3, and their weights at level 3 must be 1740same as those mentioned in 7.3.1, UTS #10. 1741If you define your collation elements which differs from this requirement, 1742this parameter doesn't work validly. 1743 1744=item variable 1745 1746-- see 3.6 Variable Weighting, UTS #10. 1747 1748This key allows for variable weighting of variable collation elements, 1749which are marked with an ASTERISK in the table 1750(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 1751 1752 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1753 1754These names are case-insensitive. 1755By default (if specification is omitted), 'shifted' is adopted. 1756 1757 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1758 considered at the 4th level. 1759 1760 'Non-Ignorable' Variable elements are not reset to ignorable. 1761 1762 'Shifted' Variable elements are made ignorable at levels 1 through 3 1763 their level 4 weight is replaced by the old level 1 weight. 1764 Level 4 weight for Non-Variable elements is 0xFFFF. 1765 1766 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1767 are trimmed. 1768 1769=back 1770 1771=head2 Methods for Collation 1772 1773=over 4 1774 1775=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1776 1777Sorts a list of strings. 1778 1779=item C<$result = $Collator-E<gt>cmp($a, $b)> 1780 1781Returns 1 (when C<$a> is greater than C<$b>) 1782or 0 (when C<$a> is equal to C<$b>) 1783or -1 (when C<$a> is less than C<$b>). 1784 1785=item C<$result = $Collator-E<gt>eq($a, $b)> 1786 1787=item C<$result = $Collator-E<gt>ne($a, $b)> 1788 1789=item C<$result = $Collator-E<gt>lt($a, $b)> 1790 1791=item C<$result = $Collator-E<gt>le($a, $b)> 1792 1793=item C<$result = $Collator-E<gt>gt($a, $b)> 1794 1795=item C<$result = $Collator-E<gt>ge($a, $b)> 1796 1797They works like the same name operators as theirs. 1798 1799 eq : whether $a is equal to $b. 1800 ne : whether $a is not equal to $b. 1801 lt : whether $a is less than $b. 1802 le : whether $a is less than $b or equal to $b. 1803 gt : whether $a is greater than $b. 1804 ge : whether $a is greater than $b or equal to $b. 1805 1806=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1807 1808-- see 4.3 Form Sort Key, UTS #10. 1809 1810Returns a sort key. 1811 1812You compare the sort keys using a binary comparison 1813and get the result of the comparison of the strings using UCA. 1814 1815 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1816 1817 is equivalent to 1818 1819 $Collator->cmp($a, $b) 1820 1821=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1822 1823Converts a sorting key into its representation form. 1824If C<UCA_Version> is 8, the output is slightly different. 1825 1826 use Unicode::Collate; 1827 my $c = Unicode::Collate->new(); 1828 print $c->viewSortKey("Perl"),"\n"; 1829 1830 # output: 1831 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1832 # Level 1 Level 2 Level 3 Level 4 1833 1834=back 1835 1836=head2 Methods for Searching 1837 1838The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1839like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1840but they are not aware of any pattern, but only a literal substring. 1841 1842B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1843for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1844C<subst>, C<gsubst>) is croaked, as the position and the length might 1845differ from those on the specified string. 1846 1847C<rearrange> and C<hangul_terminator> parameters are neglected. 1848C<katakana_before_hiragana> and C<upper_before_lower> don't affect 1849matching and searching, as it doesn't matter whether greater or less. 1850 1851=over 4 1852 1853=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1854 1855=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1856 1857If C<$substring> matches a part of C<$string>, returns 1858the position of the first occurrence of the matching part in scalar context; 1859in list context, returns a two-element list of 1860the position and the length of the matching part. 1861 1862If C<$substring> does not match any part of C<$string>, 1863returns C<-1> in scalar context and 1864an empty list in list context. 1865 1866e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">, 1867you say the following where C<$sub> is C<"M>E<uuml>C<SS">, 1868 1869 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1870 # (normalization => undef) is REQUIRED. 1871 my $match; 1872 if (my($pos,$len) = $Collator->index($str, $sub)) { 1873 $match = substr($str, $pos, $len); 1874 } 1875 1876and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<"> 1877is primary equal to C<"M>E<uuml>C<SS">. 1878 1879=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1880 1881=item C<($match) = $Collator-E<gt>match($string, $substring)> 1882 1883If C<$substring> matches a part of C<$string>, in scalar context, returns 1884B<a reference to> the first occurrence of the matching part 1885(C<$match_ref> is always true if matches, 1886since every reference is B<true>); 1887in list context, returns the first occurrence of the matching part. 1888 1889If C<$substring> does not match any part of C<$string>, 1890returns C<undef> in scalar context and 1891an empty list in list context. 1892 1893e.g. 1894 1895 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1896 print "matches [$$match_ref].\n"; 1897 } else { 1898 print "doesn't match.\n"; 1899 } 1900 1901 or 1902 1903 if (($match) = $Collator->match($str, $sub)) { # list context 1904 print "matches [$match].\n"; 1905 } else { 1906 print "doesn't match.\n"; 1907 } 1908 1909=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1910 1911If C<$substring> matches a part of C<$string>, returns 1912all the matching parts (or matching count in scalar context). 1913 1914If C<$substring> does not match any part of C<$string>, 1915returns an empty list. 1916 1917=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1918 1919If C<$substring> matches a part of C<$string>, 1920the first occurrence of the matching part is replaced by C<$replacement> 1921(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 1922 1923C<$replacement> can be a C<CODEREF>, 1924taking the matching part as an argument, 1925and returning a string to replace the matching part 1926(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1927 1928=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1929 1930If C<$substring> matches a part of C<$string>, 1931all the occurrences of the matching part are replaced by C<$replacement> 1932(C<$string> is modified) and C<$count> is returned. 1933 1934C<$replacement> can be a C<CODEREF>, 1935taking the matching part as an argument, 1936and returning a string to replace the matching part 1937(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1938 1939e.g. 1940 1941 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1942 # (normalization => undef) is REQUIRED. 1943 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 1944 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1945 1946 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 1947 # i.e., all the camels are made bold-faced. 1948 1949 Examples: levels and ignore_level2 - what does camel match? 1950 --------------------------------------------------------------------------- 1951 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 1952 -----------------------|--------------------------------------------------- 1953 1 false | yes yes yes yes yes 1954 2 false | yes yes no yes yes 1955 3 false | yes no no yes yes 1956 4 false | yes no no no yes 1957 -----------------------|--------------------------------------------------- 1958 1 true | yes yes yes yes yes 1959 2 true | yes yes yes yes yes 1960 3 true | yes no yes yes yes 1961 4 true | yes no yes no yes 1962 --------------------------------------------------------------------------- 1963 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 1964 at any level. 1965 1966=back 1967 1968=head2 Other Methods 1969 1970=over 4 1971 1972=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 1973 1974=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 1975 1976Changes the value of specified keys and returns the changed part. 1977 1978 $Collator = Unicode::Collate->new(level => 4); 1979 1980 $Collator->eq("perl", "PERL"); # false 1981 1982 %old = $Collator->change(level => 2); # returns (level => 4). 1983 1984 $Collator->eq("perl", "PERL"); # true 1985 1986 $Collator->change(%old); # returns (level => 2). 1987 1988 $Collator->eq("perl", "PERL"); # false 1989 1990Not all C<(key,value)>s are allowed to be changed. 1991See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 1992 1993In the scalar context, returns the modified collator 1994(but it is B<not> a clone from the original). 1995 1996 $Collator->change(level => 2)->eq("perl", "PERL"); # true 1997 1998 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 1999 2000 $Collator->change(level => 4)->eq("perl", "PERL"); # false 2001 2002=item C<$version = $Collator-E<gt>version()> 2003 2004Returns the version number (a string) of the Unicode Standard 2005which the C<table> file used by the collator object is based on. 2006If the table does not include a version line (starting with C<@version>), 2007returns C<"unknown">. 2008 2009=item C<UCA_Version()> 2010 2011Returns the revision number of UTS #10 this module consults, 2012that should correspond with the DUCET incorporated. 2013 2014=item C<Base_Unicode_Version()> 2015 2016Returns the version number of UTS #10 this module consults, 2017that should correspond with the DUCET incorporated. 2018 2019=back 2020 2021=head1 EXPORT 2022 2023No method will be exported. 2024 2025=head1 INSTALL 2026 2027Though this module can be used without any C<table> file, 2028to use this module easily, it is recommended to install a table file 2029in the UCA format, by copying it under the directory 2030<a place in @INC>/Unicode/Collate. 2031 2032The most preferable one is "The Default Unicode Collation Element Table" 2033(aka DUCET), available from the Unicode Consortium's website: 2034 2035 http://www.unicode.org/Public/UCA/ 2036 2037 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) 2038 2039If DUCET is not installed, it is recommended to copy the file 2040from http://www.unicode.org/Public/UCA/latest/allkeys.txt 2041to <a place in @INC>/Unicode/Collate/allkeys.txt 2042manually. 2043 2044=head1 CAVEATS 2045 2046=over 4 2047 2048=item Normalization 2049 2050Use of the C<normalization> parameter requires the B<Unicode::Normalize> 2051module (see L<Unicode::Normalize>). 2052 2053If you need not it (say, in the case when you need not 2054handle any combining characters), 2055assign C<(normalization =E<gt> undef)> explicitly. 2056 2057-- see 6.5 Avoiding Normalization, UTS #10. 2058 2059=item Conformance Test 2060 2061The Conformance Test for the UCA is available 2062under L<http://www.unicode.org/Public/UCA/>. 2063 2064For F<CollationTest_SHIFTED.txt>, 2065a collator via C<Unicode::Collate-E<gt>new( )> should be used; 2066for F<CollationTest_NON_IGNORABLE.txt>, a collator via 2067C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 2068 2069If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 2070C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 2071C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 2072C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 2073 2074B<Unicode::Normalize is required to try The Conformance Test.> 2075 2076=back 2077 2078=head1 AUTHOR, COPYRIGHT AND LICENSE 2079 2080The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 2081<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2014, 2082SADAHIRO Tomoyuki. Japan. All rights reserved. 2083 2084This module is free software; you can redistribute it and/or 2085modify it under the same terms as Perl itself. 2086 2087The file Unicode/Collate/allkeys.txt was copied verbatim 2088from L<http://www.unicode.org/Public/UCA/6.3.0/allkeys.txt>. 2089For this file, Copyright (c) 2001-2012 Unicode, Inc. 2090Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>. 2091 2092=head1 SEE ALSO 2093 2094=over 4 2095 2096=item Unicode Collation Algorithm - UTS #10 2097 2098L<http://www.unicode.org/reports/tr10/> 2099 2100=item The Default Unicode Collation Element Table (DUCET) 2101 2102L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 2103 2104=item The conformance test for the UCA 2105 2106L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 2107 2108L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 2109 2110=item Hangul Syllable Type 2111 2112L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 2113 2114=item Unicode Normalization Forms - UAX #15 2115 2116L<http://www.unicode.org/reports/tr15/> 2117 2118=item Unicode Locale Data Markup Language (LDML) - UTS #35 2119 2120L<http://www.unicode.org/reports/tr35/> 2121 2122=back 2123 2124=cut 2125