1package Unicode::Collate; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Collate cannot stringify a Unicode code point\n"; 6 } 7} 8 9use 5.006; 10use strict; 11use warnings; 12use Carp; 13use File::Spec; 14 15no warnings 'utf8'; 16 17require Exporter; 18 19our $VERSION = '0.33'; 20our $PACKAGE = __PACKAGE__; 21 22our @ISA = qw(Exporter); 23 24our %EXPORT_TAGS = (); 25our @EXPORT_OK = (); 26our @EXPORT = (); 27 28(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//; 29our $KeyFile = "allkeys.txt"; 30 31# Perl's boolean 32use constant TRUE => 1; 33use constant FALSE => ""; 34use constant NOMATCHPOS => -1; 35 36# A coderef to get combining class imported from Unicode::Normalize 37# (i.e. \&Unicode::Normalize::getCombinClass). 38# This is also used as a HAS_UNICODE_NORMALIZE flag. 39our $CVgetCombinClass; 40 41# Supported Levels 42use constant MinLevel => 1; 43use constant MaxLevel => 4; 44 45# Minimum weights at level 2 and 3, respectively 46use constant Min2Wt => 0x20; 47use constant Min3Wt => 0x02; 48 49# Shifted weight at 4th level 50use constant Shift4Wt => 0xFFFF; 51 52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 53# PROBLEM: The Default Unicode Collation Element Table 54# has weights over 0xFFFF at the 4th level. 55# The tie-breaking in the variable weights 56# other than "shift" (as well as "shift-trimmed") is unreliable. 57use constant VCE_TEMPLATE => 'Cn4'; 58 59# A sort key: 16-bit weights 60# See also the PROBLEM on VCE_TEMPLATE above. 61use constant KEY_TEMPLATE => 'n*'; 62 63# Level separator in a sort key: 64# i.e. pack(KEY_TEMPLATE, 0) 65use constant LEVEL_SEP => "\0\0"; 66 67# As Unicode code point separator for hash keys. 68# A joined code point string (denoted by JCPS below) 69# like "65;768" is used for internal processing 70# instead of Perl's Unicode string like "\x41\x{300}", 71# as the native code point is different from the Unicode code point 72# on EBCDIC platform. 73# This character must not be included in any stringified 74# representation of an integer. 75use constant CODE_SEP => ';'; 76 77# boolean values of variable weights 78use constant NON_VAR => 0; # Non-Variable character 79use constant VAR => 1; # Variable character 80 81# specific code points 82use constant Hangul_LBase => 0x1100; 83use constant Hangul_LIni => 0x1100; 84use constant Hangul_LFin => 0x1159; 85use constant Hangul_LFill => 0x115F; 86use constant Hangul_VBase => 0x1161; 87use constant Hangul_VIni => 0x1160; 88use constant Hangul_VFin => 0x11A2; 89use constant Hangul_TBase => 0x11A7; 90use constant Hangul_TIni => 0x11A8; 91use constant Hangul_TFin => 0x11F9; 92use constant Hangul_TCount => 28; 93use constant Hangul_NCount => 588; 94use constant Hangul_SBase => 0xAC00; 95use constant Hangul_SIni => 0xAC00; 96use constant Hangul_SFin => 0xD7A3; 97use constant CJK_UidIni => 0x4E00; 98use constant CJK_UidFin => 0x9FA5; 99use constant CJK_ExtAIni => 0x3400; 100use constant CJK_ExtAFin => 0x4DB5; 101use constant CJK_ExtBIni => 0x20000; 102use constant CJK_ExtBFin => 0x2A6D6; 103use constant BMP_Max => 0xFFFF; 104 105# Logical_Order_Exception in PropList.txt 106# TODO: synchronization with change of PropList.txt. 107our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 108 109sub UCA_Version { "11" } 110 111sub Base_Unicode_Version { "4.0" } 112 113###### 114 115sub pack_U { 116 return pack('U*', @_); 117} 118 119sub unpack_U { 120 return unpack('U*', pack('U*').shift); 121} 122 123###### 124 125my (%VariableOK); 126@VariableOK{ qw/ 127 blanked non-ignorable shifted shift-trimmed 128 / } = (); # keys lowercased 129 130our @ChangeOK = qw/ 131 alternate backwards level normalization rearrange 132 katakana_before_hiragana upper_before_lower 133 overrideHangul overrideCJK preprocess UCA_Version 134 hangul_terminator variable 135 /; 136 137our @ChangeNG = qw/ 138 entry mapping table maxlength 139 ignoreChar ignoreName undefChar undefName variableTable 140 versionTable alternateTable backwardsTable forwardsTable rearrangeTable 141 derivCode normCode rearrangeHash L3_ignorable 142 backwardsFlag 143 /; 144# The hash key 'ignored' is deleted at v 0.21. 145# The hash key 'isShift' is deleted at v 0.23. 146# The hash key 'combining' is deleted at v 0.24. 147# The hash key 'entries' is deleted at v 0.30. 148 149sub version { 150 my $self = shift; 151 return $self->{versionTable} || 'unknown'; 152} 153 154my (%ChangeOK, %ChangeNG); 155@ChangeOK{ @ChangeOK } = (); 156@ChangeNG{ @ChangeNG } = (); 157 158sub change { 159 my $self = shift; 160 my %hash = @_; 161 my %old; 162 if (exists $hash{variable} && exists $hash{alternate}) { 163 delete $hash{alternate}; 164 } 165 elsif (!exists $hash{variable} && exists $hash{alternate}) { 166 $hash{variable} = $hash{alternate}; 167 } 168 foreach my $k (keys %hash) { 169 if (exists $ChangeOK{$k}) { 170 $old{$k} = $self->{$k}; 171 $self->{$k} = $hash{$k}; 172 } 173 elsif (exists $ChangeNG{$k}) { 174 croak "change of $k via change() is not allowed!"; 175 } 176 # else => ignored 177 } 178 $self->checkCollator; 179 return wantarray ? %old : $self; 180} 181 182sub _checkLevel { 183 my $level = shift; 184 my $key = shift; # 'level' or 'backwards' 185 MinLevel <= $level or croak sprintf 186 "Illegal level %d (in value for key '%s') lower than %d.", 187 $level, $key, MinLevel; 188 $level <= MaxLevel or croak sprintf 189 "Unsupported level %d (in value for key '%s') higher than %d.", 190 $level, $key, MaxLevel; 191} 192 193my %DerivCode = ( 194 8 => \&_derivCE_8, 195 9 => \&_derivCE_9, 196 11 => \&_derivCE_9, # 11 == 9 197); 198 199sub checkCollator { 200 my $self = shift; 201 _checkLevel($self->{level}, "level"); 202 203 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 204 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 205 206 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 207 $self->{alternateTable} || 'shifted'; 208 $self->{variable} = $self->{alternate} = lc($self->{variable}); 209 exists $VariableOK{ $self->{variable} } 210 or croak "$PACKAGE unknown variable tag name: $self->{variable}"; 211 212 if (! defined $self->{backwards}) { 213 $self->{backwardsFlag} = 0; 214 } 215 elsif (! ref $self->{backwards}) { 216 _checkLevel($self->{backwards}, "backwards"); 217 $self->{backwardsFlag} = 1 << $self->{backwards}; 218 } 219 else { 220 my %level; 221 $self->{backwardsFlag} = 0; 222 for my $b (@{ $self->{backwards} }) { 223 _checkLevel($b, "backwards"); 224 $level{$b} = 1; 225 } 226 for my $v (sort keys %level) { 227 $self->{backwardsFlag} += 1 << $v; 228 } 229 } 230 231 defined $self->{rearrange} or $self->{rearrange} = []; 232 ref $self->{rearrange} 233 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 234 235 # keys of $self->{rearrangeHash} are $self->{rearrange}. 236 $self->{rearrangeHash} = undef; 237 238 if (@{ $self->{rearrange} }) { 239 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 240 } 241 242 $self->{normCode} = undef; 243 244 if (defined $self->{normalization}) { 245 eval { require Unicode::Normalize }; 246 $@ and croak "Unicode::Normalize is required to normalize strings"; 247 248 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 249 250 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 251 $self->{normCode} = \&Unicode::Normalize::NFD; 252 } 253 elsif ($self->{normalization} ne 'prenormalized') { 254 my $norm = $self->{normalization}; 255 $self->{normCode} = sub { 256 Unicode::Normalize::normalize($norm, shift); 257 }; 258 eval { $self->{normCode}->("") }; # try 259 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 260 } 261 } 262 return; 263} 264 265sub new 266{ 267 my $class = shift; 268 my $self = bless { @_ }, $class; 269 270 # If undef is passed explicitly, no file is read. 271 $self->{table} = $KeyFile if ! exists $self->{table}; 272 $self->read_table if defined $self->{table}; 273 274 if ($self->{entry}) { 275 $self->parseEntry($_) foreach split /\n/, $self->{entry}; 276 } 277 278 $self->{level} ||= MaxLevel; 279 $self->{UCA_Version} ||= UCA_Version(); 280 281 $self->{overrideHangul} = FALSE 282 if ! exists $self->{overrideHangul}; 283 $self->{overrideCJK} = FALSE 284 if ! exists $self->{overrideCJK}; 285 $self->{normalization} = 'NFD' 286 if ! exists $self->{normalization}; 287 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange 288 if ! exists $self->{rearrange}; 289 $self->{backwards} = $self->{backwardsTable} 290 if ! exists $self->{backwards}; 291 292 $self->checkCollator; 293 294 return $self; 295} 296 297sub read_table { 298 my $self = shift; 299 300 my $filepath = File::Spec->catfile($Path, $self->{table}); 301 open my $fk, "<$filepath" 302 or croak "File does not exist at $filepath"; 303 304 while (<$fk>) { 305 next if /^\s*#/; 306 unless (s/^\s*\@//) { 307 $self->parseEntry($_); 308 next; 309 } 310 311 if (/^version\s*(\S*)/) { 312 $self->{versionTable} ||= $1; 313 } 314 elsif (/^variable\s+(\S*)/) { # since UTS #10-9 315 $self->{variableTable} ||= $1; 316 } 317 elsif (/^alternate\s+(\S*)/) { # till UTS #10-8 318 $self->{alternateTable} ||= $1; 319 } 320 elsif (/^backwards\s+(\S*)/) { 321 push @{ $self->{backwardsTable} }, $1; 322 } 323 elsif (/^forwards\s+(\S*)/) { # parhaps no use 324 push @{ $self->{forwardsTable} }, $1; 325 } 326 elsif (/^rearrange\s+(.*)/) { # (\S*) is NG 327 push @{ $self->{rearrangeTable} }, _getHexArray($1); 328 } 329 } 330 close $fk; 331} 332 333 334## 335## get $line, parse it, and write an entry in $self 336## 337sub parseEntry 338{ 339 my $self = shift; 340 my $line = shift; 341 my($name, $entry, @uv, @key); 342 343 return if $line !~ /^\s*[0-9A-Fa-f]/; 344 345 # removes comment and gets name 346 $name = $1 347 if $line =~ s/[#%]\s*(.*)//; 348 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 349 350 # gets element 351 my($e, $k) = split /;/, $line; 352 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 353 if ! $k; 354 355 @uv = _getHexArray($e); 356 return if !@uv; 357 358 $entry = join(CODE_SEP, @uv); # in JCPS 359 360 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 361 my $ele = pack_U(@uv); 362 363 # regarded as if it were not entried in the table 364 return 365 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 366 367 # replaced as completely ignorable 368 $k = '[.0000.0000.0000.0000]' 369 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 370 } 371 372 # replaced as completely ignorable 373 $k = '[.0000.0000.0000.0000]' 374 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 375 376 my $is_L3_ignorable = TRUE; 377 378 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 379 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 380 my @wt = _getHexArray($arr); 381 push @key, pack(VCE_TEMPLATE, $var, @wt); 382 $is_L3_ignorable = FALSE 383 if $wt[0] + $wt[1] + $wt[2] != 0; 384 # if $arr !~ /[1-9A-Fa-f]/; NG 385 # Conformance Test shows L3-ignorable is completely ignorable. 386 # For expansion, an entry $is_L3_ignorable 387 # if and only if "all" CEs are [.0000.0000.0000]. 388 } 389 390 $self->{mapping}{$entry} = \@key; 391 392 if (@uv > 1) { 393 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) 394 and $self->{maxlength}{$uv[0]} = @uv; 395 } 396 else { 397 $is_L3_ignorable 398 ? ($self->{L3_ignorable}{$uv[0]} = TRUE) 399 : ($self->{L3_ignorable}{$uv[0]} and 400 $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key. 401 } 402} 403 404 405## 406## VCE = _varCE(variable term, VCE) 407## 408sub _varCE 409{ 410 my $vbl = shift; 411 my $vce = shift; 412 if ($vbl eq 'non-ignorable') { 413 return $vce; 414 } 415 my ($var, @wt) = unpack VCE_TEMPLATE, $vce; 416 417 if ($var) { 418 return pack(VCE_TEMPLATE, $var, 0, 0, 0, 419 $vbl eq 'blanked' ? $wt[3] : $wt[0]); 420 } 421 elsif ($vbl eq 'blanked') { 422 return $vce; 423 } 424 else { 425 return pack(VCE_TEMPLATE, $var, @wt[0..2], 426 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0); 427 } 428} 429 430sub viewSortKey 431{ 432 my $self = shift; 433 $self->visualizeSortKey($self->getSortKey(@_)); 434} 435 436sub visualizeSortKey 437{ 438 my $self = shift; 439 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift); 440 441 if ($self->{UCA_Version} <= 8) { 442 $view =~ s/ ?0000 ?/|/g; 443 } else { 444 $view =~ s/\b0000\b/|/g; 445 } 446 return "[$view]"; 447} 448 449 450## 451## arrayref of JCPS = splitEnt(string to be collated) 452## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true) 453## 454sub splitEnt 455{ 456 my $self = shift; 457 my $wLen = $_[1]; 458 459 my $code = $self->{preprocess}; 460 my $norm = $self->{normCode}; 461 my $map = $self->{mapping}; 462 my $max = $self->{maxlength}; 463 my $reH = $self->{rearrangeHash}; 464 my $ign = $self->{L3_ignorable}; 465 my $ver9 = $self->{UCA_Version} >= 9; 466 467 my ($str, @buf); 468 469 if ($wLen) { 470 $code and croak "Preprocess breaks character positions. " 471 . "Don't use with index(), match(), etc."; 472 $norm and croak "Normalization breaks character positions. " 473 . "Don't use with index(), match(), etc."; 474 $str = $_[0]; 475 } 476 else { 477 $str = $_[0]; 478 $str = &$code($str) if ref $code; 479 $str = &$norm($str) if ref $norm; 480 } 481 482 # get array of Unicode code point of string. 483 my @src = unpack_U($str); 484 485 # rearrangement: 486 # Character positions are not kept if rearranged, 487 # then neglected if $wLen is true. 488 if ($reH && ! $wLen) { 489 for (my $i = 0; $i < @src; $i++) { 490 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 491 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 492 $i++; 493 } 494 } 495 } 496 497 # To remove a character marked as a completely ignorable. 498 for (my $i = 0; $i < @src; $i++) { 499 $src[$i] = undef 500 if _isIllegal($src[$i]) || ($ver9 && $ign->{ $src[$i] }); 501 } 502 503 for (my $i = 0; $i < @src; $i++) { 504 my $jcps = $src[$i]; 505 next if ! defined $jcps; 506 my $i_orig = $i; 507 508 if ($max->{$jcps}) { # contract 509 my $temp_jcps = $jcps; 510 my $jcpsLen = 1; 511 my $maxLen = $max->{$jcps}; 512 513 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 514 next if ! defined $src[$p]; 515 $temp_jcps .= CODE_SEP . $src[$p]; 516 $jcpsLen++; 517 if ($map->{$temp_jcps}) { 518 $jcps = $temp_jcps; 519 $i = $p; 520 } 521 } 522 523 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1). 524 # This process requires Unicode::Normalize. 525 # If "normalization" is undef, here should be skipped *always* 526 # (in spite of bool value of $CVgetCombinClass), 527 # since canonical ordering cannot be expected. 528 # Blocked combining character should not be contracted. 529 530 if ($self->{normalization}) 531 # $self->{normCode} is false in the case of "prenormalized". 532 { 533 my $preCC = 0; 534 my $curCC = 0; 535 536 for (my $p = $i + 1; $p < @src; $p++) { 537 next if ! defined $src[$p]; 538 $curCC = $CVgetCombinClass->($src[$p]); 539 last unless $curCC; 540 my $tail = CODE_SEP . $src[$p]; 541 if ($preCC != $curCC && $map->{$jcps.$tail}) { 542 $jcps .= $tail; 543 $src[$p] = undef; 544 } else { 545 $preCC = $curCC; 546 } 547 } 548 } 549 } 550 551 if ($wLen) { 552 for (; $i + 1 < @src; $i++) { 553 last if defined $src[$i + 1]; 554 } 555 } 556 557 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 558 } 559 return \@buf; 560} 561 562 563## 564## list of VCE = getWt(JCPS) 565## 566sub getWt 567{ 568 my $self = shift; 569 my $u = shift; 570 my $vbl = $self->{variable}; 571 my $map = $self->{mapping}; 572 my $der = $self->{derivCode}; 573 574 return if !defined $u; 575 return map(_varCE($vbl, $_), @{ $map->{$u} }) 576 if $map->{$u}; 577 578 # JCPS must not be a contraction, then it's a code point. 579 if (Hangul_SIni <= $u && $u <= Hangul_SFin) { 580 my $hang = $self->{overrideHangul}; 581 my @hangulCE; 582 if ($hang) { 583 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)); 584 } 585 elsif (!defined $hang) { 586 @hangulCE = $der->($u); 587 } 588 else { 589 my $max = $self->{maxlength}; 590 my @decH = _decompHangul($u); 591 592 if (@decH == 2) { 593 my $contract = join(CODE_SEP, @decH); 594 @decH = ($contract) if $map->{$contract}; 595 } else { # must be <@decH == 3> 596 if ($max->{$decH[0]}) { 597 my $contract = join(CODE_SEP, @decH); 598 if ($map->{$contract}) { 599 @decH = ($contract); 600 } else { 601 $contract = join(CODE_SEP, @decH[0,1]); 602 $map->{$contract} and @decH = ($contract, $decH[2]); 603 } 604 # even if V's ignorable, LT contraction is not supported. 605 # If such a situatution were required, NFD should be used. 606 } 607 if (@decH == 3 && $max->{$decH[1]}) { 608 my $contract = join(CODE_SEP, @decH[1,2]); 609 $map->{$contract} and @decH = ($decH[0], $contract); 610 } 611 } 612 613 @hangulCE = map({ 614 $map->{$_} ? @{ $map->{$_} } : $der->($_); 615 } @decH); 616 } 617 return map _varCE($vbl, $_), @hangulCE; 618 } 619 elsif (CJK_UidIni <= $u && $u <= CJK_UidFin || 620 CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 621 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) { 622 my $cjk = $self->{overrideCJK}; 623 return map _varCE($vbl, $_), 624 $cjk 625 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) 626 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max 627 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u) 628 : $der->($u); 629 } 630 else { 631 return map _varCE($vbl, $_), $der->($u); 632 } 633} 634 635 636## 637## string sortkey = getSortKey(string arg) 638## 639sub getSortKey 640{ 641 my $self = shift; 642 my $lev = $self->{level}; 643 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS 644 my $ver9 = $self->{UCA_Version} >= 9; 645 my $v2i = $ver9 && $self->{variable} ne 'non-ignorable'; 646 647 my @buf; # weight arrays 648 if ($self->{hangul_terminator}) { 649 my $preHST = ''; 650 foreach my $jcps (@$rEnt) { 651 # weird things like VL, TL-contraction are not considered! 652 my $curHST = ''; 653 foreach my $u (split /;/, $jcps) { 654 $curHST .= getHST($u); 655 } 656 if ($preHST && !$curHST || # hangul before non-hangul 657 $preHST =~ /L\z/ && $curHST =~ /^T/ || 658 $preHST =~ /V\z/ && $curHST =~ /^L/ || 659 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 660 661 push @buf, $self->getWtHangulTerm(); 662 } 663 $preHST = $curHST; 664 665 push @buf, $self->getWt($jcps); 666 } 667 $preHST # end at hangul 668 and push @buf, $self->getWtHangulTerm(); 669 } 670 else { 671 foreach my $jcps (@$rEnt) { 672 push @buf, $self->getWt($jcps); 673 } 674 } 675 676 # make sort key 677 my @ret = ([],[],[],[]); 678 my $last_is_variable; 679 680 foreach my $vwt (@buf) { 681 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 682 if ($v2i) { 683 if ($var) { 684 $last_is_variable = TRUE; 685 } 686 elsif (!$wt[0]) { # ignorable 687 next if $last_is_variable; 688 } 689 else { 690 $last_is_variable = FALSE; 691 } 692 } 693 foreach my $v (0..$lev-1) { 694 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v]; 695 } 696 } 697 698 # modification of tertiary weights 699 if ($self->{upper_before_lower}) { 700 foreach (@{ $ret[2] }) { 701 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower 702 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper 703 elsif ($_ == 0x1C) { $_ += 1 } # square upper 704 elsif ($_ == 0x1D) { $_ -= 1 } # square lower 705 } 706 } 707 if ($self->{katakana_before_hiragana}) { 708 foreach (@{ $ret[2] }) { 709 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana 710 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana 711 } 712 } 713 714 if ($self->{backwardsFlag}) { 715 for (my $v = MinLevel; $v <= MaxLevel; $v++) { 716 if ($self->{backwardsFlag} & (1 << $v)) { 717 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] }; 718 } 719 } 720 } 721 722 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret; 723} 724 725 726## 727## int compare = cmp(string a, string b) 728## 729sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 730sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 731sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 732sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 733sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 734sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 735sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 736 737## 738## list[strings] sorted = sort(list[strings] arg) 739## 740sub sort { 741 my $obj = shift; 742 return 743 map { $_->[1] } 744 sort{ $a->[0] cmp $b->[0] } 745 map [ $obj->getSortKey($_), $_ ], @_; 746} 747 748 749sub _derivCE_9 { 750 my $u = shift; 751 my $base = 752 (CJK_UidIni <= $u && $u <= CJK_UidFin) 753 ? 0xFB40 : # CJK 754 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 755 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 756 ? 0xFB80 # CJK ext. 757 : 0xFBC0; # others 758 759 my $aaaa = $base + ($u >> 15); 760 my $bbbb = ($u & 0x7FFF) | 0x8000; 761 return 762 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 763 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 764} 765 766sub _derivCE_8 { 767 my $code = shift; 768 my $aaaa = 0xFF80 + ($code >> 15); 769 my $bbbb = ($code & 0x7FFF) | 0x8000; 770 return 771 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), 772 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); 773} 774 775 776sub getWtHangulTerm { 777 my $self = shift; 778 return _varCE($self->{variable}, 779 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0)); 780} 781 782 783## 784## "hhhh hhhh hhhh" to (dddd, dddd, dddd) 785## 786sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } 787 788# 789# $code *must* be in Hangul syllable. 790# Check it before you enter here. 791# 792sub _decompHangul { 793 my $code = shift; 794 my $SIndex = $code - Hangul_SBase; 795 my $LIndex = int( $SIndex / Hangul_NCount); 796 my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount); 797 my $TIndex = $SIndex % Hangul_TCount; 798 return ( 799 Hangul_LBase + $LIndex, 800 Hangul_VBase + $VIndex, 801 $TIndex ? (Hangul_TBase + $TIndex) : (), 802 ); 803} 804 805sub _isIllegal { 806 my $code = shift; 807 return ! defined $code # removed 808 || ($code < 0 || 0x10FFFF < $code) # out of range 809 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) 810 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates 811 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters 812 ; 813} 814 815# Hangul Syllable Type 816sub getHST { 817 my $u = shift; 818 return 819 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" : 820 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" : 821 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : 822 Hangul_SIni <= $u && $u <= Hangul_SFin ? 823 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : ""; 824} 825 826 827## 828## bool _nonIgnorAtLevel(arrayref weights, int level) 829## 830sub _nonIgnorAtLevel($$) 831{ 832 my $wt = shift; 833 return if ! defined $wt; 834 my $lv = shift; 835 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 836} 837 838## 839## bool _eqArray( 840## arrayref of arrayref[weights] source, 841## arrayref of arrayref[weights] substr, 842## int level) 843## * comparison of graphemes vs graphemes. 844## @$source >= @$substr must be true (check it before call this); 845## 846sub _eqArray($$$) 847{ 848 my $source = shift; 849 my $substr = shift; 850 my $lev = shift; 851 852 for my $g (0..@$substr-1){ 853 # Do the $g'th graphemes have the same number of AV weigths? 854 return if @{ $source->[$g] } != @{ $substr->[$g] }; 855 856 for my $w (0..@{ $substr->[$g] }-1) { 857 for my $v (0..$lev-1) { 858 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 859 } 860 } 861 } 862 return 1; 863} 864 865## 866## (int position, int length) 867## int position = index(string, substring, position, [undoc'ed grobal]) 868## 869## With "grobal" (only for the list context), 870## returns list of arrayref[position, length]. 871## 872sub index 873{ 874 my $self = shift; 875 my $str = shift; 876 my $len = length($str); 877 my $subE = $self->splitEnt(shift); 878 my $pos = @_ ? shift : 0; 879 $pos = 0 if $pos < 0; 880 my $grob = shift; 881 882 my $lev = $self->{level}; 883 my $ver9 = $self->{UCA_Version} >= 9; 884 my $v2i = $self->{variable} ne 'non-ignorable'; 885 886 if (! @$subE) { 887 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 888 return $grob 889 ? map([$_, 0], $temp..$len) 890 : wantarray ? ($temp,0) : $temp; 891 } 892 $len < $pos 893 and return wantarray ? () : NOMATCHPOS; 894 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 895 @$strE 896 or return wantarray ? () : NOMATCHPOS; 897 898 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 899 900 my $last_is_variable; 901 for my $vwt (map $self->getWt($_), @$subE) { 902 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 903 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 904 905 if ($v2i && $ver9) { 906 if ($var) { 907 $last_is_variable = TRUE; 908 } 909 elsif (!$wt[0]) { # ignorable 910 $to_be_pushed = FALSE if $last_is_variable; 911 } 912 else { 913 $last_is_variable = FALSE; 914 } 915 } 916 917 if (@subWt && !$var && !$wt[0]) { 918 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 919 } else { 920 push @subWt, [ \@wt ]; 921 } 922 } 923 924 my $count = 0; 925 my $end = @$strE - 1; 926 927 $last_is_variable = FALSE; # reuse 928 for (my $i = 0; $i <= $end; ) { # no $i++ 929 my $found_base = 0; 930 931 # fetch a grapheme 932 while ($i <= $end && $found_base == 0) { 933 for my $vwt ($self->getWt($strE->[$i][0])) { 934 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 935 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 936 937 if ($v2i && $ver9) { 938 if ($var) { 939 $last_is_variable = TRUE; 940 } 941 elsif (!$wt[0]) { # ignorable 942 $to_be_pushed = FALSE if $last_is_variable; 943 } 944 else { 945 $last_is_variable = FALSE; 946 } 947 } 948 949 if (@strWt && !$var && !$wt[0]) { 950 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 951 $finPos[-1] = $strE->[$i][2]; 952 } elsif ($to_be_pushed) { 953 push @strWt, [ \@wt ]; 954 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 955 $finPos[-1] = NOMATCHPOS if $found_base; 956 push @finPos, $strE->[$i][2]; 957 $found_base++; 958 } 959 # else ===> no-op 960 } 961 $i++; 962 } 963 964 # try to match 965 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 966 if ($iniPos[0] != NOMATCHPOS && 967 $finPos[$#subWt] != NOMATCHPOS && 968 _eqArray(\@strWt, \@subWt, $lev)) { 969 my $temp = $iniPos[0] + $pos; 970 971 if ($grob) { 972 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 973 splice @strWt, 0, $#subWt; 974 splice @iniPos, 0, $#subWt; 975 splice @finPos, 0, $#subWt; 976 } 977 else { 978 return wantarray 979 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 980 : $temp; 981 } 982 } 983 shift @strWt; 984 shift @iniPos; 985 shift @finPos; 986 } 987 } 988 989 return $grob 990 ? @g_ret 991 : wantarray ? () : NOMATCHPOS; 992} 993 994## 995## scalarref to matching part = match(string, substring) 996## 997sub match 998{ 999 my $self = shift; 1000 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1001 my $temp = substr($_[0], $pos, $len); 1002 return wantarray ? $temp : \$temp; 1003 # An lvalue ref \substr should be avoided, 1004 # since its value is affected by modification of its referent. 1005 } 1006 else { 1007 return; 1008 } 1009} 1010 1011## 1012## arrayref matching parts = gmatch(string, substring) 1013## 1014sub gmatch 1015{ 1016 my $self = shift; 1017 my $str = shift; 1018 my $sub = shift; 1019 return map substr($str, $_->[0], $_->[1]), 1020 $self->index($str, $sub, 0, 'g'); 1021} 1022 1023## 1024## bool subst'ed = subst(string, substring, replace) 1025## 1026sub subst 1027{ 1028 my $self = shift; 1029 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1030 1031 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1032 if ($code) { 1033 my $mat = substr($_[0], $pos, $len); 1034 substr($_[0], $pos, $len, $code->($mat)); 1035 } else { 1036 substr($_[0], $pos, $len, $_[2]); 1037 } 1038 return TRUE; 1039 } 1040 else { 1041 return FALSE; 1042 } 1043} 1044 1045## 1046## int count = gsubst(string, substring, replace) 1047## 1048sub gsubst 1049{ 1050 my $self = shift; 1051 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1052 my $cnt = 0; 1053 1054 # Replacement is carried out from the end, then use reverse. 1055 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1056 if ($code) { 1057 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1058 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1059 } else { 1060 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1061 } 1062 $cnt++; 1063 } 1064 return $cnt; 1065} 1066 10671; 1068__END__ 1069 1070=head1 NAME 1071 1072Unicode::Collate - Unicode Collation Algorithm 1073 1074=head1 SYNOPSIS 1075 1076 use Unicode::Collate; 1077 1078 #construct 1079 $Collator = Unicode::Collate->new(%tailoring); 1080 1081 #sort 1082 @sorted = $Collator->sort(@not_sorted); 1083 1084 #compare 1085 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1086 1087 # If %tailoring is false (i.e. empty), 1088 # $Collator should do the default collation. 1089 1090=head1 DESCRIPTION 1091 1092This module is an implementation 1093of Unicode Technical Standard #10 (UTS #10) 1094"Unicode Collation Algorithm." 1095 1096=head2 Constructor and Tailoring 1097 1098The C<new> method returns a collator object. 1099 1100 $Collator = Unicode::Collate->new( 1101 UCA_Version => $UCA_Version, 1102 alternate => $alternate, # deprecated: use of 'variable' is recommended. 1103 backwards => $levelNumber, # or \@levelNumbers 1104 entry => $element, 1105 hangul_terminator => $term_primary_weight, 1106 ignoreName => qr/$ignoreName/, 1107 ignoreChar => qr/$ignoreChar/, 1108 katakana_before_hiragana => $bool, 1109 level => $collationLevel, 1110 normalization => $normalization_form, 1111 overrideCJK => \&overrideCJK, 1112 overrideHangul => \&overrideHangul, 1113 preprocess => \&preprocess, 1114 rearrange => \@charList, 1115 table => $filename, 1116 undefName => qr/$undefName/, 1117 undefChar => qr/$undefChar/, 1118 upper_before_lower => $bool, 1119 variable => $variable, 1120 ); 1121 1122=over 4 1123 1124=item UCA_Version 1125 1126If the tracking version number of the older UCA is given, 1127the older behavior of that tracking version is emulated on collating. 1128If omitted, the return value of C<UCA_Version()> is used. 1129 1130The supported tracking version: 8, 9, or 11. 1131 1132B<This parameter may be removed in the future version, 1133as switching the algorithm would affect the performance.> 1134 1135=item backwards 1136 1137-- see 3.1.2 French Accents, UTS #10. 1138 1139 backwards => $levelNumber or \@levelNumbers 1140 1141Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1142If omitted, forwards at all the levels. 1143 1144=item entry 1145 1146-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10. 1147 1148If the same character (or a sequence of characters) exists 1149in the collation element table through C<table>, 1150mapping to collation elements is overrided. 1151If it does not exist, the mapping is defined additionally. 1152 1153 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 11540063 0068 ; [.0E6A.0020.0002.0063] # ch 11550043 0068 ; [.0E6A.0020.0007.0043] # Ch 11560043 0048 ; [.0E6A.0020.0008.0043] # CH 1157006C 006C ; [.0F4C.0020.0002.006C] # ll 1158004C 006C ; [.0F4C.0020.0007.004C] # Ll 1159004C 004C ; [.0F4C.0020.0008.004C] # LL 1160006E 0303 ; [.0F7B.0020.0002.006E] # n-tilde 1161004E 0303 ; [.0F7B.0020.0008.004E] # N-tilde 1162ENTRY 1163 1164 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 116500E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 116600C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1167ENTRY 1168 1169B<NOTE:> The code point in the UCA file format (before C<';'>) 1170B<must> be a Unicode code point (defined as hexadecimal), 1171but not a native code point. 1172So C<0063> must always denote C<U+0063>, 1173but not a character of C<"\x63">. 1174 1175Weighting may vary depending on collation element table. 1176So ensure the weights defined in C<entry> will be consistent with 1177those in the collation element table loaded via C<table>. 1178 1179In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1180and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1181(as a value between C<0E60> and C<0E6D>) 1182makes ordering as C<C E<lt> CH E<lt> D>. 1183Exactly speaking DUCET already has some characters between C<C> and C<D>: 1184C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1185C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1186and C<c-curl> (C<U+0255>) with C<0E69>. 1187Then primary weight C<0E6A> for C<CH> makes C<CH> 1188ordered between C<c-curl> and C<D>. 1189 1190=item hangul_terminator 1191 1192-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10. 1193 1194If a true value is given (non-zero but should be positive), 1195it will be added as a terminator primary weight to the end of 1196every standard Hangul syllable. Secondary and any higher weights 1197for terminator are set to zero. 1198If the value is false or C<hangul_terminator> key does not exist, 1199insertion of terminator weights will not be performed. 1200 1201Boundaries of Hangul syllables are determined 1202according to conjoining Jamo behavior in F<the Unicode Standard> 1203and F<HangulSyllableType.txt>. 1204 1205B<Implementation Note:> 1206(1) For expansion mapping (Unicode character mapped 1207to a sequence of collation elements), a terminator will not be added 1208between collation elements, even if Hangul syllable boundary exists there. 1209Addition of terminator is restricted to the next position 1210to the last collation element. 1211 1212(2) Non-conjoining Hangul letters 1213(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1214automatically terminated with a terminator primary weight. 1215These characters may need terminator included in a collation element 1216table beforehand. 1217 1218=item ignoreName 1219 1220=item ignoreChar 1221 1222-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10. 1223 1224Makes the entry in the table completely ignorable; 1225i.e. as if the weights were zero at all level. 1226 1227E.g. when 'a' and 'e' are ignorable, 1228'element' is equal to 'lament' (or 'lmnt'). 1229 1230=item level 1231 1232-- see 4.3 Form a sort key for each string, UTS #10. 1233 1234Set the maximum level. 1235Any higher levels than the specified one are ignored. 1236 1237 Level 1: alphabetic ordering 1238 Level 2: diacritic ordering 1239 Level 3: case ordering 1240 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1241 1242 ex.level => 2, 1243 1244If omitted, the maximum is the 4th. 1245 1246=item normalization 1247 1248-- see 4.1 Normalize each input string, UTS #10. 1249 1250If specified, strings are normalized before preparation of sort keys 1251(the normalization is executed after preprocess). 1252 1253A form name C<Unicode::Normalize::normalize()> accepts will be applied 1254as C<$normalization_form>. 1255Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1256See C<Unicode::Normalize::normalize()> for detail. 1257If omitted, C<'NFD'> is used. 1258 1259C<normalization> is performed after C<preprocess> (if defined). 1260 1261Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1262though they are not concerned with C<Unicode::Normalize::normalize()>. 1263 1264If C<undef> (not a string C<"undef">) is passed explicitly 1265as the value for this key, 1266any normalization is not carried out (this may make tailoring easier 1267if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1268only contiguous contractions are resolved; 1269e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1270C<A-cedilla-ring> would be primary equal to C<A>. 1271In this point, 1272C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1273B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1274 1275In the case of C<(normalization =E<gt> "prenormalized")>, 1276any normalization is not performed, but 1277non-contiguous contractions with combining characters are performed. 1278Therefore 1279C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1280B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1281If source strings are finely prenormalized, 1282C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1283 1284Except C<(normalization =E<gt> undef)>, 1285B<Unicode::Normalize> is required (see also B<CAVEAT>). 1286 1287=item overrideCJK 1288 1289-- see 7.1 Derived Collation Elements, UTS #10. 1290 1291By default, CJK Unified Ideographs are ordered in Unicode codepoint order 1292(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than 1293C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and 1294C<U+20000> to C<U+2A6D6>]. 1295 1296Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided. 1297 1298ex. CJK Unified Ideographs in the JIS code point order. 1299 1300 overrideCJK => sub { 1301 my $u = shift; # get a Unicode codepoint 1302 my $b = pack('n', $u); # to UTF-16BE 1303 my $s = your_unicode_to_sjis_converter($b); # convert 1304 my $n = unpack('n', $s); # convert sjis to short 1305 [ $n, 0x20, 0x2, $u ]; # return the collation element 1306 }, 1307 1308ex. ignores all CJK Unified Ideographs. 1309 1310 overrideCJK => sub {()}, # CODEREF returning empty list 1311 1312 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1313 # as U+4E00 is a CJK Unified Ideograph and to be ignorable. 1314 1315If C<undef> is passed explicitly as the value for this key, 1316weights for CJK Unified Ideographs are treated as undefined. 1317But assignment of weight for CJK Unified Ideographs 1318in table or C<entry> is still valid. 1319 1320=item overrideHangul 1321 1322-- see 7.1 Derived Collation Elements, UTS #10. 1323 1324By default, Hangul Syllables are decomposed into Hangul Jamo, 1325even if C<(normalization =E<gt> undef)>. 1326But the mapping of Hangul Syllables may be overrided. 1327 1328This tag works like C<overrideCJK>, so see there for examples. 1329 1330If you want to override the mapping of Hangul Syllables, 1331NFD, NFKD, and FCD are not appropriate, 1332since they will decompose Hangul Syllables before overriding. 1333 1334If C<undef> is passed explicitly as the value for this key, 1335weight for Hangul Syllables is treated as undefined 1336without decomposition into Hangul Jamo. 1337But definition of weight for Hangul Syllables 1338in table or C<entry> is still valid. 1339 1340=item preprocess 1341 1342-- see 5.1 Preprocessing, UTS #10. 1343 1344If specified, the coderef is used to preprocess 1345before the formation of sort keys. 1346 1347ex. dropping English articles, such as "a" or "the". 1348Then, "the pen" is before "a pencil". 1349 1350 preprocess => sub { 1351 my $str = shift; 1352 $str =~ s/\b(?:an?|the)\s+//gi; 1353 return $str; 1354 }, 1355 1356C<preprocess> is performed before C<normalization> (if defined). 1357 1358=item rearrange 1359 1360-- see 3.1.3 Rearrangement, UTS #10. 1361 1362Characters that are not coded in logical order and to be rearranged. 1363By default, 1364 1365 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1366 1367If you want to disallow any rearrangement, 1368pass C<undef> or C<[]> (a reference to an empty list) 1369as the value for this key. 1370 1371B<According to the version 9 of UCA, this parameter shall not be used; 1372but it is not warned at present.> 1373 1374=item table 1375 1376-- see 3.2 Default Unicode Collation Element Table, UTS #10. 1377 1378You can use another collation element table if desired. 1379The table file must be put into a directory 1380where F<Unicode/Collate.pm> is installed; e.g. into 1381F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>. 1382 1383By default, the filename F<allkeys.txt> is used. 1384 1385If C<undef> is passed explicitly as the value for this key, 1386no file is read (but you can define collation elements via C<entry>). 1387 1388A typical way to define a collation element table 1389without any file of table: 1390 1391 $onlyABC = Unicode::Collate->new( 1392 table => undef, 1393 entry => << 'ENTRIES', 13940061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 13950041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 13960062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 13970042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 13980063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 13990043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1400ENTRIES 1401 ); 1402 1403=item undefName 1404 1405=item undefChar 1406 1407-- see 6.3.4 Reducing the Repertoire, UTS #10. 1408 1409Undefines the collation element as if it were unassigned in the table. 1410This reduces the size of the table. 1411If an unassigned character appears in the string to be collated, 1412the sort key is made from its codepoint 1413as a single-character collation element, 1414as it is greater than any other assigned collation elements 1415(in the codepoint order among the unassigned characters). 1416But, it'd be better to ignore characters 1417unfamiliar to you and maybe never used. 1418 1419=item katakana_before_hiragana 1420 1421=item upper_before_lower 1422 1423-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10. 1424 1425By default, lowercase is before uppercase 1426and hiragana is before katakana. 1427 1428If the tag is made true, this is reversed. 1429 1430B<NOTE>: These tags simplemindedly assume 1431any lowercase/uppercase or hiragana/katakana distinctions 1432must occur in level 3, and their weights at level 3 1433must be same as those mentioned in 7.3.1, UTS #10. 1434If you define your collation elements which violate this requirement, 1435these tags don't work validly. 1436 1437=item variable 1438 1439=item alternate 1440 1441-- see 3.2.2 Variable Weighting, UTS #10. 1442 1443(the title in UCA version 8: Alternate Weighting) 1444 1445This key allows to variable weighting for variable collation elements, 1446which are marked with an ASTERISK in the table 1447(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>). 1448 1449 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1450 1451These names are case-insensitive. 1452By default (if specification is omitted), 'shifted' is adopted. 1453 1454 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1455 considered at the 4th level. 1456 1457 'Non-Ignorable' Variable elements are not reset to ignorable. 1458 1459 'Shifted' Variable elements are made ignorable at levels 1 through 3 1460 their level 4 weight is replaced by the old level 1 weight. 1461 Level 4 weight for Non-Variable elements is 0xFFFF. 1462 1463 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1464 are trimmed. 1465 1466For backward compatibility, C<alternate> can be used as an alias 1467for C<variable>. 1468 1469=back 1470 1471=head2 Methods for Collation 1472 1473=over 4 1474 1475=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1476 1477Sorts a list of strings. 1478 1479=item C<$result = $Collator-E<gt>cmp($a, $b)> 1480 1481Returns 1 (when C<$a> is greater than C<$b>) 1482or 0 (when C<$a> is equal to C<$b>) 1483or -1 (when C<$a> is lesser than C<$b>). 1484 1485=item C<$result = $Collator-E<gt>eq($a, $b)> 1486 1487=item C<$result = $Collator-E<gt>ne($a, $b)> 1488 1489=item C<$result = $Collator-E<gt>lt($a, $b)> 1490 1491=item C<$result = $Collator-E<gt>le($a, $b)> 1492 1493=item C<$result = $Collator-E<gt>gt($a, $b)> 1494 1495=item C<$result = $Collator-E<gt>ge($a, $b)> 1496 1497They works like the same name operators as theirs. 1498 1499 eq : whether $a is equal to $b. 1500 ne : whether $a is not equal to $b. 1501 lt : whether $a is lesser than $b. 1502 le : whether $a is lesser than $b or equal to $b. 1503 gt : whether $a is greater than $b. 1504 ge : whether $a is greater than $b or equal to $b. 1505 1506=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1507 1508-- see 4.3 Form a sort key for each string, UTS #10. 1509 1510Returns a sort key. 1511 1512You compare the sort keys using a binary comparison 1513and get the result of the comparison of the strings using UCA. 1514 1515 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1516 1517 is equivalent to 1518 1519 $Collator->cmp($a, $b) 1520 1521=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1522 1523 use Unicode::Collate; 1524 my $c = Unicode::Collate->new(); 1525 print $c->viewSortKey("Perl"),"\n"; 1526 1527 # output: 1528 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1529 # Level 1 Level 2 Level 3 Level 4 1530 1531 (If C<UCA_Version> is 8, the output is slightly different.) 1532 1533=back 1534 1535=head2 Methods for Searching 1536 1537B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true 1538for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1539C<subst>, C<gsubst>) is croaked, 1540as the position and the length might differ 1541from those on the specified string. 1542(And C<rearrange> and C<hangul_terminator> tags are neglected.) 1543 1544The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1545like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1546but they are not aware of any pattern, but only a literal substring. 1547 1548=over 4 1549 1550=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1551 1552=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1553 1554If C<$substring> matches a part of C<$string>, returns 1555the position of the first occurrence of the matching part in scalar context; 1556in list context, returns a two-element list of 1557the position and the length of the matching part. 1558 1559If C<$substring> does not match any part of C<$string>, 1560returns C<-1> in scalar context and 1561an empty list in list context. 1562 1563e.g. you say 1564 1565 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1566 # (normalization => undef) is REQUIRED. 1567 my $str = "Ich mu� studieren Perl."; 1568 my $sub = "M�SS"; 1569 my $match; 1570 if (my($pos,$len) = $Collator->index($str, $sub)) { 1571 $match = substr($str, $pos, $len); 1572 } 1573 1574and get C<"mu�"> in C<$match> since C<"mu�"> 1575is primary equal to C<"M�SS">. 1576 1577=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1578 1579=item C<($match) = $Collator-E<gt>match($string, $substring)> 1580 1581If C<$substring> matches a part of C<$string>, in scalar context, returns 1582B<a reference to> the first occurrence of the matching part 1583(C<$match_ref> is always true if matches, 1584since every reference is B<true>); 1585in list context, returns the first occurrence of the matching part. 1586 1587If C<$substring> does not match any part of C<$string>, 1588returns C<undef> in scalar context and 1589an empty list in list context. 1590 1591e.g. 1592 1593 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1594 print "matches [$$match_ref].\n"; 1595 } else { 1596 print "doesn't match.\n"; 1597 } 1598 1599 or 1600 1601 if (($match) = $Collator->match($str, $sub)) { # list context 1602 print "matches [$match].\n"; 1603 } else { 1604 print "doesn't match.\n"; 1605 } 1606 1607=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1608 1609If C<$substring> matches a part of C<$string>, returns 1610all the matching parts (or matching count in scalar context). 1611 1612If C<$substring> does not match any part of C<$string>, 1613returns an empty list. 1614 1615=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1616 1617If C<$substring> matches a part of C<$string>, 1618the first occurrence of the matching part is replaced by C<$replacement> 1619(C<$string> is modified) and return C<$count> (always equals to C<1>). 1620 1621C<$replacement> can be a C<CODEREF>, 1622taking the matching part as an argument, 1623and returning a string to replace the matching part 1624(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1625 1626=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1627 1628If C<$substring> matches a part of C<$string>, 1629all the occurrences of the matching part is replaced by C<$replacement> 1630(C<$string> is modified) and return C<$count>. 1631 1632C<$replacement> can be a C<CODEREF>, 1633taking the matching part as an argument, 1634and returning a string to replace the matching part 1635(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1636 1637e.g. 1638 1639 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1640 # (normalization => undef) is REQUIRED. 1641 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L..."; 1642 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1643 1644 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>..."; 1645 # i.e., all the camels are made bold-faced. 1646 1647=back 1648 1649=head2 Other Methods 1650 1651=over 4 1652 1653=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 1654 1655Change the value of specified keys and returns the changed part. 1656 1657 $Collator = Unicode::Collate->new(level => 4); 1658 1659 $Collator->eq("perl", "PERL"); # false 1660 1661 %old = $Collator->change(level => 2); # returns (level => 4). 1662 1663 $Collator->eq("perl", "PERL"); # true 1664 1665 $Collator->change(%old); # returns (level => 2). 1666 1667 $Collator->eq("perl", "PERL"); # false 1668 1669Not all C<(key,value)>s are allowed to be changed. 1670See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 1671 1672In the scalar context, returns the modified collator 1673(but it is B<not> a clone from the original). 1674 1675 $Collator->change(level => 2)->eq("perl", "PERL"); # true 1676 1677 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 1678 1679 $Collator->change(level => 4)->eq("perl", "PERL"); # false 1680 1681=item C<$version = $Collator-E<gt>version()> 1682 1683Returns the version number (a string) of the Unicode Standard 1684which the C<table> file used by the collator object is based on. 1685If the table does not include a version line (starting with C<@version>), 1686returns C<"unknown">. 1687 1688=item C<UCA_Version()> 1689 1690Returns the tracking version number of UTS #10 this module consults. 1691 1692=item C<Base_Unicode_Version()> 1693 1694Returns the version number of UTS #10 this module consults. 1695 1696=back 1697 1698=head2 EXPORT 1699 1700None by default. 1701 1702=head2 CAVEAT 1703 1704Use of the C<normalization> parameter requires 1705the B<Unicode::Normalize> module. 1706 1707If you need not it (say, in the case when you need not 1708handle any combining characters), 1709assign C<normalization =E<gt> undef> explicitly. 1710 1711-- see 6.5 Avoiding Normalization, UTS #10. 1712 1713=head2 Conformance Test 1714 1715The Conformance Test for the UCA is available 1716under L<http://www.unicode.org/Public/UCA/>. 1717 1718For F<CollationTest_SHIFTED.txt>, 1719a collator via C<Unicode::Collate-E<gt>new( )> should be used; 1720for F<CollationTest_NON_IGNORABLE.txt>, a collator via 1721C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 1722 1723B<Unicode::Normalize is required to try The Conformance Test.> 1724 1725=head1 AUTHOR 1726 1727SADAHIRO Tomoyuki <SADAHIRO@cpan.org> 1728 1729 http://homepage1.nifty.com/nomenclator/perl/ 1730 1731 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved. 1732 1733 This library is free software; you can redistribute it 1734 and/or modify it under the same terms as Perl itself. 1735 1736=head1 SEE ALSO 1737 1738=over 4 1739 1740=item Unicode Collation Algorithm - UTS #10 1741 1742L<http://www.unicode.org/reports/tr10/> 1743 1744=item The Default Unicode Collation Element Table (DUCET) 1745 1746L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 1747 1748=item The conformance test for the UCA 1749 1750L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 1751 1752L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 1753 1754=item Hangul Syllable Type 1755 1756L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 1757 1758=item Unicode Normalization Forms - UAX #15 1759 1760L<http://www.unicode.org/reports/tr15/> 1761 1762=item L<Unicode::Normalize> 1763 1764=back 1765 1766=cut 1767