1package Unicode::Collate; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Collate cannot stringify a Unicode code point\n"; 6 } 7 unless (0x41 == unpack('U', 'A')) { 8 die "Unicode::Collate cannot get a Unicode code point\n"; 9 } 10} 11 12use 5.006; 13use strict; 14use warnings; 15use Carp; 16use File::Spec; 17 18no warnings 'utf8'; 19 20our $VERSION = '1.04'; 21our $PACKAGE = __PACKAGE__; 22 23### begin XS only ### 24require DynaLoader; 25our @ISA = qw(DynaLoader); 26bootstrap Unicode::Collate $VERSION; 27### end XS only ### 28 29my @Path = qw(Unicode Collate); 30my $KeyFile = "allkeys.txt"; 31 32# Perl's boolean 33use constant TRUE => 1; 34use constant FALSE => ""; 35use constant NOMATCHPOS => -1; 36 37# A coderef to get combining class imported from Unicode::Normalize 38# (i.e. \&Unicode::Normalize::getCombinClass). 39# This is also used as a HAS_UNICODE_NORMALIZE flag. 40my $CVgetCombinClass; 41 42# Supported Levels 43use constant MinLevel => 1; 44use constant MaxLevel => 4; 45 46# Minimum weights at level 2 and 3, respectively 47use constant Min2Wt => 0x20; 48use constant Min3Wt => 0x02; 49 50# Shifted weight at 4th level 51use constant Shift4Wt => 0xFFFF; 52 53# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 54use constant VCE_TEMPLATE => 'Cn4'; 55 56# A sort key: 16-bit weights 57use constant KEY_TEMPLATE => 'n*'; 58 59# The tie-breaking: 32-bit weights 60use constant TIE_TEMPLATE => 'N*'; 61 62# Level separator in a sort key: 63# i.e. pack(KEY_TEMPLATE, 0) 64use constant LEVEL_SEP => "\0\0"; 65 66# As Unicode code point separator for hash keys. 67# A joined code point string (denoted by JCPS below) 68# like "65;768" is used for internal processing 69# instead of Perl's Unicode string like "\x41\x{300}", 70# as the native code point is different from the Unicode code point 71# on EBCDIC platform. 72# This character must not be included in any stringified 73# representation of an integer. 74use constant CODE_SEP => ';'; 75 # NOTE: in regex /;/ is used for $jcps! 76 77# boolean values of variable weights 78use constant NON_VAR => 0; # Non-Variable character 79use constant VAR => 1; # Variable character 80 81# specific code points 82use constant Hangul_SIni => 0xAC00; 83use constant Hangul_SFin => 0xD7A3; 84 85# Logical_Order_Exception in PropList.txt 86my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 87 88# for highestFFFF and minimalFFFE 89my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 90my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 91 92sub UCA_Version { "28" } 93 94sub Base_Unicode_Version { "6.3.0" } 95 96###### 97 98sub pack_U { 99 return pack('U*', @_); 100} 101 102### begin XS only ### 103*unpack_U = exists &Unicode::Collate::bootstrap && 104 $] < 5.008 && \&unpackUfor56 && 0x41 == unpackUfor56('A') 105 ? \&unpackUfor56 : sub { return unpack('U*', shift(@_).pack('U*')) }; 106### end XS only ### 107 108###### 109 110my (%VariableOK); 111@VariableOK{ qw/ 112 blanked non-ignorable shifted shift-trimmed 113 / } = (); # keys lowercased 114 115our @ChangeOK = qw/ 116 alternate backwards level normalization rearrange 117 katakana_before_hiragana upper_before_lower ignore_level2 118 overrideCJK overrideHangul overrideOut preprocess UCA_Version 119 hangul_terminator variable identical highestFFFF minimalFFFE 120 /; 121 122our @ChangeNG = qw/ 123 entry mapping table maxlength contraction 124 ignoreChar ignoreName undefChar undefName rewrite 125 versionTable alternateTable backwardsTable forwardsTable 126 rearrangeTable variableTable 127 derivCode normCode rearrangeHash backwardsFlag 128 suppress suppressHash 129 __useXS /; ### XS only 130# The hash key 'ignored' was deleted at v 0.21. 131# The hash key 'isShift' was deleted at v 0.23. 132# The hash key 'combining' was deleted at v 0.24. 133# The hash key 'entries' was deleted at v 0.30. 134# The hash key 'L3_ignorable' was deleted at v 0.40. 135 136sub version { 137 my $self = shift; 138 return $self->{versionTable} || 'unknown'; 139} 140 141my (%ChangeOK, %ChangeNG); 142@ChangeOK{ @ChangeOK } = (); 143@ChangeNG{ @ChangeNG } = (); 144 145sub change { 146 my $self = shift; 147 my %hash = @_; 148 my %old; 149 if (exists $hash{alternate}) { 150 if (exists $hash{variable}) { 151 delete $hash{alternate}; 152 } else { 153 $hash{variable} = $hash{alternate}; 154 } 155 } 156 foreach my $k (keys %hash) { 157 if (exists $ChangeOK{$k}) { 158 $old{$k} = $self->{$k}; 159 $self->{$k} = $hash{$k}; 160 } elsif (exists $ChangeNG{$k}) { 161 croak "change of $k via change() is not allowed!"; 162 } 163 # else => ignored 164 } 165 $self->checkCollator(); 166 return wantarray ? %old : $self; 167} 168 169sub _checkLevel { 170 my $level = shift; 171 my $key = shift; # 'level' or 'backwards' 172 MinLevel <= $level or croak sprintf 173 "Illegal level %d (in value for key '%s') lower than %d.", 174 $level, $key, MinLevel; 175 $level <= MaxLevel or croak sprintf 176 "Unsupported level %d (in value for key '%s') higher than %d.", 177 $level, $key, MaxLevel; 178} 179 180my %DerivCode = ( 181 8 => \&_derivCE_8, 182 9 => \&_derivCE_9, 183 11 => \&_derivCE_9, # 11 == 9 184 14 => \&_derivCE_14, 185 16 => \&_derivCE_14, # 16 == 14 186 18 => \&_derivCE_18, 187 20 => \&_derivCE_20, 188 22 => \&_derivCE_22, 189 24 => \&_derivCE_24, 190 26 => \&_derivCE_24, # 26 == 24 191 28 => \&_derivCE_24, # 28 == 24 192); 193 194sub checkCollator { 195 my $self = shift; 196 _checkLevel($self->{level}, "level"); 197 198 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 199 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 200 201 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 202 $self->{alternateTable} || 'shifted'; 203 $self->{variable} = $self->{alternate} = lc($self->{variable}); 204 exists $VariableOK{ $self->{variable} } 205 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 206 207 if (! defined $self->{backwards}) { 208 $self->{backwardsFlag} = 0; 209 } elsif (! ref $self->{backwards}) { 210 _checkLevel($self->{backwards}, "backwards"); 211 $self->{backwardsFlag} = 1 << $self->{backwards}; 212 } else { 213 my %level; 214 $self->{backwardsFlag} = 0; 215 for my $b (@{ $self->{backwards} }) { 216 _checkLevel($b, "backwards"); 217 $level{$b} = 1; 218 } 219 for my $v (sort keys %level) { 220 $self->{backwardsFlag} += 1 << $v; 221 } 222 } 223 224 defined $self->{rearrange} or $self->{rearrange} = []; 225 ref $self->{rearrange} 226 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 227 228 # keys of $self->{rearrangeHash} are $self->{rearrange}. 229 $self->{rearrangeHash} = undef; 230 231 if (@{ $self->{rearrange} }) { 232 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 233 } 234 235 $self->{normCode} = undef; 236 237 if (defined $self->{normalization}) { 238 eval { require Unicode::Normalize }; 239 $@ and croak "Unicode::Normalize is required to normalize strings"; 240 241 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 242 243 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 244 $self->{normCode} = \&Unicode::Normalize::NFD; 245 } 246 elsif ($self->{normalization} ne 'prenormalized') { 247 my $norm = $self->{normalization}; 248 $self->{normCode} = sub { 249 Unicode::Normalize::normalize($norm, shift); 250 }; 251 eval { $self->{normCode}->("") }; # try 252 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 253 } 254 } 255 return; 256} 257 258sub new 259{ 260 my $class = shift; 261 my $self = bless { @_ }, $class; 262 263### begin XS only ### 264 if (! exists $self->{table} && !defined $self->{rewrite} && 265 !defined $self->{undefName} && !defined $self->{ignoreName} && 266 !defined $self->{undefChar} && !defined $self->{ignoreChar}) { 267 $self->{__useXS} = \&_fetch_simple; 268 } else { 269 $self->{__useXS} = undef; 270 } 271### end XS only ### 272 273 # keys of $self->{suppressHash} are $self->{suppress}. 274 if ($self->{suppress} && @{ $self->{suppress} }) { 275 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 276 } # before read_table() 277 278 # If undef is passed explicitly, no file is read. 279 $self->{table} = $KeyFile if ! exists $self->{table}; 280 $self->read_table() if defined $self->{table}; 281 282 if ($self->{entry}) { 283 while ($self->{entry} =~ /([^\n]+)/g) { 284 $self->parseEntry($1, TRUE); 285 } 286 } 287 288 $self->{level} ||= MaxLevel; 289 $self->{UCA_Version} ||= UCA_Version(); 290 291 $self->{overrideHangul} = FALSE 292 if ! exists $self->{overrideHangul}; 293 $self->{overrideCJK} = FALSE 294 if ! exists $self->{overrideCJK}; 295 $self->{normalization} = 'NFD' 296 if ! exists $self->{normalization}; 297 $self->{rearrange} = $self->{rearrangeTable} || 298 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 299 if ! exists $self->{rearrange}; 300 $self->{backwards} = $self->{backwardsTable} 301 if ! exists $self->{backwards}; 302 303 $self->checkCollator(); 304 305 return $self; 306} 307 308sub parseAtmark { 309 my $self = shift; 310 my $line = shift; # after s/^\s*\@// 311 312 if ($line =~ /^version\s*(\S*)/) { 313 $self->{versionTable} ||= $1; 314 } 315 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 316 $self->{variableTable} ||= $1; 317 } 318 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 319 $self->{alternateTable} ||= $1; 320 } 321 elsif ($line =~ /^backwards\s+(\S*)/) { 322 push @{ $self->{backwardsTable} }, $1; 323 } 324 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use 325 push @{ $self->{forwardsTable} }, $1; 326 } 327 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 328 push @{ $self->{rearrangeTable} }, _getHexArray($1); 329 } 330} 331 332sub read_table { 333 my $self = shift; 334 335### begin XS only ### 336 if ($self->{__useXS}) { 337 my @rest = _fetch_rest(); # complex matter need to parse 338 for my $line (@rest) { 339 next if $line =~ /^\s*#/; 340 341 if ($line =~ s/^\s*\@//) { 342 $self->parseAtmark($line); 343 } else { 344 $self->parseEntry($line); 345 } 346 } 347 return; 348 } 349### end XS only ### 350 351 my($f, $fh); 352 foreach my $d (@INC) { 353 $f = File::Spec->catfile($d, @Path, $self->{table}); 354 last if open($fh, $f); 355 $f = undef; 356 } 357 if (!defined $f) { 358 $f = File::Spec->catfile(@Path, $self->{table}); 359 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 360 } 361 362 while (my $line = <$fh>) { 363 next if $line =~ /^\s*#/; 364 365 if ($line =~ s/^\s*\@//) { 366 $self->parseAtmark($line); 367 } else { 368 $self->parseEntry($line); 369 } 370 } 371 close $fh; 372} 373 374 375## 376## get $line, parse it, and write an entry in $self 377## 378sub parseEntry 379{ 380 my $self = shift; 381 my $line = shift; 382 my $tailoring = shift; 383 my($name, $entry, @uv, @key); 384 385 if (defined $self->{rewrite}) { 386 $line = $self->{rewrite}->($line); 387 } 388 389 return if $line !~ /^\s*[0-9A-Fa-f]/; 390 391 # removes comment and gets name 392 $name = $1 393 if $line =~ s/[#%]\s*(.*)//; 394 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 395 396 # gets element 397 my($e, $k) = split /;/, $line; 398 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 399 if ! $k; 400 401 @uv = _getHexArray($e); 402 return if !@uv; 403 return if @uv > 1 && $self->{suppressHash} && !$tailoring && 404 exists $self->{suppressHash}{$uv[0]}; 405 $entry = join(CODE_SEP, @uv); # in JCPS 406 407 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 408 my $ele = pack_U(@uv); 409 410 # regarded as if it were not stored in the table 411 return 412 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 413 414 # replaced as completely ignorable 415 $k = '[.0000.0000.0000.0000]' 416 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 417 } 418 419 # replaced as completely ignorable 420 $k = '[.0000.0000.0000.0000]' 421 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 422 423 my $is_L3_ignorable = TRUE; 424 425 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 426 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 427 my @wt = _getHexArray($arr); 428 push @key, pack(VCE_TEMPLATE, $var, @wt); 429 $is_L3_ignorable = FALSE 430 if $wt[0] || $wt[1] || $wt[2]; 431 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 432 # is completely ignorable. 433 # For expansion, an entry $is_L3_ignorable 434 # if and only if "all" CEs are [.0000.0000.0000]. 435 } 436 437 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 438 439 if (@uv > 1) { 440 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 441 $self->{maxlength}{$uv[0]} = @uv; 442 } 443 } 444 if (@uv > 2) { 445 while (@uv) { 446 pop @uv; 447 my $fake_entry = join(CODE_SEP, @uv); # in JCPS 448 $self->{contraction}{$fake_entry} = 1; 449 } 450 } 451} 452 453 454sub viewSortKey 455{ 456 my $self = shift; 457 my $str = shift; 458 $self->visualizeSortKey($self->getSortKey($str)); 459} 460 461 462sub process 463{ 464 my $self = shift; 465 my $str = shift; 466 my $prep = $self->{preprocess}; 467 my $norm = $self->{normCode}; 468 469 $str = &$prep($str) if ref $prep; 470 $str = &$norm($str) if ref $norm; 471 return $str; 472} 473 474## 475## arrayref of JCPS = splitEnt(string to be collated) 476## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 477## 478sub splitEnt 479{ 480 my $self = shift; 481 my $str = shift; 482 my $wLen = shift; # with Length 483 484 my $map = $self->{mapping}; 485 my $max = $self->{maxlength}; 486 my $reH = $self->{rearrangeHash}; 487 my $vers = $self->{UCA_Version}; 488 my $ver9 = $vers >= 9 && $vers <= 11; 489 my $uXS = $self->{__useXS}; ### XS only 490 491 my @buf; 492 493 # get array of Unicode code point of string. 494 my @src = unpack_U($str); 495 496 # rearrangement: 497 # Character positions are not kept if rearranged, 498 # then neglected if $wLen is true. 499 if ($reH && ! $wLen) { 500 for (my $i = 0; $i < @src; $i++) { 501 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 502 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 503 $i++; 504 } 505 } 506 } 507 508 # remove a code point marked as a completely ignorable. 509 for (my $i = 0; $i < @src; $i++) { 510 if ($vers <= 20 && _isIllegal($src[$i])) { 511 $src[$i] = undef; 512 } elsif ($ver9) { 513 $src[$i] = undef if $map->{ $src[$i] } 514 ? @{ $map->{ $src[$i] } } == 0 515 : $uXS && _ignorable_simple($src[$i]); ### XS only 516 } 517 } 518 519 for (my $i = 0; $i < @src; $i++) { 520 my $jcps = $src[$i]; 521 522 # skip removed code point 523 if (! defined $jcps) { 524 if ($wLen && @buf) { 525 $buf[-1][2] = $i + 1; 526 } 527 next; 528 } 529 530 my $i_orig = $i; 531 532 # find contraction 533 if ($max->{$jcps}) { 534 my $temp_jcps = $jcps; 535 my $jcpsLen = 1; 536 my $maxLen = $max->{$jcps}; 537 538 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 539 next if ! defined $src[$p]; 540 $temp_jcps .= CODE_SEP . $src[$p]; 541 $jcpsLen++; 542 if ($map->{$temp_jcps}) { 543 $jcps = $temp_jcps; 544 $i = $p; 545 } 546 } 547 548 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 549 # This process requires Unicode::Normalize. 550 # If "normalization" is undef, here should be skipped *always* 551 # (in spite of bool value of $CVgetCombinClass), 552 # since canonical ordering cannot be expected. 553 # Blocked combining character should not be contracted. 554 555 # $self->{normCode} is false in the case of "prenormalized". 556 if ($self->{normalization}) { 557 my $cont = $self->{contraction}; 558 my $preCC = 0; 559 my $preCC_uc = 0; 560 my $jcps_uc = $jcps; 561 my(@out, @out_uc); 562 563 for (my $p = $i + 1; $p < @src; $p++) { 564 next if ! defined $src[$p]; 565 my $curCC = $CVgetCombinClass->($src[$p]); 566 last unless $curCC; 567 my $tail = CODE_SEP . $src[$p]; 568 569 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} || 570 $cont->{$jcps_uc.$tail})) { 571 $jcps_uc .= $tail; 572 push @out_uc, $p; 573 } else { 574 $preCC_uc = $curCC; 575 } 576 577 if ($preCC != $curCC && $map->{$jcps.$tail}) { 578 $jcps .= $tail; 579 push @out, $p; 580 } else { 581 $preCC = $curCC; 582 } 583 } 584 585 if ($map->{$jcps_uc}) { 586 $jcps = $jcps_uc; 587 $src[$_] = undef for @out_uc; 588 } else { 589 $src[$_] = undef for @out; 590 } 591 } 592 } 593 594 # skip completely ignorable 595 if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 : 596 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only 597 if ($wLen && @buf) { 598 $buf[-1][2] = $i + 1; 599 } 600 next; 601 } 602 603 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 604 } 605 return \@buf; 606} 607 608## 609## VCE = _pack_override(input, codepoint, derivCode) 610## 611sub _pack_override ($$$) { 612 my $r = shift; 613 my $u = shift; 614 my $der = shift; 615 616 if (ref $r) { 617 return pack(VCE_TEMPLATE, NON_VAR, @$r); 618 } elsif (defined $r) { 619 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 620 } else { 621 $u = 0xFFFD if 0x10FFFF < $u; 622 return $der->($u); 623 } 624} 625 626## 627## list of VCE = getWt(JCPS) 628## 629sub getWt 630{ 631 my $self = shift; 632 my $u = shift; 633 my $map = $self->{mapping}; 634 my $der = $self->{derivCode}; 635 my $out = $self->{overrideOut}; 636 my $uXS = $self->{__useXS}; ### XS only 637 638 return if !defined $u; 639 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 640 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 641 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; 642 643 my @ce; 644 if ($map->{$u}) { 645 @ce = @{ $map->{$u} }; # $u may be a contraction 646### begin XS only ### 647 } elsif ($uXS && _exists_simple($u)) { 648 @ce = _fetch_simple($u); 649### end XS only ### 650 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { 651 my $hang = $self->{overrideHangul}; 652 if ($hang) { 653 @ce = map _pack_override($_, $u, $der), $hang->($u); 654 } elsif (!defined $hang) { 655 @ce = $der->($u); 656 } else { 657 my $max = $self->{maxlength}; 658 my @decH = _decompHangul($u); 659 660 if (@decH == 2) { 661 my $contract = join(CODE_SEP, @decH); 662 @decH = ($contract) if $map->{$contract}; 663 } else { # must be <@decH == 3> 664 if ($max->{$decH[0]}) { 665 my $contract = join(CODE_SEP, @decH); 666 if ($map->{$contract}) { 667 @decH = ($contract); 668 } else { 669 $contract = join(CODE_SEP, @decH[0,1]); 670 $map->{$contract} and @decH = ($contract, $decH[2]); 671 } 672 # even if V's ignorable, LT contraction is not supported. 673 # If such a situation were required, NFD should be used. 674 } 675 if (@decH == 3 && $max->{$decH[1]}) { 676 my $contract = join(CODE_SEP, @decH[1,2]); 677 $map->{$contract} and @decH = ($decH[0], $contract); 678 } 679 } 680 681 @ce = map({ 682 $map->{$_} ? @{ $map->{$_} } : 683 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only 684 $der->($_); 685 } @decH); 686 } 687 } elsif ($out && 0x10FFFF < $u) { 688 @ce = map _pack_override($_, $u, $der), $out->($u); 689 } else { 690 my $cjk = $self->{overrideCJK}; 691 my $vers = $self->{UCA_Version}; 692 if ($cjk && _isUIdeo($u, $vers)) { 693 @ce = map _pack_override($_, $u, $der), $cjk->($u); 694 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 695 @ce = _uideoCE_8($u); 696 } else { 697 @ce = $der->($u); 698 } 699 } 700 return map $self->varCE($_), @ce; 701} 702 703 704## 705## string sortkey = getSortKey(string arg) 706## 707sub getSortKey 708{ 709 my $self = shift; 710 my $orig = shift; 711 my $str = $self->process($orig); 712 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 713 my $vers = $self->{UCA_Version}; 714 my $term = $self->{hangul_terminator}; 715 my $lev = $self->{level}; 716 my $iden = $self->{identical}; 717 718 my @buf; # weight arrays 719 if ($term) { 720 my $preHST = ''; 721 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 722 foreach my $jcps (@$rEnt) { 723 # weird things like VL, TL-contraction are not considered! 724 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 725 if ($preHST && !$curHST || # hangul before non-hangul 726 $preHST =~ /L\z/ && $curHST =~ /^T/ || 727 $preHST =~ /V\z/ && $curHST =~ /^L/ || 728 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 729 push @buf, $termCE; 730 } 731 $preHST = $curHST; 732 push @buf, $self->getWt($jcps); 733 } 734 push @buf, $termCE if $preHST; # end at hangul 735 } else { 736 foreach my $jcps (@$rEnt) { 737 push @buf, $self->getWt($jcps); 738 } 739 } 740 741 my $rkey = $self->mk_SortKey(\@buf); ### XS only 742 743 if ($iden || $vers >= 26 && $lev == MaxLevel) { 744 $rkey .= LEVEL_SEP; 745 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 746 } 747 return $rkey; 748} 749 750 751## 752## int compare = cmp(string a, string b) 753## 754sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 755sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 756sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 757sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 758sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 759sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 760sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 761 762## 763## list[strings] sorted = sort(list[strings] arg) 764## 765sub sort { 766 my $obj = shift; 767 return 768 map { $_->[1] } 769 sort{ $a->[0] cmp $b->[0] } 770 map [ $obj->getSortKey($_), $_ ], @_; 771} 772 773 774## 775## bool _nonIgnorAtLevel(arrayref weights, int level) 776## 777sub _nonIgnorAtLevel($$) 778{ 779 my $wt = shift; 780 return if ! defined $wt; 781 my $lv = shift; 782 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 783} 784 785## 786## bool _eqArray( 787## arrayref of arrayref[weights] source, 788## arrayref of arrayref[weights] substr, 789## int level) 790## * comparison of graphemes vs graphemes. 791## @$source >= @$substr must be true (check it before call this); 792## 793sub _eqArray($$$) 794{ 795 my $source = shift; 796 my $substr = shift; 797 my $lev = shift; 798 799 for my $g (0..@$substr-1){ 800 # Do the $g'th graphemes have the same number of AV weights? 801 return if @{ $source->[$g] } != @{ $substr->[$g] }; 802 803 for my $w (0..@{ $substr->[$g] }-1) { 804 for my $v (0..$lev-1) { 805 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 806 } 807 } 808 } 809 return 1; 810} 811 812## 813## (int position, int length) 814## int position = index(string, substring, position, [undoc'ed global]) 815## 816## With "global" (only for the list context), 817## returns list of arrayref[position, length]. 818## 819sub index 820{ 821 my $self = shift; 822 $self->{preprocess} and 823 croak "Don't use Preprocess with index(), match(), etc."; 824 $self->{normCode} and 825 croak "Don't use Normalization with index(), match(), etc."; 826 827 my $str = shift; 828 my $len = length($str); 829 my $sub = shift; 830 my $subE = $self->splitEnt($sub); 831 my $pos = @_ ? shift : 0; 832 $pos = 0 if $pos < 0; 833 my $glob = shift; 834 835 my $lev = $self->{level}; 836 my $v2i = $self->{UCA_Version} >= 9 && 837 $self->{variable} ne 'non-ignorable'; 838 839 if (! @$subE) { 840 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 841 return $glob 842 ? map([$_, 0], $temp..$len) 843 : wantarray ? ($temp,0) : $temp; 844 } 845 $len < $pos 846 and return wantarray ? () : NOMATCHPOS; 847 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 848 @$strE 849 or return wantarray ? () : NOMATCHPOS; 850 851 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 852 853 my $last_is_variable; 854 for my $vwt (map $self->getWt($_), @$subE) { 855 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 856 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 857 858 # "Ignorable (L1, L2) after Variable" since track. v. 9 859 if ($v2i) { 860 if ($var) { 861 $last_is_variable = TRUE; 862 } 863 elsif (!$wt[0]) { # ignorable 864 $to_be_pushed = FALSE if $last_is_variable; 865 } 866 else { 867 $last_is_variable = FALSE; 868 } 869 } 870 871 if (@subWt && !$var && !$wt[0]) { 872 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 873 } elsif ($to_be_pushed) { 874 push @subWt, [ \@wt ]; 875 } 876 # else ===> skipped 877 } 878 879 my $count = 0; 880 my $end = @$strE - 1; 881 882 $last_is_variable = FALSE; # reuse 883 for (my $i = 0; $i <= $end; ) { # no $i++ 884 my $found_base = 0; 885 886 # fetch a grapheme 887 while ($i <= $end && $found_base == 0) { 888 for my $vwt ($self->getWt($strE->[$i][0])) { 889 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 890 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 891 892 # "Ignorable (L1, L2) after Variable" since track. v. 9 893 if ($v2i) { 894 if ($var) { 895 $last_is_variable = TRUE; 896 } 897 elsif (!$wt[0]) { # ignorable 898 $to_be_pushed = FALSE if $last_is_variable; 899 } 900 else { 901 $last_is_variable = FALSE; 902 } 903 } 904 905 if (@strWt && !$var && !$wt[0]) { 906 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 907 $finPos[-1] = $strE->[$i][2]; 908 } elsif ($to_be_pushed) { 909 push @strWt, [ \@wt ]; 910 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 911 $finPos[-1] = NOMATCHPOS if $found_base; 912 push @finPos, $strE->[$i][2]; 913 $found_base++; 914 } 915 # else ===> no-op 916 } 917 $i++; 918 } 919 920 # try to match 921 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 922 if ($iniPos[0] != NOMATCHPOS && 923 $finPos[$#subWt] != NOMATCHPOS && 924 _eqArray(\@strWt, \@subWt, $lev)) { 925 my $temp = $iniPos[0] + $pos; 926 927 if ($glob) { 928 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 929 splice @strWt, 0, $#subWt; 930 splice @iniPos, 0, $#subWt; 931 splice @finPos, 0, $#subWt; 932 } 933 else { 934 return wantarray 935 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 936 : $temp; 937 } 938 } 939 shift @strWt; 940 shift @iniPos; 941 shift @finPos; 942 } 943 } 944 945 return $glob 946 ? @g_ret 947 : wantarray ? () : NOMATCHPOS; 948} 949 950## 951## scalarref to matching part = match(string, substring) 952## 953sub match 954{ 955 my $self = shift; 956 if (my($pos,$len) = $self->index($_[0], $_[1])) { 957 my $temp = substr($_[0], $pos, $len); 958 return wantarray ? $temp : \$temp; 959 # An lvalue ref \substr should be avoided, 960 # since its value is affected by modification of its referent. 961 } 962 else { 963 return; 964 } 965} 966 967## 968## arrayref matching parts = gmatch(string, substring) 969## 970sub gmatch 971{ 972 my $self = shift; 973 my $str = shift; 974 my $sub = shift; 975 return map substr($str, $_->[0], $_->[1]), 976 $self->index($str, $sub, 0, 'g'); 977} 978 979## 980## bool subst'ed = subst(string, substring, replace) 981## 982sub subst 983{ 984 my $self = shift; 985 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 986 987 if (my($pos,$len) = $self->index($_[0], $_[1])) { 988 if ($code) { 989 my $mat = substr($_[0], $pos, $len); 990 substr($_[0], $pos, $len, $code->($mat)); 991 } else { 992 substr($_[0], $pos, $len, $_[2]); 993 } 994 return TRUE; 995 } 996 else { 997 return FALSE; 998 } 999} 1000 1001## 1002## int count = gsubst(string, substring, replace) 1003## 1004sub gsubst 1005{ 1006 my $self = shift; 1007 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1008 my $cnt = 0; 1009 1010 # Replacement is carried out from the end, then use reverse. 1011 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1012 if ($code) { 1013 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1014 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1015 } else { 1016 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1017 } 1018 $cnt++; 1019 } 1020 return $cnt; 1021} 1022 10231; 1024__END__ 1025 1026=head1 NAME 1027 1028Unicode::Collate - Unicode Collation Algorithm 1029 1030=head1 SYNOPSIS 1031 1032 use Unicode::Collate; 1033 1034 #construct 1035 $Collator = Unicode::Collate->new(%tailoring); 1036 1037 #sort 1038 @sorted = $Collator->sort(@not_sorted); 1039 1040 #compare 1041 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1042 1043B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1044according to Perl's Unicode support. See L<perlunicode>, 1045L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1046Otherwise you can use C<preprocess> or should decode them before. 1047 1048=head1 DESCRIPTION 1049 1050This module is an implementation of Unicode Technical Standard #10 1051(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1052 1053=head2 Constructor and Tailoring 1054 1055The C<new> method returns a collator object. If new() is called 1056with no parameters, the collator should do the default collation. 1057 1058 $Collator = Unicode::Collate->new( 1059 UCA_Version => $UCA_Version, 1060 alternate => $alternate, # alias for 'variable' 1061 backwards => $levelNumber, # or \@levelNumbers 1062 entry => $element, 1063 hangul_terminator => $term_primary_weight, 1064 highestFFFF => $bool, 1065 identical => $bool, 1066 ignoreName => qr/$ignoreName/, 1067 ignoreChar => qr/$ignoreChar/, 1068 ignore_level2 => $bool, 1069 katakana_before_hiragana => $bool, 1070 level => $collationLevel, 1071 minimalFFFE => $bool, 1072 normalization => $normalization_form, 1073 overrideCJK => \&overrideCJK, 1074 overrideHangul => \&overrideHangul, 1075 preprocess => \&preprocess, 1076 rearrange => \@charList, 1077 rewrite => \&rewrite, 1078 suppress => \@charList, 1079 table => $filename, 1080 undefName => qr/$undefName/, 1081 undefChar => qr/$undefChar/, 1082 upper_before_lower => $bool, 1083 variable => $variable, 1084 ); 1085 1086=over 4 1087 1088=item UCA_Version 1089 1090If the revision (previously "tracking version") number of UCA is given, 1091behavior of that revision is emulated on collating. 1092If omitted, the return value of C<UCA_Version()> is used. 1093 1094The following revisions are supported. The default is 28. 1095 1096 UCA Unicode Standard DUCET (@version) 1097 ------------------------------------------------------- 1098 8 3.1 3.0.1 (3.0.1d9) 1099 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) 1100 11 4.0 4.0.0 (4.0.0) 1101 14 4.1.0 4.1.0 (4.1.0) 1102 16 5.0 5.0.0 (5.0.0) 1103 18 5.1.0 5.1.0 (5.1.0) 1104 20 5.2.0 5.2.0 (5.2.0) 1105 22 6.0.0 6.0.0 (6.0.0) 1106 24 6.1.0 6.1.0 (6.1.0) 1107 26 6.2.0 6.2.0 (6.2.0) 1108 28 6.3.0 6.3.0 (6.3.0) 1109 1110* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1111since C<UCA_Version> 22. 1112 1113* Out-of-range codepoints (greater than U+10FFFF) are not ignored, 1114and can be overridden since C<UCA_Version> 22. 1115 1116* Fully ignorable characters were ignored, and would not interrupt 1117contractions with C<UCA_Version> 9 and 11. 1118 1119* Treatment of ignorables after variables and some behaviors 1120were changed at C<UCA_Version> 9. 1121 1122* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1123depend on C<UCA_Version>. 1124 1125* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1126C<hangul_terminator>. 1127 1128=item alternate 1129 1130-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1131 1132For backward compatibility, C<alternate> (old name) can be used 1133as an alias for C<variable>. 1134 1135=item backwards 1136 1137-- see 3.4 Backward Accents, UTS #10. 1138 1139 backwards => $levelNumber or \@levelNumbers 1140 1141Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1142If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1143forwards at all the levels. 1144 1145=item entry 1146 1147-- see 5 Tailoring; 3.6.1 File Format, UTS #10. 1148 1149If the same character (or a sequence of characters) exists 1150in the collation element table through C<table>, 1151mapping to collation elements is overridden. 1152If it does not exist, the mapping is defined additionally. 1153 1154 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 11550063 0068 ; [.0E6A.0020.0002.0063] # ch 11560043 0068 ; [.0E6A.0020.0007.0043] # Ch 11570043 0048 ; [.0E6A.0020.0008.0043] # CH 1158006C 006C ; [.0F4C.0020.0002.006C] # ll 1159004C 006C ; [.0F4C.0020.0007.004C] # Ll 1160004C 004C ; [.0F4C.0020.0008.004C] # LL 116100F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1162006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 116300D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1164004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1165ENTRY 1166 1167 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 116800E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 116900C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1170ENTRY 1171 1172B<NOTE:> The code point in the UCA file format (before C<';'>) 1173B<must> be a Unicode code point (defined as hexadecimal), 1174but not a native code point. 1175So C<0063> must always denote C<U+0063>, 1176but not a character of C<"\x63">. 1177 1178Weighting may vary depending on collation element table. 1179So ensure the weights defined in C<entry> will be consistent with 1180those in the collation element table loaded via C<table>. 1181 1182In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1183and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1184(as a value between C<0E60> and C<0E6D>) 1185makes ordering as C<C E<lt> CH E<lt> D>. 1186Exactly speaking DUCET already has some characters between C<C> and C<D>: 1187C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1188C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1189and C<c-curl> (C<U+0255>) with C<0E69>. 1190Then primary weight C<0E6A> for C<CH> makes C<CH> 1191ordered between C<c-curl> and C<D>. 1192 1193=item hangul_terminator 1194 1195-- see 7.1.4 Trailing Weights, UTS #10. 1196 1197If a true value is given (non-zero but should be positive), 1198it will be added as a terminator primary weight to the end of 1199every standard Hangul syllable. Secondary and any higher weights 1200for terminator are set to zero. 1201If the value is false or C<hangul_terminator> key does not exist, 1202insertion of terminator weights will not be performed. 1203 1204Boundaries of Hangul syllables are determined 1205according to conjoining Jamo behavior in F<the Unicode Standard> 1206and F<HangulSyllableType.txt>. 1207 1208B<Implementation Note:> 1209(1) For expansion mapping (Unicode character mapped 1210to a sequence of collation elements), a terminator will not be added 1211between collation elements, even if Hangul syllable boundary exists there. 1212Addition of terminator is restricted to the next position 1213to the last collation element. 1214 1215(2) Non-conjoining Hangul letters 1216(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1217automatically terminated with a terminator primary weight. 1218These characters may need terminator included in a collation element 1219table beforehand. 1220 1221=item highestFFFF 1222 1223-- see 5.14 Collation Elements, UTS #35. 1224 1225If the parameter is made true, C<U+FFFF> has a highest primary weight. 1226When a boolean of C<$coll-E<gt>ge($str, "abc")> and 1227C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 1228begins with C<"abc">, or another primary equivalent. 1229C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 1230such as C<"abc\x{FFFF}xyz">. 1231 1232C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 1233almost, but the latter has a problem that you should know which letter is 1234next to C<c>. For a certain language where C<ch> as the next letter, 1235C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. 1236 1237Note: 1238This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>. 1239Any other character than C<U+FFFF> can be tailored by C<entry>. 1240 1241=item identical 1242 1243-- see A.3 Deterministic Comparison, UTS #10. 1244 1245By default, strings whose weights are equal should be equal, 1246even though their code points are not equal. 1247Completely ignorable characters are ignored. 1248 1249If the parameter is made true, a final, tie-breaking level is used. 1250If no difference of weights is found after the comparison through 1251all the level specified by C<level>, the comparison with code points 1252will be performed. 1253For the tie-breaking comparison, the sort key has code points 1254of the original string appended. 1255Completely ignorable characters are not ignored. 1256 1257If C<preprocess> and/or C<normalization> is applied, the code points 1258of the string after them (in NFD by default) are used. 1259 1260=item ignoreChar 1261 1262=item ignoreName 1263 1264-- see 3.6.2 Variable Weighting, UTS #10. 1265 1266Makes the entry in the table completely ignorable; 1267i.e. as if the weights were zero at all level. 1268 1269Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1270will be ignored. Through C<ignoreName>, any character whose name 1271(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1272will be ignored. 1273 1274E.g. when 'a' and 'e' are ignorable, 1275'element' is equal to 'lament' (or 'lmnt'). 1276 1277=item ignore_level2 1278 1279-- see 5.1 Parametric Tailoring, UTS #10. 1280 1281By default, case-sensitive comparison (that is level 3 difference) 1282won't ignore accents (that is level 2 difference). 1283 1284If the parameter is made true, accents (and other primary ignorable 1285characters) are ignored, even though cases are taken into account. 1286 1287B<NOTE>: C<level> should be 3 or greater. 1288 1289=item katakana_before_hiragana 1290 1291-- see 7.2 Tertiary Weight Table, UTS #10. 1292 1293By default, hiragana is before katakana. 1294If the parameter is made true, this is reversed. 1295 1296B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1297distinctions must occur in level 3, and their weights at level 3 must be 1298same as those mentioned in 7.3.1, UTS #10. 1299If you define your collation elements which violate this requirement, 1300this parameter does not work validly. 1301 1302=item level 1303 1304-- see 4.3 Form Sort Key, UTS #10. 1305 1306Set the maximum level. 1307Any higher levels than the specified one are ignored. 1308 1309 Level 1: alphabetic ordering 1310 Level 2: diacritic ordering 1311 Level 3: case ordering 1312 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1313 1314 ex.level => 2, 1315 1316If omitted, the maximum is the 4th. 1317 1318B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 1319But this module only uses weights within 0xFFFF. 1320When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 1321and 'shift-trimmed'), the level 4 may be unreliable. 1322 1323See also C<identical>. 1324 1325=item minimalFFFE 1326 1327-- see 5.14 Collation Elements, UTS #35. 1328 1329If the parameter is made true, C<U+FFFE> has a minimal primary weight. 1330The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 1331first compares C<$a1> and C<$b1> at level 1, and 1332then C<$a2> and C<$b2> at level 1, as followed. 1333 1334 "ab\x{FFFE}a" 1335 "Ab\x{FFFE}a" 1336 "ab\x{FFFE}c" 1337 "Ab\x{FFFE}c" 1338 "ab\x{FFFE}xyz" 1339 "abc\x{FFFE}def" 1340 "abc\x{FFFE}xYz" 1341 "aBc\x{FFFE}xyz" 1342 "abcX\x{FFFE}def" 1343 "abcx\x{FFFE}xyz" 1344 "b\x{FFFE}aaa" 1345 "bbb\x{FFFE}a" 1346 1347Note: 1348This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>. 1349Any other character than C<U+FFFE> can be tailored by C<entry>. 1350 1351=item normalization 1352 1353-- see 4.1 Normalize, UTS #10. 1354 1355If specified, strings are normalized before preparation of sort keys 1356(the normalization is executed after preprocess). 1357 1358A form name C<Unicode::Normalize::normalize()> accepts will be applied 1359as C<$normalization_form>. 1360Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1361See C<Unicode::Normalize::normalize()> for detail. 1362If omitted, C<'NFD'> is used. 1363 1364C<normalization> is performed after C<preprocess> (if defined). 1365 1366Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1367though they are not concerned with C<Unicode::Normalize::normalize()>. 1368 1369If C<undef> (not a string C<"undef">) is passed explicitly 1370as the value for this key, 1371any normalization is not carried out (this may make tailoring easier 1372if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1373only contiguous contractions are resolved; 1374e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1375C<A-cedilla-ring> would be primary equal to C<A>. 1376In this point, 1377C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1378B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1379 1380In the case of C<(normalization =E<gt> "prenormalized")>, 1381any normalization is not performed, but 1382discontiguous contractions with combining characters are performed. 1383Therefore 1384C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1385B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1386If source strings are finely prenormalized, 1387C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1388 1389Except C<(normalization =E<gt> undef)>, 1390B<Unicode::Normalize> is required (see also B<CAVEAT>). 1391 1392=item overrideCJK 1393 1394-- see 7.1 Derived Collation Elements, UTS #10. 1395 1396By default, CJK unified ideographs are ordered in Unicode codepoint 1397order, but those in the CJK Unified Ideographs block are less than 1398those in the CJK Unified Ideographs Extension A etc. 1399 1400 In the CJK Unified Ideographs block: 1401 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1402 U+4E00..U+9FBB if UCA_Version is 14 or 16. 1403 U+4E00..U+9FC3 if UCA_Version is 18. 1404 U+4E00..U+9FCB if UCA_Version is 20 or 22. 1405 U+4E00..U+9FCC if UCA_Version is 24 or later. 1406 1407 In the CJK Unified Ideographs Extension blocks: 1408 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version. 1409 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later. 1410 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later. 1411 1412Through C<overrideCJK>, ordering of CJK unified ideographs (including 1413extensions) can be overridden. 1414 1415ex. CJK unified ideographs in the JIS code point order. 1416 1417 overrideCJK => sub { 1418 my $u = shift; # get a Unicode codepoint 1419 my $b = pack('n', $u); # to UTF-16BE 1420 my $s = your_unicode_to_sjis_converter($b); # convert 1421 my $n = unpack('n', $s); # convert sjis to short 1422 [ $n, 0x20, 0x2, $u ]; # return the collation element 1423 }, 1424 1425The return value may be an arrayref of 1st to 4th weights as shown 1426above. The return value may be an integer as the primary weight 1427as shown below. If C<undef> is returned, the default derived 1428collation element will be used. 1429 1430 overrideCJK => sub { 1431 my $u = shift; # get a Unicode codepoint 1432 my $b = pack('n', $u); # to UTF-16BE 1433 my $s = your_unicode_to_sjis_converter($b); # convert 1434 my $n = unpack('n', $s); # convert sjis to short 1435 return $n; # return the primary weight 1436 }, 1437 1438The return value may be a list containing zero or more of 1439an arrayref, an integer, or C<undef>. 1440 1441ex. ignores all CJK unified ideographs. 1442 1443 overrideCJK => sub {()}, # CODEREF returning empty list 1444 1445 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1446 # as U+4E00 is a CJK unified ideograph and to be ignorable. 1447 1448If a false value (including C<undef>) is passed, C<overrideCJK> 1449has no effect. 1450C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one. 1451 1452But assignment of weight for CJK unified ideographs 1453in C<table> or C<entry> is still valid. 1454If C<undef> is passed explicitly as the value for this key, 1455weights for CJK unified ideographs are treated as undefined. 1456However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)> 1457has no special meaning. 1458 1459B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 1460C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 1461C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 1462ideographs. But they can't be overridden via C<overrideCJK> when you use 1463DUCET, as the table includes weights for them. C<table> or C<entry> has 1464priority over C<overrideCJK>. 1465 1466=item overrideHangul 1467 1468-- see 7.1 Derived Collation Elements, UTS #10. 1469 1470By default, Hangul syllables are decomposed into Hangul Jamo, 1471even if C<(normalization =E<gt> undef)>. 1472But the mapping of Hangul syllables may be overridden. 1473 1474This parameter works like C<overrideCJK>, so see there for examples. 1475 1476If you want to override the mapping of Hangul syllables, 1477NFD and NFKD are not appropriate, since NFD and NFKD will decompose 1478Hangul syllables before overriding. FCD may decompose Hangul syllables 1479as the case may be. 1480 1481If a false value (but not C<undef>) is passed, C<overrideHangul> 1482has no effect. 1483C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one. 1484 1485If C<undef> is passed explicitly as the value for this key, 1486weight for Hangul syllables is treated as undefined 1487without decomposition into Hangul Jamo. 1488But definition of weight for Hangul syllables 1489in C<table> or C<entry> is still valid. 1490 1491=item overrideOut 1492 1493-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. 1494 1495Perl seems to allow out-of-range values (greater than 0x10FFFF). 1496By default, out-of-range values are replaced with C<U+FFFD> 1497(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22, 1498or ignored when C<UCA_Version> E<lt>= 20. 1499 1500When C<UCA_Version> E<gt>= 22, the weights of out-of-range values 1501can be overridden. Though C<table> or C<entry> are available for them, 1502out-of-range values are too many. 1503 1504C<overrideOut> can perform it algorithmically. 1505This parameter works like C<overrideCJK>, so see there for examples. 1506 1507ex. ignores all out-of-range values. 1508 1509 overrideOut => sub {()}, # CODEREF returning empty list 1510 1511If a false value (including C<undef>) is passed, C<overrideOut> 1512has no effect. 1513C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one. 1514 1515B<NOTE ABOUT U+FFFD:> 1516 1517UCA recommends that out-of-range values should not be ignored for security 1518reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. 1519However, C<U+FFFD> is wrongly mapped to a variable collation element 1520in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be 1521ignored when C<variable> isn't C<Non-ignorable>. 1522 1523The mapping of C<U+FFFD> is corrected in Unicode 6.3.0. 1524see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights> 1525(7.1.4 Trailing Weights). Such a correction is reproduced by this. 1526 1527 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer 1528 1529This workaround is unnecessary since Unicode 6.3.0. 1530 1531=item preprocess 1532 1533-- see 5.4 Preprocessing, UTS #10. 1534 1535If specified, the coderef is used to preprocess each string 1536before the formation of sort keys. 1537 1538ex. dropping English articles, such as "a" or "the". 1539Then, "the pen" is before "a pencil". 1540 1541 preprocess => sub { 1542 my $str = shift; 1543 $str =~ s/\b(?:an?|the)\s+//gi; 1544 return $str; 1545 }, 1546 1547C<preprocess> is performed before C<normalization> (if defined). 1548 1549ex. decoding strings in a legacy encoding such as shift-jis: 1550 1551 $sjis_collator = Unicode::Collate->new( 1552 preprocess => \&your_shiftjis_to_unicode_decoder, 1553 ); 1554 @result = $sjis_collator->sort(@shiftjis_strings); 1555 1556B<Note:> Strings returned from the coderef will be interpreted 1557according to Perl's Unicode support. See L<perlunicode>, 1558L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1559 1560=item rearrange 1561 1562-- see 3.5 Rearrangement, UTS #10. 1563 1564Characters that are not coded in logical order and to be rearranged. 1565If C<UCA_Version> is equal to or less than 11, default is: 1566 1567 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1568 1569If you want to disallow any rearrangement, pass C<undef> or C<[]> 1570(a reference to empty list) as the value for this key. 1571 1572If C<UCA_Version> is equal to or greater than 14, default is C<[]> 1573(i.e. no rearrangement). 1574 1575B<According to the version 9 of UCA, this parameter shall not be used; 1576but it is not warned at present.> 1577 1578=item rewrite 1579 1580If specified, the coderef is used to rewrite lines in C<table> or C<entry>. 1581The coderef will get each line, and then should return a rewritten line 1582according to the UCA file format. 1583If the coderef returns an empty line, the line will be skipped. 1584 1585e.g. any primary ignorable characters into tertiary ignorable: 1586 1587 rewrite => sub { 1588 my $line = shift; 1589 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 1590 return $line; 1591 }, 1592 1593This example shows rewriting weights. C<rewrite> is allowed to 1594affect code points, weights, and the name. 1595 1596B<NOTE>: C<table> is available to use another table file; 1597preparing a modified table once would be more efficient than 1598rewriting lines on reading an unmodified table every time. 1599 1600=item suppress 1601 1602-- see suppress contractions in 5.14.11 Special-Purpose Commands, 1603UTS #35 (LDML). 1604 1605Contractions beginning with the specified characters are suppressed, 1606even if those contractions are defined in C<table>. 1607 1608An example for Russian and some languages using the Cyrillic script: 1609 1610 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 1611 1612where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 1613 1614B<NOTE>: Contractions via C<entry> are not be suppressed. 1615 1616=item table 1617 1618-- see 3.6 Default Unicode Collation Element Table, UTS #10. 1619 1620You can use another collation element table if desired. 1621 1622The table file should locate in the F<Unicode/Collate> directory 1623on C<@INC>. Say, if the filename is F<Foo.txt>, 1624the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1625 1626By default, F<allkeys.txt> (as the filename of DUCET) is used. 1627If you will prepare your own table file, any name other than F<allkeys.txt> 1628may be better to avoid namespace conflict. 1629 1630B<NOTE>: When XSUB is used, the DUCET is compiled on building this 1631module, and it may save time at the run time. 1632Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table, 1633or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 1634C<rewrite> will prevent this module from using the compiled DUCET. 1635 1636If C<undef> is passed explicitly as the value for this key, 1637no file is read (but you can define collation elements via C<entry>). 1638 1639A typical way to define a collation element table 1640without any file of table: 1641 1642 $onlyABC = Unicode::Collate->new( 1643 table => undef, 1644 entry => << 'ENTRIES', 16450061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 16460041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 16470062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 16480042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 16490063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 16500043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1651ENTRIES 1652 ); 1653 1654If C<ignoreName> or C<undefName> is used, character names should be 1655specified as a comment (following C<#>) on each line. 1656 1657=item undefChar 1658 1659=item undefName 1660 1661-- see 6.3.4 Reducing the Repertoire, UTS #10. 1662 1663Undefines the collation element as if it were unassigned in the C<table>. 1664This reduces the size of the table. 1665If an unassigned character appears in the string to be collated, 1666the sort key is made from its codepoint 1667as a single-character collation element, 1668as it is greater than any other assigned collation elements 1669(in the codepoint order among the unassigned characters). 1670But, it'd be better to ignore characters 1671unfamiliar to you and maybe never used. 1672 1673Through C<undefChar>, any character matching C<qr/$undefChar/> 1674will be undefined. Through C<undefName>, any character whose name 1675(given in the C<table> file as a comment) matches C<qr/$undefName/> 1676will be undefined. 1677 1678ex. Collation weights for beyond-BMP characters are not stored in object: 1679 1680 undefChar => qr/[^\0-\x{fffd}]/, 1681 1682=item upper_before_lower 1683 1684-- see 6.6 Case Comparisons, UTS #10. 1685 1686By default, lowercase is before uppercase. 1687If the parameter is made true, this is reversed. 1688 1689B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1690distinctions must occur in level 3, and their weights at level 3 must be 1691same as those mentioned in 7.3.1, UTS #10. 1692If you define your collation elements which differs from this requirement, 1693this parameter doesn't work validly. 1694 1695=item variable 1696 1697-- see 3.6.2 Variable Weighting, UTS #10. 1698 1699This key allows for variable weighting of variable collation elements, 1700which are marked with an ASTERISK in the table 1701(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 1702 1703 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1704 1705These names are case-insensitive. 1706By default (if specification is omitted), 'shifted' is adopted. 1707 1708 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1709 considered at the 4th level. 1710 1711 'Non-Ignorable' Variable elements are not reset to ignorable. 1712 1713 'Shifted' Variable elements are made ignorable at levels 1 through 3 1714 their level 4 weight is replaced by the old level 1 weight. 1715 Level 4 weight for Non-Variable elements is 0xFFFF. 1716 1717 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1718 are trimmed. 1719 1720=back 1721 1722=head2 Methods for Collation 1723 1724=over 4 1725 1726=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1727 1728Sorts a list of strings. 1729 1730=item C<$result = $Collator-E<gt>cmp($a, $b)> 1731 1732Returns 1 (when C<$a> is greater than C<$b>) 1733or 0 (when C<$a> is equal to C<$b>) 1734or -1 (when C<$a> is less than C<$b>). 1735 1736=item C<$result = $Collator-E<gt>eq($a, $b)> 1737 1738=item C<$result = $Collator-E<gt>ne($a, $b)> 1739 1740=item C<$result = $Collator-E<gt>lt($a, $b)> 1741 1742=item C<$result = $Collator-E<gt>le($a, $b)> 1743 1744=item C<$result = $Collator-E<gt>gt($a, $b)> 1745 1746=item C<$result = $Collator-E<gt>ge($a, $b)> 1747 1748They works like the same name operators as theirs. 1749 1750 eq : whether $a is equal to $b. 1751 ne : whether $a is not equal to $b. 1752 lt : whether $a is less than $b. 1753 le : whether $a is less than $b or equal to $b. 1754 gt : whether $a is greater than $b. 1755 ge : whether $a is greater than $b or equal to $b. 1756 1757=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1758 1759-- see 4.3 Form Sort Key, UTS #10. 1760 1761Returns a sort key. 1762 1763You compare the sort keys using a binary comparison 1764and get the result of the comparison of the strings using UCA. 1765 1766 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1767 1768 is equivalent to 1769 1770 $Collator->cmp($a, $b) 1771 1772=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1773 1774Converts a sorting key into its representation form. 1775If C<UCA_Version> is 8, the output is slightly different. 1776 1777 use Unicode::Collate; 1778 my $c = Unicode::Collate->new(); 1779 print $c->viewSortKey("Perl"),"\n"; 1780 1781 # output: 1782 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1783 # Level 1 Level 2 Level 3 Level 4 1784 1785=back 1786 1787=head2 Methods for Searching 1788 1789The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1790like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1791but they are not aware of any pattern, but only a literal substring. 1792 1793B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1794for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1795C<subst>, C<gsubst>) is croaked, as the position and the length might 1796differ from those on the specified string. 1797 1798C<rearrange> and C<hangul_terminator> parameters are neglected. 1799C<katakana_before_hiragana> and C<upper_before_lower> don't affect 1800matching and searching, as it doesn't matter whether greater or less. 1801 1802=over 4 1803 1804=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1805 1806=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1807 1808If C<$substring> matches a part of C<$string>, returns 1809the position of the first occurrence of the matching part in scalar context; 1810in list context, returns a two-element list of 1811the position and the length of the matching part. 1812 1813If C<$substring> does not match any part of C<$string>, 1814returns C<-1> in scalar context and 1815an empty list in list context. 1816 1817e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">, 1818you say the following where C<$sub> is C<"M>E<uuml>C<SS">, 1819 1820 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1821 # (normalization => undef) is REQUIRED. 1822 my $match; 1823 if (my($pos,$len) = $Collator->index($str, $sub)) { 1824 $match = substr($str, $pos, $len); 1825 } 1826 1827and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<"> 1828is primary equal to C<"M>E<uuml>C<SS">. 1829 1830=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1831 1832=item C<($match) = $Collator-E<gt>match($string, $substring)> 1833 1834If C<$substring> matches a part of C<$string>, in scalar context, returns 1835B<a reference to> the first occurrence of the matching part 1836(C<$match_ref> is always true if matches, 1837since every reference is B<true>); 1838in list context, returns the first occurrence of the matching part. 1839 1840If C<$substring> does not match any part of C<$string>, 1841returns C<undef> in scalar context and 1842an empty list in list context. 1843 1844e.g. 1845 1846 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1847 print "matches [$$match_ref].\n"; 1848 } else { 1849 print "doesn't match.\n"; 1850 } 1851 1852 or 1853 1854 if (($match) = $Collator->match($str, $sub)) { # list context 1855 print "matches [$match].\n"; 1856 } else { 1857 print "doesn't match.\n"; 1858 } 1859 1860=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1861 1862If C<$substring> matches a part of C<$string>, returns 1863all the matching parts (or matching count in scalar context). 1864 1865If C<$substring> does not match any part of C<$string>, 1866returns an empty list. 1867 1868=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1869 1870If C<$substring> matches a part of C<$string>, 1871the first occurrence of the matching part is replaced by C<$replacement> 1872(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 1873 1874C<$replacement> can be a C<CODEREF>, 1875taking the matching part as an argument, 1876and returning a string to replace the matching part 1877(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1878 1879=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1880 1881If C<$substring> matches a part of C<$string>, 1882all the occurrences of the matching part are replaced by C<$replacement> 1883(C<$string> is modified) and C<$count> is returned. 1884 1885C<$replacement> can be a C<CODEREF>, 1886taking the matching part as an argument, 1887and returning a string to replace the matching part 1888(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1889 1890e.g. 1891 1892 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1893 # (normalization => undef) is REQUIRED. 1894 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 1895 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1896 1897 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 1898 # i.e., all the camels are made bold-faced. 1899 1900 Examples: levels and ignore_level2 - what does camel match? 1901 --------------------------------------------------------------------------- 1902 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 1903 -----------------------|--------------------------------------------------- 1904 1 false | yes yes yes yes yes 1905 2 false | yes yes no yes yes 1906 3 false | yes no no yes yes 1907 4 false | yes no no no yes 1908 -----------------------|--------------------------------------------------- 1909 1 true | yes yes yes yes yes 1910 2 true | yes yes yes yes yes 1911 3 true | yes no yes yes yes 1912 4 true | yes no yes no yes 1913 --------------------------------------------------------------------------- 1914 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 1915 at any level. 1916 1917=back 1918 1919=head2 Other Methods 1920 1921=over 4 1922 1923=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 1924 1925=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 1926 1927Changes the value of specified keys and returns the changed part. 1928 1929 $Collator = Unicode::Collate->new(level => 4); 1930 1931 $Collator->eq("perl", "PERL"); # false 1932 1933 %old = $Collator->change(level => 2); # returns (level => 4). 1934 1935 $Collator->eq("perl", "PERL"); # true 1936 1937 $Collator->change(%old); # returns (level => 2). 1938 1939 $Collator->eq("perl", "PERL"); # false 1940 1941Not all C<(key,value)>s are allowed to be changed. 1942See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 1943 1944In the scalar context, returns the modified collator 1945(but it is B<not> a clone from the original). 1946 1947 $Collator->change(level => 2)->eq("perl", "PERL"); # true 1948 1949 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 1950 1951 $Collator->change(level => 4)->eq("perl", "PERL"); # false 1952 1953=item C<$version = $Collator-E<gt>version()> 1954 1955Returns the version number (a string) of the Unicode Standard 1956which the C<table> file used by the collator object is based on. 1957If the table does not include a version line (starting with C<@version>), 1958returns C<"unknown">. 1959 1960=item C<UCA_Version()> 1961 1962Returns the revision number of UTS #10 this module consults, 1963that should correspond with the DUCET incorporated. 1964 1965=item C<Base_Unicode_Version()> 1966 1967Returns the version number of UTS #10 this module consults, 1968that should correspond with the DUCET incorporated. 1969 1970=back 1971 1972=head1 EXPORT 1973 1974No method will be exported. 1975 1976=head1 INSTALL 1977 1978Though this module can be used without any C<table> file, 1979to use this module easily, it is recommended to install a table file 1980in the UCA format, by copying it under the directory 1981<a place in @INC>/Unicode/Collate. 1982 1983The most preferable one is "The Default Unicode Collation Element Table" 1984(aka DUCET), available from the Unicode Consortium's website: 1985 1986 http://www.unicode.org/Public/UCA/ 1987 1988 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) 1989 1990If DUCET is not installed, it is recommended to copy the file 1991from http://www.unicode.org/Public/UCA/latest/allkeys.txt 1992to <a place in @INC>/Unicode/Collate/allkeys.txt 1993manually. 1994 1995=head1 CAVEATS 1996 1997=over 4 1998 1999=item Normalization 2000 2001Use of the C<normalization> parameter requires the B<Unicode::Normalize> 2002module (see L<Unicode::Normalize>). 2003 2004If you need not it (say, in the case when you need not 2005handle any combining characters), 2006assign C<(normalization =E<gt> undef)> explicitly. 2007 2008-- see 6.5 Avoiding Normalization, UTS #10. 2009 2010=item Conformance Test 2011 2012The Conformance Test for the UCA is available 2013under L<http://www.unicode.org/Public/UCA/>. 2014 2015For F<CollationTest_SHIFTED.txt>, 2016a collator via C<Unicode::Collate-E<gt>new( )> should be used; 2017for F<CollationTest_NON_IGNORABLE.txt>, a collator via 2018C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 2019 2020If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 2021C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 2022C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 2023C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 2024 2025B<Unicode::Normalize is required to try The Conformance Test.> 2026 2027=back 2028 2029=head1 AUTHOR, COPYRIGHT AND LICENSE 2030 2031The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 2032<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2013, 2033SADAHIRO Tomoyuki. Japan. All rights reserved. 2034 2035This module is free software; you can redistribute it and/or 2036modify it under the same terms as Perl itself. 2037 2038The file Unicode/Collate/allkeys.txt was copied verbatim 2039from L<http://www.unicode.org/Public/UCA/6.3.0/allkeys.txt>. 2040For this file, Copyright (c) 2001-2012 Unicode, Inc. 2041Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>. 2042 2043=head1 SEE ALSO 2044 2045=over 4 2046 2047=item Unicode Collation Algorithm - UTS #10 2048 2049L<http://www.unicode.org/reports/tr10/> 2050 2051=item The Default Unicode Collation Element Table (DUCET) 2052 2053L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 2054 2055=item The conformance test for the UCA 2056 2057L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 2058 2059L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 2060 2061=item Hangul Syllable Type 2062 2063L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 2064 2065=item Unicode Normalization Forms - UAX #15 2066 2067L<http://www.unicode.org/reports/tr15/> 2068 2069=item Unicode Locale Data Markup Language (LDML) - UTS #35 2070 2071L<http://www.unicode.org/reports/tr35/> 2072 2073=back 2074 2075=cut 2076