1package Unicode::Collate; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Collate cannot stringify a Unicode code point\n"; 6 } 7} 8 9use 5.006; 10use strict; 11use warnings; 12use Carp; 13use File::Spec; 14 15no warnings 'utf8'; 16 17our $VERSION = '0.97'; 18our $PACKAGE = __PACKAGE__; 19 20### begin XS only ### 21require DynaLoader; 22our @ISA = qw(DynaLoader); 23bootstrap Unicode::Collate $VERSION; 24### end XS only ### 25 26my @Path = qw(Unicode Collate); 27my $KeyFile = "allkeys.txt"; 28 29# Perl's boolean 30use constant TRUE => 1; 31use constant FALSE => ""; 32use constant NOMATCHPOS => -1; 33 34# A coderef to get combining class imported from Unicode::Normalize 35# (i.e. \&Unicode::Normalize::getCombinClass). 36# This is also used as a HAS_UNICODE_NORMALIZE flag. 37my $CVgetCombinClass; 38 39# Supported Levels 40use constant MinLevel => 1; 41use constant MaxLevel => 4; 42 43# Minimum weights at level 2 and 3, respectively 44use constant Min2Wt => 0x20; 45use constant Min3Wt => 0x02; 46 47# Shifted weight at 4th level 48use constant Shift4Wt => 0xFFFF; 49 50# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 51use constant VCE_TEMPLATE => 'Cn4'; 52 53# A sort key: 16-bit weights 54use constant KEY_TEMPLATE => 'n*'; 55 56# The tie-breaking: 32-bit weights 57use constant TIE_TEMPLATE => 'N*'; 58 59# Level separator in a sort key: 60# i.e. pack(KEY_TEMPLATE, 0) 61use constant LEVEL_SEP => "\0\0"; 62 63# As Unicode code point separator for hash keys. 64# A joined code point string (denoted by JCPS below) 65# like "65;768" is used for internal processing 66# instead of Perl's Unicode string like "\x41\x{300}", 67# as the native code point is different from the Unicode code point 68# on EBCDIC platform. 69# This character must not be included in any stringified 70# representation of an integer. 71use constant CODE_SEP => ';'; 72 # NOTE: in regex /;/ is used for $jcps! 73 74# boolean values of variable weights 75use constant NON_VAR => 0; # Non-Variable character 76use constant VAR => 1; # Variable character 77 78# specific code points 79use constant Hangul_SIni => 0xAC00; 80use constant Hangul_SFin => 0xD7A3; 81 82# Logical_Order_Exception in PropList.txt 83my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 84 85# for highestFFFF and minimalFFFE 86my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 87my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 88 89sub UCA_Version { "26" } 90 91sub Base_Unicode_Version { "6.2.0" } 92 93###### 94 95sub pack_U { 96 return pack('U*', @_); 97} 98 99###### 100 101my (%VariableOK); 102@VariableOK{ qw/ 103 blanked non-ignorable shifted shift-trimmed 104 / } = (); # keys lowercased 105 106our @ChangeOK = qw/ 107 alternate backwards level normalization rearrange 108 katakana_before_hiragana upper_before_lower ignore_level2 109 overrideHangul overrideCJK preprocess UCA_Version 110 hangul_terminator variable identical highestFFFF minimalFFFE 111 /; 112 113our @ChangeNG = qw/ 114 entry mapping table maxlength contraction 115 ignoreChar ignoreName undefChar undefName rewrite 116 versionTable alternateTable backwardsTable forwardsTable 117 rearrangeTable variableTable 118 derivCode normCode rearrangeHash backwardsFlag 119 suppress suppressHash 120 __useXS /; ### XS only 121# The hash key 'ignored' is deleted at v 0.21. 122# The hash key 'isShift' is deleted at v 0.23. 123# The hash key 'combining' is deleted at v 0.24. 124# The hash key 'entries' is deleted at v 0.30. 125# The hash key 'L3_ignorable' is deleted at v 0.40. 126 127sub version { 128 my $self = shift; 129 return $self->{versionTable} || 'unknown'; 130} 131 132my (%ChangeOK, %ChangeNG); 133@ChangeOK{ @ChangeOK } = (); 134@ChangeNG{ @ChangeNG } = (); 135 136sub change { 137 my $self = shift; 138 my %hash = @_; 139 my %old; 140 if (exists $hash{alternate}) { 141 if (exists $hash{variable}) { 142 delete $hash{alternate}; 143 } else { 144 $hash{variable} = $hash{alternate}; 145 } 146 } 147 foreach my $k (keys %hash) { 148 if (exists $ChangeOK{$k}) { 149 $old{$k} = $self->{$k}; 150 $self->{$k} = $hash{$k}; 151 } elsif (exists $ChangeNG{$k}) { 152 croak "change of $k via change() is not allowed!"; 153 } 154 # else => ignored 155 } 156 $self->checkCollator(); 157 return wantarray ? %old : $self; 158} 159 160sub _checkLevel { 161 my $level = shift; 162 my $key = shift; # 'level' or 'backwards' 163 MinLevel <= $level or croak sprintf 164 "Illegal level %d (in value for key '%s') lower than %d.", 165 $level, $key, MinLevel; 166 $level <= MaxLevel or croak sprintf 167 "Unsupported level %d (in value for key '%s') higher than %d.", 168 $level, $key, MaxLevel; 169} 170 171my %DerivCode = ( 172 8 => \&_derivCE_8, 173 9 => \&_derivCE_9, 174 11 => \&_derivCE_9, # 11 == 9 175 14 => \&_derivCE_14, 176 16 => \&_derivCE_14, # 16 == 14 177 18 => \&_derivCE_18, 178 20 => \&_derivCE_20, 179 22 => \&_derivCE_22, 180 24 => \&_derivCE_24, 181 26 => \&_derivCE_24, # 26 == 24 182); 183 184sub checkCollator { 185 my $self = shift; 186 _checkLevel($self->{level}, "level"); 187 188 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 189 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 190 191 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 192 $self->{alternateTable} || 'shifted'; 193 $self->{variable} = $self->{alternate} = lc($self->{variable}); 194 exists $VariableOK{ $self->{variable} } 195 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 196 197 if (! defined $self->{backwards}) { 198 $self->{backwardsFlag} = 0; 199 } elsif (! ref $self->{backwards}) { 200 _checkLevel($self->{backwards}, "backwards"); 201 $self->{backwardsFlag} = 1 << $self->{backwards}; 202 } else { 203 my %level; 204 $self->{backwardsFlag} = 0; 205 for my $b (@{ $self->{backwards} }) { 206 _checkLevel($b, "backwards"); 207 $level{$b} = 1; 208 } 209 for my $v (sort keys %level) { 210 $self->{backwardsFlag} += 1 << $v; 211 } 212 } 213 214 defined $self->{rearrange} or $self->{rearrange} = []; 215 ref $self->{rearrange} 216 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 217 218 # keys of $self->{rearrangeHash} are $self->{rearrange}. 219 $self->{rearrangeHash} = undef; 220 221 if (@{ $self->{rearrange} }) { 222 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 223 } 224 225 $self->{normCode} = undef; 226 227 if (defined $self->{normalization}) { 228 eval { require Unicode::Normalize }; 229 $@ and croak "Unicode::Normalize is required to normalize strings"; 230 231 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 232 233 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 234 $self->{normCode} = \&Unicode::Normalize::NFD; 235 } 236 elsif ($self->{normalization} ne 'prenormalized') { 237 my $norm = $self->{normalization}; 238 $self->{normCode} = sub { 239 Unicode::Normalize::normalize($norm, shift); 240 }; 241 eval { $self->{normCode}->("") }; # try 242 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 243 } 244 } 245 return; 246} 247 248sub new 249{ 250 my $class = shift; 251 my $self = bless { @_ }, $class; 252 253### begin XS only ### 254 if (! exists $self->{table} && !defined $self->{rewrite} && 255 !defined $self->{undefName} && !defined $self->{ignoreName} && 256 !defined $self->{undefChar} && !defined $self->{ignoreChar}) { 257 $self->{__useXS} = \&_fetch_simple; 258 } else { 259 $self->{__useXS} = undef; 260 } 261### end XS only ### 262 263 # keys of $self->{suppressHash} are $self->{suppress}. 264 if ($self->{suppress} && @{ $self->{suppress} }) { 265 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 266 } # before read_table() 267 268 # If undef is passed explicitly, no file is read. 269 $self->{table} = $KeyFile if ! exists $self->{table}; 270 $self->read_table() if defined $self->{table}; 271 272 if ($self->{entry}) { 273 while ($self->{entry} =~ /([^\n]+)/g) { 274 $self->parseEntry($1, TRUE); 275 } 276 } 277 278 $self->{level} ||= MaxLevel; 279 $self->{UCA_Version} ||= UCA_Version(); 280 281 $self->{overrideHangul} = FALSE 282 if ! exists $self->{overrideHangul}; 283 $self->{overrideCJK} = FALSE 284 if ! exists $self->{overrideCJK}; 285 $self->{normalization} = 'NFD' 286 if ! exists $self->{normalization}; 287 $self->{rearrange} = $self->{rearrangeTable} || 288 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 289 if ! exists $self->{rearrange}; 290 $self->{backwards} = $self->{backwardsTable} 291 if ! exists $self->{backwards}; 292 293 $self->checkCollator(); 294 295 return $self; 296} 297 298sub parseAtmark { 299 my $self = shift; 300 my $line = shift; # after s/^\s*\@// 301 302 if ($line =~ /^version\s*(\S*)/) { 303 $self->{versionTable} ||= $1; 304 } 305 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 306 $self->{variableTable} ||= $1; 307 } 308 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 309 $self->{alternateTable} ||= $1; 310 } 311 elsif ($line =~ /^backwards\s+(\S*)/) { 312 push @{ $self->{backwardsTable} }, $1; 313 } 314 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use 315 push @{ $self->{forwardsTable} }, $1; 316 } 317 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 318 push @{ $self->{rearrangeTable} }, _getHexArray($1); 319 } 320} 321 322sub read_table { 323 my $self = shift; 324 325### begin XS only ### 326 if ($self->{__useXS}) { 327 my @rest = _fetch_rest(); # complex matter need to parse 328 for my $line (@rest) { 329 next if $line =~ /^\s*#/; 330 331 if ($line =~ s/^\s*\@//) { 332 $self->parseAtmark($line); 333 } else { 334 $self->parseEntry($line); 335 } 336 } 337 return; 338 } 339### end XS only ### 340 341 my($f, $fh); 342 foreach my $d (@INC) { 343 $f = File::Spec->catfile($d, @Path, $self->{table}); 344 last if open($fh, $f); 345 $f = undef; 346 } 347 if (!defined $f) { 348 $f = File::Spec->catfile(@Path, $self->{table}); 349 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 350 } 351 352 while (my $line = <$fh>) { 353 next if $line =~ /^\s*#/; 354 355 if ($line =~ s/^\s*\@//) { 356 $self->parseAtmark($line); 357 } else { 358 $self->parseEntry($line); 359 } 360 } 361 close $fh; 362} 363 364 365## 366## get $line, parse it, and write an entry in $self 367## 368sub parseEntry 369{ 370 my $self = shift; 371 my $line = shift; 372 my $tailoring = shift; 373 my($name, $entry, @uv, @key); 374 375 if (defined $self->{rewrite}) { 376 $line = $self->{rewrite}->($line); 377 } 378 379 return if $line !~ /^\s*[0-9A-Fa-f]/; 380 381 # removes comment and gets name 382 $name = $1 383 if $line =~ s/[#%]\s*(.*)//; 384 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 385 386 # gets element 387 my($e, $k) = split /;/, $line; 388 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 389 if ! $k; 390 391 @uv = _getHexArray($e); 392 return if !@uv; 393 return if @uv > 1 && $self->{suppressHash} && !$tailoring && 394 exists $self->{suppressHash}{$uv[0]}; 395 $entry = join(CODE_SEP, @uv); # in JCPS 396 397 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 398 my $ele = pack_U(@uv); 399 400 # regarded as if it were not entried in the table 401 return 402 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 403 404 # replaced as completely ignorable 405 $k = '[.0000.0000.0000.0000]' 406 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 407 } 408 409 # replaced as completely ignorable 410 $k = '[.0000.0000.0000.0000]' 411 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 412 413 my $is_L3_ignorable = TRUE; 414 415 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 416 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 417 my @wt = _getHexArray($arr); 418 push @key, pack(VCE_TEMPLATE, $var, @wt); 419 $is_L3_ignorable = FALSE 420 if $wt[0] || $wt[1] || $wt[2]; 421 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 422 # is completely ignorable. 423 # For expansion, an entry $is_L3_ignorable 424 # if and only if "all" CEs are [.0000.0000.0000]. 425 } 426 427 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 428 429 if (@uv > 1) { 430 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 431 $self->{maxlength}{$uv[0]} = @uv; 432 } 433 } 434 if (@uv > 2) { 435 while (@uv) { 436 pop @uv; 437 my $fake_entry = join(CODE_SEP, @uv); # in JCPS 438 $self->{contraction}{$fake_entry} = 1; 439 } 440 } 441} 442 443 444sub viewSortKey 445{ 446 my $self = shift; 447 my $str = shift; 448 $self->visualizeSortKey($self->getSortKey($str)); 449} 450 451 452sub process 453{ 454 my $self = shift; 455 my $str = shift; 456 my $prep = $self->{preprocess}; 457 my $norm = $self->{normCode}; 458 459 $str = &$prep($str) if ref $prep; 460 $str = &$norm($str) if ref $norm; 461 return $str; 462} 463 464## 465## arrayref of JCPS = splitEnt(string to be collated) 466## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 467## 468sub splitEnt 469{ 470 my $self = shift; 471 my $str = shift; 472 my $wLen = shift; # with Length 473 474 my $map = $self->{mapping}; 475 my $max = $self->{maxlength}; 476 my $reH = $self->{rearrangeHash}; 477 my $vers = $self->{UCA_Version}; 478 my $ver9 = $vers >= 9 && $vers <= 11; 479 my $uXS = $self->{__useXS}; ### XS only 480 481 my @buf; 482 483 # get array of Unicode code point of string. 484 my @src = unpack_U($str); 485 486 # rearrangement: 487 # Character positions are not kept if rearranged, 488 # then neglected if $wLen is true. 489 if ($reH && ! $wLen) { 490 for (my $i = 0; $i < @src; $i++) { 491 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 492 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 493 $i++; 494 } 495 } 496 } 497 498 # remove a code point marked as a completely ignorable. 499 for (my $i = 0; $i < @src; $i++) { 500 if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) { 501 $src[$i] = undef; 502 } elsif ($ver9) { 503 $src[$i] = undef if $map->{ $src[$i] } 504 ? @{ $map->{ $src[$i] } } == 0 505 : $uXS && _ignorable_simple($src[$i]); ### XS only 506 } 507 } 508 509 for (my $i = 0; $i < @src; $i++) { 510 my $jcps = $src[$i]; 511 512 # skip removed code point 513 if (! defined $jcps) { 514 if ($wLen && @buf) { 515 $buf[-1][2] = $i + 1; 516 } 517 next; 518 } 519 520 my $i_orig = $i; 521 522 # find contraction 523 if ($max->{$jcps}) { 524 my $temp_jcps = $jcps; 525 my $jcpsLen = 1; 526 my $maxLen = $max->{$jcps}; 527 528 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 529 next if ! defined $src[$p]; 530 $temp_jcps .= CODE_SEP . $src[$p]; 531 $jcpsLen++; 532 if ($map->{$temp_jcps}) { 533 $jcps = $temp_jcps; 534 $i = $p; 535 } 536 } 537 538 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 539 # This process requires Unicode::Normalize. 540 # If "normalization" is undef, here should be skipped *always* 541 # (in spite of bool value of $CVgetCombinClass), 542 # since canonical ordering cannot be expected. 543 # Blocked combining character should not be contracted. 544 545 # $self->{normCode} is false in the case of "prenormalized". 546 if ($self->{normalization}) { 547 my $cont = $self->{contraction}; 548 my $preCC = 0; 549 my $preCC_uc = 0; 550 my $jcps_uc = $jcps; 551 my(@out, @out_uc); 552 553 for (my $p = $i + 1; $p < @src; $p++) { 554 next if ! defined $src[$p]; 555 my $curCC = $CVgetCombinClass->($src[$p]); 556 last unless $curCC; 557 my $tail = CODE_SEP . $src[$p]; 558 559 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} || 560 $cont->{$jcps_uc.$tail})) { 561 $jcps_uc .= $tail; 562 push @out_uc, $p; 563 } else { 564 $preCC_uc = $curCC; 565 } 566 567 if ($preCC != $curCC && $map->{$jcps.$tail}) { 568 $jcps .= $tail; 569 push @out, $p; 570 } else { 571 $preCC = $curCC; 572 } 573 } 574 575 if ($map->{$jcps_uc}) { 576 $jcps = $jcps_uc; 577 $src[$_] = undef for @out_uc; 578 } else { 579 $src[$_] = undef for @out; 580 } 581 } 582 } 583 584 # skip completely ignorable 585 if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 : 586 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only 587 if ($wLen && @buf) { 588 $buf[-1][2] = $i + 1; 589 } 590 next; 591 } 592 593 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 594 } 595 return \@buf; 596} 597 598## 599## VCE = _pack_override(input, codepoint, derivCode) 600## 601sub _pack_override ($$$) { 602 my $r = shift; 603 my $u = shift; 604 my $der = shift; 605 606 if (ref $r) { 607 return pack(VCE_TEMPLATE, NON_VAR, @$r); 608 } elsif (defined $r) { 609 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 610 } else { 611 return $der->($u); 612 } 613} 614 615## 616## list of VCE = getWt(JCPS) 617## 618sub getWt 619{ 620 my $self = shift; 621 my $u = shift; 622 my $map = $self->{mapping}; 623 my $der = $self->{derivCode}; 624 my $uXS = $self->{__useXS}; ### XS only 625 626 return if !defined $u; 627 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 628 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 629 return map($self->varCE($_), @{ $map->{$u} }) if $map->{$u}; 630### begin XS only ### 631 return map($self->varCE($_), _fetch_simple($u)) 632 if $uXS && _exists_simple($u); 633### end XS only ### 634 635 # JCPS must not be a contraction, then it's a code point. 636 if (Hangul_SIni <= $u && $u <= Hangul_SFin) { 637 my $hang = $self->{overrideHangul}; 638 my @hangulCE; 639 if ($hang) { 640 @hangulCE = map _pack_override($_, $u, $der), $hang->($u); 641 } elsif (!defined $hang) { 642 @hangulCE = $der->($u); 643 } else { 644 my $max = $self->{maxlength}; 645 my @decH = _decompHangul($u); 646 647 if (@decH == 2) { 648 my $contract = join(CODE_SEP, @decH); 649 @decH = ($contract) if $map->{$contract}; 650 } else { # must be <@decH == 3> 651 if ($max->{$decH[0]}) { 652 my $contract = join(CODE_SEP, @decH); 653 if ($map->{$contract}) { 654 @decH = ($contract); 655 } else { 656 $contract = join(CODE_SEP, @decH[0,1]); 657 $map->{$contract} and @decH = ($contract, $decH[2]); 658 } 659 # even if V's ignorable, LT contraction is not supported. 660 # If such a situation were required, NFD should be used. 661 } 662 if (@decH == 3 && $max->{$decH[1]}) { 663 my $contract = join(CODE_SEP, @decH[1,2]); 664 $map->{$contract} and @decH = ($decH[0], $contract); 665 } 666 } 667 668 @hangulCE = map({ 669 $map->{$_} ? @{ $map->{$_} } : 670 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only 671 $der->($_); 672 } @decH); 673 } 674 return map $self->varCE($_), @hangulCE; 675 } else { 676 my $cjk = $self->{overrideCJK}; 677 my $vers = $self->{UCA_Version}; 678 if ($cjk && _isUIdeo($u, $vers)) { 679 my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u); 680 return map $self->varCE($_), @cjkCE; 681 } 682 if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 683 return map $self->varCE($_), _uideoCE_8($u); 684 } 685 return map $self->varCE($_), $der->($u); 686 } 687} 688 689 690## 691## string sortkey = getSortKey(string arg) 692## 693sub getSortKey 694{ 695 my $self = shift; 696 my $orig = shift; 697 my $str = $self->process($orig); 698 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 699 my $vers = $self->{UCA_Version}; 700 my $term = $self->{hangul_terminator}; 701 my $lev = $self->{level}; 702 my $iden = $self->{identical}; 703 704 my @buf; # weight arrays 705 if ($term) { 706 my $preHST = ''; 707 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 708 foreach my $jcps (@$rEnt) { 709 # weird things like VL, TL-contraction are not considered! 710 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 711 if ($preHST && !$curHST || # hangul before non-hangul 712 $preHST =~ /L\z/ && $curHST =~ /^T/ || 713 $preHST =~ /V\z/ && $curHST =~ /^L/ || 714 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 715 push @buf, $termCE; 716 } 717 $preHST = $curHST; 718 push @buf, $self->getWt($jcps); 719 } 720 push @buf, $termCE if $preHST; # end at hangul 721 } else { 722 foreach my $jcps (@$rEnt) { 723 push @buf, $self->getWt($jcps); 724 } 725 } 726 727 my $rkey = $self->mk_SortKey(\@buf); ### XS only 728 729 if ($iden || $vers >= 26 && $lev == MaxLevel) { 730 $rkey .= LEVEL_SEP; 731 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 732 } 733 return $rkey; 734} 735 736 737## 738## int compare = cmp(string a, string b) 739## 740sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 741sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 742sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 743sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 744sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 745sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 746sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 747 748## 749## list[strings] sorted = sort(list[strings] arg) 750## 751sub sort { 752 my $obj = shift; 753 return 754 map { $_->[1] } 755 sort{ $a->[0] cmp $b->[0] } 756 map [ $obj->getSortKey($_), $_ ], @_; 757} 758 759 760## 761## bool _nonIgnorAtLevel(arrayref weights, int level) 762## 763sub _nonIgnorAtLevel($$) 764{ 765 my $wt = shift; 766 return if ! defined $wt; 767 my $lv = shift; 768 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 769} 770 771## 772## bool _eqArray( 773## arrayref of arrayref[weights] source, 774## arrayref of arrayref[weights] substr, 775## int level) 776## * comparison of graphemes vs graphemes. 777## @$source >= @$substr must be true (check it before call this); 778## 779sub _eqArray($$$) 780{ 781 my $source = shift; 782 my $substr = shift; 783 my $lev = shift; 784 785 for my $g (0..@$substr-1){ 786 # Do the $g'th graphemes have the same number of AV weights? 787 return if @{ $source->[$g] } != @{ $substr->[$g] }; 788 789 for my $w (0..@{ $substr->[$g] }-1) { 790 for my $v (0..$lev-1) { 791 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 792 } 793 } 794 } 795 return 1; 796} 797 798## 799## (int position, int length) 800## int position = index(string, substring, position, [undoc'ed global]) 801## 802## With "global" (only for the list context), 803## returns list of arrayref[position, length]. 804## 805sub index 806{ 807 my $self = shift; 808 $self->{preprocess} and 809 croak "Don't use Preprocess with index(), match(), etc."; 810 $self->{normCode} and 811 croak "Don't use Normalization with index(), match(), etc."; 812 813 my $str = shift; 814 my $len = length($str); 815 my $sub = shift; 816 my $subE = $self->splitEnt($sub); 817 my $pos = @_ ? shift : 0; 818 $pos = 0 if $pos < 0; 819 my $glob = shift; 820 821 my $lev = $self->{level}; 822 my $v2i = $self->{UCA_Version} >= 9 && 823 $self->{variable} ne 'non-ignorable'; 824 825 if (! @$subE) { 826 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 827 return $glob 828 ? map([$_, 0], $temp..$len) 829 : wantarray ? ($temp,0) : $temp; 830 } 831 $len < $pos 832 and return wantarray ? () : NOMATCHPOS; 833 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 834 @$strE 835 or return wantarray ? () : NOMATCHPOS; 836 837 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 838 839 my $last_is_variable; 840 for my $vwt (map $self->getWt($_), @$subE) { 841 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 842 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 843 844 # "Ignorable (L1, L2) after Variable" since track. v. 9 845 if ($v2i) { 846 if ($var) { 847 $last_is_variable = TRUE; 848 } 849 elsif (!$wt[0]) { # ignorable 850 $to_be_pushed = FALSE if $last_is_variable; 851 } 852 else { 853 $last_is_variable = FALSE; 854 } 855 } 856 857 if (@subWt && !$var && !$wt[0]) { 858 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 859 } elsif ($to_be_pushed) { 860 push @subWt, [ \@wt ]; 861 } 862 # else ===> skipped 863 } 864 865 my $count = 0; 866 my $end = @$strE - 1; 867 868 $last_is_variable = FALSE; # reuse 869 for (my $i = 0; $i <= $end; ) { # no $i++ 870 my $found_base = 0; 871 872 # fetch a grapheme 873 while ($i <= $end && $found_base == 0) { 874 for my $vwt ($self->getWt($strE->[$i][0])) { 875 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 876 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 877 878 # "Ignorable (L1, L2) after Variable" since track. v. 9 879 if ($v2i) { 880 if ($var) { 881 $last_is_variable = TRUE; 882 } 883 elsif (!$wt[0]) { # ignorable 884 $to_be_pushed = FALSE if $last_is_variable; 885 } 886 else { 887 $last_is_variable = FALSE; 888 } 889 } 890 891 if (@strWt && !$var && !$wt[0]) { 892 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 893 $finPos[-1] = $strE->[$i][2]; 894 } elsif ($to_be_pushed) { 895 push @strWt, [ \@wt ]; 896 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 897 $finPos[-1] = NOMATCHPOS if $found_base; 898 push @finPos, $strE->[$i][2]; 899 $found_base++; 900 } 901 # else ===> no-op 902 } 903 $i++; 904 } 905 906 # try to match 907 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 908 if ($iniPos[0] != NOMATCHPOS && 909 $finPos[$#subWt] != NOMATCHPOS && 910 _eqArray(\@strWt, \@subWt, $lev)) { 911 my $temp = $iniPos[0] + $pos; 912 913 if ($glob) { 914 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 915 splice @strWt, 0, $#subWt; 916 splice @iniPos, 0, $#subWt; 917 splice @finPos, 0, $#subWt; 918 } 919 else { 920 return wantarray 921 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 922 : $temp; 923 } 924 } 925 shift @strWt; 926 shift @iniPos; 927 shift @finPos; 928 } 929 } 930 931 return $glob 932 ? @g_ret 933 : wantarray ? () : NOMATCHPOS; 934} 935 936## 937## scalarref to matching part = match(string, substring) 938## 939sub match 940{ 941 my $self = shift; 942 if (my($pos,$len) = $self->index($_[0], $_[1])) { 943 my $temp = substr($_[0], $pos, $len); 944 return wantarray ? $temp : \$temp; 945 # An lvalue ref \substr should be avoided, 946 # since its value is affected by modification of its referent. 947 } 948 else { 949 return; 950 } 951} 952 953## 954## arrayref matching parts = gmatch(string, substring) 955## 956sub gmatch 957{ 958 my $self = shift; 959 my $str = shift; 960 my $sub = shift; 961 return map substr($str, $_->[0], $_->[1]), 962 $self->index($str, $sub, 0, 'g'); 963} 964 965## 966## bool subst'ed = subst(string, substring, replace) 967## 968sub subst 969{ 970 my $self = shift; 971 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 972 973 if (my($pos,$len) = $self->index($_[0], $_[1])) { 974 if ($code) { 975 my $mat = substr($_[0], $pos, $len); 976 substr($_[0], $pos, $len, $code->($mat)); 977 } else { 978 substr($_[0], $pos, $len, $_[2]); 979 } 980 return TRUE; 981 } 982 else { 983 return FALSE; 984 } 985} 986 987## 988## int count = gsubst(string, substring, replace) 989## 990sub gsubst 991{ 992 my $self = shift; 993 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 994 my $cnt = 0; 995 996 # Replacement is carried out from the end, then use reverse. 997 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 998 if ($code) { 999 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1000 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1001 } else { 1002 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1003 } 1004 $cnt++; 1005 } 1006 return $cnt; 1007} 1008 10091; 1010__END__ 1011 1012=head1 NAME 1013 1014Unicode::Collate - Unicode Collation Algorithm 1015 1016=head1 SYNOPSIS 1017 1018 use Unicode::Collate; 1019 1020 #construct 1021 $Collator = Unicode::Collate->new(%tailoring); 1022 1023 #sort 1024 @sorted = $Collator->sort(@not_sorted); 1025 1026 #compare 1027 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1028 1029B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1030according to Perl's Unicode support. See L<perlunicode>, 1031L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1032Otherwise you can use C<preprocess> or should decode them before. 1033 1034=head1 DESCRIPTION 1035 1036This module is an implementation of Unicode Technical Standard #10 1037(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1038 1039=head2 Constructor and Tailoring 1040 1041The C<new> method returns a collator object. If new() is called 1042with no parameters, the collator should do the default collation. 1043 1044 $Collator = Unicode::Collate->new( 1045 UCA_Version => $UCA_Version, 1046 alternate => $alternate, # alias for 'variable' 1047 backwards => $levelNumber, # or \@levelNumbers 1048 entry => $element, 1049 hangul_terminator => $term_primary_weight, 1050 highestFFFF => $bool, 1051 identical => $bool, 1052 ignoreName => qr/$ignoreName/, 1053 ignoreChar => qr/$ignoreChar/, 1054 ignore_level2 => $bool, 1055 katakana_before_hiragana => $bool, 1056 level => $collationLevel, 1057 minimalFFFE => $bool, 1058 normalization => $normalization_form, 1059 overrideCJK => \&overrideCJK, 1060 overrideHangul => \&overrideHangul, 1061 preprocess => \&preprocess, 1062 rearrange => \@charList, 1063 rewrite => \&rewrite, 1064 suppress => \@charList, 1065 table => $filename, 1066 undefName => qr/$undefName/, 1067 undefChar => qr/$undefChar/, 1068 upper_before_lower => $bool, 1069 variable => $variable, 1070 ); 1071 1072=over 4 1073 1074=item UCA_Version 1075 1076If the revision (previously "tracking version") number of UCA is given, 1077behavior of that revision is emulated on collating. 1078If omitted, the return value of C<UCA_Version()> is used. 1079 1080The following revisions are supported. The default is 26. 1081 1082 UCA Unicode Standard DUCET (@version) 1083 ------------------------------------------------------- 1084 8 3.1 3.0.1 (3.0.1d9) 1085 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) 1086 11 4.0 4.0.0 (4.0.0) 1087 14 4.1.0 4.1.0 (4.1.0) 1088 16 5.0 5.0.0 (5.0.0) 1089 18 5.1.0 5.1.0 (5.1.0) 1090 20 5.2.0 5.2.0 (5.2.0) 1091 22 6.0.0 6.0.0 (6.0.0) 1092 24 6.1.0 6.1.0 (6.1.0) 1093 26 6.2.0 6.2.0 (6.2.0) 1094 1095* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1096since C<UCA_Version> 22. 1097 1098* Fully ignorable characters were ignored, and would not interrupt 1099contractions with C<UCA_Version> 9 and 11. 1100 1101* Treatment of ignorables after variables and some behaviors 1102were changed at C<UCA_Version> 9. 1103 1104* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1105depend on C<UCA_Version>. 1106 1107* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1108C<hangul_terminator>. 1109 1110=item alternate 1111 1112-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1113 1114For backward compatibility, C<alternate> (old name) can be used 1115as an alias for C<variable>. 1116 1117=item backwards 1118 1119-- see 3.4 Backward Accents, UTS #10. 1120 1121 backwards => $levelNumber or \@levelNumbers 1122 1123Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1124If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1125forwards at all the levels. 1126 1127=item entry 1128 1129-- see 5 Tailoring; 3.6.1 File Format, UTS #10. 1130 1131If the same character (or a sequence of characters) exists 1132in the collation element table through C<table>, 1133mapping to collation elements is overridden. 1134If it does not exist, the mapping is defined additionally. 1135 1136 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 11370063 0068 ; [.0E6A.0020.0002.0063] # ch 11380043 0068 ; [.0E6A.0020.0007.0043] # Ch 11390043 0048 ; [.0E6A.0020.0008.0043] # CH 1140006C 006C ; [.0F4C.0020.0002.006C] # ll 1141004C 006C ; [.0F4C.0020.0007.004C] # Ll 1142004C 004C ; [.0F4C.0020.0008.004C] # LL 114300F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1144006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 114500D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1146004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1147ENTRY 1148 1149 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 115000E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 115100C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1152ENTRY 1153 1154B<NOTE:> The code point in the UCA file format (before C<';'>) 1155B<must> be a Unicode code point (defined as hexadecimal), 1156but not a native code point. 1157So C<0063> must always denote C<U+0063>, 1158but not a character of C<"\x63">. 1159 1160Weighting may vary depending on collation element table. 1161So ensure the weights defined in C<entry> will be consistent with 1162those in the collation element table loaded via C<table>. 1163 1164In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1165and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1166(as a value between C<0E60> and C<0E6D>) 1167makes ordering as C<C E<lt> CH E<lt> D>. 1168Exactly speaking DUCET already has some characters between C<C> and C<D>: 1169C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1170C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1171and C<c-curl> (C<U+0255>) with C<0E69>. 1172Then primary weight C<0E6A> for C<CH> makes C<CH> 1173ordered between C<c-curl> and C<D>. 1174 1175=item hangul_terminator 1176 1177-- see 7.1.4 Trailing Weights, UTS #10. 1178 1179If a true value is given (non-zero but should be positive), 1180it will be added as a terminator primary weight to the end of 1181every standard Hangul syllable. Secondary and any higher weights 1182for terminator are set to zero. 1183If the value is false or C<hangul_terminator> key does not exist, 1184insertion of terminator weights will not be performed. 1185 1186Boundaries of Hangul syllables are determined 1187according to conjoining Jamo behavior in F<the Unicode Standard> 1188and F<HangulSyllableType.txt>. 1189 1190B<Implementation Note:> 1191(1) For expansion mapping (Unicode character mapped 1192to a sequence of collation elements), a terminator will not be added 1193between collation elements, even if Hangul syllable boundary exists there. 1194Addition of terminator is restricted to the next position 1195to the last collation element. 1196 1197(2) Non-conjoining Hangul letters 1198(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1199automatically terminated with a terminator primary weight. 1200These characters may need terminator included in a collation element 1201table beforehand. 1202 1203=item highestFFFF 1204 1205-- see 5.14 Collation Elements, UTS #35. 1206 1207If the parameter is made true, C<U+FFFF> has a highest primary weight. 1208When a boolean of C<$coll-E<gt>ge($str, "abc")> and 1209C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 1210begins with C<"abc">, or another primary equivalent. 1211C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 1212such as C<"abc\x{FFFF}xyz">. 1213 1214C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 1215almostly, but the latter has a problem that you should know which letter is 1216next to C<c>. For a certain language where C<ch> as the next letter, 1217C<"abch"> is greater than C<"abc\x{FFFF}">, but lesser than C<"abd">. 1218 1219Note: This is equivalent to C<entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]'>. 1220Any other character than C<U+FFFF> can be tailored by C<entry>. 1221 1222=item identical 1223 1224-- see A.3 Deterministic Comparison, UTS #10. 1225 1226By default, strings whose weights are equal should be equal, 1227even though their code points are not equal. 1228Completely ignorable characters are ignored. 1229 1230If the parameter is made true, a final, tie-breaking level is used. 1231If no difference of weights is found after the comparison through 1232all the level specified by C<level>, the comparison with code points 1233will be performed. 1234For the tie-breaking comparision, the sort key has code points 1235of the original string appended. 1236Completely ignorable characters are not ignored. 1237 1238If C<preprocess> and/or C<normalization> is applied, the code points 1239of the string after them (in NFD by default) are used. 1240 1241=item ignoreChar 1242 1243=item ignoreName 1244 1245-- see 3.6.2 Variable Weighting, UTS #10. 1246 1247Makes the entry in the table completely ignorable; 1248i.e. as if the weights were zero at all level. 1249 1250Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1251will be ignored. Through C<ignoreName>, any character whose name 1252(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1253will be ignored. 1254 1255E.g. when 'a' and 'e' are ignorable, 1256'element' is equal to 'lament' (or 'lmnt'). 1257 1258=item ignore_level2 1259 1260-- see 5.1 Parametric Tailoring, UTS #10. 1261 1262By default, case-sensitive comparison (that is level 3 difference) 1263won't ignore accents (that is level 2 difference). 1264 1265If the parameter is made true, accents (and other primary ignorable 1266characters) are ignored, even though cases are taken into account. 1267 1268B<NOTE>: C<level> should be 3 or greater. 1269 1270=item katakana_before_hiragana 1271 1272-- see 7.2 Tertiary Weight Table, UTS #10. 1273 1274By default, hiragana is before katakana. 1275If the parameter is made true, this is reversed. 1276 1277B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1278distinctions must occur in level 3, and their weights at level 3 must be 1279same as those mentioned in 7.3.1, UTS #10. 1280If you define your collation elements which violate this requirement, 1281this parameter does not work validly. 1282 1283=item level 1284 1285-- see 4.3 Form Sort Key, UTS #10. 1286 1287Set the maximum level. 1288Any higher levels than the specified one are ignored. 1289 1290 Level 1: alphabetic ordering 1291 Level 2: diacritic ordering 1292 Level 3: case ordering 1293 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1294 1295 ex.level => 2, 1296 1297If omitted, the maximum is the 4th. 1298 1299B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 1300But this module only uses weights within 0xFFFF. 1301When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 1302and 'shift-trimmed'), the level 4 may be unreliable. 1303 1304See also C<identical>. 1305 1306=item minimalFFFE 1307 1308-- see 5.14 Collation Elements, UTS #35. 1309 1310If the parameter is made true, C<U+FFFE> has a minimal primary weight. 1311The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 1312first compares C<$a1> and C<$b1> at level 1, and 1313then C<$a2> and C<$b2> at level 1, as followed. 1314 1315 "ab\x{FFFE}a" 1316 "Ab\x{FFFE}a" 1317 "ab\x{FFFE}c" 1318 "Ab\x{FFFE}c" 1319 "ab\x{FFFE}xyz" 1320 "abc\x{FFFE}def" 1321 "abc\x{FFFE}xYz" 1322 "aBc\x{FFFE}xyz" 1323 "abcX\x{FFFE}def" 1324 "abcx\x{FFFE}xyz" 1325 "b\x{FFFE}aaa" 1326 "bbb\x{FFFE}a" 1327 1328Note: This is equivalent to C<entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]'>. 1329Any other character than C<U+FFFE> can be tailored by C<entry>. 1330 1331=item normalization 1332 1333-- see 4.1 Normalize, UTS #10. 1334 1335If specified, strings are normalized before preparation of sort keys 1336(the normalization is executed after preprocess). 1337 1338A form name C<Unicode::Normalize::normalize()> accepts will be applied 1339as C<$normalization_form>. 1340Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1341See C<Unicode::Normalize::normalize()> for detail. 1342If omitted, C<'NFD'> is used. 1343 1344C<normalization> is performed after C<preprocess> (if defined). 1345 1346Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1347though they are not concerned with C<Unicode::Normalize::normalize()>. 1348 1349If C<undef> (not a string C<"undef">) is passed explicitly 1350as the value for this key, 1351any normalization is not carried out (this may make tailoring easier 1352if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1353only contiguous contractions are resolved; 1354e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1355C<A-cedilla-ring> would be primary equal to C<A>. 1356In this point, 1357C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1358B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1359 1360In the case of C<(normalization =E<gt> "prenormalized")>, 1361any normalization is not performed, but 1362discontiguous contractions with combining characters are performed. 1363Therefore 1364C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1365B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1366If source strings are finely prenormalized, 1367C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1368 1369Except C<(normalization =E<gt> undef)>, 1370B<Unicode::Normalize> is required (see also B<CAVEAT>). 1371 1372=item overrideCJK 1373 1374-- see 7.1 Derived Collation Elements, UTS #10. 1375 1376By default, CJK unified ideographs are ordered in Unicode codepoint 1377order, but those in the CJK Unified Ideographs block are lesser than 1378those in the CJK Unified Ideographs Extension A etc. 1379 1380 In the CJK Unified Ideographs block: 1381 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1382 U+4E00..U+9FBB if UCA_Version is 14 or 16. 1383 U+4E00..U+9FC3 if UCA_Version is 18. 1384 U+4E00..U+9FCB if UCA_Version is 20 or 22. 1385 U+4E00..U+9FCC if UCA_Version is 24 or 26. 1386 1387 In the CJK Unified Ideographs Extension blocks: 1388 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version. 1389 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater. 1390 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater. 1391 1392Through C<overrideCJK>, ordering of CJK unified ideographs (including 1393extensions) can be overridden. 1394 1395ex. CJK unified ideographs in the JIS code point order. 1396 1397 overrideCJK => sub { 1398 my $u = shift; # get a Unicode codepoint 1399 my $b = pack('n', $u); # to UTF-16BE 1400 my $s = your_unicode_to_sjis_converter($b); # convert 1401 my $n = unpack('n', $s); # convert sjis to short 1402 [ $n, 0x20, 0x2, $u ]; # return the collation element 1403 }, 1404 1405The return value may be an arrayref of 1st to 4th weights as shown 1406above. The return value may be an integer as the primary weight 1407as shown below. If C<undef> is returned, the default derived 1408collation element will be used. 1409 1410 overrideCJK => sub { 1411 my $u = shift; # get a Unicode codepoint 1412 my $b = pack('n', $u); # to UTF-16BE 1413 my $s = your_unicode_to_sjis_converter($b); # convert 1414 my $n = unpack('n', $s); # convert sjis to short 1415 return $n; # return the primary weight 1416 }, 1417 1418The return value may be a list containing zero or more of 1419an arrayref, an integer, or C<undef>. 1420 1421ex. ignores all CJK unified ideographs. 1422 1423 overrideCJK => sub {()}, # CODEREF returning empty list 1424 1425 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1426 # as U+4E00 is a CJK unified ideograph and to be ignorable. 1427 1428If C<undef> is passed explicitly as the value for this key, 1429weights for CJK unified ideographs are treated as undefined. 1430But assignment of weight for CJK unified ideographs 1431in C<table> or C<entry> is still valid. 1432 1433B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 1434C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 1435C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 1436ideographs. But they can't be overridden via C<overrideCJK> when you use 1437DUCET, as the table includes weights for them. C<table> or C<entry> has 1438priority over C<overrideCJK>. 1439 1440=item overrideHangul 1441 1442-- see 7.1 Derived Collation Elements, UTS #10. 1443 1444By default, Hangul syllables are decomposed into Hangul Jamo, 1445even if C<(normalization =E<gt> undef)>. 1446But the mapping of Hangul syllables may be overridden. 1447 1448This parameter works like C<overrideCJK>, so see there for examples. 1449 1450If you want to override the mapping of Hangul syllables, 1451NFD and NFKD are not appropriate, since NFD and NFKD will decompose 1452Hangul syllables before overriding. FCD may decompose Hangul syllables 1453as the case may be. 1454 1455If C<undef> is passed explicitly as the value for this key, 1456weight for Hangul syllables is treated as undefined 1457without decomposition into Hangul Jamo. 1458But definition of weight for Hangul syllables 1459in C<table> or C<entry> is still valid. 1460 1461=item preprocess 1462 1463-- see 5.4 Preprocessing, UTS #10. 1464 1465If specified, the coderef is used to preprocess each string 1466before the formation of sort keys. 1467 1468ex. dropping English articles, such as "a" or "the". 1469Then, "the pen" is before "a pencil". 1470 1471 preprocess => sub { 1472 my $str = shift; 1473 $str =~ s/\b(?:an?|the)\s+//gi; 1474 return $str; 1475 }, 1476 1477C<preprocess> is performed before C<normalization> (if defined). 1478 1479ex. decoding strings in a legacy encoding such as shift-jis: 1480 1481 $sjis_collator = Unicode::Collate->new( 1482 preprocess => \&your_shiftjis_to_unicode_decoder, 1483 ); 1484 @result = $sjis_collator->sort(@shiftjis_strings); 1485 1486B<Note:> Strings returned from the coderef will be interpreted 1487according to Perl's Unicode support. See L<perlunicode>, 1488L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1489 1490=item rearrange 1491 1492-- see 3.5 Rearrangement, UTS #10. 1493 1494Characters that are not coded in logical order and to be rearranged. 1495If C<UCA_Version> is equal to or lesser than 11, default is: 1496 1497 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1498 1499If you want to disallow any rearrangement, pass C<undef> or C<[]> 1500(a reference to empty list) as the value for this key. 1501 1502If C<UCA_Version> is equal to or greater than 14, default is C<[]> 1503(i.e. no rearrangement). 1504 1505B<According to the version 9 of UCA, this parameter shall not be used; 1506but it is not warned at present.> 1507 1508=item rewrite 1509 1510If specified, the coderef is used to rewrite lines in C<table> or C<entry>. 1511The coderef will get each line, and then should return a rewritten line 1512according to the UCA file format. 1513If the coderef returns an empty line, the line will be skipped. 1514 1515e.g. any primary ignorable characters into tertiary ignorable: 1516 1517 rewrite => sub { 1518 my $line = shift; 1519 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 1520 return $line; 1521 }, 1522 1523This example shows rewriting weights. C<rewrite> is allowed to 1524affect code points, weights, and the name. 1525 1526B<NOTE>: C<table> is available to use another table file; 1527preparing a modified table once would be more efficient than 1528rewriting lines on reading an unmodified table every time. 1529 1530=item suppress 1531 1532-- see suppress contractions in 5.14.11 Special-Purpose Commands, 1533UTS #35 (LDML). 1534 1535Contractions beginning with the specified characters are suppressed, 1536even if those contractions are defined in C<table>. 1537 1538An example for Russian and some languages using the Cyrillic script: 1539 1540 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 1541 1542where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 1543 1544B<NOTE>: Contractions via C<entry> are not be suppressed. 1545 1546=item table 1547 1548-- see 3.6 Default Unicode Collation Element Table, UTS #10. 1549 1550You can use another collation element table if desired. 1551 1552The table file should locate in the F<Unicode/Collate> directory 1553on C<@INC>. Say, if the filename is F<Foo.txt>, 1554the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1555 1556By default, F<allkeys.txt> (as the filename of DUCET) is used. 1557If you will prepare your own table file, any name other than F<allkeys.txt> 1558may be better to avoid namespace conflict. 1559 1560B<NOTE>: When XSUB is used, the DUCET is compiled on building this 1561module, and it may save time at the run time. 1562Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table), 1563or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 1564C<rewrite> will prevent this module from using the compiled DUCET. 1565 1566If C<undef> is passed explicitly as the value for this key, 1567no file is read (but you can define collation elements via C<entry>). 1568 1569A typical way to define a collation element table 1570without any file of table: 1571 1572 $onlyABC = Unicode::Collate->new( 1573 table => undef, 1574 entry => << 'ENTRIES', 15750061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 15760041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 15770062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 15780042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 15790063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 15800043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1581ENTRIES 1582 ); 1583 1584If C<ignoreName> or C<undefName> is used, character names should be 1585specified as a comment (following C<#>) on each line. 1586 1587=item undefChar 1588 1589=item undefName 1590 1591-- see 6.3.4 Reducing the Repertoire, UTS #10. 1592 1593Undefines the collation element as if it were unassigned in the C<table>. 1594This reduces the size of the table. 1595If an unassigned character appears in the string to be collated, 1596the sort key is made from its codepoint 1597as a single-character collation element, 1598as it is greater than any other assigned collation elements 1599(in the codepoint order among the unassigned characters). 1600But, it'd be better to ignore characters 1601unfamiliar to you and maybe never used. 1602 1603Through C<undefChar>, any character matching C<qr/$undefChar/> 1604will be undefined. Through C<undefName>, any character whose name 1605(given in the C<table> file as a comment) matches C<qr/$undefName/> 1606will be undefined. 1607 1608ex. Collation weights for beyond-BMP characters are not stored in object: 1609 1610 undefChar => qr/[^\0-\x{fffd}]/, 1611 1612=item upper_before_lower 1613 1614-- see 6.6 Case Comparisons, UTS #10. 1615 1616By default, lowercase is before uppercase. 1617If the parameter is made true, this is reversed. 1618 1619B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1620distinctions must occur in level 3, and their weights at level 3 must be 1621same as those mentioned in 7.3.1, UTS #10. 1622If you define your collation elements which differs from this requirement, 1623this parameter doesn't work validly. 1624 1625=item variable 1626 1627-- see 3.6.2 Variable Weighting, UTS #10. 1628 1629This key allows for variable weighting of variable collation elements, 1630which are marked with an ASTERISK in the table 1631(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 1632 1633 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1634 1635These names are case-insensitive. 1636By default (if specification is omitted), 'shifted' is adopted. 1637 1638 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1639 considered at the 4th level. 1640 1641 'Non-Ignorable' Variable elements are not reset to ignorable. 1642 1643 'Shifted' Variable elements are made ignorable at levels 1 through 3 1644 their level 4 weight is replaced by the old level 1 weight. 1645 Level 4 weight for Non-Variable elements is 0xFFFF. 1646 1647 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1648 are trimmed. 1649 1650=back 1651 1652=head2 Methods for Collation 1653 1654=over 4 1655 1656=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1657 1658Sorts a list of strings. 1659 1660=item C<$result = $Collator-E<gt>cmp($a, $b)> 1661 1662Returns 1 (when C<$a> is greater than C<$b>) 1663or 0 (when C<$a> is equal to C<$b>) 1664or -1 (when C<$a> is lesser than C<$b>). 1665 1666=item C<$result = $Collator-E<gt>eq($a, $b)> 1667 1668=item C<$result = $Collator-E<gt>ne($a, $b)> 1669 1670=item C<$result = $Collator-E<gt>lt($a, $b)> 1671 1672=item C<$result = $Collator-E<gt>le($a, $b)> 1673 1674=item C<$result = $Collator-E<gt>gt($a, $b)> 1675 1676=item C<$result = $Collator-E<gt>ge($a, $b)> 1677 1678They works like the same name operators as theirs. 1679 1680 eq : whether $a is equal to $b. 1681 ne : whether $a is not equal to $b. 1682 lt : whether $a is lesser than $b. 1683 le : whether $a is lesser than $b or equal to $b. 1684 gt : whether $a is greater than $b. 1685 ge : whether $a is greater than $b or equal to $b. 1686 1687=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1688 1689-- see 4.3 Form Sort Key, UTS #10. 1690 1691Returns a sort key. 1692 1693You compare the sort keys using a binary comparison 1694and get the result of the comparison of the strings using UCA. 1695 1696 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1697 1698 is equivalent to 1699 1700 $Collator->cmp($a, $b) 1701 1702=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1703 1704Converts a sorting key into its representation form. 1705If C<UCA_Version> is 8, the output is slightly different. 1706 1707 use Unicode::Collate; 1708 my $c = Unicode::Collate->new(); 1709 print $c->viewSortKey("Perl"),"\n"; 1710 1711 # output: 1712 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1713 # Level 1 Level 2 Level 3 Level 4 1714 1715=back 1716 1717=head2 Methods for Searching 1718 1719The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1720like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1721but they are not aware of any pattern, but only a literal substring. 1722 1723B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1724for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1725C<subst>, C<gsubst>) is croaked, as the position and the length might 1726differ from those on the specified string. 1727 1728C<rearrange> and C<hangul_terminator> parameters are neglected. 1729C<katakana_before_hiragana> and C<upper_before_lower> don't affect 1730matching and searching, as it doesn't matter whether greater or lesser. 1731 1732=over 4 1733 1734=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1735 1736=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1737 1738If C<$substring> matches a part of C<$string>, returns 1739the position of the first occurrence of the matching part in scalar context; 1740in list context, returns a two-element list of 1741the position and the length of the matching part. 1742 1743If C<$substring> does not match any part of C<$string>, 1744returns C<-1> in scalar context and 1745an empty list in list context. 1746 1747e.g. you say 1748 1749 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1750 # (normalization => undef) is REQUIRED. 1751 my $str = "Ich mu� studieren Perl."; 1752 my $sub = "M�SS"; 1753 my $match; 1754 if (my($pos,$len) = $Collator->index($str, $sub)) { 1755 $match = substr($str, $pos, $len); 1756 } 1757 1758and get C<"mu�"> in C<$match> since C<"mu�"> 1759is primary equal to C<"M�SS">. 1760 1761=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1762 1763=item C<($match) = $Collator-E<gt>match($string, $substring)> 1764 1765If C<$substring> matches a part of C<$string>, in scalar context, returns 1766B<a reference to> the first occurrence of the matching part 1767(C<$match_ref> is always true if matches, 1768since every reference is B<true>); 1769in list context, returns the first occurrence of the matching part. 1770 1771If C<$substring> does not match any part of C<$string>, 1772returns C<undef> in scalar context and 1773an empty list in list context. 1774 1775e.g. 1776 1777 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1778 print "matches [$$match_ref].\n"; 1779 } else { 1780 print "doesn't match.\n"; 1781 } 1782 1783 or 1784 1785 if (($match) = $Collator->match($str, $sub)) { # list context 1786 print "matches [$match].\n"; 1787 } else { 1788 print "doesn't match.\n"; 1789 } 1790 1791=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1792 1793If C<$substring> matches a part of C<$string>, returns 1794all the matching parts (or matching count in scalar context). 1795 1796If C<$substring> does not match any part of C<$string>, 1797returns an empty list. 1798 1799=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1800 1801If C<$substring> matches a part of C<$string>, 1802the first occurrence of the matching part is replaced by C<$replacement> 1803(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 1804 1805C<$replacement> can be a C<CODEREF>, 1806taking the matching part as an argument, 1807and returning a string to replace the matching part 1808(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1809 1810=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1811 1812If C<$substring> matches a part of C<$string>, 1813all the occurrences of the matching part are replaced by C<$replacement> 1814(C<$string> is modified) and C<$count> is returned. 1815 1816C<$replacement> can be a C<CODEREF>, 1817taking the matching part as an argument, 1818and returning a string to replace the matching part 1819(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1820 1821e.g. 1822 1823 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1824 # (normalization => undef) is REQUIRED. 1825 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 1826 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1827 1828 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 1829 # i.e., all the camels are made bold-faced. 1830 1831 Examples: levels and ignore_level2 - what does camel match? 1832 --------------------------------------------------------------------------- 1833 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 1834 -----------------------|--------------------------------------------------- 1835 1 false | yes yes yes yes yes 1836 2 false | yes yes no yes yes 1837 3 false | yes no no yes yes 1838 4 false | yes no no no yes 1839 -----------------------|--------------------------------------------------- 1840 1 true | yes yes yes yes yes 1841 2 true | yes yes yes yes yes 1842 3 true | yes no yes yes yes 1843 4 true | yes no yes no yes 1844 --------------------------------------------------------------------------- 1845 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 1846 at any level. 1847 1848=back 1849 1850=head2 Other Methods 1851 1852=over 4 1853 1854=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 1855 1856=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 1857 1858Changes the value of specified keys and returns the changed part. 1859 1860 $Collator = Unicode::Collate->new(level => 4); 1861 1862 $Collator->eq("perl", "PERL"); # false 1863 1864 %old = $Collator->change(level => 2); # returns (level => 4). 1865 1866 $Collator->eq("perl", "PERL"); # true 1867 1868 $Collator->change(%old); # returns (level => 2). 1869 1870 $Collator->eq("perl", "PERL"); # false 1871 1872Not all C<(key,value)>s are allowed to be changed. 1873See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 1874 1875In the scalar context, returns the modified collator 1876(but it is B<not> a clone from the original). 1877 1878 $Collator->change(level => 2)->eq("perl", "PERL"); # true 1879 1880 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 1881 1882 $Collator->change(level => 4)->eq("perl", "PERL"); # false 1883 1884=item C<$version = $Collator-E<gt>version()> 1885 1886Returns the version number (a string) of the Unicode Standard 1887which the C<table> file used by the collator object is based on. 1888If the table does not include a version line (starting with C<@version>), 1889returns C<"unknown">. 1890 1891=item C<UCA_Version()> 1892 1893Returns the revision number of UTS #10 this module consults, 1894that should correspond with the DUCET incorporated. 1895 1896=item C<Base_Unicode_Version()> 1897 1898Returns the version number of UTS #10 this module consults, 1899that should correspond with the DUCET incorporated. 1900 1901=back 1902 1903=head1 EXPORT 1904 1905No method will be exported. 1906 1907=head1 INSTALL 1908 1909Though this module can be used without any C<table> file, 1910to use this module easily, it is recommended to install a table file 1911in the UCA format, by copying it under the directory 1912<a place in @INC>/Unicode/Collate. 1913 1914The most preferable one is "The Default Unicode Collation Element Table" 1915(aka DUCET), available from the Unicode Consortium's website: 1916 1917 http://www.unicode.org/Public/UCA/ 1918 1919 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) 1920 1921If DUCET is not installed, it is recommended to copy the file 1922from http://www.unicode.org/Public/UCA/latest/allkeys.txt 1923to <a place in @INC>/Unicode/Collate/allkeys.txt 1924manually. 1925 1926=head1 CAVEATS 1927 1928=over 4 1929 1930=item Normalization 1931 1932Use of the C<normalization> parameter requires the B<Unicode::Normalize> 1933module (see L<Unicode::Normalize>). 1934 1935If you need not it (say, in the case when you need not 1936handle any combining characters), 1937assign C<normalization =E<gt> undef> explicitly. 1938 1939-- see 6.5 Avoiding Normalization, UTS #10. 1940 1941=item Conformance Test 1942 1943The Conformance Test for the UCA is available 1944under L<http://www.unicode.org/Public/UCA/>. 1945 1946For F<CollationTest_SHIFTED.txt>, 1947a collator via C<Unicode::Collate-E<gt>new( )> should be used; 1948for F<CollationTest_NON_IGNORABLE.txt>, a collator via 1949C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 1950 1951If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 1952C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 1953C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 1954C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 1955 1956B<Unicode::Normalize is required to try The Conformance Test.> 1957 1958=back 1959 1960=head1 AUTHOR, COPYRIGHT AND LICENSE 1961 1962The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 1963<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2012, 1964SADAHIRO Tomoyuki. Japan. All rights reserved. 1965 1966This module is free software; you can redistribute it and/or 1967modify it under the same terms as Perl itself. 1968 1969The file Unicode/Collate/allkeys.txt was copied verbatim 1970from L<http://www.unicode.org/Public/UCA/6.2.0/allkeys.txt>. 1971For this file, Copyright (c) 2001-2012 Unicode, Inc. 1972Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>. 1973 1974=head1 SEE ALSO 1975 1976=over 4 1977 1978=item Unicode Collation Algorithm - UTS #10 1979 1980L<http://www.unicode.org/reports/tr10/> 1981 1982=item The Default Unicode Collation Element Table (DUCET) 1983 1984L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 1985 1986=item The conformance test for the UCA 1987 1988L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 1989 1990L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 1991 1992=item Hangul Syllable Type 1993 1994L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 1995 1996=item Unicode Normalization Forms - UAX #15 1997 1998L<http://www.unicode.org/reports/tr15/> 1999 2000=item Unicode Locale Data Markup Language (LDML) - UTS #35 2001 2002L<http://www.unicode.org/reports/tr35/> 2003 2004=back 2005 2006=cut 2007