1b39c5158Smillertpackage Unicode::Collate; 2b39c5158Smillert 3b39c5158Smillertuse 5.006; 4b39c5158Smillertuse strict; 5b39c5158Smillertuse warnings; 6b39c5158Smillertuse Carp; 7b39c5158Smillertuse File::Spec; 8b39c5158Smillert 9b39c5158Smillertno warnings 'utf8'; 10b39c5158Smillert 11*eac174f2Safresh1our $VERSION = '1.31'; 12b39c5158Smillertour $PACKAGE = __PACKAGE__; 13b39c5158Smillert 14898184e3Ssthen### begin XS only ### 159f11ffb7Safresh1use XSLoader (); 169f11ffb7Safresh1XSLoader::load('Unicode::Collate', $VERSION); 17898184e3Ssthen### end XS only ### 18898184e3Ssthen 19b39c5158Smillertmy @Path = qw(Unicode Collate); 209f11ffb7Safresh1my $KeyFile = 'allkeys.txt'; 21b39c5158Smillert 22b39c5158Smillert# Perl's boolean 23b39c5158Smillertuse constant TRUE => 1; 24b39c5158Smillertuse constant FALSE => ""; 25b39c5158Smillertuse constant NOMATCHPOS => -1; 26b39c5158Smillert 27b39c5158Smillert# A coderef to get combining class imported from Unicode::Normalize 28b39c5158Smillert# (i.e. \&Unicode::Normalize::getCombinClass). 29b39c5158Smillert# This is also used as a HAS_UNICODE_NORMALIZE flag. 30b39c5158Smillertmy $CVgetCombinClass; 31b39c5158Smillert 32b39c5158Smillert# Supported Levels 33b39c5158Smillertuse constant MinLevel => 1; 34b39c5158Smillertuse constant MaxLevel => 4; 35b39c5158Smillert 36b39c5158Smillert# Minimum weights at level 2 and 3, respectively 37b39c5158Smillertuse constant Min2Wt => 0x20; 38b39c5158Smillertuse constant Min3Wt => 0x02; 39b39c5158Smillert 40b39c5158Smillert# Shifted weight at 4th level 41b39c5158Smillertuse constant Shift4Wt => 0xFFFF; 42b39c5158Smillert 43b39c5158Smillert# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 44b39c5158Smillertuse constant VCE_TEMPLATE => 'Cn4'; 45b39c5158Smillert 46b39c5158Smillert# A sort key: 16-bit weights 47b39c5158Smillertuse constant KEY_TEMPLATE => 'n*'; 48b39c5158Smillert 4991f110e0Safresh1# The tie-breaking: 32-bit weights 5091f110e0Safresh1use constant TIE_TEMPLATE => 'N*'; 5191f110e0Safresh1 52b39c5158Smillert# Level separator in a sort key: 53b39c5158Smillert# i.e. pack(KEY_TEMPLATE, 0) 54b39c5158Smillertuse constant LEVEL_SEP => "\0\0"; 55b39c5158Smillert 56b39c5158Smillert# As Unicode code point separator for hash keys. 57b39c5158Smillert# A joined code point string (denoted by JCPS below) 58b39c5158Smillert# like "65;768" is used for internal processing 59b39c5158Smillert# instead of Perl's Unicode string like "\x41\x{300}", 60b39c5158Smillert# as the native code point is different from the Unicode code point 61b39c5158Smillert# on EBCDIC platform. 62b39c5158Smillert# This character must not be included in any stringified 63b39c5158Smillert# representation of an integer. 64b39c5158Smillertuse constant CODE_SEP => ';'; 65898184e3Ssthen # NOTE: in regex /;/ is used for $jcps! 66b39c5158Smillert 67b39c5158Smillert# boolean values of variable weights 68b39c5158Smillertuse constant NON_VAR => 0; # Non-Variable character 69b39c5158Smillertuse constant VAR => 1; # Variable character 70b39c5158Smillert 71b39c5158Smillert# specific code points 72b39c5158Smillertuse constant Hangul_SIni => 0xAC00; 73b39c5158Smillertuse constant Hangul_SFin => 0xD7A3; 74b39c5158Smillert 75b39c5158Smillert# Logical_Order_Exception in PropList.txt 76b39c5158Smillertmy $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 77b39c5158Smillert 7891f110e0Safresh1# for highestFFFF and minimalFFFE 7991f110e0Safresh1my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 8091f110e0Safresh1my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 81b39c5158Smillert 82*eac174f2Safresh1sub UCA_Version { '43' } 8391f110e0Safresh1 84*eac174f2Safresh1sub Base_Unicode_Version { '13.0.0' } 85b39c5158Smillert 86b39c5158Smillert###### 87b39c5158Smillert 88*eac174f2Safresh1my $native_to_unicode = ($::IS_ASCII || $] < 5.008) 89*eac174f2Safresh1 ? sub { return shift } 90*eac174f2Safresh1 : sub { utf8::native_to_unicode(shift) }; 91*eac174f2Safresh1 92*eac174f2Safresh1my $unicode_to_native = ($::IS_ASCII || $] < 5.008) 93*eac174f2Safresh1 ? sub { return shift } 94*eac174f2Safresh1 : sub { utf8::unicode_to_native(shift) }; 95*eac174f2Safresh1 96*eac174f2Safresh1# pack_U() should get Unicode code points. 97b39c5158Smillertsub pack_U { 98*eac174f2Safresh1 return pack('U*', map $unicode_to_native->($_), @_); 99b39c5158Smillert} 100b39c5158Smillert 101*eac174f2Safresh1# unpack_U() should return Unicode code points. 102b8851fccSafresh1sub unpack_U { 103*eac174f2Safresh1 return map $native_to_unicode->($_), unpack('U*', shift(@_).pack('U*')); 104b8851fccSafresh1} 105*eac174f2Safresh1# for older perl version, pack('U*') generates empty string with utf8 flag. 1066fb12b70Safresh1 107b39c5158Smillert###### 108b39c5158Smillert 109b39c5158Smillertmy (%VariableOK); 110b39c5158Smillert@VariableOK{ qw/ 111b39c5158Smillert blanked non-ignorable shifted shift-trimmed 112b39c5158Smillert / } = (); # keys lowercased 113b39c5158Smillert 114b39c5158Smillertour @ChangeOK = qw/ 115b39c5158Smillert alternate backwards level normalization rearrange 116898184e3Ssthen katakana_before_hiragana upper_before_lower ignore_level2 1176fb12b70Safresh1 overrideCJK overrideHangul overrideOut preprocess UCA_Version 11891f110e0Safresh1 hangul_terminator variable identical highestFFFF minimalFFFE 119b8851fccSafresh1 long_contraction 120b39c5158Smillert /; 121b39c5158Smillert 122b39c5158Smillertour @ChangeNG = qw/ 123898184e3Ssthen entry mapping table maxlength contraction 124898184e3Ssthen ignoreChar ignoreName undefChar undefName rewrite 125898184e3Ssthen versionTable alternateTable backwardsTable forwardsTable 126898184e3Ssthen rearrangeTable variableTable 127898184e3Ssthen derivCode normCode rearrangeHash backwardsFlag 128898184e3Ssthen suppress suppressHash 129898184e3Ssthen __useXS /; ### XS only 1306fb12b70Safresh1# The hash key 'ignored' was deleted at v 0.21. 1316fb12b70Safresh1# The hash key 'isShift' was deleted at v 0.23. 1326fb12b70Safresh1# The hash key 'combining' was deleted at v 0.24. 1336fb12b70Safresh1# The hash key 'entries' was deleted at v 0.30. 1346fb12b70Safresh1# The hash key 'L3_ignorable' was deleted at v 0.40. 135b39c5158Smillert 136b39c5158Smillertsub version { 137b39c5158Smillert my $self = shift; 138b39c5158Smillert return $self->{versionTable} || 'unknown'; 139b39c5158Smillert} 140b39c5158Smillert 141b39c5158Smillertmy (%ChangeOK, %ChangeNG); 142b39c5158Smillert@ChangeOK{ @ChangeOK } = (); 143b39c5158Smillert@ChangeNG{ @ChangeNG } = (); 144b39c5158Smillert 145b39c5158Smillertsub change { 146b39c5158Smillert my $self = shift; 147b39c5158Smillert my %hash = @_; 148b39c5158Smillert my %old; 14991f110e0Safresh1 if (exists $hash{alternate}) { 15091f110e0Safresh1 if (exists $hash{variable}) { 151b39c5158Smillert delete $hash{alternate}; 15291f110e0Safresh1 } else { 153b39c5158Smillert $hash{variable} = $hash{alternate}; 154b39c5158Smillert } 15591f110e0Safresh1 } 156b39c5158Smillert foreach my $k (keys %hash) { 157b39c5158Smillert if (exists $ChangeOK{$k}) { 158b39c5158Smillert $old{$k} = $self->{$k}; 159b39c5158Smillert $self->{$k} = $hash{$k}; 16091f110e0Safresh1 } elsif (exists $ChangeNG{$k}) { 161b39c5158Smillert croak "change of $k via change() is not allowed!"; 162b39c5158Smillert } 163b39c5158Smillert # else => ignored 164b39c5158Smillert } 165b39c5158Smillert $self->checkCollator(); 166b39c5158Smillert return wantarray ? %old : $self; 167b39c5158Smillert} 168b39c5158Smillert 169b39c5158Smillertsub _checkLevel { 170b39c5158Smillert my $level = shift; 171b39c5158Smillert my $key = shift; # 'level' or 'backwards' 172b39c5158Smillert MinLevel <= $level or croak sprintf 173b39c5158Smillert "Illegal level %d (in value for key '%s') lower than %d.", 174b39c5158Smillert $level, $key, MinLevel; 175b39c5158Smillert $level <= MaxLevel or croak sprintf 176b39c5158Smillert "Unsupported level %d (in value for key '%s') higher than %d.", 177b39c5158Smillert $level, $key, MaxLevel; 178b39c5158Smillert} 179b39c5158Smillert 180b39c5158Smillertmy %DerivCode = ( 181b39c5158Smillert 8 => \&_derivCE_8, 182b39c5158Smillert 9 => \&_derivCE_9, 183b39c5158Smillert 11 => \&_derivCE_9, # 11 == 9 184b39c5158Smillert 14 => \&_derivCE_14, 185898184e3Ssthen 16 => \&_derivCE_14, # 16 == 14 186898184e3Ssthen 18 => \&_derivCE_18, 187898184e3Ssthen 20 => \&_derivCE_20, 188898184e3Ssthen 22 => \&_derivCE_22, 189898184e3Ssthen 24 => \&_derivCE_24, 19091f110e0Safresh1 26 => \&_derivCE_24, # 26 == 24 1916fb12b70Safresh1 28 => \&_derivCE_24, # 28 == 24 192b8851fccSafresh1 30 => \&_derivCE_24, # 30 == 24 1939f11ffb7Safresh1 32 => \&_derivCE_32, 1949f11ffb7Safresh1 34 => \&_derivCE_34, 1959f11ffb7Safresh1 36 => \&_derivCE_36, 196*eac174f2Safresh1 38 => \&_derivCE_38, 197*eac174f2Safresh1 40 => \&_derivCE_40, 198*eac174f2Safresh1 41 => \&_derivCE_40, # 41 == 40 199*eac174f2Safresh1 43 => \&_derivCE_43, 200b39c5158Smillert); 201b39c5158Smillert 202b39c5158Smillertsub checkCollator { 203b39c5158Smillert my $self = shift; 2049f11ffb7Safresh1 _checkLevel($self->{level}, 'level'); 205b39c5158Smillert 206b39c5158Smillert $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 207b39c5158Smillert or croak "Illegal UCA version (passed $self->{UCA_Version})."; 208b39c5158Smillert 209b39c5158Smillert $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 210b39c5158Smillert $self->{alternateTable} || 'shifted'; 211b39c5158Smillert $self->{variable} = $self->{alternate} = lc($self->{variable}); 212b39c5158Smillert exists $VariableOK{ $self->{variable} } 213b39c5158Smillert or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 214b39c5158Smillert 215b39c5158Smillert if (! defined $self->{backwards}) { 216b39c5158Smillert $self->{backwardsFlag} = 0; 21791f110e0Safresh1 } elsif (! ref $self->{backwards}) { 2189f11ffb7Safresh1 _checkLevel($self->{backwards}, 'backwards'); 219b39c5158Smillert $self->{backwardsFlag} = 1 << $self->{backwards}; 22091f110e0Safresh1 } else { 221b39c5158Smillert my %level; 222b39c5158Smillert $self->{backwardsFlag} = 0; 223b39c5158Smillert for my $b (@{ $self->{backwards} }) { 2249f11ffb7Safresh1 _checkLevel($b, 'backwards'); 225b39c5158Smillert $level{$b} = 1; 226b39c5158Smillert } 227b39c5158Smillert for my $v (sort keys %level) { 228b39c5158Smillert $self->{backwardsFlag} += 1 << $v; 229b39c5158Smillert } 230b39c5158Smillert } 231b39c5158Smillert 232b39c5158Smillert defined $self->{rearrange} or $self->{rearrange} = []; 233b39c5158Smillert ref $self->{rearrange} 234b39c5158Smillert or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 235b39c5158Smillert 236b39c5158Smillert # keys of $self->{rearrangeHash} are $self->{rearrange}. 237b39c5158Smillert $self->{rearrangeHash} = undef; 238b39c5158Smillert 239b39c5158Smillert if (@{ $self->{rearrange} }) { 240b39c5158Smillert @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 241b39c5158Smillert } 242b39c5158Smillert 243b39c5158Smillert $self->{normCode} = undef; 244b39c5158Smillert 245b39c5158Smillert if (defined $self->{normalization}) { 246b39c5158Smillert eval { require Unicode::Normalize }; 247b39c5158Smillert $@ and croak "Unicode::Normalize is required to normalize strings"; 248b39c5158Smillert 249b39c5158Smillert $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 250b39c5158Smillert 251b39c5158Smillert if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 252b39c5158Smillert $self->{normCode} = \&Unicode::Normalize::NFD; 253b39c5158Smillert } 254b39c5158Smillert elsif ($self->{normalization} ne 'prenormalized') { 255b39c5158Smillert my $norm = $self->{normalization}; 256b39c5158Smillert $self->{normCode} = sub { 257b39c5158Smillert Unicode::Normalize::normalize($norm, shift); 258b39c5158Smillert }; 259b39c5158Smillert eval { $self->{normCode}->("") }; # try 260b39c5158Smillert $@ and croak "$PACKAGE unknown normalization form name: $norm"; 261b39c5158Smillert } 262b39c5158Smillert } 263b39c5158Smillert return; 264b39c5158Smillert} 265b39c5158Smillert 266b39c5158Smillertsub new 267b39c5158Smillert{ 268b39c5158Smillert my $class = shift; 269b39c5158Smillert my $self = bless { @_ }, $class; 270b39c5158Smillert 271898184e3Ssthen### begin XS only ### 272898184e3Ssthen if (! exists $self->{table} && !defined $self->{rewrite} && 273898184e3Ssthen !defined $self->{undefName} && !defined $self->{ignoreName} && 274898184e3Ssthen !defined $self->{undefChar} && !defined $self->{ignoreChar}) { 275898184e3Ssthen $self->{__useXS} = \&_fetch_simple; 276898184e3Ssthen } else { 277898184e3Ssthen $self->{__useXS} = undef; 278898184e3Ssthen } 279898184e3Ssthen### end XS only ### 280898184e3Ssthen 281898184e3Ssthen # keys of $self->{suppressHash} are $self->{suppress}. 282898184e3Ssthen if ($self->{suppress} && @{ $self->{suppress} }) { 283898184e3Ssthen @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 284898184e3Ssthen } # before read_table() 285898184e3Ssthen 286b39c5158Smillert # If undef is passed explicitly, no file is read. 287b39c5158Smillert $self->{table} = $KeyFile if ! exists $self->{table}; 288b39c5158Smillert $self->read_table() if defined $self->{table}; 289b39c5158Smillert 290b39c5158Smillert if ($self->{entry}) { 291b39c5158Smillert while ($self->{entry} =~ /([^\n]+)/g) { 292898184e3Ssthen $self->parseEntry($1, TRUE); 293b39c5158Smillert } 294b39c5158Smillert } 295b39c5158Smillert 296b8851fccSafresh1 # only in new(), not in change() 297b39c5158Smillert $self->{level} ||= MaxLevel; 298b39c5158Smillert $self->{UCA_Version} ||= UCA_Version(); 299b39c5158Smillert 300b39c5158Smillert $self->{overrideHangul} = FALSE 301b39c5158Smillert if ! exists $self->{overrideHangul}; 302b39c5158Smillert $self->{overrideCJK} = FALSE 303b39c5158Smillert if ! exists $self->{overrideCJK}; 304b39c5158Smillert $self->{normalization} = 'NFD' 305b39c5158Smillert if ! exists $self->{normalization}; 306b39c5158Smillert $self->{rearrange} = $self->{rearrangeTable} || 307b39c5158Smillert ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 308b39c5158Smillert if ! exists $self->{rearrange}; 309b39c5158Smillert $self->{backwards} = $self->{backwardsTable} 310b39c5158Smillert if ! exists $self->{backwards}; 311b8851fccSafresh1 exists $self->{long_contraction} or $self->{long_contraction} 312b8851fccSafresh1 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24; 313b39c5158Smillert 314b8851fccSafresh1 # checkCollator() will be called in change() 315b39c5158Smillert $self->checkCollator(); 316b39c5158Smillert 317b39c5158Smillert return $self; 318b39c5158Smillert} 319b39c5158Smillert 320898184e3Ssthensub parseAtmark { 321b39c5158Smillert my $self = shift; 322898184e3Ssthen my $line = shift; # after s/^\s*\@// 323b39c5158Smillert 324b39c5158Smillert if ($line =~ /^version\s*(\S*)/) { 325b39c5158Smillert $self->{versionTable} ||= $1; 326b39c5158Smillert } 327b39c5158Smillert elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 328b39c5158Smillert $self->{variableTable} ||= $1; 329b39c5158Smillert } 330b39c5158Smillert elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 331b39c5158Smillert $self->{alternateTable} ||= $1; 332b39c5158Smillert } 333b39c5158Smillert elsif ($line =~ /^backwards\s+(\S*)/) { 334b39c5158Smillert push @{ $self->{backwardsTable} }, $1; 335b39c5158Smillert } 3366fb12b70Safresh1 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use 337b39c5158Smillert push @{ $self->{forwardsTable} }, $1; 338b39c5158Smillert } 339b39c5158Smillert elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 340b39c5158Smillert push @{ $self->{rearrangeTable} }, _getHexArray($1); 341b39c5158Smillert } 342b39c5158Smillert} 343898184e3Ssthen 344898184e3Ssthensub read_table { 345898184e3Ssthen my $self = shift; 346898184e3Ssthen 347898184e3Ssthen### begin XS only ### 348898184e3Ssthen if ($self->{__useXS}) { 349898184e3Ssthen my @rest = _fetch_rest(); # complex matter need to parse 350898184e3Ssthen for my $line (@rest) { 351898184e3Ssthen next if $line =~ /^\s*#/; 352898184e3Ssthen 353898184e3Ssthen if ($line =~ s/^\s*\@//) { 354898184e3Ssthen $self->parseAtmark($line); 355898184e3Ssthen } else { 356898184e3Ssthen $self->parseEntry($line); 357898184e3Ssthen } 358898184e3Ssthen } 359898184e3Ssthen return; 360898184e3Ssthen } 361898184e3Ssthen### end XS only ### 362898184e3Ssthen 363898184e3Ssthen my($f, $fh); 364898184e3Ssthen foreach my $d (@INC) { 365898184e3Ssthen $f = File::Spec->catfile($d, @Path, $self->{table}); 366898184e3Ssthen last if open($fh, $f); 367898184e3Ssthen $f = undef; 368898184e3Ssthen } 369898184e3Ssthen if (!defined $f) { 370898184e3Ssthen $f = File::Spec->catfile(@Path, $self->{table}); 371898184e3Ssthen croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 372898184e3Ssthen } 373898184e3Ssthen 374898184e3Ssthen while (my $line = <$fh>) { 375898184e3Ssthen next if $line =~ /^\s*#/; 376898184e3Ssthen 377898184e3Ssthen if ($line =~ s/^\s*\@//) { 378898184e3Ssthen $self->parseAtmark($line); 379898184e3Ssthen } else { 380898184e3Ssthen $self->parseEntry($line); 381898184e3Ssthen } 382898184e3Ssthen } 383b39c5158Smillert close $fh; 384b39c5158Smillert} 385b39c5158Smillert 386b39c5158Smillert 387b39c5158Smillert## 388b39c5158Smillert## get $line, parse it, and write an entry in $self 389b39c5158Smillert## 390b39c5158Smillertsub parseEntry 391b39c5158Smillert{ 392b39c5158Smillert my $self = shift; 393b39c5158Smillert my $line = shift; 394898184e3Ssthen my $tailoring = shift; 395b39c5158Smillert my($name, $entry, @uv, @key); 396b39c5158Smillert 397898184e3Ssthen if (defined $self->{rewrite}) { 398898184e3Ssthen $line = $self->{rewrite}->($line); 399898184e3Ssthen } 400898184e3Ssthen 401b39c5158Smillert return if $line !~ /^\s*[0-9A-Fa-f]/; 402b39c5158Smillert 403b39c5158Smillert # removes comment and gets name 404b39c5158Smillert $name = $1 405b39c5158Smillert if $line =~ s/[#%]\s*(.*)//; 406b39c5158Smillert return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 407b39c5158Smillert 408b39c5158Smillert # gets element 409b39c5158Smillert my($e, $k) = split /;/, $line; 410b39c5158Smillert croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 411b39c5158Smillert if ! $k; 412b39c5158Smillert 413b39c5158Smillert @uv = _getHexArray($e); 414b39c5158Smillert return if !@uv; 415898184e3Ssthen return if @uv > 1 && $self->{suppressHash} && !$tailoring && 416898184e3Ssthen exists $self->{suppressHash}{$uv[0]}; 417b39c5158Smillert $entry = join(CODE_SEP, @uv); # in JCPS 418b39c5158Smillert 419b39c5158Smillert if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 420b39c5158Smillert my $ele = pack_U(@uv); 421b39c5158Smillert 4226fb12b70Safresh1 # regarded as if it were not stored in the table 423b39c5158Smillert return 424b39c5158Smillert if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 425b39c5158Smillert 426b39c5158Smillert # replaced as completely ignorable 427b39c5158Smillert $k = '[.0000.0000.0000.0000]' 428b39c5158Smillert if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 429b39c5158Smillert } 430b39c5158Smillert 431b39c5158Smillert # replaced as completely ignorable 432b39c5158Smillert $k = '[.0000.0000.0000.0000]' 433b39c5158Smillert if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 434b39c5158Smillert 435b39c5158Smillert my $is_L3_ignorable = TRUE; 436b39c5158Smillert 437b39c5158Smillert foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 438b39c5158Smillert my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 439b39c5158Smillert my @wt = _getHexArray($arr); 440b39c5158Smillert push @key, pack(VCE_TEMPLATE, $var, @wt); 441b39c5158Smillert $is_L3_ignorable = FALSE 442b39c5158Smillert if $wt[0] || $wt[1] || $wt[2]; 443b39c5158Smillert # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 444b39c5158Smillert # is completely ignorable. 445b39c5158Smillert # For expansion, an entry $is_L3_ignorable 446b39c5158Smillert # if and only if "all" CEs are [.0000.0000.0000]. 447b39c5158Smillert } 448b39c5158Smillert 4499f11ffb7Safresh1 # mapping: be an array ref or not exists (any false value is disallowed) 450b39c5158Smillert $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 451b39c5158Smillert 4529f11ffb7Safresh1 # maxlength: be more than 1 or not exists (any false value is disallowed) 453b39c5158Smillert if (@uv > 1) { 454898184e3Ssthen if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 455898184e3Ssthen $self->{maxlength}{$uv[0]} = @uv; 456898184e3Ssthen } 457898184e3Ssthen } 4589f11ffb7Safresh1 4599f11ffb7Safresh1 # contraction: be 1 or not exists (any false value is disallowed) 460b8851fccSafresh1 while (@uv > 2) { 461898184e3Ssthen pop @uv; 462898184e3Ssthen my $fake_entry = join(CODE_SEP, @uv); # in JCPS 463898184e3Ssthen $self->{contraction}{$fake_entry} = 1; 464898184e3Ssthen } 465b39c5158Smillert} 466b39c5158Smillert 467b39c5158Smillert 468b39c5158Smillertsub viewSortKey 469b39c5158Smillert{ 470b39c5158Smillert my $self = shift; 47191f110e0Safresh1 my $str = shift; 47291f110e0Safresh1 $self->visualizeSortKey($self->getSortKey($str)); 473b39c5158Smillert} 474b39c5158Smillert 475b39c5158Smillert 47691f110e0Safresh1sub process 47791f110e0Safresh1{ 47891f110e0Safresh1 my $self = shift; 47991f110e0Safresh1 my $str = shift; 48091f110e0Safresh1 my $prep = $self->{preprocess}; 48191f110e0Safresh1 my $norm = $self->{normCode}; 48291f110e0Safresh1 48391f110e0Safresh1 $str = &$prep($str) if ref $prep; 48491f110e0Safresh1 $str = &$norm($str) if ref $norm; 48591f110e0Safresh1 return $str; 48691f110e0Safresh1} 48791f110e0Safresh1 488b39c5158Smillert## 489b39c5158Smillert## arrayref of JCPS = splitEnt(string to be collated) 49091f110e0Safresh1## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 491b39c5158Smillert## 492b39c5158Smillertsub splitEnt 493b39c5158Smillert{ 494b39c5158Smillert my $self = shift; 49591f110e0Safresh1 my $str = shift; 49691f110e0Safresh1 my $wLen = shift; # with Length 497b39c5158Smillert 498b39c5158Smillert my $map = $self->{mapping}; 499b39c5158Smillert my $max = $self->{maxlength}; 500b39c5158Smillert my $reH = $self->{rearrangeHash}; 501898184e3Ssthen my $vers = $self->{UCA_Version}; 502898184e3Ssthen my $ver9 = $vers >= 9 && $vers <= 11; 503b8851fccSafresh1 my $long = $self->{long_contraction}; 504898184e3Ssthen my $uXS = $self->{__useXS}; ### XS only 505b39c5158Smillert 50691f110e0Safresh1 my @buf; 507b39c5158Smillert 508b39c5158Smillert # get array of Unicode code point of string. 509b39c5158Smillert my @src = unpack_U($str); 510b39c5158Smillert 511b39c5158Smillert # rearrangement: 512b39c5158Smillert # Character positions are not kept if rearranged, 513b39c5158Smillert # then neglected if $wLen is true. 514b39c5158Smillert if ($reH && ! $wLen) { 515b39c5158Smillert for (my $i = 0; $i < @src; $i++) { 516b39c5158Smillert if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 517b39c5158Smillert ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 518b39c5158Smillert $i++; 519b39c5158Smillert } 520b39c5158Smillert } 521b39c5158Smillert } 522b39c5158Smillert 523b39c5158Smillert # remove a code point marked as a completely ignorable. 524b39c5158Smillert for (my $i = 0; $i < @src; $i++) { 5256fb12b70Safresh1 if ($vers <= 20 && _isIllegal($src[$i])) { 526898184e3Ssthen $src[$i] = undef; 527898184e3Ssthen } elsif ($ver9) { 5289f11ffb7Safresh1 $src[$i] = undef if exists $map->{ $src[$i] } 52991f110e0Safresh1 ? @{ $map->{ $src[$i] } } == 0 53091f110e0Safresh1 : $uXS && _ignorable_simple($src[$i]); ### XS only 531898184e3Ssthen } 532b39c5158Smillert } 533b39c5158Smillert 534b39c5158Smillert for (my $i = 0; $i < @src; $i++) { 535b39c5158Smillert my $jcps = $src[$i]; 536b39c5158Smillert 537b39c5158Smillert # skip removed code point 538b39c5158Smillert if (! defined $jcps) { 539b39c5158Smillert if ($wLen && @buf) { 540b39c5158Smillert $buf[-1][2] = $i + 1; 541b39c5158Smillert } 542b39c5158Smillert next; 543b39c5158Smillert } 544b39c5158Smillert 545b39c5158Smillert my $i_orig = $i; 546b39c5158Smillert 547b39c5158Smillert # find contraction 5489f11ffb7Safresh1 if (exists $max->{$jcps}) { 549b39c5158Smillert my $temp_jcps = $jcps; 550b39c5158Smillert my $jcpsLen = 1; 551b39c5158Smillert my $maxLen = $max->{$jcps}; 552b39c5158Smillert 553b39c5158Smillert for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 554b39c5158Smillert next if ! defined $src[$p]; 555b39c5158Smillert $temp_jcps .= CODE_SEP . $src[$p]; 556b39c5158Smillert $jcpsLen++; 5579f11ffb7Safresh1 if (exists $map->{$temp_jcps}) { 558b39c5158Smillert $jcps = $temp_jcps; 559b39c5158Smillert $i = $p; 560b39c5158Smillert } 561b39c5158Smillert } 562b39c5158Smillert 563898184e3Ssthen # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 564b39c5158Smillert # This process requires Unicode::Normalize. 565b39c5158Smillert # If "normalization" is undef, here should be skipped *always* 566b39c5158Smillert # (in spite of bool value of $CVgetCombinClass), 567b39c5158Smillert # since canonical ordering cannot be expected. 568b39c5158Smillert # Blocked combining character should not be contracted. 569b39c5158Smillert 570b39c5158Smillert # $self->{normCode} is false in the case of "prenormalized". 571898184e3Ssthen if ($self->{normalization}) { 572898184e3Ssthen my $cont = $self->{contraction}; 573b39c5158Smillert my $preCC = 0; 574898184e3Ssthen my $preCC_uc = 0; 575898184e3Ssthen my $jcps_uc = $jcps; 576898184e3Ssthen my(@out, @out_uc); 577b39c5158Smillert 578b39c5158Smillert for (my $p = $i + 1; $p < @src; $p++) { 579b39c5158Smillert next if ! defined $src[$p]; 580898184e3Ssthen my $curCC = $CVgetCombinClass->($src[$p]); 581b39c5158Smillert last unless $curCC; 582b39c5158Smillert my $tail = CODE_SEP . $src[$p]; 583898184e3Ssthen 5849f11ffb7Safresh1 if ($preCC != $curCC && exists $map->{$jcps.$tail}) { 585b8851fccSafresh1 $jcps .= $tail; 586b8851fccSafresh1 push @out, $p; 587b8851fccSafresh1 } else { 588b8851fccSafresh1 $preCC = $curCC; 589b8851fccSafresh1 } 590b8851fccSafresh1 591b8851fccSafresh1 next if !$long; 592b8851fccSafresh1 5939f11ffb7Safresh1 if ($preCC_uc != $curCC && 5949f11ffb7Safresh1 (exists $map->{$jcps_uc.$tail} || 5959f11ffb7Safresh1 exists $cont->{$jcps_uc.$tail})) { 596898184e3Ssthen $jcps_uc .= $tail; 597898184e3Ssthen push @out_uc, $p; 598898184e3Ssthen } else { 599898184e3Ssthen $preCC_uc = $curCC; 600898184e3Ssthen } 601b39c5158Smillert } 602898184e3Ssthen 6039f11ffb7Safresh1 if (@out_uc && exists $map->{$jcps_uc}) { 604898184e3Ssthen $jcps = $jcps_uc; 605898184e3Ssthen $src[$_] = undef for @out_uc; 606898184e3Ssthen } else { 607898184e3Ssthen $src[$_] = undef for @out; 608898184e3Ssthen } 609b39c5158Smillert } 610b39c5158Smillert } 611b39c5158Smillert 612b39c5158Smillert # skip completely ignorable 6139f11ffb7Safresh1 if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 : 61491f110e0Safresh1 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only 615b39c5158Smillert if ($wLen && @buf) { 616b39c5158Smillert $buf[-1][2] = $i + 1; 617b39c5158Smillert } 618b39c5158Smillert next; 619b39c5158Smillert } 620b39c5158Smillert 621b39c5158Smillert push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 622b39c5158Smillert } 623b39c5158Smillert return \@buf; 624b39c5158Smillert} 625b39c5158Smillert 626898184e3Ssthen## 627898184e3Ssthen## VCE = _pack_override(input, codepoint, derivCode) 628898184e3Ssthen## 629898184e3Ssthensub _pack_override ($$$) { 630898184e3Ssthen my $r = shift; 631898184e3Ssthen my $u = shift; 632898184e3Ssthen my $der = shift; 633898184e3Ssthen 634898184e3Ssthen if (ref $r) { 635898184e3Ssthen return pack(VCE_TEMPLATE, NON_VAR, @$r); 636898184e3Ssthen } elsif (defined $r) { 637898184e3Ssthen return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 638898184e3Ssthen } else { 6396fb12b70Safresh1 $u = 0xFFFD if 0x10FFFF < $u; 640898184e3Ssthen return $der->($u); 641898184e3Ssthen } 642898184e3Ssthen} 643b39c5158Smillert 644b39c5158Smillert## 645b39c5158Smillert## list of VCE = getWt(JCPS) 646b39c5158Smillert## 647b39c5158Smillertsub getWt 648b39c5158Smillert{ 649b39c5158Smillert my $self = shift; 650b39c5158Smillert my $u = shift; 651b39c5158Smillert my $map = $self->{mapping}; 652b39c5158Smillert my $der = $self->{derivCode}; 6536fb12b70Safresh1 my $out = $self->{overrideOut}; 654898184e3Ssthen my $uXS = $self->{__useXS}; ### XS only 655b39c5158Smillert 656b39c5158Smillert return if !defined $u; 65791f110e0Safresh1 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 65891f110e0Safresh1 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 6596fb12b70Safresh1 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; 660b39c5158Smillert 6616fb12b70Safresh1 my @ce; 6629f11ffb7Safresh1 if (exists $map->{$u}) { 6636fb12b70Safresh1 @ce = @{ $map->{$u} }; # $u may be a contraction 6646fb12b70Safresh1### begin XS only ### 6656fb12b70Safresh1 } elsif ($uXS && _exists_simple($u)) { 6666fb12b70Safresh1 @ce = _fetch_simple($u); 6676fb12b70Safresh1### end XS only ### 6686fb12b70Safresh1 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { 669b39c5158Smillert my $hang = $self->{overrideHangul}; 670b39c5158Smillert if ($hang) { 6716fb12b70Safresh1 @ce = map _pack_override($_, $u, $der), $hang->($u); 672898184e3Ssthen } elsif (!defined $hang) { 6736fb12b70Safresh1 @ce = $der->($u); 674898184e3Ssthen } else { 675b39c5158Smillert my $max = $self->{maxlength}; 676b39c5158Smillert my @decH = _decompHangul($u); 677b39c5158Smillert 678b39c5158Smillert if (@decH == 2) { 679b39c5158Smillert my $contract = join(CODE_SEP, @decH); 6809f11ffb7Safresh1 @decH = ($contract) if exists $map->{$contract}; 681b39c5158Smillert } else { # must be <@decH == 3> 6829f11ffb7Safresh1 if (exists $max->{$decH[0]}) { 683b39c5158Smillert my $contract = join(CODE_SEP, @decH); 6849f11ffb7Safresh1 if (exists $map->{$contract}) { 685b39c5158Smillert @decH = ($contract); 686b39c5158Smillert } else { 687b39c5158Smillert $contract = join(CODE_SEP, @decH[0,1]); 6889f11ffb7Safresh1 exists $map->{$contract} and @decH = ($contract, $decH[2]); 689b39c5158Smillert } 690b39c5158Smillert # even if V's ignorable, LT contraction is not supported. 691898184e3Ssthen # If such a situation were required, NFD should be used. 692b39c5158Smillert } 6939f11ffb7Safresh1 if (@decH == 3 && exists $max->{$decH[1]}) { 694b39c5158Smillert my $contract = join(CODE_SEP, @decH[1,2]); 6959f11ffb7Safresh1 exists $map->{$contract} and @decH = ($decH[0], $contract); 696b39c5158Smillert } 697b39c5158Smillert } 698b39c5158Smillert 6996fb12b70Safresh1 @ce = map({ 7009f11ffb7Safresh1 exists $map->{$_} ? @{ $map->{$_} } : 701898184e3Ssthen $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only 702898184e3Ssthen $der->($_); 703b39c5158Smillert } @decH); 704b39c5158Smillert } 7056fb12b70Safresh1 } elsif ($out && 0x10FFFF < $u) { 7066fb12b70Safresh1 @ce = map _pack_override($_, $u, $der), $out->($u); 707898184e3Ssthen } else { 708b39c5158Smillert my $cjk = $self->{overrideCJK}; 709898184e3Ssthen my $vers = $self->{UCA_Version}; 710898184e3Ssthen if ($cjk && _isUIdeo($u, $vers)) { 7116fb12b70Safresh1 @ce = map _pack_override($_, $u, $der), $cjk->($u); 7126fb12b70Safresh1 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 7136fb12b70Safresh1 @ce = _uideoCE_8($u); 7146fb12b70Safresh1 } else { 7156fb12b70Safresh1 @ce = $der->($u); 716b39c5158Smillert } 717898184e3Ssthen } 7186fb12b70Safresh1 return map $self->varCE($_), @ce; 719b39c5158Smillert} 720b39c5158Smillert 721b39c5158Smillert 722b39c5158Smillert## 723b39c5158Smillert## string sortkey = getSortKey(string arg) 724b39c5158Smillert## 725b39c5158Smillertsub getSortKey 726b39c5158Smillert{ 727b39c5158Smillert my $self = shift; 72891f110e0Safresh1 my $orig = shift; 72991f110e0Safresh1 my $str = $self->process($orig); 73091f110e0Safresh1 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 731898184e3Ssthen my $vers = $self->{UCA_Version}; 732898184e3Ssthen my $term = $self->{hangul_terminator}; 73391f110e0Safresh1 my $lev = $self->{level}; 73491f110e0Safresh1 my $iden = $self->{identical}; 735b39c5158Smillert 736b39c5158Smillert my @buf; # weight arrays 737898184e3Ssthen if ($term) { 738b39c5158Smillert my $preHST = ''; 739898184e3Ssthen my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 740b39c5158Smillert foreach my $jcps (@$rEnt) { 741b39c5158Smillert # weird things like VL, TL-contraction are not considered! 742898184e3Ssthen my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 743b39c5158Smillert if ($preHST && !$curHST || # hangul before non-hangul 744b39c5158Smillert $preHST =~ /L\z/ && $curHST =~ /^T/ || 745b39c5158Smillert $preHST =~ /V\z/ && $curHST =~ /^L/ || 746b39c5158Smillert $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 747898184e3Ssthen push @buf, $termCE; 748b39c5158Smillert } 749b39c5158Smillert $preHST = $curHST; 750b39c5158Smillert push @buf, $self->getWt($jcps); 751b39c5158Smillert } 752898184e3Ssthen push @buf, $termCE if $preHST; # end at hangul 753898184e3Ssthen } else { 754b39c5158Smillert foreach my $jcps (@$rEnt) { 755b39c5158Smillert push @buf, $self->getWt($jcps); 756b39c5158Smillert } 757b39c5158Smillert } 758b39c5158Smillert 75991f110e0Safresh1 my $rkey = $self->mk_SortKey(\@buf); ### XS only 76091f110e0Safresh1 76191f110e0Safresh1 if ($iden || $vers >= 26 && $lev == MaxLevel) { 76291f110e0Safresh1 $rkey .= LEVEL_SEP; 76391f110e0Safresh1 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 76491f110e0Safresh1 } 76591f110e0Safresh1 return $rkey; 766b39c5158Smillert} 767b39c5158Smillert 768b39c5158Smillert 769b39c5158Smillert## 770b39c5158Smillert## int compare = cmp(string a, string b) 771b39c5158Smillert## 772b39c5158Smillertsub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 773b39c5158Smillertsub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 774b39c5158Smillertsub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 775b39c5158Smillertsub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 776b39c5158Smillertsub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 777b39c5158Smillertsub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 778b39c5158Smillertsub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 779b39c5158Smillert 780b39c5158Smillert## 781b39c5158Smillert## list[strings] sorted = sort(list[strings] arg) 782b39c5158Smillert## 783b39c5158Smillertsub sort { 784b39c5158Smillert my $obj = shift; 785b39c5158Smillert return 786b39c5158Smillert map { $_->[1] } 787b39c5158Smillert sort{ $a->[0] cmp $b->[0] } 788b39c5158Smillert map [ $obj->getSortKey($_), $_ ], @_; 789b39c5158Smillert} 790b39c5158Smillert 791b39c5158Smillert 792b39c5158Smillert## 793b39c5158Smillert## bool _nonIgnorAtLevel(arrayref weights, int level) 794b39c5158Smillert## 795b39c5158Smillertsub _nonIgnorAtLevel($$) 796b39c5158Smillert{ 797b39c5158Smillert my $wt = shift; 798b39c5158Smillert return if ! defined $wt; 799b39c5158Smillert my $lv = shift; 800b39c5158Smillert return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 801b39c5158Smillert} 802b39c5158Smillert 803b39c5158Smillert## 804b39c5158Smillert## bool _eqArray( 805b39c5158Smillert## arrayref of arrayref[weights] source, 806b39c5158Smillert## arrayref of arrayref[weights] substr, 807b39c5158Smillert## int level) 808b39c5158Smillert## * comparison of graphemes vs graphemes. 809b39c5158Smillert## @$source >= @$substr must be true (check it before call this); 810b39c5158Smillert## 811b39c5158Smillertsub _eqArray($$$) 812b39c5158Smillert{ 813b39c5158Smillert my $source = shift; 814b39c5158Smillert my $substr = shift; 815b39c5158Smillert my $lev = shift; 816b39c5158Smillert 817b39c5158Smillert for my $g (0..@$substr-1){ 818898184e3Ssthen # Do the $g'th graphemes have the same number of AV weights? 819b39c5158Smillert return if @{ $source->[$g] } != @{ $substr->[$g] }; 820b39c5158Smillert 821b39c5158Smillert for my $w (0..@{ $substr->[$g] }-1) { 822b39c5158Smillert for my $v (0..$lev-1) { 823b39c5158Smillert return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 824b39c5158Smillert } 825b39c5158Smillert } 826b39c5158Smillert } 827b39c5158Smillert return 1; 828b39c5158Smillert} 829b39c5158Smillert 830b39c5158Smillert## 831b39c5158Smillert## (int position, int length) 832898184e3Ssthen## int position = index(string, substring, position, [undoc'ed global]) 833b39c5158Smillert## 834898184e3Ssthen## With "global" (only for the list context), 835b39c5158Smillert## returns list of arrayref[position, length]. 836b39c5158Smillert## 837b39c5158Smillertsub index 838b39c5158Smillert{ 839b39c5158Smillert my $self = shift; 84091f110e0Safresh1 $self->{preprocess} and 84191f110e0Safresh1 croak "Don't use Preprocess with index(), match(), etc."; 84291f110e0Safresh1 $self->{normCode} and 84391f110e0Safresh1 croak "Don't use Normalization with index(), match(), etc."; 84491f110e0Safresh1 845b39c5158Smillert my $str = shift; 846b39c5158Smillert my $len = length($str); 84791f110e0Safresh1 my $sub = shift; 84891f110e0Safresh1 my $subE = $self->splitEnt($sub); 849b39c5158Smillert my $pos = @_ ? shift : 0; 850b39c5158Smillert $pos = 0 if $pos < 0; 851898184e3Ssthen my $glob = shift; 852b39c5158Smillert 853b39c5158Smillert my $lev = $self->{level}; 854b39c5158Smillert my $v2i = $self->{UCA_Version} >= 9 && 855b39c5158Smillert $self->{variable} ne 'non-ignorable'; 856b39c5158Smillert 857b39c5158Smillert if (! @$subE) { 858b39c5158Smillert my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 859898184e3Ssthen return $glob 860b39c5158Smillert ? map([$_, 0], $temp..$len) 861b39c5158Smillert : wantarray ? ($temp,0) : $temp; 862b39c5158Smillert } 863b39c5158Smillert $len < $pos 864b39c5158Smillert and return wantarray ? () : NOMATCHPOS; 865b39c5158Smillert my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 866b39c5158Smillert @$strE 867b39c5158Smillert or return wantarray ? () : NOMATCHPOS; 868b39c5158Smillert 869b39c5158Smillert my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 870b39c5158Smillert 871b39c5158Smillert my $last_is_variable; 872b39c5158Smillert for my $vwt (map $self->getWt($_), @$subE) { 873b39c5158Smillert my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 874b39c5158Smillert my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 875b39c5158Smillert 876b39c5158Smillert # "Ignorable (L1, L2) after Variable" since track. v. 9 877b39c5158Smillert if ($v2i) { 878b39c5158Smillert if ($var) { 879b39c5158Smillert $last_is_variable = TRUE; 880b39c5158Smillert } 881b39c5158Smillert elsif (!$wt[0]) { # ignorable 882b39c5158Smillert $to_be_pushed = FALSE if $last_is_variable; 883b39c5158Smillert } 884b39c5158Smillert else { 885b39c5158Smillert $last_is_variable = FALSE; 886b39c5158Smillert } 887b39c5158Smillert } 888b39c5158Smillert 889b39c5158Smillert if (@subWt && !$var && !$wt[0]) { 890b39c5158Smillert push @{ $subWt[-1] }, \@wt if $to_be_pushed; 891898184e3Ssthen } elsif ($to_be_pushed) { 892b39c5158Smillert push @subWt, [ \@wt ]; 893b39c5158Smillert } 894898184e3Ssthen # else ===> skipped 895b39c5158Smillert } 896b39c5158Smillert 897b39c5158Smillert my $count = 0; 898b39c5158Smillert my $end = @$strE - 1; 899b39c5158Smillert 900b39c5158Smillert $last_is_variable = FALSE; # reuse 901b39c5158Smillert for (my $i = 0; $i <= $end; ) { # no $i++ 902b39c5158Smillert my $found_base = 0; 903b39c5158Smillert 904b39c5158Smillert # fetch a grapheme 905b39c5158Smillert while ($i <= $end && $found_base == 0) { 906b39c5158Smillert for my $vwt ($self->getWt($strE->[$i][0])) { 907b39c5158Smillert my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 908b39c5158Smillert my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 909b39c5158Smillert 910b39c5158Smillert # "Ignorable (L1, L2) after Variable" since track. v. 9 911b39c5158Smillert if ($v2i) { 912b39c5158Smillert if ($var) { 913b39c5158Smillert $last_is_variable = TRUE; 914b39c5158Smillert } 915b39c5158Smillert elsif (!$wt[0]) { # ignorable 916b39c5158Smillert $to_be_pushed = FALSE if $last_is_variable; 917b39c5158Smillert } 918b39c5158Smillert else { 919b39c5158Smillert $last_is_variable = FALSE; 920b39c5158Smillert } 921b39c5158Smillert } 922b39c5158Smillert 923b39c5158Smillert if (@strWt && !$var && !$wt[0]) { 924b39c5158Smillert push @{ $strWt[-1] }, \@wt if $to_be_pushed; 925b39c5158Smillert $finPos[-1] = $strE->[$i][2]; 926b39c5158Smillert } elsif ($to_be_pushed) { 927b39c5158Smillert push @strWt, [ \@wt ]; 928b39c5158Smillert push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 929b39c5158Smillert $finPos[-1] = NOMATCHPOS if $found_base; 930b39c5158Smillert push @finPos, $strE->[$i][2]; 931b39c5158Smillert $found_base++; 932b39c5158Smillert } 933b39c5158Smillert # else ===> no-op 934b39c5158Smillert } 935b39c5158Smillert $i++; 936b39c5158Smillert } 937b39c5158Smillert 938b39c5158Smillert # try to match 939b39c5158Smillert while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 940b39c5158Smillert if ($iniPos[0] != NOMATCHPOS && 941b39c5158Smillert $finPos[$#subWt] != NOMATCHPOS && 942b39c5158Smillert _eqArray(\@strWt, \@subWt, $lev)) { 943b39c5158Smillert my $temp = $iniPos[0] + $pos; 944b39c5158Smillert 945898184e3Ssthen if ($glob) { 946b39c5158Smillert push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 947b39c5158Smillert splice @strWt, 0, $#subWt; 948b39c5158Smillert splice @iniPos, 0, $#subWt; 949b39c5158Smillert splice @finPos, 0, $#subWt; 950b39c5158Smillert } 951b39c5158Smillert else { 952b39c5158Smillert return wantarray 953b39c5158Smillert ? ($temp, $finPos[$#subWt] - $iniPos[0]) 954b39c5158Smillert : $temp; 955b39c5158Smillert } 956b39c5158Smillert } 957b39c5158Smillert shift @strWt; 958b39c5158Smillert shift @iniPos; 959b39c5158Smillert shift @finPos; 960b39c5158Smillert } 961b39c5158Smillert } 962b39c5158Smillert 963898184e3Ssthen return $glob 964b39c5158Smillert ? @g_ret 965b39c5158Smillert : wantarray ? () : NOMATCHPOS; 966b39c5158Smillert} 967b39c5158Smillert 968b39c5158Smillert## 969b39c5158Smillert## scalarref to matching part = match(string, substring) 970b39c5158Smillert## 971b39c5158Smillertsub match 972b39c5158Smillert{ 973b39c5158Smillert my $self = shift; 974b39c5158Smillert if (my($pos,$len) = $self->index($_[0], $_[1])) { 975b39c5158Smillert my $temp = substr($_[0], $pos, $len); 976b39c5158Smillert return wantarray ? $temp : \$temp; 977b39c5158Smillert # An lvalue ref \substr should be avoided, 978b39c5158Smillert # since its value is affected by modification of its referent. 979b39c5158Smillert } 980b39c5158Smillert else { 981b39c5158Smillert return; 982b39c5158Smillert } 983b39c5158Smillert} 984b39c5158Smillert 985b39c5158Smillert## 986b39c5158Smillert## arrayref matching parts = gmatch(string, substring) 987b39c5158Smillert## 988b39c5158Smillertsub gmatch 989b39c5158Smillert{ 990b39c5158Smillert my $self = shift; 991b39c5158Smillert my $str = shift; 992b39c5158Smillert my $sub = shift; 993b39c5158Smillert return map substr($str, $_->[0], $_->[1]), 994b39c5158Smillert $self->index($str, $sub, 0, 'g'); 995b39c5158Smillert} 996b39c5158Smillert 997b39c5158Smillert## 998b39c5158Smillert## bool subst'ed = subst(string, substring, replace) 999b39c5158Smillert## 1000b39c5158Smillertsub subst 1001b39c5158Smillert{ 1002b39c5158Smillert my $self = shift; 1003b39c5158Smillert my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1004b39c5158Smillert 1005b39c5158Smillert if (my($pos,$len) = $self->index($_[0], $_[1])) { 1006b39c5158Smillert if ($code) { 1007b39c5158Smillert my $mat = substr($_[0], $pos, $len); 1008b39c5158Smillert substr($_[0], $pos, $len, $code->($mat)); 1009b39c5158Smillert } else { 1010b39c5158Smillert substr($_[0], $pos, $len, $_[2]); 1011b39c5158Smillert } 1012b39c5158Smillert return TRUE; 1013b39c5158Smillert } 1014b39c5158Smillert else { 1015b39c5158Smillert return FALSE; 1016b39c5158Smillert } 1017b39c5158Smillert} 1018b39c5158Smillert 1019b39c5158Smillert## 1020b39c5158Smillert## int count = gsubst(string, substring, replace) 1021b39c5158Smillert## 1022b39c5158Smillertsub gsubst 1023b39c5158Smillert{ 1024b39c5158Smillert my $self = shift; 1025b39c5158Smillert my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1026b39c5158Smillert my $cnt = 0; 1027b39c5158Smillert 1028b39c5158Smillert # Replacement is carried out from the end, then use reverse. 1029b39c5158Smillert for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1030b39c5158Smillert if ($code) { 1031b39c5158Smillert my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1032b39c5158Smillert substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1033b39c5158Smillert } else { 1034b39c5158Smillert substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1035b39c5158Smillert } 1036b39c5158Smillert $cnt++; 1037b39c5158Smillert } 1038b39c5158Smillert return $cnt; 1039b39c5158Smillert} 1040b39c5158Smillert 1041b39c5158Smillert1; 1042b39c5158Smillert__END__ 1043b39c5158Smillert 1044b39c5158Smillert=head1 NAME 1045b39c5158Smillert 1046b39c5158SmillertUnicode::Collate - Unicode Collation Algorithm 1047b39c5158Smillert 1048b39c5158Smillert=head1 SYNOPSIS 1049b39c5158Smillert 1050b39c5158Smillert use Unicode::Collate; 1051b39c5158Smillert 1052b39c5158Smillert #construct 1053b39c5158Smillert $Collator = Unicode::Collate->new(%tailoring); 1054b39c5158Smillert 1055b39c5158Smillert #sort 1056b39c5158Smillert @sorted = $Collator->sort(@not_sorted); 1057b39c5158Smillert 1058b39c5158Smillert #compare 1059b39c5158Smillert $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1060b39c5158Smillert 1061898184e3SsthenB<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1062898184e3Ssthenaccording to Perl's Unicode support. See L<perlunicode>, 1063898184e3SsthenL<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1064898184e3SsthenOtherwise you can use C<preprocess> or should decode them before. 1065b39c5158Smillert 1066b39c5158Smillert=head1 DESCRIPTION 1067b39c5158Smillert 1068b39c5158SmillertThis module is an implementation of Unicode Technical Standard #10 1069b39c5158Smillert(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1070b39c5158Smillert 1071b39c5158Smillert=head2 Constructor and Tailoring 1072b39c5158Smillert 1073898184e3SsthenThe C<new> method returns a collator object. If new() is called 1074898184e3Ssthenwith no parameters, the collator should do the default collation. 1075b39c5158Smillert 1076b39c5158Smillert $Collator = Unicode::Collate->new( 1077b39c5158Smillert UCA_Version => $UCA_Version, 1078898184e3Ssthen alternate => $alternate, # alias for 'variable' 1079b39c5158Smillert backwards => $levelNumber, # or \@levelNumbers 1080b39c5158Smillert entry => $element, 1081b39c5158Smillert hangul_terminator => $term_primary_weight, 108291f110e0Safresh1 highestFFFF => $bool, 108391f110e0Safresh1 identical => $bool, 1084b39c5158Smillert ignoreName => qr/$ignoreName/, 1085b39c5158Smillert ignoreChar => qr/$ignoreChar/, 1086898184e3Ssthen ignore_level2 => $bool, 1087b39c5158Smillert katakana_before_hiragana => $bool, 1088b39c5158Smillert level => $collationLevel, 1089b8851fccSafresh1 long_contraction => $bool, 109091f110e0Safresh1 minimalFFFE => $bool, 1091b39c5158Smillert normalization => $normalization_form, 1092b39c5158Smillert overrideCJK => \&overrideCJK, 1093b39c5158Smillert overrideHangul => \&overrideHangul, 1094b39c5158Smillert preprocess => \&preprocess, 1095b39c5158Smillert rearrange => \@charList, 1096898184e3Ssthen rewrite => \&rewrite, 1097898184e3Ssthen suppress => \@charList, 1098b39c5158Smillert table => $filename, 1099b39c5158Smillert undefName => qr/$undefName/, 1100b39c5158Smillert undefChar => qr/$undefChar/, 1101b39c5158Smillert upper_before_lower => $bool, 1102b39c5158Smillert variable => $variable, 1103b39c5158Smillert ); 1104b39c5158Smillert 1105b39c5158Smillert=over 4 1106b39c5158Smillert 1107b39c5158Smillert=item UCA_Version 1108b39c5158Smillert 1109898184e3SsthenIf the revision (previously "tracking version") number of UCA is given, 1110898184e3Ssthenbehavior of that revision is emulated on collating. 1111b39c5158SmillertIf omitted, the return value of C<UCA_Version()> is used. 1112b39c5158Smillert 1113*eac174f2Safresh1The following revisions are supported. The default is 43. 1114b39c5158Smillert 1115b39c5158Smillert UCA Unicode Standard DUCET (@version) 1116898184e3Ssthen ------------------------------------------------------- 1117b39c5158Smillert 8 3.1 3.0.1 (3.0.1d9) 1118*eac174f2Safresh1 9 3.1 with Corrigendum 3 3.1.1 1119*eac174f2Safresh1 11 4.0.0 1120*eac174f2Safresh1 14 4.1.0 1121*eac174f2Safresh1 16 5.0.0 1122*eac174f2Safresh1 18 5.1.0 1123*eac174f2Safresh1 20 5.2.0 1124*eac174f2Safresh1 22 6.0.0 1125*eac174f2Safresh1 24 6.1.0 1126*eac174f2Safresh1 26 6.2.0 1127*eac174f2Safresh1 28 6.3.0 1128*eac174f2Safresh1 30 7.0.0 1129*eac174f2Safresh1 32 8.0.0 1130*eac174f2Safresh1 34 9.0.0 1131*eac174f2Safresh1 36 10.0.0 1132*eac174f2Safresh1 38 11.0.0 1133*eac174f2Safresh1 40 12.0.0 1134*eac174f2Safresh1 41 12.1.0 1135*eac174f2Safresh1 43 13.0.0 1136b8851fccSafresh1 11379f11ffb7Safresh1* See below for C<long_contraction> with C<UCA_Version> 22 and 24. 1138b39c5158Smillert 1139898184e3Ssthen* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1140898184e3Ssthensince C<UCA_Version> 22. 1141898184e3Ssthen 11426fb12b70Safresh1* Out-of-range codepoints (greater than U+10FFFF) are not ignored, 11436fb12b70Safresh1and can be overridden since C<UCA_Version> 22. 11446fb12b70Safresh1 1145898184e3Ssthen* Fully ignorable characters were ignored, and would not interrupt 1146898184e3Ssthencontractions with C<UCA_Version> 9 and 11. 1147898184e3Ssthen 1148898184e3Ssthen* Treatment of ignorables after variables and some behaviors 1149898184e3Ssthenwere changed at C<UCA_Version> 9. 1150898184e3Ssthen 1151898184e3Ssthen* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1152898184e3Ssthendepend on C<UCA_Version>. 1153898184e3Ssthen 1154898184e3Ssthen* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1155898184e3SsthenC<hangul_terminator>. 1156b39c5158Smillert 1157b39c5158Smillert=item alternate 1158b39c5158Smillert 1159b39c5158Smillert-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1160b39c5158Smillert 1161b39c5158SmillertFor backward compatibility, C<alternate> (old name) can be used 1162b39c5158Smillertas an alias for C<variable>. 1163b39c5158Smillert 1164b39c5158Smillert=item backwards 1165b39c5158Smillert 116691f110e0Safresh1-- see 3.4 Backward Accents, UTS #10. 1167b39c5158Smillert 1168b39c5158Smillert backwards => $levelNumber or \@levelNumbers 1169b39c5158Smillert 1170b39c5158SmillertWeights in reverse order; ex. level 2 (diacritic ordering) in French. 1171898184e3SsthenIf omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1172898184e3Ssthenforwards at all the levels. 1173b39c5158Smillert 1174b39c5158Smillert=item entry 1175b39c5158Smillert 1176b8851fccSafresh1-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10. 1177b39c5158Smillert 1178b39c5158SmillertIf the same character (or a sequence of characters) exists 1179b39c5158Smillertin the collation element table through C<table>, 1180898184e3Ssthenmapping to collation elements is overridden. 1181b39c5158SmillertIf it does not exist, the mapping is defined additionally. 1182b39c5158Smillert 1183b39c5158Smillert entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 1184b39c5158Smillert0063 0068 ; [.0E6A.0020.0002.0063] # ch 1185b39c5158Smillert0043 0068 ; [.0E6A.0020.0007.0043] # Ch 1186b39c5158Smillert0043 0048 ; [.0E6A.0020.0008.0043] # CH 1187b39c5158Smillert006C 006C ; [.0F4C.0020.0002.006C] # ll 1188b39c5158Smillert004C 006C ; [.0F4C.0020.0007.004C] # Ll 1189b39c5158Smillert004C 004C ; [.0F4C.0020.0008.004C] # LL 1190b39c5158Smillert00F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1191b39c5158Smillert006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 1192b39c5158Smillert00D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1193b39c5158Smillert004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1194b39c5158SmillertENTRY 1195b39c5158Smillert 1196b39c5158Smillert entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 1197b39c5158Smillert00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 1198b39c5158Smillert00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1199b39c5158SmillertENTRY 1200b39c5158Smillert 1201b39c5158SmillertB<NOTE:> The code point in the UCA file format (before C<';'>) 1202b39c5158SmillertB<must> be a Unicode code point (defined as hexadecimal), 1203b39c5158Smillertbut not a native code point. 1204b39c5158SmillertSo C<0063> must always denote C<U+0063>, 1205b39c5158Smillertbut not a character of C<"\x63">. 1206b39c5158Smillert 1207b39c5158SmillertWeighting may vary depending on collation element table. 1208b39c5158SmillertSo ensure the weights defined in C<entry> will be consistent with 1209b39c5158Smillertthose in the collation element table loaded via C<table>. 1210b39c5158Smillert 1211b39c5158SmillertIn DUCET v4.0.0, primary weight of C<C> is C<0E60> 1212b39c5158Smillertand that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1213b39c5158Smillert(as a value between C<0E60> and C<0E6D>) 1214b39c5158Smillertmakes ordering as C<C E<lt> CH E<lt> D>. 1215b39c5158SmillertExactly speaking DUCET already has some characters between C<C> and C<D>: 1216b39c5158SmillertC<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1217b39c5158SmillertC<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1218b39c5158Smillertand C<c-curl> (C<U+0255>) with C<0E69>. 1219b39c5158SmillertThen primary weight C<0E6A> for C<CH> makes C<CH> 1220b39c5158Smillertordered between C<c-curl> and C<D>. 1221b39c5158Smillert 1222b39c5158Smillert=item hangul_terminator 1223b39c5158Smillert 1224b39c5158Smillert-- see 7.1.4 Trailing Weights, UTS #10. 1225b39c5158Smillert 1226b39c5158SmillertIf a true value is given (non-zero but should be positive), 1227b39c5158Smillertit will be added as a terminator primary weight to the end of 1228b39c5158Smillertevery standard Hangul syllable. Secondary and any higher weights 1229b39c5158Smillertfor terminator are set to zero. 1230b39c5158SmillertIf the value is false or C<hangul_terminator> key does not exist, 1231b39c5158Smillertinsertion of terminator weights will not be performed. 1232b39c5158Smillert 1233b39c5158SmillertBoundaries of Hangul syllables are determined 1234b39c5158Smillertaccording to conjoining Jamo behavior in F<the Unicode Standard> 1235b39c5158Smillertand F<HangulSyllableType.txt>. 1236b39c5158Smillert 1237b39c5158SmillertB<Implementation Note:> 1238b39c5158Smillert(1) For expansion mapping (Unicode character mapped 1239b39c5158Smillertto a sequence of collation elements), a terminator will not be added 1240b39c5158Smillertbetween collation elements, even if Hangul syllable boundary exists there. 1241b39c5158SmillertAddition of terminator is restricted to the next position 1242b39c5158Smillertto the last collation element. 1243b39c5158Smillert 1244b39c5158Smillert(2) Non-conjoining Hangul letters 1245b39c5158Smillert(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1246b39c5158Smillertautomatically terminated with a terminator primary weight. 1247b39c5158SmillertThese characters may need terminator included in a collation element 1248b39c5158Smillerttable beforehand. 1249b39c5158Smillert 125091f110e0Safresh1=item highestFFFF 125191f110e0Safresh1 12529f11ffb7Safresh1-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation. 125391f110e0Safresh1 125491f110e0Safresh1If the parameter is made true, C<U+FFFF> has a highest primary weight. 125591f110e0Safresh1When a boolean of C<$coll-E<gt>ge($str, "abc")> and 125691f110e0Safresh1C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 125791f110e0Safresh1begins with C<"abc">, or another primary equivalent. 125891f110e0Safresh1C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 125991f110e0Safresh1such as C<"abc\x{FFFF}xyz">. 126091f110e0Safresh1 126191f110e0Safresh1C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 12626fb12b70Safresh1almost, but the latter has a problem that you should know which letter is 126391f110e0Safresh1next to C<c>. For a certain language where C<ch> as the next letter, 12646fb12b70Safresh1C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. 126591f110e0Safresh1 12666fb12b70Safresh1Note: 12676fb12b70Safresh1This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>. 126891f110e0Safresh1Any other character than C<U+FFFF> can be tailored by C<entry>. 126991f110e0Safresh1 127091f110e0Safresh1=item identical 127191f110e0Safresh1 127291f110e0Safresh1-- see A.3 Deterministic Comparison, UTS #10. 127391f110e0Safresh1 127491f110e0Safresh1By default, strings whose weights are equal should be equal, 127591f110e0Safresh1even though their code points are not equal. 127691f110e0Safresh1Completely ignorable characters are ignored. 127791f110e0Safresh1 127891f110e0Safresh1If the parameter is made true, a final, tie-breaking level is used. 127991f110e0Safresh1If no difference of weights is found after the comparison through 128091f110e0Safresh1all the level specified by C<level>, the comparison with code points 128191f110e0Safresh1will be performed. 12826fb12b70Safresh1For the tie-breaking comparison, the sort key has code points 128391f110e0Safresh1of the original string appended. 128491f110e0Safresh1Completely ignorable characters are not ignored. 128591f110e0Safresh1 128691f110e0Safresh1If C<preprocess> and/or C<normalization> is applied, the code points 128791f110e0Safresh1of the string after them (in NFD by default) are used. 128891f110e0Safresh1 1289b39c5158Smillert=item ignoreChar 1290b39c5158Smillert 1291b39c5158Smillert=item ignoreName 1292b39c5158Smillert 1293b8851fccSafresh1-- see 3.6 Variable Weighting, UTS #10. 1294b39c5158Smillert 1295b39c5158SmillertMakes the entry in the table completely ignorable; 1296b39c5158Smillerti.e. as if the weights were zero at all level. 1297b39c5158Smillert 1298b39c5158SmillertThrough C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1299b39c5158Smillertwill be ignored. Through C<ignoreName>, any character whose name 1300b39c5158Smillert(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1301b39c5158Smillertwill be ignored. 1302b39c5158Smillert 1303b39c5158SmillertE.g. when 'a' and 'e' are ignorable, 1304b39c5158Smillert'element' is equal to 'lament' (or 'lmnt'). 1305b39c5158Smillert 1306898184e3Ssthen=item ignore_level2 1307898184e3Ssthen 1308898184e3Ssthen-- see 5.1 Parametric Tailoring, UTS #10. 1309898184e3Ssthen 1310898184e3SsthenBy default, case-sensitive comparison (that is level 3 difference) 1311898184e3Ssthenwon't ignore accents (that is level 2 difference). 1312898184e3Ssthen 1313898184e3SsthenIf the parameter is made true, accents (and other primary ignorable 1314898184e3Ssthencharacters) are ignored, even though cases are taken into account. 1315898184e3Ssthen 1316898184e3SsthenB<NOTE>: C<level> should be 3 or greater. 1317898184e3Ssthen 1318b39c5158Smillert=item katakana_before_hiragana 1319b39c5158Smillert 132091f110e0Safresh1-- see 7.2 Tertiary Weight Table, UTS #10. 1321b39c5158Smillert 1322b39c5158SmillertBy default, hiragana is before katakana. 1323b39c5158SmillertIf the parameter is made true, this is reversed. 1324b39c5158Smillert 1325b39c5158SmillertB<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1326b39c5158Smillertdistinctions must occur in level 3, and their weights at level 3 must be 1327b39c5158Smillertsame as those mentioned in 7.3.1, UTS #10. 1328b39c5158SmillertIf you define your collation elements which violate this requirement, 1329b39c5158Smillertthis parameter does not work validly. 1330b39c5158Smillert 1331b39c5158Smillert=item level 1332b39c5158Smillert 1333b39c5158Smillert-- see 4.3 Form Sort Key, UTS #10. 1334b39c5158Smillert 1335b39c5158SmillertSet the maximum level. 1336b39c5158SmillertAny higher levels than the specified one are ignored. 1337b39c5158Smillert 1338b39c5158Smillert Level 1: alphabetic ordering 1339b39c5158Smillert Level 2: diacritic ordering 1340b39c5158Smillert Level 3: case ordering 1341b39c5158Smillert Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1342b39c5158Smillert 1343b39c5158Smillert ex.level => 2, 1344b39c5158Smillert 1345b39c5158SmillertIf omitted, the maximum is the 4th. 1346b39c5158Smillert 134791f110e0Safresh1B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 134891f110e0Safresh1But this module only uses weights within 0xFFFF. 134991f110e0Safresh1When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 135091f110e0Safresh1and 'shift-trimmed'), the level 4 may be unreliable. 135191f110e0Safresh1 135291f110e0Safresh1See also C<identical>. 135391f110e0Safresh1 1354b8851fccSafresh1=item long_contraction 1355b8851fccSafresh1 1356b8851fccSafresh1-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10. 1357b8851fccSafresh1 1358b8851fccSafresh1If the parameter is made true, for a contraction with three or more 1359b8851fccSafresh1characters (here nicknamed "long contraction"), initial substrings 1360b8851fccSafresh1will be handled. 1361b8851fccSafresh1For example, a contraction ABC, where A is a starter, and B and C 1362b8851fccSafresh1are non-starters (character with non-zero combining character class), 1363b8851fccSafresh1will be detected even if there is not AB as a contraction. 1364b8851fccSafresh1 1365b8851fccSafresh1B<Default:> Usually false. 1366b8851fccSafresh1If C<UCA_Version> is 22 or 24, and the value of C<long_contraction> 1367b8851fccSafresh1is not specified in C<new()>, a true value is set implicitly. 1368b8851fccSafresh1This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0. 1369b8851fccSafresh1 1370b8851fccSafresh1C<change()> handles C<long_contraction> explicitly only. 1371b8851fccSafresh1If C<long_contraction> is not specified in C<change()>, even though 1372b8851fccSafresh1C<UCA_Version> is changed, C<long_contraction> will not be changed. 1373b8851fccSafresh1 1374b8851fccSafresh1B<Limitation:> Scanning non-starters is one-way (no back tracking). 1375b8851fccSafresh1If AB is found but not ABC is not found, other long contraction where 1376b8851fccSafresh1the first character is A and the second is not B may not be found. 1377b8851fccSafresh1 1378b8851fccSafresh1Under C<(normalization =E<gt> undef)>, detection step of discontiguous 1379b8851fccSafresh1contractions will be skipped. 1380b8851fccSafresh1 1381b8851fccSafresh1B<Note:> The following contractions in DUCET are not considered 1382b8851fccSafresh1in steps S2.1.1 to S2.1.3, where they are discontiguous. 1383b8851fccSafresh1 1384b8851fccSafresh1 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR) 1385b8851fccSafresh1 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL) 1386b8851fccSafresh1 1387b8851fccSafresh1For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY> 1388b8851fccSafresh1(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD. 1389b8851fccSafresh1In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected, 1390b8851fccSafresh1instead of C<0FB2 0F71 0F80>. 1391b8851fccSafresh1Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of 1392b8851fccSafresh1contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected. 1393b8851fccSafresh1 139491f110e0Safresh1=item minimalFFFE 139591f110e0Safresh1 13969f11ffb7Safresh1-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation. 139791f110e0Safresh1 139891f110e0Safresh1If the parameter is made true, C<U+FFFE> has a minimal primary weight. 139991f110e0Safresh1The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 140091f110e0Safresh1first compares C<$a1> and C<$b1> at level 1, and 140191f110e0Safresh1then C<$a2> and C<$b2> at level 1, as followed. 140291f110e0Safresh1 140391f110e0Safresh1 "ab\x{FFFE}a" 140491f110e0Safresh1 "Ab\x{FFFE}a" 140591f110e0Safresh1 "ab\x{FFFE}c" 140691f110e0Safresh1 "Ab\x{FFFE}c" 140791f110e0Safresh1 "ab\x{FFFE}xyz" 140891f110e0Safresh1 "abc\x{FFFE}def" 140991f110e0Safresh1 "abc\x{FFFE}xYz" 141091f110e0Safresh1 "aBc\x{FFFE}xyz" 141191f110e0Safresh1 "abcX\x{FFFE}def" 141291f110e0Safresh1 "abcx\x{FFFE}xyz" 141391f110e0Safresh1 "b\x{FFFE}aaa" 141491f110e0Safresh1 "bbb\x{FFFE}a" 141591f110e0Safresh1 14166fb12b70Safresh1Note: 14176fb12b70Safresh1This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>. 141891f110e0Safresh1Any other character than C<U+FFFE> can be tailored by C<entry>. 141991f110e0Safresh1 1420b39c5158Smillert=item normalization 1421b39c5158Smillert 1422b39c5158Smillert-- see 4.1 Normalize, UTS #10. 1423b39c5158Smillert 1424b39c5158SmillertIf specified, strings are normalized before preparation of sort keys 1425b39c5158Smillert(the normalization is executed after preprocess). 1426b39c5158Smillert 1427b39c5158SmillertA form name C<Unicode::Normalize::normalize()> accepts will be applied 1428b39c5158Smillertas C<$normalization_form>. 1429b39c5158SmillertAcceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1430b39c5158SmillertSee C<Unicode::Normalize::normalize()> for detail. 1431b39c5158SmillertIf omitted, C<'NFD'> is used. 1432b39c5158Smillert 1433b39c5158SmillertC<normalization> is performed after C<preprocess> (if defined). 1434b39c5158Smillert 1435b39c5158SmillertFurthermore, special values, C<undef> and C<"prenormalized">, can be used, 1436b39c5158Smillertthough they are not concerned with C<Unicode::Normalize::normalize()>. 1437b39c5158Smillert 1438b39c5158SmillertIf C<undef> (not a string C<"undef">) is passed explicitly 1439b39c5158Smillertas the value for this key, 1440b39c5158Smillertany normalization is not carried out (this may make tailoring easier 1441b39c5158Smillertif any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1442b39c5158Smillertonly contiguous contractions are resolved; 1443b39c5158Smillerte.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1444b39c5158SmillertC<A-cedilla-ring> would be primary equal to C<A>. 1445b39c5158SmillertIn this point, 1446b39c5158SmillertC<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1447b39c5158SmillertB<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1448b39c5158Smillert 1449b39c5158SmillertIn the case of C<(normalization =E<gt> "prenormalized")>, 1450b39c5158Smillertany normalization is not performed, but 1451898184e3Ssthendiscontiguous contractions with combining characters are performed. 1452b39c5158SmillertTherefore 1453b39c5158SmillertC<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1454b39c5158SmillertB<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1455b39c5158SmillertIf source strings are finely prenormalized, 1456b39c5158SmillertC<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1457b39c5158Smillert 1458b39c5158SmillertExcept C<(normalization =E<gt> undef)>, 1459b39c5158SmillertB<Unicode::Normalize> is required (see also B<CAVEAT>). 1460b39c5158Smillert 1461b39c5158Smillert=item overrideCJK 1462b39c5158Smillert 1463b39c5158Smillert-- see 7.1 Derived Collation Elements, UTS #10. 1464b39c5158Smillert 1465898184e3SsthenBy default, CJK unified ideographs are ordered in Unicode codepoint 14666fb12b70Safresh1order, but those in the CJK Unified Ideographs block are less than 1467898184e3Ssthenthose in the CJK Unified Ideographs Extension A etc. 1468b39c5158Smillert 1469898184e3Ssthen In the CJK Unified Ideographs block: 1470898184e3Ssthen U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1471898184e3Ssthen U+4E00..U+9FBB if UCA_Version is 14 or 16. 1472898184e3Ssthen U+4E00..U+9FC3 if UCA_Version is 18. 1473898184e3Ssthen U+4E00..U+9FCB if UCA_Version is 20 or 22. 14749f11ffb7Safresh1 U+4E00..U+9FCC if UCA_Version is 24 to 30. 14759f11ffb7Safresh1 U+4E00..U+9FD5 if UCA_Version is 32 or 34. 14769f11ffb7Safresh1 U+4E00..U+9FEA if UCA_Version is 36. 1477*eac174f2Safresh1 U+4E00..U+9FEF if UCA_Version is 38, 40 or 41. 1478*eac174f2Safresh1 U+4E00..U+9FFC if UCA_Version is 43. 1479b39c5158Smillert 1480898184e3Ssthen In the CJK Unified Ideographs Extension blocks: 1481*eac174f2Safresh1 Ext.A (U+3400..U+4DB5) if UCA_Version is 8 to 41. 1482*eac174f2Safresh1 Ext.A (U+3400..U+4DBF) if UCA_Version is 43. 1483*eac174f2Safresh1 Ext.B (U+20000..U+2A6D6) if UCA_Version is 8 to 41. 1484*eac174f2Safresh1 Ext.B (U+20000..U+2A6DD) if UCA_Version is 43. 14856fb12b70Safresh1 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later. 14866fb12b70Safresh1 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later. 14879f11ffb7Safresh1 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later. 1488*eac174f2Safresh1 Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later. 1489*eac174f2Safresh1 Ext.G (U+30000..U+3134A) if UCA_Version is 43. 1490898184e3Ssthen 1491898184e3SsthenThrough C<overrideCJK>, ordering of CJK unified ideographs (including 1492898184e3Ssthenextensions) can be overridden. 1493898184e3Ssthen 1494898184e3Ssthenex. CJK unified ideographs in the JIS code point order. 1495b39c5158Smillert 1496b39c5158Smillert overrideCJK => sub { 1497b39c5158Smillert my $u = shift; # get a Unicode codepoint 1498b39c5158Smillert my $b = pack('n', $u); # to UTF-16BE 1499b39c5158Smillert my $s = your_unicode_to_sjis_converter($b); # convert 1500b39c5158Smillert my $n = unpack('n', $s); # convert sjis to short 1501b39c5158Smillert [ $n, 0x20, 0x2, $u ]; # return the collation element 1502b39c5158Smillert }, 1503b39c5158Smillert 1504898184e3SsthenThe return value may be an arrayref of 1st to 4th weights as shown 1505898184e3Ssthenabove. The return value may be an integer as the primary weight 1506898184e3Ssthenas shown below. If C<undef> is returned, the default derived 1507898184e3Ssthencollation element will be used. 1508898184e3Ssthen 1509898184e3Ssthen overrideCJK => sub { 1510898184e3Ssthen my $u = shift; # get a Unicode codepoint 1511898184e3Ssthen my $b = pack('n', $u); # to UTF-16BE 1512898184e3Ssthen my $s = your_unicode_to_sjis_converter($b); # convert 1513898184e3Ssthen my $n = unpack('n', $s); # convert sjis to short 1514898184e3Ssthen return $n; # return the primary weight 1515898184e3Ssthen }, 1516898184e3Ssthen 1517898184e3SsthenThe return value may be a list containing zero or more of 1518898184e3Ssthenan arrayref, an integer, or C<undef>. 1519898184e3Ssthen 1520898184e3Ssthenex. ignores all CJK unified ideographs. 1521b39c5158Smillert 1522b39c5158Smillert overrideCJK => sub {()}, # CODEREF returning empty list 1523b39c5158Smillert 1524b39c5158Smillert # where ->eq("Pe\x{4E00}rl", "Perl") is true 1525898184e3Ssthen # as U+4E00 is a CJK unified ideograph and to be ignorable. 1526b39c5158Smillert 15276fb12b70Safresh1If a false value (including C<undef>) is passed, C<overrideCJK> 15286fb12b70Safresh1has no effect. 15296fb12b70Safresh1C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one. 15306fb12b70Safresh1 1531898184e3SsthenBut assignment of weight for CJK unified ideographs 1532898184e3Ssthenin C<table> or C<entry> is still valid. 15336fb12b70Safresh1If C<undef> is passed explicitly as the value for this key, 15346fb12b70Safresh1weights for CJK unified ideographs are treated as undefined. 15356fb12b70Safresh1However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)> 15366fb12b70Safresh1has no special meaning. 1537898184e3Ssthen 1538898184e3SsthenB<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 1539898184e3SsthenC<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 1540898184e3SsthenC<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 1541898184e3Ssthenideographs. But they can't be overridden via C<overrideCJK> when you use 1542898184e3SsthenDUCET, as the table includes weights for them. C<table> or C<entry> has 1543898184e3Ssthenpriority over C<overrideCJK>. 1544b39c5158Smillert 1545b39c5158Smillert=item overrideHangul 1546b39c5158Smillert 1547b39c5158Smillert-- see 7.1 Derived Collation Elements, UTS #10. 1548b39c5158Smillert 1549898184e3SsthenBy default, Hangul syllables are decomposed into Hangul Jamo, 1550b39c5158Smillerteven if C<(normalization =E<gt> undef)>. 1551898184e3SsthenBut the mapping of Hangul syllables may be overridden. 1552b39c5158Smillert 1553b39c5158SmillertThis parameter works like C<overrideCJK>, so see there for examples. 1554b39c5158Smillert 1555898184e3SsthenIf you want to override the mapping of Hangul syllables, 1556898184e3SsthenNFD and NFKD are not appropriate, since NFD and NFKD will decompose 1557898184e3SsthenHangul syllables before overriding. FCD may decompose Hangul syllables 1558898184e3Ssthenas the case may be. 1559b39c5158Smillert 15606fb12b70Safresh1If a false value (but not C<undef>) is passed, C<overrideHangul> 15616fb12b70Safresh1has no effect. 15626fb12b70Safresh1C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one. 15636fb12b70Safresh1 1564b39c5158SmillertIf C<undef> is passed explicitly as the value for this key, 1565898184e3Ssthenweight for Hangul syllables is treated as undefined 1566b39c5158Smillertwithout decomposition into Hangul Jamo. 1567898184e3SsthenBut definition of weight for Hangul syllables 1568898184e3Ssthenin C<table> or C<entry> is still valid. 1569b39c5158Smillert 15706fb12b70Safresh1=item overrideOut 15716fb12b70Safresh1 15726fb12b70Safresh1-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. 15736fb12b70Safresh1 15746fb12b70Safresh1Perl seems to allow out-of-range values (greater than 0x10FFFF). 15756fb12b70Safresh1By default, out-of-range values are replaced with C<U+FFFD> 15766fb12b70Safresh1(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22, 15776fb12b70Safresh1or ignored when C<UCA_Version> E<lt>= 20. 15786fb12b70Safresh1 15796fb12b70Safresh1When C<UCA_Version> E<gt>= 22, the weights of out-of-range values 15806fb12b70Safresh1can be overridden. Though C<table> or C<entry> are available for them, 15816fb12b70Safresh1out-of-range values are too many. 15826fb12b70Safresh1 15836fb12b70Safresh1C<overrideOut> can perform it algorithmically. 15846fb12b70Safresh1This parameter works like C<overrideCJK>, so see there for examples. 15856fb12b70Safresh1 15866fb12b70Safresh1ex. ignores all out-of-range values. 15876fb12b70Safresh1 15886fb12b70Safresh1 overrideOut => sub {()}, # CODEREF returning empty list 15896fb12b70Safresh1 15906fb12b70Safresh1If a false value (including C<undef>) is passed, C<overrideOut> 15916fb12b70Safresh1has no effect. 15926fb12b70Safresh1C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one. 15936fb12b70Safresh1 15946fb12b70Safresh1B<NOTE ABOUT U+FFFD:> 15956fb12b70Safresh1 15966fb12b70Safresh1UCA recommends that out-of-range values should not be ignored for security 15976fb12b70Safresh1reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. 15986fb12b70Safresh1However, C<U+FFFD> is wrongly mapped to a variable collation element 15996fb12b70Safresh1in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be 16006fb12b70Safresh1ignored when C<variable> isn't C<Non-ignorable>. 16016fb12b70Safresh1 16026fb12b70Safresh1The mapping of C<U+FFFD> is corrected in Unicode 6.3.0. 16036fb12b70Safresh1see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights> 16046fb12b70Safresh1(7.1.4 Trailing Weights). Such a correction is reproduced by this. 16056fb12b70Safresh1 16066fb12b70Safresh1 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer 16076fb12b70Safresh1 16086fb12b70Safresh1This workaround is unnecessary since Unicode 6.3.0. 16096fb12b70Safresh1 1610b39c5158Smillert=item preprocess 1611b39c5158Smillert 161291f110e0Safresh1-- see 5.4 Preprocessing, UTS #10. 1613b39c5158Smillert 1614898184e3SsthenIf specified, the coderef is used to preprocess each string 1615b39c5158Smillertbefore the formation of sort keys. 1616b39c5158Smillert 1617b39c5158Smillertex. dropping English articles, such as "a" or "the". 1618b39c5158SmillertThen, "the pen" is before "a pencil". 1619b39c5158Smillert 1620b39c5158Smillert preprocess => sub { 1621b39c5158Smillert my $str = shift; 1622b39c5158Smillert $str =~ s/\b(?:an?|the)\s+//gi; 1623b39c5158Smillert return $str; 1624b39c5158Smillert }, 1625b39c5158Smillert 1626b39c5158SmillertC<preprocess> is performed before C<normalization> (if defined). 1627b39c5158Smillert 1628898184e3Ssthenex. decoding strings in a legacy encoding such as shift-jis: 1629898184e3Ssthen 1630898184e3Ssthen $sjis_collator = Unicode::Collate->new( 1631898184e3Ssthen preprocess => \&your_shiftjis_to_unicode_decoder, 1632898184e3Ssthen ); 1633898184e3Ssthen @result = $sjis_collator->sort(@shiftjis_strings); 1634898184e3Ssthen 1635898184e3SsthenB<Note:> Strings returned from the coderef will be interpreted 1636898184e3Ssthenaccording to Perl's Unicode support. See L<perlunicode>, 1637898184e3SsthenL<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1638898184e3Ssthen 1639b39c5158Smillert=item rearrange 1640b39c5158Smillert 164191f110e0Safresh1-- see 3.5 Rearrangement, UTS #10. 1642b39c5158Smillert 1643b39c5158SmillertCharacters that are not coded in logical order and to be rearranged. 16446fb12b70Safresh1If C<UCA_Version> is equal to or less than 11, default is: 1645b39c5158Smillert 1646b39c5158Smillert rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1647b39c5158Smillert 1648b39c5158SmillertIf you want to disallow any rearrangement, pass C<undef> or C<[]> 1649b39c5158Smillert(a reference to empty list) as the value for this key. 1650b39c5158Smillert 1651898184e3SsthenIf C<UCA_Version> is equal to or greater than 14, default is C<[]> 1652898184e3Ssthen(i.e. no rearrangement). 1653b39c5158Smillert 1654b39c5158SmillertB<According to the version 9 of UCA, this parameter shall not be used; 1655b39c5158Smillertbut it is not warned at present.> 1656b39c5158Smillert 1657898184e3Ssthen=item rewrite 1658898184e3Ssthen 1659898184e3SsthenIf specified, the coderef is used to rewrite lines in C<table> or C<entry>. 1660898184e3SsthenThe coderef will get each line, and then should return a rewritten line 1661898184e3Ssthenaccording to the UCA file format. 1662898184e3SsthenIf the coderef returns an empty line, the line will be skipped. 1663898184e3Ssthen 1664898184e3Ssthene.g. any primary ignorable characters into tertiary ignorable: 1665898184e3Ssthen 1666898184e3Ssthen rewrite => sub { 1667898184e3Ssthen my $line = shift; 1668898184e3Ssthen $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 1669898184e3Ssthen return $line; 1670898184e3Ssthen }, 1671898184e3Ssthen 1672898184e3SsthenThis example shows rewriting weights. C<rewrite> is allowed to 1673898184e3Ssthenaffect code points, weights, and the name. 1674898184e3Ssthen 1675898184e3SsthenB<NOTE>: C<table> is available to use another table file; 1676898184e3Ssthenpreparing a modified table once would be more efficient than 1677898184e3Ssthenrewriting lines on reading an unmodified table every time. 1678898184e3Ssthen 1679898184e3Ssthen=item suppress 1680898184e3Ssthen 16819f11ffb7Safresh1-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation. 1682898184e3Ssthen 1683898184e3SsthenContractions beginning with the specified characters are suppressed, 1684898184e3Sstheneven if those contractions are defined in C<table>. 1685898184e3Ssthen 1686898184e3SsthenAn example for Russian and some languages using the Cyrillic script: 1687898184e3Ssthen 1688898184e3Ssthen suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 1689898184e3Ssthen 1690898184e3Ssthenwhere 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 1691898184e3Ssthen 16929f11ffb7Safresh1B<NOTE>: Contractions via C<entry> will not be suppressed. 1693898184e3Ssthen 1694b39c5158Smillert=item table 1695b39c5158Smillert 1696b8851fccSafresh1-- see 3.8 Default Unicode Collation Element Table, UTS #10. 1697b39c5158Smillert 1698b39c5158SmillertYou can use another collation element table if desired. 1699b39c5158Smillert 1700b39c5158SmillertThe table file should locate in the F<Unicode/Collate> directory 1701b39c5158Smillerton C<@INC>. Say, if the filename is F<Foo.txt>, 1702b39c5158Smillertthe table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1703b39c5158Smillert 1704b39c5158SmillertBy default, F<allkeys.txt> (as the filename of DUCET) is used. 1705b39c5158SmillertIf you will prepare your own table file, any name other than F<allkeys.txt> 1706b39c5158Smillertmay be better to avoid namespace conflict. 1707b39c5158Smillert 1708898184e3SsthenB<NOTE>: When XSUB is used, the DUCET is compiled on building this 1709898184e3Ssthenmodule, and it may save time at the run time. 17106fb12b70Safresh1Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table, 1711898184e3Ssthenor using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 1712898184e3SsthenC<rewrite> will prevent this module from using the compiled DUCET. 1713898184e3Ssthen 1714b39c5158SmillertIf C<undef> is passed explicitly as the value for this key, 1715b39c5158Smillertno file is read (but you can define collation elements via C<entry>). 1716b39c5158Smillert 1717b39c5158SmillertA typical way to define a collation element table 1718b39c5158Smillertwithout any file of table: 1719b39c5158Smillert 1720b39c5158Smillert $onlyABC = Unicode::Collate->new( 1721b39c5158Smillert table => undef, 1722b39c5158Smillert entry => << 'ENTRIES', 1723b39c5158Smillert0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 1724b39c5158Smillert0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 1725b39c5158Smillert0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 1726b39c5158Smillert0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 1727b39c5158Smillert0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 1728b39c5158Smillert0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1729b39c5158SmillertENTRIES 1730b39c5158Smillert ); 1731b39c5158Smillert 1732b39c5158SmillertIf C<ignoreName> or C<undefName> is used, character names should be 1733b39c5158Smillertspecified as a comment (following C<#>) on each line. 1734b39c5158Smillert 1735b39c5158Smillert=item undefChar 1736b39c5158Smillert 1737b39c5158Smillert=item undefName 1738b39c5158Smillert 17399f11ffb7Safresh1-- see 6.3.3 Reducing the Repertoire, UTS #10. 1740b39c5158Smillert 1741898184e3SsthenUndefines the collation element as if it were unassigned in the C<table>. 1742b39c5158SmillertThis reduces the size of the table. 1743b39c5158SmillertIf an unassigned character appears in the string to be collated, 1744b39c5158Smillertthe sort key is made from its codepoint 1745b39c5158Smillertas a single-character collation element, 1746b39c5158Smillertas it is greater than any other assigned collation elements 1747b39c5158Smillert(in the codepoint order among the unassigned characters). 1748b39c5158SmillertBut, it'd be better to ignore characters 1749b39c5158Smillertunfamiliar to you and maybe never used. 1750b39c5158Smillert 1751b39c5158SmillertThrough C<undefChar>, any character matching C<qr/$undefChar/> 1752b39c5158Smillertwill be undefined. Through C<undefName>, any character whose name 1753b39c5158Smillert(given in the C<table> file as a comment) matches C<qr/$undefName/> 1754b39c5158Smillertwill be undefined. 1755b39c5158Smillert 1756b39c5158Smillertex. Collation weights for beyond-BMP characters are not stored in object: 1757b39c5158Smillert 1758b39c5158Smillert undefChar => qr/[^\0-\x{fffd}]/, 1759b39c5158Smillert 1760b39c5158Smillert=item upper_before_lower 1761b39c5158Smillert 1762b39c5158Smillert-- see 6.6 Case Comparisons, UTS #10. 1763b39c5158Smillert 1764b39c5158SmillertBy default, lowercase is before uppercase. 1765b39c5158SmillertIf the parameter is made true, this is reversed. 1766b39c5158Smillert 1767b39c5158SmillertB<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1768b39c5158Smillertdistinctions must occur in level 3, and their weights at level 3 must be 1769b39c5158Smillertsame as those mentioned in 7.3.1, UTS #10. 1770b39c5158SmillertIf you define your collation elements which differs from this requirement, 1771b39c5158Smillertthis parameter doesn't work validly. 1772b39c5158Smillert 1773b39c5158Smillert=item variable 1774b39c5158Smillert 1775b8851fccSafresh1-- see 3.6 Variable Weighting, UTS #10. 1776b39c5158Smillert 1777898184e3SsthenThis key allows for variable weighting of variable collation elements, 1778b39c5158Smillertwhich are marked with an ASTERISK in the table 1779898184e3Ssthen(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 1780b39c5158Smillert 1781b39c5158Smillert variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1782b39c5158Smillert 1783b39c5158SmillertThese names are case-insensitive. 1784b39c5158SmillertBy default (if specification is omitted), 'shifted' is adopted. 1785b39c5158Smillert 1786b39c5158Smillert 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1787b39c5158Smillert considered at the 4th level. 1788b39c5158Smillert 1789b39c5158Smillert 'Non-Ignorable' Variable elements are not reset to ignorable. 1790b39c5158Smillert 1791b39c5158Smillert 'Shifted' Variable elements are made ignorable at levels 1 through 3 1792b39c5158Smillert their level 4 weight is replaced by the old level 1 weight. 1793b39c5158Smillert Level 4 weight for Non-Variable elements is 0xFFFF. 1794b39c5158Smillert 1795b39c5158Smillert 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1796b39c5158Smillert are trimmed. 1797b39c5158Smillert 1798b39c5158Smillert=back 1799b39c5158Smillert 1800b39c5158Smillert=head2 Methods for Collation 1801b39c5158Smillert 1802b39c5158Smillert=over 4 1803b39c5158Smillert 1804b39c5158Smillert=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1805b39c5158Smillert 1806b39c5158SmillertSorts a list of strings. 1807b39c5158Smillert 1808b39c5158Smillert=item C<$result = $Collator-E<gt>cmp($a, $b)> 1809b39c5158Smillert 1810b39c5158SmillertReturns 1 (when C<$a> is greater than C<$b>) 1811b39c5158Smillertor 0 (when C<$a> is equal to C<$b>) 18126fb12b70Safresh1or -1 (when C<$a> is less than C<$b>). 1813b39c5158Smillert 1814b39c5158Smillert=item C<$result = $Collator-E<gt>eq($a, $b)> 1815b39c5158Smillert 1816b39c5158Smillert=item C<$result = $Collator-E<gt>ne($a, $b)> 1817b39c5158Smillert 1818b39c5158Smillert=item C<$result = $Collator-E<gt>lt($a, $b)> 1819b39c5158Smillert 1820b39c5158Smillert=item C<$result = $Collator-E<gt>le($a, $b)> 1821b39c5158Smillert 1822b39c5158Smillert=item C<$result = $Collator-E<gt>gt($a, $b)> 1823b39c5158Smillert 1824b39c5158Smillert=item C<$result = $Collator-E<gt>ge($a, $b)> 1825b39c5158Smillert 1826b39c5158SmillertThey works like the same name operators as theirs. 1827b39c5158Smillert 1828b39c5158Smillert eq : whether $a is equal to $b. 1829b39c5158Smillert ne : whether $a is not equal to $b. 18306fb12b70Safresh1 lt : whether $a is less than $b. 18316fb12b70Safresh1 le : whether $a is less than $b or equal to $b. 1832b39c5158Smillert gt : whether $a is greater than $b. 1833b39c5158Smillert ge : whether $a is greater than $b or equal to $b. 1834b39c5158Smillert 1835b39c5158Smillert=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1836b39c5158Smillert 1837b39c5158Smillert-- see 4.3 Form Sort Key, UTS #10. 1838b39c5158Smillert 1839b39c5158SmillertReturns a sort key. 1840b39c5158Smillert 1841b39c5158SmillertYou compare the sort keys using a binary comparison 1842b39c5158Smillertand get the result of the comparison of the strings using UCA. 1843b39c5158Smillert 1844b39c5158Smillert $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1845b39c5158Smillert 1846b39c5158Smillert is equivalent to 1847b39c5158Smillert 1848b39c5158Smillert $Collator->cmp($a, $b) 1849b39c5158Smillert 1850b39c5158Smillert=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1851b39c5158Smillert 1852b39c5158SmillertConverts a sorting key into its representation form. 1853b39c5158SmillertIf C<UCA_Version> is 8, the output is slightly different. 1854b39c5158Smillert 1855b39c5158Smillert use Unicode::Collate; 1856b39c5158Smillert my $c = Unicode::Collate->new(); 1857b39c5158Smillert print $c->viewSortKey("Perl"),"\n"; 1858b39c5158Smillert 1859b39c5158Smillert # output: 1860b39c5158Smillert # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1861b39c5158Smillert # Level 1 Level 2 Level 3 Level 4 1862b39c5158Smillert 1863b39c5158Smillert=back 1864b39c5158Smillert 1865b39c5158Smillert=head2 Methods for Searching 1866b39c5158Smillert 1867b39c5158SmillertThe C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1868b39c5158Smillertlike C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1869b39c5158Smillertbut they are not aware of any pattern, but only a literal substring. 1870b39c5158Smillert 1871898184e3SsthenB<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1872898184e3Ssthenfor C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1873898184e3SsthenC<subst>, C<gsubst>) is croaked, as the position and the length might 1874898184e3Ssthendiffer from those on the specified string. 1875898184e3Ssthen 1876898184e3SsthenC<rearrange> and C<hangul_terminator> parameters are neglected. 1877898184e3SsthenC<katakana_before_hiragana> and C<upper_before_lower> don't affect 18786fb12b70Safresh1matching and searching, as it doesn't matter whether greater or less. 1879898184e3Ssthen 1880b39c5158Smillert=over 4 1881b39c5158Smillert 1882b39c5158Smillert=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1883b39c5158Smillert 1884b39c5158Smillert=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1885b39c5158Smillert 1886b39c5158SmillertIf C<$substring> matches a part of C<$string>, returns 1887b39c5158Smillertthe position of the first occurrence of the matching part in scalar context; 1888b39c5158Smillertin list context, returns a two-element list of 1889b39c5158Smillertthe position and the length of the matching part. 1890b39c5158Smillert 1891b39c5158SmillertIf C<$substring> does not match any part of C<$string>, 1892b39c5158Smillertreturns C<-1> in scalar context and 1893b39c5158Smillertan empty list in list context. 1894b39c5158Smillert 18956fb12b70Safresh1e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">, 18966fb12b70Safresh1you say the following where C<$sub> is C<"M>E<uuml>C<SS">, 1897b39c5158Smillert 1898b39c5158Smillert my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1899b39c5158Smillert # (normalization => undef) is REQUIRED. 1900b39c5158Smillert my $match; 1901b39c5158Smillert if (my($pos,$len) = $Collator->index($str, $sub)) { 1902b39c5158Smillert $match = substr($str, $pos, $len); 1903b39c5158Smillert } 1904b39c5158Smillert 19056fb12b70Safresh1and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<"> 19066fb12b70Safresh1is primary equal to C<"M>E<uuml>C<SS">. 1907b39c5158Smillert 1908b39c5158Smillert=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1909b39c5158Smillert 1910b39c5158Smillert=item C<($match) = $Collator-E<gt>match($string, $substring)> 1911b39c5158Smillert 1912b39c5158SmillertIf C<$substring> matches a part of C<$string>, in scalar context, returns 1913b39c5158SmillertB<a reference to> the first occurrence of the matching part 1914b39c5158Smillert(C<$match_ref> is always true if matches, 1915b39c5158Smillertsince every reference is B<true>); 1916b39c5158Smillertin list context, returns the first occurrence of the matching part. 1917b39c5158Smillert 1918b39c5158SmillertIf C<$substring> does not match any part of C<$string>, 1919b39c5158Smillertreturns C<undef> in scalar context and 1920b39c5158Smillertan empty list in list context. 1921b39c5158Smillert 1922b39c5158Smillerte.g. 1923b39c5158Smillert 1924b39c5158Smillert if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1925b39c5158Smillert print "matches [$$match_ref].\n"; 1926b39c5158Smillert } else { 1927b39c5158Smillert print "doesn't match.\n"; 1928b39c5158Smillert } 1929b39c5158Smillert 1930b39c5158Smillert or 1931b39c5158Smillert 1932b39c5158Smillert if (($match) = $Collator->match($str, $sub)) { # list context 1933b39c5158Smillert print "matches [$match].\n"; 1934b39c5158Smillert } else { 1935b39c5158Smillert print "doesn't match.\n"; 1936b39c5158Smillert } 1937b39c5158Smillert 1938b39c5158Smillert=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1939b39c5158Smillert 1940b39c5158SmillertIf C<$substring> matches a part of C<$string>, returns 1941b39c5158Smillertall the matching parts (or matching count in scalar context). 1942b39c5158Smillert 1943b39c5158SmillertIf C<$substring> does not match any part of C<$string>, 1944b39c5158Smillertreturns an empty list. 1945b39c5158Smillert 1946b39c5158Smillert=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1947b39c5158Smillert 1948b39c5158SmillertIf C<$substring> matches a part of C<$string>, 1949b39c5158Smillertthe first occurrence of the matching part is replaced by C<$replacement> 1950898184e3Ssthen(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 1951b39c5158Smillert 1952b39c5158SmillertC<$replacement> can be a C<CODEREF>, 1953b39c5158Smillerttaking the matching part as an argument, 1954b39c5158Smillertand returning a string to replace the matching part 1955b39c5158Smillert(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1956b39c5158Smillert 1957b39c5158Smillert=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1958b39c5158Smillert 1959b39c5158SmillertIf C<$substring> matches a part of C<$string>, 1960898184e3Ssthenall the occurrences of the matching part are replaced by C<$replacement> 1961898184e3Ssthen(C<$string> is modified) and C<$count> is returned. 1962b39c5158Smillert 1963b39c5158SmillertC<$replacement> can be a C<CODEREF>, 1964b39c5158Smillerttaking the matching part as an argument, 1965b39c5158Smillertand returning a string to replace the matching part 1966b39c5158Smillert(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1967b39c5158Smillert 1968b39c5158Smillerte.g. 1969b39c5158Smillert 1970b39c5158Smillert my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1971b39c5158Smillert # (normalization => undef) is REQUIRED. 1972898184e3Ssthen my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 1973b39c5158Smillert $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1974b39c5158Smillert 1975898184e3Ssthen # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 1976b39c5158Smillert # i.e., all the camels are made bold-faced. 1977b39c5158Smillert 1978898184e3Ssthen Examples: levels and ignore_level2 - what does camel match? 1979898184e3Ssthen --------------------------------------------------------------------------- 1980898184e3Ssthen level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 1981898184e3Ssthen -----------------------|--------------------------------------------------- 1982898184e3Ssthen 1 false | yes yes yes yes yes 1983898184e3Ssthen 2 false | yes yes no yes yes 1984898184e3Ssthen 3 false | yes no no yes yes 1985898184e3Ssthen 4 false | yes no no no yes 1986898184e3Ssthen -----------------------|--------------------------------------------------- 1987898184e3Ssthen 1 true | yes yes yes yes yes 1988898184e3Ssthen 2 true | yes yes yes yes yes 1989898184e3Ssthen 3 true | yes no yes yes yes 1990898184e3Ssthen 4 true | yes no yes no yes 1991898184e3Ssthen --------------------------------------------------------------------------- 1992898184e3Ssthen note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 1993898184e3Ssthen at any level. 1994898184e3Ssthen 1995b39c5158Smillert=back 1996b39c5158Smillert 1997b39c5158Smillert=head2 Other Methods 1998b39c5158Smillert 1999b39c5158Smillert=over 4 2000b39c5158Smillert 2001b39c5158Smillert=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 2002b39c5158Smillert 2003898184e3Ssthen=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 2004898184e3Ssthen 2005898184e3SsthenChanges the value of specified keys and returns the changed part. 2006b39c5158Smillert 2007b39c5158Smillert $Collator = Unicode::Collate->new(level => 4); 2008b39c5158Smillert 2009b39c5158Smillert $Collator->eq("perl", "PERL"); # false 2010b39c5158Smillert 2011b39c5158Smillert %old = $Collator->change(level => 2); # returns (level => 4). 2012b39c5158Smillert 2013b39c5158Smillert $Collator->eq("perl", "PERL"); # true 2014b39c5158Smillert 2015b39c5158Smillert $Collator->change(%old); # returns (level => 2). 2016b39c5158Smillert 2017b39c5158Smillert $Collator->eq("perl", "PERL"); # false 2018b39c5158Smillert 2019b39c5158SmillertNot all C<(key,value)>s are allowed to be changed. 2020b39c5158SmillertSee also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 2021b39c5158Smillert 2022b39c5158SmillertIn the scalar context, returns the modified collator 2023b39c5158Smillert(but it is B<not> a clone from the original). 2024b39c5158Smillert 2025b39c5158Smillert $Collator->change(level => 2)->eq("perl", "PERL"); # true 2026b39c5158Smillert 2027b39c5158Smillert $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 2028b39c5158Smillert 2029b39c5158Smillert $Collator->change(level => 4)->eq("perl", "PERL"); # false 2030b39c5158Smillert 2031b39c5158Smillert=item C<$version = $Collator-E<gt>version()> 2032b39c5158Smillert 2033b39c5158SmillertReturns the version number (a string) of the Unicode Standard 2034b39c5158Smillertwhich the C<table> file used by the collator object is based on. 2035b39c5158SmillertIf the table does not include a version line (starting with C<@version>), 2036b39c5158Smillertreturns C<"unknown">. 2037b39c5158Smillert 2038b39c5158Smillert=item C<UCA_Version()> 2039b39c5158Smillert 2040898184e3SsthenReturns the revision number of UTS #10 this module consults, 2041898184e3Ssthenthat should correspond with the DUCET incorporated. 2042b39c5158Smillert 2043b39c5158Smillert=item C<Base_Unicode_Version()> 2044b39c5158Smillert 2045898184e3SsthenReturns the version number of UTS #10 this module consults, 2046898184e3Ssthenthat should correspond with the DUCET incorporated. 2047b39c5158Smillert 2048b39c5158Smillert=back 2049b39c5158Smillert 2050b39c5158Smillert=head1 EXPORT 2051b39c5158Smillert 2052b39c5158SmillertNo method will be exported. 2053b39c5158Smillert 2054b39c5158Smillert=head1 INSTALL 2055b39c5158Smillert 2056b39c5158SmillertThough this module can be used without any C<table> file, 2057b39c5158Smillertto use this module easily, it is recommended to install a table file 2058b39c5158Smillertin the UCA format, by copying it under the directory 2059b39c5158Smillert<a place in @INC>/Unicode/Collate. 2060b39c5158Smillert 2061b39c5158SmillertThe most preferable one is "The Default Unicode Collation Element Table" 2062b39c5158Smillert(aka DUCET), available from the Unicode Consortium's website: 2063b39c5158Smillert 2064b39c5158Smillert http://www.unicode.org/Public/UCA/ 2065b39c5158Smillert 20669f11ffb7Safresh1 http://www.unicode.org/Public/UCA/latest/allkeys.txt 20679f11ffb7Safresh1 (latest version) 2068b39c5158Smillert 2069b39c5158SmillertIf DUCET is not installed, it is recommended to copy the file 2070b39c5158Smillertfrom http://www.unicode.org/Public/UCA/latest/allkeys.txt 2071b39c5158Smillertto <a place in @INC>/Unicode/Collate/allkeys.txt 2072b39c5158Smillertmanually. 2073b39c5158Smillert 2074b39c5158Smillert=head1 CAVEATS 2075b39c5158Smillert 2076b39c5158Smillert=over 4 2077b39c5158Smillert 2078b39c5158Smillert=item Normalization 2079b39c5158Smillert 2080b39c5158SmillertUse of the C<normalization> parameter requires the B<Unicode::Normalize> 2081b39c5158Smillertmodule (see L<Unicode::Normalize>). 2082b39c5158Smillert 2083b39c5158SmillertIf you need not it (say, in the case when you need not 2084b39c5158Smillerthandle any combining characters), 20856fb12b70Safresh1assign C<(normalization =E<gt> undef)> explicitly. 2086b39c5158Smillert 2087b39c5158Smillert-- see 6.5 Avoiding Normalization, UTS #10. 2088b39c5158Smillert 2089b39c5158Smillert=item Conformance Test 2090b39c5158Smillert 2091b39c5158SmillertThe Conformance Test for the UCA is available 2092b39c5158Smillertunder L<http://www.unicode.org/Public/UCA/>. 2093b39c5158Smillert 2094b39c5158SmillertFor F<CollationTest_SHIFTED.txt>, 2095b39c5158Smillerta collator via C<Unicode::Collate-E<gt>new( )> should be used; 2096b39c5158Smillertfor F<CollationTest_NON_IGNORABLE.txt>, a collator via 2097b39c5158SmillertC<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 2098b39c5158Smillert 209991f110e0Safresh1If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 210091f110e0Safresh1C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 210191f110e0Safresh1C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 210291f110e0Safresh1C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 210391f110e0Safresh1 2104b39c5158SmillertB<Unicode::Normalize is required to try The Conformance Test.> 2105b39c5158Smillert 2106*eac174f2Safresh1B<EBCDIC-SUPPORT IS EXPERIMENTAL.> 2107*eac174f2Safresh1 2108b39c5158Smillert=back 2109b39c5158Smillert 2110b39c5158Smillert=head1 AUTHOR, COPYRIGHT AND LICENSE 2111b39c5158Smillert 2112b39c5158SmillertThe Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 2113*eac174f2Safresh1<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2021, 2114b39c5158SmillertSADAHIRO Tomoyuki. Japan. All rights reserved. 2115b39c5158Smillert 2116b39c5158SmillertThis module is free software; you can redistribute it and/or 2117b39c5158Smillertmodify it under the same terms as Perl itself. 2118b39c5158Smillert 2119898184e3SsthenThe file Unicode/Collate/allkeys.txt was copied verbatim 2120*eac174f2Safresh1from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>. 2121*eac174f2Safresh1For this file, Copyright (c) 2020 Unicode, Inc.; distributed 21229f11ffb7Safresh1under the Terms of Use in L<http://www.unicode.org/terms_of_use.html> 2123b39c5158Smillert 2124b39c5158Smillert=head1 SEE ALSO 2125b39c5158Smillert 2126b39c5158Smillert=over 4 2127b39c5158Smillert 2128b39c5158Smillert=item Unicode Collation Algorithm - UTS #10 2129b39c5158Smillert 2130b39c5158SmillertL<http://www.unicode.org/reports/tr10/> 2131b39c5158Smillert 2132b39c5158Smillert=item The Default Unicode Collation Element Table (DUCET) 2133b39c5158Smillert 2134b39c5158SmillertL<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 2135b39c5158Smillert 2136b39c5158Smillert=item The conformance test for the UCA 2137b39c5158Smillert 2138b39c5158SmillertL<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 2139b39c5158Smillert 2140b39c5158SmillertL<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 2141b39c5158Smillert 2142b39c5158Smillert=item Hangul Syllable Type 2143b39c5158Smillert 2144b39c5158SmillertL<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 2145b39c5158Smillert 2146b39c5158Smillert=item Unicode Normalization Forms - UAX #15 2147b39c5158Smillert 2148b39c5158SmillertL<http://www.unicode.org/reports/tr15/> 2149b39c5158Smillert 2150898184e3Ssthen=item Unicode Locale Data Markup Language (LDML) - UTS #35 2151898184e3Ssthen 2152898184e3SsthenL<http://www.unicode.org/reports/tr35/> 2153898184e3Ssthen 2154b39c5158Smillert=back 2155b39c5158Smillert 2156b39c5158Smillert=cut 2157