xref: /openbsd-src/gnu/usr.bin/perl/cpan/Unicode-Collate/Collate.pm (revision eac174f2741a08d8deb8aae59a7f778ef9b5d770)
1b39c5158Smillertpackage Unicode::Collate;
2b39c5158Smillert
3b39c5158Smillertuse 5.006;
4b39c5158Smillertuse strict;
5b39c5158Smillertuse warnings;
6b39c5158Smillertuse Carp;
7b39c5158Smillertuse File::Spec;
8b39c5158Smillert
9b39c5158Smillertno warnings 'utf8';
10b39c5158Smillert
11*eac174f2Safresh1our $VERSION = '1.31';
12b39c5158Smillertour $PACKAGE = __PACKAGE__;
13b39c5158Smillert
14898184e3Ssthen### begin XS only ###
159f11ffb7Safresh1use XSLoader ();
169f11ffb7Safresh1XSLoader::load('Unicode::Collate', $VERSION);
17898184e3Ssthen### end XS only ###
18898184e3Ssthen
19b39c5158Smillertmy @Path = qw(Unicode Collate);
209f11ffb7Safresh1my $KeyFile = 'allkeys.txt';
21b39c5158Smillert
22b39c5158Smillert# Perl's boolean
23b39c5158Smillertuse constant TRUE  => 1;
24b39c5158Smillertuse constant FALSE => "";
25b39c5158Smillertuse constant NOMATCHPOS => -1;
26b39c5158Smillert
27b39c5158Smillert# A coderef to get combining class imported from Unicode::Normalize
28b39c5158Smillert# (i.e. \&Unicode::Normalize::getCombinClass).
29b39c5158Smillert# This is also used as a HAS_UNICODE_NORMALIZE flag.
30b39c5158Smillertmy $CVgetCombinClass;
31b39c5158Smillert
32b39c5158Smillert# Supported Levels
33b39c5158Smillertuse constant MinLevel => 1;
34b39c5158Smillertuse constant MaxLevel => 4;
35b39c5158Smillert
36b39c5158Smillert# Minimum weights at level 2 and 3, respectively
37b39c5158Smillertuse constant Min2Wt => 0x20;
38b39c5158Smillertuse constant Min3Wt => 0x02;
39b39c5158Smillert
40b39c5158Smillert# Shifted weight at 4th level
41b39c5158Smillertuse constant Shift4Wt => 0xFFFF;
42b39c5158Smillert
43b39c5158Smillert# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
44b39c5158Smillertuse constant VCE_TEMPLATE => 'Cn4';
45b39c5158Smillert
46b39c5158Smillert# A sort key: 16-bit weights
47b39c5158Smillertuse constant KEY_TEMPLATE => 'n*';
48b39c5158Smillert
4991f110e0Safresh1# The tie-breaking: 32-bit weights
5091f110e0Safresh1use constant TIE_TEMPLATE => 'N*';
5191f110e0Safresh1
52b39c5158Smillert# Level separator in a sort key:
53b39c5158Smillert# i.e. pack(KEY_TEMPLATE, 0)
54b39c5158Smillertuse constant LEVEL_SEP => "\0\0";
55b39c5158Smillert
56b39c5158Smillert# As Unicode code point separator for hash keys.
57b39c5158Smillert# A joined code point string (denoted by JCPS below)
58b39c5158Smillert# like "65;768" is used for internal processing
59b39c5158Smillert# instead of Perl's Unicode string like "\x41\x{300}",
60b39c5158Smillert# as the native code point is different from the Unicode code point
61b39c5158Smillert# on EBCDIC platform.
62b39c5158Smillert# This character must not be included in any stringified
63b39c5158Smillert# representation of an integer.
64b39c5158Smillertuse constant CODE_SEP => ';';
65898184e3Ssthen	# NOTE: in regex /;/ is used for $jcps!
66b39c5158Smillert
67b39c5158Smillert# boolean values of variable weights
68b39c5158Smillertuse constant NON_VAR => 0; # Non-Variable character
69b39c5158Smillertuse constant VAR     => 1; # Variable character
70b39c5158Smillert
71b39c5158Smillert# specific code points
72b39c5158Smillertuse constant Hangul_SIni   => 0xAC00;
73b39c5158Smillertuse constant Hangul_SFin   => 0xD7A3;
74b39c5158Smillert
75b39c5158Smillert# Logical_Order_Exception in PropList.txt
76b39c5158Smillertmy $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
77b39c5158Smillert
7891f110e0Safresh1# for highestFFFF and minimalFFFE
7991f110e0Safresh1my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
8091f110e0Safresh1my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
81b39c5158Smillert
82*eac174f2Safresh1sub UCA_Version { '43' }
8391f110e0Safresh1
84*eac174f2Safresh1sub Base_Unicode_Version { '13.0.0' }
85b39c5158Smillert
86b39c5158Smillert######
87b39c5158Smillert
88*eac174f2Safresh1my $native_to_unicode = ($::IS_ASCII || $] < 5.008)
89*eac174f2Safresh1	? sub { return shift }
90*eac174f2Safresh1	: sub { utf8::native_to_unicode(shift) };
91*eac174f2Safresh1
92*eac174f2Safresh1my $unicode_to_native = ($::IS_ASCII || $] < 5.008)
93*eac174f2Safresh1	? sub { return shift }
94*eac174f2Safresh1	: sub { utf8::unicode_to_native(shift) };
95*eac174f2Safresh1
96*eac174f2Safresh1# pack_U() should get Unicode code points.
97b39c5158Smillertsub pack_U {
98*eac174f2Safresh1    return pack('U*', map $unicode_to_native->($_), @_);
99b39c5158Smillert}
100b39c5158Smillert
101*eac174f2Safresh1# unpack_U() should return Unicode code points.
102b8851fccSafresh1sub unpack_U {
103*eac174f2Safresh1    return map $native_to_unicode->($_), unpack('U*', shift(@_).pack('U*'));
104b8851fccSafresh1}
105*eac174f2Safresh1# for older perl version, pack('U*') generates empty string with utf8 flag.
1066fb12b70Safresh1
107b39c5158Smillert######
108b39c5158Smillert
109b39c5158Smillertmy (%VariableOK);
110b39c5158Smillert@VariableOK{ qw/
111b39c5158Smillert    blanked  non-ignorable  shifted  shift-trimmed
112b39c5158Smillert  / } = (); # keys lowercased
113b39c5158Smillert
114b39c5158Smillertour @ChangeOK = qw/
115b39c5158Smillert    alternate backwards level normalization rearrange
116898184e3Ssthen    katakana_before_hiragana upper_before_lower ignore_level2
1176fb12b70Safresh1    overrideCJK overrideHangul overrideOut preprocess UCA_Version
11891f110e0Safresh1    hangul_terminator variable identical highestFFFF minimalFFFE
119b8851fccSafresh1    long_contraction
120b39c5158Smillert  /;
121b39c5158Smillert
122b39c5158Smillertour @ChangeNG = qw/
123898184e3Ssthen    entry mapping table maxlength contraction
124898184e3Ssthen    ignoreChar ignoreName undefChar undefName rewrite
125898184e3Ssthen    versionTable alternateTable backwardsTable forwardsTable
126898184e3Ssthen    rearrangeTable variableTable
127898184e3Ssthen    derivCode normCode rearrangeHash backwardsFlag
128898184e3Ssthen    suppress suppressHash
129898184e3Ssthen    __useXS /; ### XS only
1306fb12b70Safresh1# The hash key 'ignored' was deleted at v 0.21.
1316fb12b70Safresh1# The hash key 'isShift' was deleted at v 0.23.
1326fb12b70Safresh1# The hash key 'combining' was deleted at v 0.24.
1336fb12b70Safresh1# The hash key 'entries' was deleted at v 0.30.
1346fb12b70Safresh1# The hash key 'L3_ignorable' was deleted at v 0.40.
135b39c5158Smillert
136b39c5158Smillertsub version {
137b39c5158Smillert    my $self = shift;
138b39c5158Smillert    return $self->{versionTable} || 'unknown';
139b39c5158Smillert}
140b39c5158Smillert
141b39c5158Smillertmy (%ChangeOK, %ChangeNG);
142b39c5158Smillert@ChangeOK{ @ChangeOK } = ();
143b39c5158Smillert@ChangeNG{ @ChangeNG } = ();
144b39c5158Smillert
145b39c5158Smillertsub change {
146b39c5158Smillert    my $self = shift;
147b39c5158Smillert    my %hash = @_;
148b39c5158Smillert    my %old;
14991f110e0Safresh1    if (exists $hash{alternate}) {
15091f110e0Safresh1	if (exists $hash{variable}) {
151b39c5158Smillert	    delete $hash{alternate};
15291f110e0Safresh1	} else {
153b39c5158Smillert	    $hash{variable} = $hash{alternate};
154b39c5158Smillert	}
15591f110e0Safresh1    }
156b39c5158Smillert    foreach my $k (keys %hash) {
157b39c5158Smillert	if (exists $ChangeOK{$k}) {
158b39c5158Smillert	    $old{$k} = $self->{$k};
159b39c5158Smillert	    $self->{$k} = $hash{$k};
16091f110e0Safresh1	} elsif (exists $ChangeNG{$k}) {
161b39c5158Smillert	    croak "change of $k via change() is not allowed!";
162b39c5158Smillert	}
163b39c5158Smillert	# else => ignored
164b39c5158Smillert    }
165b39c5158Smillert    $self->checkCollator();
166b39c5158Smillert    return wantarray ? %old : $self;
167b39c5158Smillert}
168b39c5158Smillert
169b39c5158Smillertsub _checkLevel {
170b39c5158Smillert    my $level = shift;
171b39c5158Smillert    my $key   = shift; # 'level' or 'backwards'
172b39c5158Smillert    MinLevel <= $level or croak sprintf
173b39c5158Smillert	"Illegal level %d (in value for key '%s') lower than %d.",
174b39c5158Smillert	    $level, $key, MinLevel;
175b39c5158Smillert    $level <= MaxLevel or croak sprintf
176b39c5158Smillert	"Unsupported level %d (in value for key '%s') higher than %d.",
177b39c5158Smillert	    $level, $key, MaxLevel;
178b39c5158Smillert}
179b39c5158Smillert
180b39c5158Smillertmy %DerivCode = (
181b39c5158Smillert    8 => \&_derivCE_8,
182b39c5158Smillert    9 => \&_derivCE_9,
183b39c5158Smillert   11 => \&_derivCE_9, # 11 == 9
184b39c5158Smillert   14 => \&_derivCE_14,
185898184e3Ssthen   16 => \&_derivCE_14, # 16 == 14
186898184e3Ssthen   18 => \&_derivCE_18,
187898184e3Ssthen   20 => \&_derivCE_20,
188898184e3Ssthen   22 => \&_derivCE_22,
189898184e3Ssthen   24 => \&_derivCE_24,
19091f110e0Safresh1   26 => \&_derivCE_24, # 26 == 24
1916fb12b70Safresh1   28 => \&_derivCE_24, # 28 == 24
192b8851fccSafresh1   30 => \&_derivCE_24, # 30 == 24
1939f11ffb7Safresh1   32 => \&_derivCE_32,
1949f11ffb7Safresh1   34 => \&_derivCE_34,
1959f11ffb7Safresh1   36 => \&_derivCE_36,
196*eac174f2Safresh1   38 => \&_derivCE_38,
197*eac174f2Safresh1   40 => \&_derivCE_40,
198*eac174f2Safresh1   41 => \&_derivCE_40, # 41 == 40
199*eac174f2Safresh1   43 => \&_derivCE_43,
200b39c5158Smillert);
201b39c5158Smillert
202b39c5158Smillertsub checkCollator {
203b39c5158Smillert    my $self = shift;
2049f11ffb7Safresh1    _checkLevel($self->{level}, 'level');
205b39c5158Smillert
206b39c5158Smillert    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
207b39c5158Smillert	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
208b39c5158Smillert
209b39c5158Smillert    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
210b39c5158Smillert				$self->{alternateTable} || 'shifted';
211b39c5158Smillert    $self->{variable} = $self->{alternate} = lc($self->{variable});
212b39c5158Smillert    exists $VariableOK{ $self->{variable} }
213b39c5158Smillert	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
214b39c5158Smillert
215b39c5158Smillert    if (! defined $self->{backwards}) {
216b39c5158Smillert	$self->{backwardsFlag} = 0;
21791f110e0Safresh1    } elsif (! ref $self->{backwards}) {
2189f11ffb7Safresh1	_checkLevel($self->{backwards}, 'backwards');
219b39c5158Smillert	$self->{backwardsFlag} = 1 << $self->{backwards};
22091f110e0Safresh1    } else {
221b39c5158Smillert	my %level;
222b39c5158Smillert	$self->{backwardsFlag} = 0;
223b39c5158Smillert	for my $b (@{ $self->{backwards} }) {
2249f11ffb7Safresh1	    _checkLevel($b, 'backwards');
225b39c5158Smillert	    $level{$b} = 1;
226b39c5158Smillert	}
227b39c5158Smillert	for my $v (sort keys %level) {
228b39c5158Smillert	    $self->{backwardsFlag} += 1 << $v;
229b39c5158Smillert	}
230b39c5158Smillert    }
231b39c5158Smillert
232b39c5158Smillert    defined $self->{rearrange} or $self->{rearrange} = [];
233b39c5158Smillert    ref $self->{rearrange}
234b39c5158Smillert	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
235b39c5158Smillert
236b39c5158Smillert    # keys of $self->{rearrangeHash} are $self->{rearrange}.
237b39c5158Smillert    $self->{rearrangeHash} = undef;
238b39c5158Smillert
239b39c5158Smillert    if (@{ $self->{rearrange} }) {
240b39c5158Smillert	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
241b39c5158Smillert    }
242b39c5158Smillert
243b39c5158Smillert    $self->{normCode} = undef;
244b39c5158Smillert
245b39c5158Smillert    if (defined $self->{normalization}) {
246b39c5158Smillert	eval { require Unicode::Normalize };
247b39c5158Smillert	$@ and croak "Unicode::Normalize is required to normalize strings";
248b39c5158Smillert
249b39c5158Smillert	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
250b39c5158Smillert
251b39c5158Smillert	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
252b39c5158Smillert	    $self->{normCode} = \&Unicode::Normalize::NFD;
253b39c5158Smillert	}
254b39c5158Smillert	elsif ($self->{normalization} ne 'prenormalized') {
255b39c5158Smillert	    my $norm = $self->{normalization};
256b39c5158Smillert	    $self->{normCode} = sub {
257b39c5158Smillert		Unicode::Normalize::normalize($norm, shift);
258b39c5158Smillert	    };
259b39c5158Smillert	    eval { $self->{normCode}->("") }; # try
260b39c5158Smillert	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
261b39c5158Smillert	}
262b39c5158Smillert    }
263b39c5158Smillert    return;
264b39c5158Smillert}
265b39c5158Smillert
266b39c5158Smillertsub new
267b39c5158Smillert{
268b39c5158Smillert    my $class = shift;
269b39c5158Smillert    my $self = bless { @_ }, $class;
270b39c5158Smillert
271898184e3Ssthen### begin XS only ###
272898184e3Ssthen    if (! exists $self->{table}     && !defined $self->{rewrite} &&
273898184e3Ssthen	!defined $self->{undefName} && !defined $self->{ignoreName} &&
274898184e3Ssthen	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
275898184e3Ssthen	$self->{__useXS} = \&_fetch_simple;
276898184e3Ssthen    } else {
277898184e3Ssthen	$self->{__useXS} = undef;
278898184e3Ssthen    }
279898184e3Ssthen### end XS only ###
280898184e3Ssthen
281898184e3Ssthen    # keys of $self->{suppressHash} are $self->{suppress}.
282898184e3Ssthen    if ($self->{suppress} && @{ $self->{suppress} }) {
283898184e3Ssthen	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
284898184e3Ssthen    } # before read_table()
285898184e3Ssthen
286b39c5158Smillert    # If undef is passed explicitly, no file is read.
287b39c5158Smillert    $self->{table} = $KeyFile if ! exists $self->{table};
288b39c5158Smillert    $self->read_table() if defined $self->{table};
289b39c5158Smillert
290b39c5158Smillert    if ($self->{entry}) {
291b39c5158Smillert	while ($self->{entry} =~ /([^\n]+)/g) {
292898184e3Ssthen	    $self->parseEntry($1, TRUE);
293b39c5158Smillert	}
294b39c5158Smillert    }
295b39c5158Smillert
296b8851fccSafresh1    # only in new(), not in change()
297b39c5158Smillert    $self->{level} ||= MaxLevel;
298b39c5158Smillert    $self->{UCA_Version} ||= UCA_Version();
299b39c5158Smillert
300b39c5158Smillert    $self->{overrideHangul} = FALSE
301b39c5158Smillert	if ! exists $self->{overrideHangul};
302b39c5158Smillert    $self->{overrideCJK} = FALSE
303b39c5158Smillert	if ! exists $self->{overrideCJK};
304b39c5158Smillert    $self->{normalization} = 'NFD'
305b39c5158Smillert	if ! exists $self->{normalization};
306b39c5158Smillert    $self->{rearrange} = $self->{rearrangeTable} ||
307b39c5158Smillert	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
308b39c5158Smillert	if ! exists $self->{rearrange};
309b39c5158Smillert    $self->{backwards} = $self->{backwardsTable}
310b39c5158Smillert	if ! exists $self->{backwards};
311b8851fccSafresh1    exists $self->{long_contraction} or $self->{long_contraction}
312b8851fccSafresh1	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
313b39c5158Smillert
314b8851fccSafresh1    # checkCollator() will be called in change()
315b39c5158Smillert    $self->checkCollator();
316b39c5158Smillert
317b39c5158Smillert    return $self;
318b39c5158Smillert}
319b39c5158Smillert
320898184e3Ssthensub parseAtmark {
321b39c5158Smillert    my $self = shift;
322898184e3Ssthen    my $line = shift; # after s/^\s*\@//
323b39c5158Smillert
324b39c5158Smillert    if ($line =~ /^version\s*(\S*)/) {
325b39c5158Smillert	$self->{versionTable} ||= $1;
326b39c5158Smillert    }
327b39c5158Smillert    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
328b39c5158Smillert	$self->{variableTable} ||= $1;
329b39c5158Smillert    }
330b39c5158Smillert    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
331b39c5158Smillert	$self->{alternateTable} ||= $1;
332b39c5158Smillert    }
333b39c5158Smillert    elsif ($line =~ /^backwards\s+(\S*)/) {
334b39c5158Smillert	push @{ $self->{backwardsTable} }, $1;
335b39c5158Smillert    }
3366fb12b70Safresh1    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
337b39c5158Smillert	push @{ $self->{forwardsTable} }, $1;
338b39c5158Smillert    }
339b39c5158Smillert    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
340b39c5158Smillert	push @{ $self->{rearrangeTable} }, _getHexArray($1);
341b39c5158Smillert    }
342b39c5158Smillert}
343898184e3Ssthen
344898184e3Ssthensub read_table {
345898184e3Ssthen    my $self = shift;
346898184e3Ssthen
347898184e3Ssthen### begin XS only ###
348898184e3Ssthen    if ($self->{__useXS}) {
349898184e3Ssthen	my @rest = _fetch_rest(); # complex matter need to parse
350898184e3Ssthen	for my $line (@rest) {
351898184e3Ssthen	    next if $line =~ /^\s*#/;
352898184e3Ssthen
353898184e3Ssthen	    if ($line =~ s/^\s*\@//) {
354898184e3Ssthen		$self->parseAtmark($line);
355898184e3Ssthen	    } else {
356898184e3Ssthen		$self->parseEntry($line);
357898184e3Ssthen	    }
358898184e3Ssthen	}
359898184e3Ssthen	return;
360898184e3Ssthen    }
361898184e3Ssthen### end XS only ###
362898184e3Ssthen
363898184e3Ssthen    my($f, $fh);
364898184e3Ssthen    foreach my $d (@INC) {
365898184e3Ssthen	$f = File::Spec->catfile($d, @Path, $self->{table});
366898184e3Ssthen	last if open($fh, $f);
367898184e3Ssthen	$f = undef;
368898184e3Ssthen    }
369898184e3Ssthen    if (!defined $f) {
370898184e3Ssthen	$f = File::Spec->catfile(@Path, $self->{table});
371898184e3Ssthen	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
372898184e3Ssthen    }
373898184e3Ssthen
374898184e3Ssthen    while (my $line = <$fh>) {
375898184e3Ssthen	next if $line =~ /^\s*#/;
376898184e3Ssthen
377898184e3Ssthen	if ($line =~ s/^\s*\@//) {
378898184e3Ssthen	    $self->parseAtmark($line);
379898184e3Ssthen	} else {
380898184e3Ssthen	    $self->parseEntry($line);
381898184e3Ssthen	}
382898184e3Ssthen    }
383b39c5158Smillert    close $fh;
384b39c5158Smillert}
385b39c5158Smillert
386b39c5158Smillert
387b39c5158Smillert##
388b39c5158Smillert## get $line, parse it, and write an entry in $self
389b39c5158Smillert##
390b39c5158Smillertsub parseEntry
391b39c5158Smillert{
392b39c5158Smillert    my $self = shift;
393b39c5158Smillert    my $line = shift;
394898184e3Ssthen    my $tailoring = shift;
395b39c5158Smillert    my($name, $entry, @uv, @key);
396b39c5158Smillert
397898184e3Ssthen    if (defined $self->{rewrite}) {
398898184e3Ssthen	$line = $self->{rewrite}->($line);
399898184e3Ssthen    }
400898184e3Ssthen
401b39c5158Smillert    return if $line !~ /^\s*[0-9A-Fa-f]/;
402b39c5158Smillert
403b39c5158Smillert    # removes comment and gets name
404b39c5158Smillert    $name = $1
405b39c5158Smillert	if $line =~ s/[#%]\s*(.*)//;
406b39c5158Smillert    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
407b39c5158Smillert
408b39c5158Smillert    # gets element
409b39c5158Smillert    my($e, $k) = split /;/, $line;
410b39c5158Smillert    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
411b39c5158Smillert	if ! $k;
412b39c5158Smillert
413b39c5158Smillert    @uv = _getHexArray($e);
414b39c5158Smillert    return if !@uv;
415898184e3Ssthen    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
416898184e3Ssthen		  exists $self->{suppressHash}{$uv[0]};
417b39c5158Smillert    $entry = join(CODE_SEP, @uv); # in JCPS
418b39c5158Smillert
419b39c5158Smillert    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
420b39c5158Smillert	my $ele = pack_U(@uv);
421b39c5158Smillert
4226fb12b70Safresh1	# regarded as if it were not stored in the table
423b39c5158Smillert	return
424b39c5158Smillert	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
425b39c5158Smillert
426b39c5158Smillert	# replaced as completely ignorable
427b39c5158Smillert	$k = '[.0000.0000.0000.0000]'
428b39c5158Smillert	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
429b39c5158Smillert    }
430b39c5158Smillert
431b39c5158Smillert    # replaced as completely ignorable
432b39c5158Smillert    $k = '[.0000.0000.0000.0000]'
433b39c5158Smillert	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
434b39c5158Smillert
435b39c5158Smillert    my $is_L3_ignorable = TRUE;
436b39c5158Smillert
437b39c5158Smillert    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
438b39c5158Smillert	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
439b39c5158Smillert	my @wt = _getHexArray($arr);
440b39c5158Smillert	push @key, pack(VCE_TEMPLATE, $var, @wt);
441b39c5158Smillert	$is_L3_ignorable = FALSE
442b39c5158Smillert	    if $wt[0] || $wt[1] || $wt[2];
443b39c5158Smillert	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
444b39c5158Smillert	# is completely ignorable.
445b39c5158Smillert	# For expansion, an entry $is_L3_ignorable
446b39c5158Smillert	# if and only if "all" CEs are [.0000.0000.0000].
447b39c5158Smillert    }
448b39c5158Smillert
4499f11ffb7Safresh1    # mapping: be an array ref or not exists (any false value is disallowed)
450b39c5158Smillert    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
451b39c5158Smillert
4529f11ffb7Safresh1    # maxlength: be more than 1 or not exists (any false value is disallowed)
453b39c5158Smillert    if (@uv > 1) {
454898184e3Ssthen	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
455898184e3Ssthen	    $self->{maxlength}{$uv[0]} = @uv;
456898184e3Ssthen	}
457898184e3Ssthen    }
4589f11ffb7Safresh1
4599f11ffb7Safresh1    # contraction: be 1 or not exists (any false value is disallowed)
460b8851fccSafresh1    while (@uv > 2) {
461898184e3Ssthen	pop @uv;
462898184e3Ssthen	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
463898184e3Ssthen	$self->{contraction}{$fake_entry} = 1;
464898184e3Ssthen    }
465b39c5158Smillert}
466b39c5158Smillert
467b39c5158Smillert
468b39c5158Smillertsub viewSortKey
469b39c5158Smillert{
470b39c5158Smillert    my $self = shift;
47191f110e0Safresh1    my $str  = shift;
47291f110e0Safresh1    $self->visualizeSortKey($self->getSortKey($str));
473b39c5158Smillert}
474b39c5158Smillert
475b39c5158Smillert
47691f110e0Safresh1sub process
47791f110e0Safresh1{
47891f110e0Safresh1    my $self = shift;
47991f110e0Safresh1    my $str  = shift;
48091f110e0Safresh1    my $prep = $self->{preprocess};
48191f110e0Safresh1    my $norm = $self->{normCode};
48291f110e0Safresh1
48391f110e0Safresh1    $str = &$prep($str) if ref $prep;
48491f110e0Safresh1    $str = &$norm($str) if ref $norm;
48591f110e0Safresh1    return $str;
48691f110e0Safresh1}
48791f110e0Safresh1
488b39c5158Smillert##
489b39c5158Smillert## arrayref of JCPS   = splitEnt(string to be collated)
49091f110e0Safresh1## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
491b39c5158Smillert##
492b39c5158Smillertsub splitEnt
493b39c5158Smillert{
494b39c5158Smillert    my $self = shift;
49591f110e0Safresh1    my $str  = shift;
49691f110e0Safresh1    my $wLen = shift; # with Length
497b39c5158Smillert
498b39c5158Smillert    my $map  = $self->{mapping};
499b39c5158Smillert    my $max  = $self->{maxlength};
500b39c5158Smillert    my $reH  = $self->{rearrangeHash};
501898184e3Ssthen    my $vers = $self->{UCA_Version};
502898184e3Ssthen    my $ver9 = $vers >= 9 && $vers <= 11;
503b8851fccSafresh1    my $long = $self->{long_contraction};
504898184e3Ssthen    my $uXS  = $self->{__useXS}; ### XS only
505b39c5158Smillert
50691f110e0Safresh1    my @buf;
507b39c5158Smillert
508b39c5158Smillert    # get array of Unicode code point of string.
509b39c5158Smillert    my @src = unpack_U($str);
510b39c5158Smillert
511b39c5158Smillert    # rearrangement:
512b39c5158Smillert    # Character positions are not kept if rearranged,
513b39c5158Smillert    # then neglected if $wLen is true.
514b39c5158Smillert    if ($reH && ! $wLen) {
515b39c5158Smillert	for (my $i = 0; $i < @src; $i++) {
516b39c5158Smillert	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
517b39c5158Smillert		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
518b39c5158Smillert		$i++;
519b39c5158Smillert	    }
520b39c5158Smillert	}
521b39c5158Smillert    }
522b39c5158Smillert
523b39c5158Smillert    # remove a code point marked as a completely ignorable.
524b39c5158Smillert    for (my $i = 0; $i < @src; $i++) {
5256fb12b70Safresh1	if ($vers <= 20 && _isIllegal($src[$i])) {
526898184e3Ssthen	    $src[$i] = undef;
527898184e3Ssthen	} elsif ($ver9) {
5289f11ffb7Safresh1	    $src[$i] = undef if exists $map->{ $src[$i] }
52991f110e0Safresh1			   ? @{ $map->{ $src[$i] } } == 0
53091f110e0Safresh1			   : $uXS && _ignorable_simple($src[$i]); ### XS only
531898184e3Ssthen	}
532b39c5158Smillert    }
533b39c5158Smillert
534b39c5158Smillert    for (my $i = 0; $i < @src; $i++) {
535b39c5158Smillert	my $jcps = $src[$i];
536b39c5158Smillert
537b39c5158Smillert	# skip removed code point
538b39c5158Smillert	if (! defined $jcps) {
539b39c5158Smillert	    if ($wLen && @buf) {
540b39c5158Smillert		$buf[-1][2] = $i + 1;
541b39c5158Smillert	    }
542b39c5158Smillert	    next;
543b39c5158Smillert	}
544b39c5158Smillert
545b39c5158Smillert	my $i_orig = $i;
546b39c5158Smillert
547b39c5158Smillert	# find contraction
5489f11ffb7Safresh1	if (exists $max->{$jcps}) {
549b39c5158Smillert	    my $temp_jcps = $jcps;
550b39c5158Smillert	    my $jcpsLen = 1;
551b39c5158Smillert	    my $maxLen = $max->{$jcps};
552b39c5158Smillert
553b39c5158Smillert	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
554b39c5158Smillert		next if ! defined $src[$p];
555b39c5158Smillert		$temp_jcps .= CODE_SEP . $src[$p];
556b39c5158Smillert		$jcpsLen++;
5579f11ffb7Safresh1		if (exists $map->{$temp_jcps}) {
558b39c5158Smillert		    $jcps = $temp_jcps;
559b39c5158Smillert		    $i = $p;
560b39c5158Smillert		}
561b39c5158Smillert	    }
562b39c5158Smillert
563898184e3Ssthen	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
564b39c5158Smillert	# This process requires Unicode::Normalize.
565b39c5158Smillert	# If "normalization" is undef, here should be skipped *always*
566b39c5158Smillert	# (in spite of bool value of $CVgetCombinClass),
567b39c5158Smillert	# since canonical ordering cannot be expected.
568b39c5158Smillert	# Blocked combining character should not be contracted.
569b39c5158Smillert
570b39c5158Smillert	    # $self->{normCode} is false in the case of "prenormalized".
571898184e3Ssthen	    if ($self->{normalization}) {
572898184e3Ssthen		my $cont = $self->{contraction};
573b39c5158Smillert		my $preCC = 0;
574898184e3Ssthen		my $preCC_uc = 0;
575898184e3Ssthen		my $jcps_uc = $jcps;
576898184e3Ssthen		my(@out, @out_uc);
577b39c5158Smillert
578b39c5158Smillert		for (my $p = $i + 1; $p < @src; $p++) {
579b39c5158Smillert		    next if ! defined $src[$p];
580898184e3Ssthen		    my $curCC = $CVgetCombinClass->($src[$p]);
581b39c5158Smillert		    last unless $curCC;
582b39c5158Smillert		    my $tail = CODE_SEP . $src[$p];
583898184e3Ssthen
5849f11ffb7Safresh1		    if ($preCC != $curCC && exists $map->{$jcps.$tail}) {
585b8851fccSafresh1			$jcps .= $tail;
586b8851fccSafresh1			push @out, $p;
587b8851fccSafresh1		    } else {
588b8851fccSafresh1			$preCC = $curCC;
589b8851fccSafresh1		    }
590b8851fccSafresh1
591b8851fccSafresh1		    next if !$long;
592b8851fccSafresh1
5939f11ffb7Safresh1		    if ($preCC_uc != $curCC &&
5949f11ffb7Safresh1			    (exists $map->{$jcps_uc.$tail} ||
5959f11ffb7Safresh1			    exists $cont->{$jcps_uc.$tail})) {
596898184e3Ssthen			$jcps_uc .= $tail;
597898184e3Ssthen			push @out_uc, $p;
598898184e3Ssthen		    } else {
599898184e3Ssthen			$preCC_uc = $curCC;
600898184e3Ssthen		    }
601b39c5158Smillert		}
602898184e3Ssthen
6039f11ffb7Safresh1		if (@out_uc && exists $map->{$jcps_uc}) {
604898184e3Ssthen		    $jcps = $jcps_uc;
605898184e3Ssthen		    $src[$_] = undef for @out_uc;
606898184e3Ssthen		} else {
607898184e3Ssthen		    $src[$_] = undef for @out;
608898184e3Ssthen		}
609b39c5158Smillert	    }
610b39c5158Smillert	}
611b39c5158Smillert
612b39c5158Smillert	# skip completely ignorable
6139f11ffb7Safresh1	if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 :
61491f110e0Safresh1	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
615b39c5158Smillert	    if ($wLen && @buf) {
616b39c5158Smillert		$buf[-1][2] = $i + 1;
617b39c5158Smillert	    }
618b39c5158Smillert	    next;
619b39c5158Smillert	}
620b39c5158Smillert
621b39c5158Smillert	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
622b39c5158Smillert    }
623b39c5158Smillert    return \@buf;
624b39c5158Smillert}
625b39c5158Smillert
626898184e3Ssthen##
627898184e3Ssthen## VCE = _pack_override(input, codepoint, derivCode)
628898184e3Ssthen##
629898184e3Ssthensub _pack_override ($$$) {
630898184e3Ssthen    my $r = shift;
631898184e3Ssthen    my $u = shift;
632898184e3Ssthen    my $der = shift;
633898184e3Ssthen
634898184e3Ssthen    if (ref $r) {
635898184e3Ssthen	return pack(VCE_TEMPLATE, NON_VAR, @$r);
636898184e3Ssthen    } elsif (defined $r) {
637898184e3Ssthen	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
638898184e3Ssthen    } else {
6396fb12b70Safresh1	$u = 0xFFFD if 0x10FFFF < $u;
640898184e3Ssthen	return $der->($u);
641898184e3Ssthen    }
642898184e3Ssthen}
643b39c5158Smillert
644b39c5158Smillert##
645b39c5158Smillert## list of VCE = getWt(JCPS)
646b39c5158Smillert##
647b39c5158Smillertsub getWt
648b39c5158Smillert{
649b39c5158Smillert    my $self = shift;
650b39c5158Smillert    my $u    = shift;
651b39c5158Smillert    my $map  = $self->{mapping};
652b39c5158Smillert    my $der  = $self->{derivCode};
6536fb12b70Safresh1    my $out  = $self->{overrideOut};
654898184e3Ssthen    my $uXS  = $self->{__useXS}; ### XS only
655b39c5158Smillert
656b39c5158Smillert    return if !defined $u;
65791f110e0Safresh1    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
65891f110e0Safresh1    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
6596fb12b70Safresh1    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
660b39c5158Smillert
6616fb12b70Safresh1    my @ce;
6629f11ffb7Safresh1    if (exists $map->{$u}) {
6636fb12b70Safresh1	@ce = @{ $map->{$u} }; # $u may be a contraction
6646fb12b70Safresh1### begin XS only ###
6656fb12b70Safresh1    } elsif ($uXS && _exists_simple($u)) {
6666fb12b70Safresh1	@ce = _fetch_simple($u);
6676fb12b70Safresh1### end XS only ###
6686fb12b70Safresh1    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
669b39c5158Smillert	my $hang = $self->{overrideHangul};
670b39c5158Smillert	if ($hang) {
6716fb12b70Safresh1	    @ce = map _pack_override($_, $u, $der), $hang->($u);
672898184e3Ssthen	} elsif (!defined $hang) {
6736fb12b70Safresh1	    @ce = $der->($u);
674898184e3Ssthen	} else {
675b39c5158Smillert	    my $max  = $self->{maxlength};
676b39c5158Smillert	    my @decH = _decompHangul($u);
677b39c5158Smillert
678b39c5158Smillert	    if (@decH == 2) {
679b39c5158Smillert		my $contract = join(CODE_SEP, @decH);
6809f11ffb7Safresh1		@decH = ($contract) if exists $map->{$contract};
681b39c5158Smillert	    } else { # must be <@decH == 3>
6829f11ffb7Safresh1		if (exists $max->{$decH[0]}) {
683b39c5158Smillert		    my $contract = join(CODE_SEP, @decH);
6849f11ffb7Safresh1		    if (exists $map->{$contract}) {
685b39c5158Smillert			@decH = ($contract);
686b39c5158Smillert		    } else {
687b39c5158Smillert			$contract = join(CODE_SEP, @decH[0,1]);
6889f11ffb7Safresh1			exists $map->{$contract} and @decH = ($contract, $decH[2]);
689b39c5158Smillert		    }
690b39c5158Smillert		    # even if V's ignorable, LT contraction is not supported.
691898184e3Ssthen		    # If such a situation were required, NFD should be used.
692b39c5158Smillert		}
6939f11ffb7Safresh1		if (@decH == 3 && exists $max->{$decH[1]}) {
694b39c5158Smillert		    my $contract = join(CODE_SEP, @decH[1,2]);
6959f11ffb7Safresh1		    exists $map->{$contract} and @decH = ($decH[0], $contract);
696b39c5158Smillert		}
697b39c5158Smillert	    }
698b39c5158Smillert
6996fb12b70Safresh1	    @ce = map({
7009f11ffb7Safresh1		    exists $map->{$_} ? @{ $map->{$_} } :
701898184e3Ssthen		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
702898184e3Ssthen		    $der->($_);
703b39c5158Smillert		} @decH);
704b39c5158Smillert	}
7056fb12b70Safresh1    } elsif ($out && 0x10FFFF < $u) {
7066fb12b70Safresh1	@ce = map _pack_override($_, $u, $der), $out->($u);
707898184e3Ssthen    } else {
708b39c5158Smillert	my $cjk  = $self->{overrideCJK};
709898184e3Ssthen	my $vers = $self->{UCA_Version};
710898184e3Ssthen	if ($cjk && _isUIdeo($u, $vers)) {
7116fb12b70Safresh1	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
7126fb12b70Safresh1	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
7136fb12b70Safresh1	    @ce = _uideoCE_8($u);
7146fb12b70Safresh1	} else {
7156fb12b70Safresh1	    @ce = $der->($u);
716b39c5158Smillert	}
717898184e3Ssthen    }
7186fb12b70Safresh1    return map $self->varCE($_), @ce;
719b39c5158Smillert}
720b39c5158Smillert
721b39c5158Smillert
722b39c5158Smillert##
723b39c5158Smillert## string sortkey = getSortKey(string arg)
724b39c5158Smillert##
725b39c5158Smillertsub getSortKey
726b39c5158Smillert{
727b39c5158Smillert    my $self = shift;
72891f110e0Safresh1    my $orig = shift;
72991f110e0Safresh1    my $str  = $self->process($orig);
73091f110e0Safresh1    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
731898184e3Ssthen    my $vers = $self->{UCA_Version};
732898184e3Ssthen    my $term = $self->{hangul_terminator};
73391f110e0Safresh1    my $lev  = $self->{level};
73491f110e0Safresh1    my $iden = $self->{identical};
735b39c5158Smillert
736b39c5158Smillert    my @buf; # weight arrays
737898184e3Ssthen    if ($term) {
738b39c5158Smillert	my $preHST = '';
739898184e3Ssthen	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
740b39c5158Smillert	foreach my $jcps (@$rEnt) {
741b39c5158Smillert	    # weird things like VL, TL-contraction are not considered!
742898184e3Ssthen	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
743b39c5158Smillert	    if ($preHST && !$curHST || # hangul before non-hangul
744b39c5158Smillert		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
745b39c5158Smillert		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
746b39c5158Smillert		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
747898184e3Ssthen		push @buf, $termCE;
748b39c5158Smillert	    }
749b39c5158Smillert	    $preHST = $curHST;
750b39c5158Smillert	    push @buf, $self->getWt($jcps);
751b39c5158Smillert	}
752898184e3Ssthen	push @buf, $termCE if $preHST; # end at hangul
753898184e3Ssthen    } else {
754b39c5158Smillert	foreach my $jcps (@$rEnt) {
755b39c5158Smillert	    push @buf, $self->getWt($jcps);
756b39c5158Smillert	}
757b39c5158Smillert    }
758b39c5158Smillert
75991f110e0Safresh1    my $rkey = $self->mk_SortKey(\@buf); ### XS only
76091f110e0Safresh1
76191f110e0Safresh1    if ($iden || $vers >= 26 && $lev == MaxLevel) {
76291f110e0Safresh1	$rkey .= LEVEL_SEP;
76391f110e0Safresh1	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
76491f110e0Safresh1    }
76591f110e0Safresh1    return $rkey;
766b39c5158Smillert}
767b39c5158Smillert
768b39c5158Smillert
769b39c5158Smillert##
770b39c5158Smillert## int compare = cmp(string a, string b)
771b39c5158Smillert##
772b39c5158Smillertsub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
773b39c5158Smillertsub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
774b39c5158Smillertsub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
775b39c5158Smillertsub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
776b39c5158Smillertsub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
777b39c5158Smillertsub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
778b39c5158Smillertsub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
779b39c5158Smillert
780b39c5158Smillert##
781b39c5158Smillert## list[strings] sorted = sort(list[strings] arg)
782b39c5158Smillert##
783b39c5158Smillertsub sort {
784b39c5158Smillert    my $obj = shift;
785b39c5158Smillert    return
786b39c5158Smillert	map { $_->[1] }
787b39c5158Smillert	    sort{ $a->[0] cmp $b->[0] }
788b39c5158Smillert		map [ $obj->getSortKey($_), $_ ], @_;
789b39c5158Smillert}
790b39c5158Smillert
791b39c5158Smillert
792b39c5158Smillert##
793b39c5158Smillert## bool _nonIgnorAtLevel(arrayref weights, int level)
794b39c5158Smillert##
795b39c5158Smillertsub _nonIgnorAtLevel($$)
796b39c5158Smillert{
797b39c5158Smillert    my $wt = shift;
798b39c5158Smillert    return if ! defined $wt;
799b39c5158Smillert    my $lv = shift;
800b39c5158Smillert    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
801b39c5158Smillert}
802b39c5158Smillert
803b39c5158Smillert##
804b39c5158Smillert## bool _eqArray(
805b39c5158Smillert##    arrayref of arrayref[weights] source,
806b39c5158Smillert##    arrayref of arrayref[weights] substr,
807b39c5158Smillert##    int level)
808b39c5158Smillert## * comparison of graphemes vs graphemes.
809b39c5158Smillert##   @$source >= @$substr must be true (check it before call this);
810b39c5158Smillert##
811b39c5158Smillertsub _eqArray($$$)
812b39c5158Smillert{
813b39c5158Smillert    my $source = shift;
814b39c5158Smillert    my $substr = shift;
815b39c5158Smillert    my $lev = shift;
816b39c5158Smillert
817b39c5158Smillert    for my $g (0..@$substr-1){
818898184e3Ssthen	# Do the $g'th graphemes have the same number of AV weights?
819b39c5158Smillert	return if @{ $source->[$g] } != @{ $substr->[$g] };
820b39c5158Smillert
821b39c5158Smillert	for my $w (0..@{ $substr->[$g] }-1) {
822b39c5158Smillert	    for my $v (0..$lev-1) {
823b39c5158Smillert		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
824b39c5158Smillert	    }
825b39c5158Smillert	}
826b39c5158Smillert    }
827b39c5158Smillert    return 1;
828b39c5158Smillert}
829b39c5158Smillert
830b39c5158Smillert##
831b39c5158Smillert## (int position, int length)
832898184e3Ssthen## int position = index(string, substring, position, [undoc'ed global])
833b39c5158Smillert##
834898184e3Ssthen## With "global" (only for the list context),
835b39c5158Smillert##  returns list of arrayref[position, length].
836b39c5158Smillert##
837b39c5158Smillertsub index
838b39c5158Smillert{
839b39c5158Smillert    my $self = shift;
84091f110e0Safresh1    $self->{preprocess} and
84191f110e0Safresh1	croak "Don't use Preprocess with index(), match(), etc.";
84291f110e0Safresh1    $self->{normCode} and
84391f110e0Safresh1	croak "Don't use Normalization with index(), match(), etc.";
84491f110e0Safresh1
845b39c5158Smillert    my $str  = shift;
846b39c5158Smillert    my $len  = length($str);
84791f110e0Safresh1    my $sub  = shift;
84891f110e0Safresh1    my $subE = $self->splitEnt($sub);
849b39c5158Smillert    my $pos  = @_ ? shift : 0;
850b39c5158Smillert       $pos  = 0 if $pos < 0;
851898184e3Ssthen    my $glob = shift;
852b39c5158Smillert
853b39c5158Smillert    my $lev  = $self->{level};
854b39c5158Smillert    my $v2i  = $self->{UCA_Version} >= 9 &&
855b39c5158Smillert		$self->{variable} ne 'non-ignorable';
856b39c5158Smillert
857b39c5158Smillert    if (! @$subE) {
858b39c5158Smillert	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
859898184e3Ssthen	return $glob
860b39c5158Smillert	    ? map([$_, 0], $temp..$len)
861b39c5158Smillert	    : wantarray ? ($temp,0) : $temp;
862b39c5158Smillert    }
863b39c5158Smillert    $len < $pos
864b39c5158Smillert	and return wantarray ? () : NOMATCHPOS;
865b39c5158Smillert    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
866b39c5158Smillert    @$strE
867b39c5158Smillert	or return wantarray ? () : NOMATCHPOS;
868b39c5158Smillert
869b39c5158Smillert    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
870b39c5158Smillert
871b39c5158Smillert    my $last_is_variable;
872b39c5158Smillert    for my $vwt (map $self->getWt($_), @$subE) {
873b39c5158Smillert	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
874b39c5158Smillert	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
875b39c5158Smillert
876b39c5158Smillert	# "Ignorable (L1, L2) after Variable" since track. v. 9
877b39c5158Smillert	if ($v2i) {
878b39c5158Smillert	    if ($var) {
879b39c5158Smillert		$last_is_variable = TRUE;
880b39c5158Smillert	    }
881b39c5158Smillert	    elsif (!$wt[0]) { # ignorable
882b39c5158Smillert		$to_be_pushed = FALSE if $last_is_variable;
883b39c5158Smillert	    }
884b39c5158Smillert	    else {
885b39c5158Smillert		$last_is_variable = FALSE;
886b39c5158Smillert	    }
887b39c5158Smillert	}
888b39c5158Smillert
889b39c5158Smillert	if (@subWt && !$var && !$wt[0]) {
890b39c5158Smillert	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
891898184e3Ssthen	} elsif ($to_be_pushed) {
892b39c5158Smillert	    push @subWt, [ \@wt ];
893b39c5158Smillert	}
894898184e3Ssthen	# else ===> skipped
895b39c5158Smillert    }
896b39c5158Smillert
897b39c5158Smillert    my $count = 0;
898b39c5158Smillert    my $end = @$strE - 1;
899b39c5158Smillert
900b39c5158Smillert    $last_is_variable = FALSE; # reuse
901b39c5158Smillert    for (my $i = 0; $i <= $end; ) { # no $i++
902b39c5158Smillert	my $found_base = 0;
903b39c5158Smillert
904b39c5158Smillert	# fetch a grapheme
905b39c5158Smillert	while ($i <= $end && $found_base == 0) {
906b39c5158Smillert	    for my $vwt ($self->getWt($strE->[$i][0])) {
907b39c5158Smillert		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
908b39c5158Smillert		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
909b39c5158Smillert
910b39c5158Smillert		# "Ignorable (L1, L2) after Variable" since track. v. 9
911b39c5158Smillert		if ($v2i) {
912b39c5158Smillert		    if ($var) {
913b39c5158Smillert			$last_is_variable = TRUE;
914b39c5158Smillert		    }
915b39c5158Smillert		    elsif (!$wt[0]) { # ignorable
916b39c5158Smillert			$to_be_pushed = FALSE if $last_is_variable;
917b39c5158Smillert		    }
918b39c5158Smillert		    else {
919b39c5158Smillert			$last_is_variable = FALSE;
920b39c5158Smillert		    }
921b39c5158Smillert		}
922b39c5158Smillert
923b39c5158Smillert		if (@strWt && !$var && !$wt[0]) {
924b39c5158Smillert		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
925b39c5158Smillert		    $finPos[-1] = $strE->[$i][2];
926b39c5158Smillert		} elsif ($to_be_pushed) {
927b39c5158Smillert		    push @strWt, [ \@wt ];
928b39c5158Smillert		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
929b39c5158Smillert		    $finPos[-1] = NOMATCHPOS if $found_base;
930b39c5158Smillert		    push @finPos, $strE->[$i][2];
931b39c5158Smillert		    $found_base++;
932b39c5158Smillert		}
933b39c5158Smillert		# else ===> no-op
934b39c5158Smillert	    }
935b39c5158Smillert	    $i++;
936b39c5158Smillert	}
937b39c5158Smillert
938b39c5158Smillert	# try to match
939b39c5158Smillert	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
940b39c5158Smillert	    if ($iniPos[0] != NOMATCHPOS &&
941b39c5158Smillert		    $finPos[$#subWt] != NOMATCHPOS &&
942b39c5158Smillert			_eqArray(\@strWt, \@subWt, $lev)) {
943b39c5158Smillert		my $temp = $iniPos[0] + $pos;
944b39c5158Smillert
945898184e3Ssthen		if ($glob) {
946b39c5158Smillert		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
947b39c5158Smillert		    splice @strWt,  0, $#subWt;
948b39c5158Smillert		    splice @iniPos, 0, $#subWt;
949b39c5158Smillert		    splice @finPos, 0, $#subWt;
950b39c5158Smillert		}
951b39c5158Smillert		else {
952b39c5158Smillert		    return wantarray
953b39c5158Smillert			? ($temp, $finPos[$#subWt] - $iniPos[0])
954b39c5158Smillert			:  $temp;
955b39c5158Smillert		}
956b39c5158Smillert	    }
957b39c5158Smillert	    shift @strWt;
958b39c5158Smillert	    shift @iniPos;
959b39c5158Smillert	    shift @finPos;
960b39c5158Smillert	}
961b39c5158Smillert    }
962b39c5158Smillert
963898184e3Ssthen    return $glob
964b39c5158Smillert	? @g_ret
965b39c5158Smillert	: wantarray ? () : NOMATCHPOS;
966b39c5158Smillert}
967b39c5158Smillert
968b39c5158Smillert##
969b39c5158Smillert## scalarref to matching part = match(string, substring)
970b39c5158Smillert##
971b39c5158Smillertsub match
972b39c5158Smillert{
973b39c5158Smillert    my $self = shift;
974b39c5158Smillert    if (my($pos,$len) = $self->index($_[0], $_[1])) {
975b39c5158Smillert	my $temp = substr($_[0], $pos, $len);
976b39c5158Smillert	return wantarray ? $temp : \$temp;
977b39c5158Smillert	# An lvalue ref \substr should be avoided,
978b39c5158Smillert	# since its value is affected by modification of its referent.
979b39c5158Smillert    }
980b39c5158Smillert    else {
981b39c5158Smillert	return;
982b39c5158Smillert    }
983b39c5158Smillert}
984b39c5158Smillert
985b39c5158Smillert##
986b39c5158Smillert## arrayref matching parts = gmatch(string, substring)
987b39c5158Smillert##
988b39c5158Smillertsub gmatch
989b39c5158Smillert{
990b39c5158Smillert    my $self = shift;
991b39c5158Smillert    my $str  = shift;
992b39c5158Smillert    my $sub  = shift;
993b39c5158Smillert    return map substr($str, $_->[0], $_->[1]),
994b39c5158Smillert		$self->index($str, $sub, 0, 'g');
995b39c5158Smillert}
996b39c5158Smillert
997b39c5158Smillert##
998b39c5158Smillert## bool subst'ed = subst(string, substring, replace)
999b39c5158Smillert##
1000b39c5158Smillertsub subst
1001b39c5158Smillert{
1002b39c5158Smillert    my $self = shift;
1003b39c5158Smillert    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1004b39c5158Smillert
1005b39c5158Smillert    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1006b39c5158Smillert	if ($code) {
1007b39c5158Smillert	    my $mat = substr($_[0], $pos, $len);
1008b39c5158Smillert	    substr($_[0], $pos, $len, $code->($mat));
1009b39c5158Smillert	} else {
1010b39c5158Smillert	    substr($_[0], $pos, $len, $_[2]);
1011b39c5158Smillert	}
1012b39c5158Smillert	return TRUE;
1013b39c5158Smillert    }
1014b39c5158Smillert    else {
1015b39c5158Smillert	return FALSE;
1016b39c5158Smillert    }
1017b39c5158Smillert}
1018b39c5158Smillert
1019b39c5158Smillert##
1020b39c5158Smillert## int count = gsubst(string, substring, replace)
1021b39c5158Smillert##
1022b39c5158Smillertsub gsubst
1023b39c5158Smillert{
1024b39c5158Smillert    my $self = shift;
1025b39c5158Smillert    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1026b39c5158Smillert    my $cnt = 0;
1027b39c5158Smillert
1028b39c5158Smillert    # Replacement is carried out from the end, then use reverse.
1029b39c5158Smillert    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1030b39c5158Smillert	if ($code) {
1031b39c5158Smillert	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1032b39c5158Smillert	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1033b39c5158Smillert	} else {
1034b39c5158Smillert	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1035b39c5158Smillert	}
1036b39c5158Smillert	$cnt++;
1037b39c5158Smillert    }
1038b39c5158Smillert    return $cnt;
1039b39c5158Smillert}
1040b39c5158Smillert
1041b39c5158Smillert1;
1042b39c5158Smillert__END__
1043b39c5158Smillert
1044b39c5158Smillert=head1 NAME
1045b39c5158Smillert
1046b39c5158SmillertUnicode::Collate - Unicode Collation Algorithm
1047b39c5158Smillert
1048b39c5158Smillert=head1 SYNOPSIS
1049b39c5158Smillert
1050b39c5158Smillert  use Unicode::Collate;
1051b39c5158Smillert
1052b39c5158Smillert  #construct
1053b39c5158Smillert  $Collator = Unicode::Collate->new(%tailoring);
1054b39c5158Smillert
1055b39c5158Smillert  #sort
1056b39c5158Smillert  @sorted = $Collator->sort(@not_sorted);
1057b39c5158Smillert
1058b39c5158Smillert  #compare
1059b39c5158Smillert  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1060b39c5158Smillert
1061898184e3SsthenB<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1062898184e3Ssthenaccording to Perl's Unicode support. See L<perlunicode>,
1063898184e3SsthenL<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1064898184e3SsthenOtherwise you can use C<preprocess> or should decode them before.
1065b39c5158Smillert
1066b39c5158Smillert=head1 DESCRIPTION
1067b39c5158Smillert
1068b39c5158SmillertThis module is an implementation of Unicode Technical Standard #10
1069b39c5158Smillert(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1070b39c5158Smillert
1071b39c5158Smillert=head2 Constructor and Tailoring
1072b39c5158Smillert
1073898184e3SsthenThe C<new> method returns a collator object. If new() is called
1074898184e3Ssthenwith no parameters, the collator should do the default collation.
1075b39c5158Smillert
1076b39c5158Smillert   $Collator = Unicode::Collate->new(
1077b39c5158Smillert      UCA_Version => $UCA_Version,
1078898184e3Ssthen      alternate => $alternate, # alias for 'variable'
1079b39c5158Smillert      backwards => $levelNumber, # or \@levelNumbers
1080b39c5158Smillert      entry => $element,
1081b39c5158Smillert      hangul_terminator => $term_primary_weight,
108291f110e0Safresh1      highestFFFF => $bool,
108391f110e0Safresh1      identical => $bool,
1084b39c5158Smillert      ignoreName => qr/$ignoreName/,
1085b39c5158Smillert      ignoreChar => qr/$ignoreChar/,
1086898184e3Ssthen      ignore_level2 => $bool,
1087b39c5158Smillert      katakana_before_hiragana => $bool,
1088b39c5158Smillert      level => $collationLevel,
1089b8851fccSafresh1      long_contraction => $bool,
109091f110e0Safresh1      minimalFFFE => $bool,
1091b39c5158Smillert      normalization  => $normalization_form,
1092b39c5158Smillert      overrideCJK => \&overrideCJK,
1093b39c5158Smillert      overrideHangul => \&overrideHangul,
1094b39c5158Smillert      preprocess => \&preprocess,
1095b39c5158Smillert      rearrange => \@charList,
1096898184e3Ssthen      rewrite => \&rewrite,
1097898184e3Ssthen      suppress => \@charList,
1098b39c5158Smillert      table => $filename,
1099b39c5158Smillert      undefName => qr/$undefName/,
1100b39c5158Smillert      undefChar => qr/$undefChar/,
1101b39c5158Smillert      upper_before_lower => $bool,
1102b39c5158Smillert      variable => $variable,
1103b39c5158Smillert   );
1104b39c5158Smillert
1105b39c5158Smillert=over 4
1106b39c5158Smillert
1107b39c5158Smillert=item UCA_Version
1108b39c5158Smillert
1109898184e3SsthenIf the revision (previously "tracking version") number of UCA is given,
1110898184e3Ssthenbehavior of that revision is emulated on collating.
1111b39c5158SmillertIf omitted, the return value of C<UCA_Version()> is used.
1112b39c5158Smillert
1113*eac174f2Safresh1The following revisions are supported.  The default is 43.
1114b39c5158Smillert
1115b39c5158Smillert     UCA       Unicode Standard         DUCET (@version)
1116898184e3Ssthen   -------------------------------------------------------
1117b39c5158Smillert      8              3.1                3.0.1 (3.0.1d9)
1118*eac174f2Safresh1      9     3.1 with Corrigendum 3      3.1.1
1119*eac174f2Safresh1     11             4.0.0
1120*eac174f2Safresh1     14             4.1.0
1121*eac174f2Safresh1     16             5.0.0
1122*eac174f2Safresh1     18             5.1.0
1123*eac174f2Safresh1     20             5.2.0
1124*eac174f2Safresh1     22             6.0.0
1125*eac174f2Safresh1     24             6.1.0
1126*eac174f2Safresh1     26             6.2.0
1127*eac174f2Safresh1     28             6.3.0
1128*eac174f2Safresh1     30             7.0.0
1129*eac174f2Safresh1     32             8.0.0
1130*eac174f2Safresh1     34             9.0.0
1131*eac174f2Safresh1     36            10.0.0
1132*eac174f2Safresh1     38            11.0.0
1133*eac174f2Safresh1     40            12.0.0
1134*eac174f2Safresh1     41            12.1.0
1135*eac174f2Safresh1     43            13.0.0
1136b8851fccSafresh1
11379f11ffb7Safresh1* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
1138b39c5158Smillert
1139898184e3Ssthen* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1140898184e3Ssthensince C<UCA_Version> 22.
1141898184e3Ssthen
11426fb12b70Safresh1* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
11436fb12b70Safresh1and can be overridden since C<UCA_Version> 22.
11446fb12b70Safresh1
1145898184e3Ssthen* Fully ignorable characters were ignored, and would not interrupt
1146898184e3Ssthencontractions with C<UCA_Version> 9 and 11.
1147898184e3Ssthen
1148898184e3Ssthen* Treatment of ignorables after variables and some behaviors
1149898184e3Ssthenwere changed at C<UCA_Version> 9.
1150898184e3Ssthen
1151898184e3Ssthen* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1152898184e3Ssthendepend on C<UCA_Version>.
1153898184e3Ssthen
1154898184e3Ssthen* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1155898184e3SsthenC<hangul_terminator>.
1156b39c5158Smillert
1157b39c5158Smillert=item alternate
1158b39c5158Smillert
1159b39c5158Smillert-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1160b39c5158Smillert
1161b39c5158SmillertFor backward compatibility, C<alternate> (old name) can be used
1162b39c5158Smillertas an alias for C<variable>.
1163b39c5158Smillert
1164b39c5158Smillert=item backwards
1165b39c5158Smillert
116691f110e0Safresh1-- see 3.4 Backward Accents, UTS #10.
1167b39c5158Smillert
1168b39c5158Smillert     backwards => $levelNumber or \@levelNumbers
1169b39c5158Smillert
1170b39c5158SmillertWeights in reverse order; ex. level 2 (diacritic ordering) in French.
1171898184e3SsthenIf omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1172898184e3Ssthenforwards at all the levels.
1173b39c5158Smillert
1174b39c5158Smillert=item entry
1175b39c5158Smillert
1176b8851fccSafresh1-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1177b39c5158Smillert
1178b39c5158SmillertIf the same character (or a sequence of characters) exists
1179b39c5158Smillertin the collation element table through C<table>,
1180898184e3Ssthenmapping to collation elements is overridden.
1181b39c5158SmillertIf it does not exist, the mapping is defined additionally.
1182b39c5158Smillert
1183b39c5158Smillert    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1184b39c5158Smillert0063 0068 ; [.0E6A.0020.0002.0063] # ch
1185b39c5158Smillert0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1186b39c5158Smillert0043 0048 ; [.0E6A.0020.0008.0043] # CH
1187b39c5158Smillert006C 006C ; [.0F4C.0020.0002.006C] # ll
1188b39c5158Smillert004C 006C ; [.0F4C.0020.0007.004C] # Ll
1189b39c5158Smillert004C 004C ; [.0F4C.0020.0008.004C] # LL
1190b39c5158Smillert00F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1191b39c5158Smillert006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1192b39c5158Smillert00D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1193b39c5158Smillert004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1194b39c5158SmillertENTRY
1195b39c5158Smillert
1196b39c5158Smillert    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1197b39c5158Smillert00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1198b39c5158Smillert00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1199b39c5158SmillertENTRY
1200b39c5158Smillert
1201b39c5158SmillertB<NOTE:> The code point in the UCA file format (before C<';'>)
1202b39c5158SmillertB<must> be a Unicode code point (defined as hexadecimal),
1203b39c5158Smillertbut not a native code point.
1204b39c5158SmillertSo C<0063> must always denote C<U+0063>,
1205b39c5158Smillertbut not a character of C<"\x63">.
1206b39c5158Smillert
1207b39c5158SmillertWeighting may vary depending on collation element table.
1208b39c5158SmillertSo ensure the weights defined in C<entry> will be consistent with
1209b39c5158Smillertthose in the collation element table loaded via C<table>.
1210b39c5158Smillert
1211b39c5158SmillertIn DUCET v4.0.0, primary weight of C<C> is C<0E60>
1212b39c5158Smillertand that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1213b39c5158Smillert(as a value between C<0E60> and C<0E6D>)
1214b39c5158Smillertmakes ordering as C<C E<lt> CH E<lt> D>.
1215b39c5158SmillertExactly speaking DUCET already has some characters between C<C> and C<D>:
1216b39c5158SmillertC<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1217b39c5158SmillertC<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1218b39c5158Smillertand C<c-curl> (C<U+0255>) with C<0E69>.
1219b39c5158SmillertThen primary weight C<0E6A> for C<CH> makes C<CH>
1220b39c5158Smillertordered between C<c-curl> and C<D>.
1221b39c5158Smillert
1222b39c5158Smillert=item hangul_terminator
1223b39c5158Smillert
1224b39c5158Smillert-- see 7.1.4 Trailing Weights, UTS #10.
1225b39c5158Smillert
1226b39c5158SmillertIf a true value is given (non-zero but should be positive),
1227b39c5158Smillertit will be added as a terminator primary weight to the end of
1228b39c5158Smillertevery standard Hangul syllable. Secondary and any higher weights
1229b39c5158Smillertfor terminator are set to zero.
1230b39c5158SmillertIf the value is false or C<hangul_terminator> key does not exist,
1231b39c5158Smillertinsertion of terminator weights will not be performed.
1232b39c5158Smillert
1233b39c5158SmillertBoundaries of Hangul syllables are determined
1234b39c5158Smillertaccording to conjoining Jamo behavior in F<the Unicode Standard>
1235b39c5158Smillertand F<HangulSyllableType.txt>.
1236b39c5158Smillert
1237b39c5158SmillertB<Implementation Note:>
1238b39c5158Smillert(1) For expansion mapping (Unicode character mapped
1239b39c5158Smillertto a sequence of collation elements), a terminator will not be added
1240b39c5158Smillertbetween collation elements, even if Hangul syllable boundary exists there.
1241b39c5158SmillertAddition of terminator is restricted to the next position
1242b39c5158Smillertto the last collation element.
1243b39c5158Smillert
1244b39c5158Smillert(2) Non-conjoining Hangul letters
1245b39c5158Smillert(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1246b39c5158Smillertautomatically terminated with a terminator primary weight.
1247b39c5158SmillertThese characters may need terminator included in a collation element
1248b39c5158Smillerttable beforehand.
1249b39c5158Smillert
125091f110e0Safresh1=item highestFFFF
125191f110e0Safresh1
12529f11ffb7Safresh1-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
125391f110e0Safresh1
125491f110e0Safresh1If the parameter is made true, C<U+FFFF> has a highest primary weight.
125591f110e0Safresh1When a boolean of C<$coll-E<gt>ge($str, "abc")> and
125691f110e0Safresh1C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
125791f110e0Safresh1begins with C<"abc">, or another primary equivalent.
125891f110e0Safresh1C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
125991f110e0Safresh1such as C<"abc\x{FFFF}xyz">.
126091f110e0Safresh1
126191f110e0Safresh1C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
12626fb12b70Safresh1almost, but the latter has a problem that you should know which letter is
126391f110e0Safresh1next to C<c>. For a certain language where C<ch> as the next letter,
12646fb12b70Safresh1C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
126591f110e0Safresh1
12666fb12b70Safresh1Note:
12676fb12b70Safresh1This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
126891f110e0Safresh1Any other character than C<U+FFFF> can be tailored by C<entry>.
126991f110e0Safresh1
127091f110e0Safresh1=item identical
127191f110e0Safresh1
127291f110e0Safresh1-- see A.3 Deterministic Comparison, UTS #10.
127391f110e0Safresh1
127491f110e0Safresh1By default, strings whose weights are equal should be equal,
127591f110e0Safresh1even though their code points are not equal.
127691f110e0Safresh1Completely ignorable characters are ignored.
127791f110e0Safresh1
127891f110e0Safresh1If the parameter is made true, a final, tie-breaking level is used.
127991f110e0Safresh1If no difference of weights is found after the comparison through
128091f110e0Safresh1all the level specified by C<level>, the comparison with code points
128191f110e0Safresh1will be performed.
12826fb12b70Safresh1For the tie-breaking comparison, the sort key has code points
128391f110e0Safresh1of the original string appended.
128491f110e0Safresh1Completely ignorable characters are not ignored.
128591f110e0Safresh1
128691f110e0Safresh1If C<preprocess> and/or C<normalization> is applied, the code points
128791f110e0Safresh1of the string after them (in NFD by default) are used.
128891f110e0Safresh1
1289b39c5158Smillert=item ignoreChar
1290b39c5158Smillert
1291b39c5158Smillert=item ignoreName
1292b39c5158Smillert
1293b8851fccSafresh1-- see 3.6 Variable Weighting, UTS #10.
1294b39c5158Smillert
1295b39c5158SmillertMakes the entry in the table completely ignorable;
1296b39c5158Smillerti.e. as if the weights were zero at all level.
1297b39c5158Smillert
1298b39c5158SmillertThrough C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1299b39c5158Smillertwill be ignored. Through C<ignoreName>, any character whose name
1300b39c5158Smillert(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1301b39c5158Smillertwill be ignored.
1302b39c5158Smillert
1303b39c5158SmillertE.g. when 'a' and 'e' are ignorable,
1304b39c5158Smillert'element' is equal to 'lament' (or 'lmnt').
1305b39c5158Smillert
1306898184e3Ssthen=item ignore_level2
1307898184e3Ssthen
1308898184e3Ssthen-- see 5.1 Parametric Tailoring, UTS #10.
1309898184e3Ssthen
1310898184e3SsthenBy default, case-sensitive comparison (that is level 3 difference)
1311898184e3Ssthenwon't ignore accents (that is level 2 difference).
1312898184e3Ssthen
1313898184e3SsthenIf the parameter is made true, accents (and other primary ignorable
1314898184e3Ssthencharacters) are ignored, even though cases are taken into account.
1315898184e3Ssthen
1316898184e3SsthenB<NOTE>: C<level> should be 3 or greater.
1317898184e3Ssthen
1318b39c5158Smillert=item katakana_before_hiragana
1319b39c5158Smillert
132091f110e0Safresh1-- see 7.2 Tertiary Weight Table, UTS #10.
1321b39c5158Smillert
1322b39c5158SmillertBy default, hiragana is before katakana.
1323b39c5158SmillertIf the parameter is made true, this is reversed.
1324b39c5158Smillert
1325b39c5158SmillertB<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1326b39c5158Smillertdistinctions must occur in level 3, and their weights at level 3 must be
1327b39c5158Smillertsame as those mentioned in 7.3.1, UTS #10.
1328b39c5158SmillertIf you define your collation elements which violate this requirement,
1329b39c5158Smillertthis parameter does not work validly.
1330b39c5158Smillert
1331b39c5158Smillert=item level
1332b39c5158Smillert
1333b39c5158Smillert-- see 4.3 Form Sort Key, UTS #10.
1334b39c5158Smillert
1335b39c5158SmillertSet the maximum level.
1336b39c5158SmillertAny higher levels than the specified one are ignored.
1337b39c5158Smillert
1338b39c5158Smillert  Level 1: alphabetic ordering
1339b39c5158Smillert  Level 2: diacritic ordering
1340b39c5158Smillert  Level 3: case ordering
1341b39c5158Smillert  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1342b39c5158Smillert
1343b39c5158Smillert  ex.level => 2,
1344b39c5158Smillert
1345b39c5158SmillertIf omitted, the maximum is the 4th.
1346b39c5158Smillert
134791f110e0Safresh1B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
134891f110e0Safresh1But this module only uses weights within 0xFFFF.
134991f110e0Safresh1When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
135091f110e0Safresh1and 'shift-trimmed'), the level 4 may be unreliable.
135191f110e0Safresh1
135291f110e0Safresh1See also C<identical>.
135391f110e0Safresh1
1354b8851fccSafresh1=item long_contraction
1355b8851fccSafresh1
1356b8851fccSafresh1-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1357b8851fccSafresh1
1358b8851fccSafresh1If the parameter is made true, for a contraction with three or more
1359b8851fccSafresh1characters (here nicknamed "long contraction"), initial substrings
1360b8851fccSafresh1will be handled.
1361b8851fccSafresh1For example, a contraction ABC, where A is a starter, and B and C
1362b8851fccSafresh1are non-starters (character with non-zero combining character class),
1363b8851fccSafresh1will be detected even if there is not AB as a contraction.
1364b8851fccSafresh1
1365b8851fccSafresh1B<Default:> Usually false.
1366b8851fccSafresh1If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1367b8851fccSafresh1is not specified in C<new()>, a true value is set implicitly.
1368b8851fccSafresh1This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1369b8851fccSafresh1
1370b8851fccSafresh1C<change()> handles C<long_contraction> explicitly only.
1371b8851fccSafresh1If C<long_contraction> is not specified in C<change()>, even though
1372b8851fccSafresh1C<UCA_Version> is changed, C<long_contraction> will not be changed.
1373b8851fccSafresh1
1374b8851fccSafresh1B<Limitation:> Scanning non-starters is one-way (no back tracking).
1375b8851fccSafresh1If AB is found but not ABC is not found, other long contraction where
1376b8851fccSafresh1the first character is A and the second is not B may not be found.
1377b8851fccSafresh1
1378b8851fccSafresh1Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1379b8851fccSafresh1contractions will be skipped.
1380b8851fccSafresh1
1381b8851fccSafresh1B<Note:> The following contractions in DUCET are not considered
1382b8851fccSafresh1in steps S2.1.1 to S2.1.3, where they are discontiguous.
1383b8851fccSafresh1
1384b8851fccSafresh1    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1385b8851fccSafresh1    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1386b8851fccSafresh1
1387b8851fccSafresh1For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1388b8851fccSafresh1(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1389b8851fccSafresh1In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1390b8851fccSafresh1instead of C<0FB2 0F71 0F80>.
1391b8851fccSafresh1Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1392b8851fccSafresh1contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1393b8851fccSafresh1
139491f110e0Safresh1=item minimalFFFE
139591f110e0Safresh1
13969f11ffb7Safresh1-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
139791f110e0Safresh1
139891f110e0Safresh1If the parameter is made true, C<U+FFFE> has a minimal primary weight.
139991f110e0Safresh1The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
140091f110e0Safresh1first compares C<$a1> and C<$b1> at level 1, and
140191f110e0Safresh1then C<$a2> and C<$b2> at level 1, as followed.
140291f110e0Safresh1
140391f110e0Safresh1        "ab\x{FFFE}a"
140491f110e0Safresh1        "Ab\x{FFFE}a"
140591f110e0Safresh1        "ab\x{FFFE}c"
140691f110e0Safresh1        "Ab\x{FFFE}c"
140791f110e0Safresh1        "ab\x{FFFE}xyz"
140891f110e0Safresh1        "abc\x{FFFE}def"
140991f110e0Safresh1        "abc\x{FFFE}xYz"
141091f110e0Safresh1        "aBc\x{FFFE}xyz"
141191f110e0Safresh1        "abcX\x{FFFE}def"
141291f110e0Safresh1        "abcx\x{FFFE}xyz"
141391f110e0Safresh1        "b\x{FFFE}aaa"
141491f110e0Safresh1        "bbb\x{FFFE}a"
141591f110e0Safresh1
14166fb12b70Safresh1Note:
14176fb12b70Safresh1This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
141891f110e0Safresh1Any other character than C<U+FFFE> can be tailored by C<entry>.
141991f110e0Safresh1
1420b39c5158Smillert=item normalization
1421b39c5158Smillert
1422b39c5158Smillert-- see 4.1 Normalize, UTS #10.
1423b39c5158Smillert
1424b39c5158SmillertIf specified, strings are normalized before preparation of sort keys
1425b39c5158Smillert(the normalization is executed after preprocess).
1426b39c5158Smillert
1427b39c5158SmillertA form name C<Unicode::Normalize::normalize()> accepts will be applied
1428b39c5158Smillertas C<$normalization_form>.
1429b39c5158SmillertAcceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1430b39c5158SmillertSee C<Unicode::Normalize::normalize()> for detail.
1431b39c5158SmillertIf omitted, C<'NFD'> is used.
1432b39c5158Smillert
1433b39c5158SmillertC<normalization> is performed after C<preprocess> (if defined).
1434b39c5158Smillert
1435b39c5158SmillertFurthermore, special values, C<undef> and C<"prenormalized">, can be used,
1436b39c5158Smillertthough they are not concerned with C<Unicode::Normalize::normalize()>.
1437b39c5158Smillert
1438b39c5158SmillertIf C<undef> (not a string C<"undef">) is passed explicitly
1439b39c5158Smillertas the value for this key,
1440b39c5158Smillertany normalization is not carried out (this may make tailoring easier
1441b39c5158Smillertif any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1442b39c5158Smillertonly contiguous contractions are resolved;
1443b39c5158Smillerte.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1444b39c5158SmillertC<A-cedilla-ring> would be primary equal to C<A>.
1445b39c5158SmillertIn this point,
1446b39c5158SmillertC<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1447b39c5158SmillertB<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1448b39c5158Smillert
1449b39c5158SmillertIn the case of C<(normalization =E<gt> "prenormalized")>,
1450b39c5158Smillertany normalization is not performed, but
1451898184e3Ssthendiscontiguous contractions with combining characters are performed.
1452b39c5158SmillertTherefore
1453b39c5158SmillertC<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1454b39c5158SmillertB<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1455b39c5158SmillertIf source strings are finely prenormalized,
1456b39c5158SmillertC<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1457b39c5158Smillert
1458b39c5158SmillertExcept C<(normalization =E<gt> undef)>,
1459b39c5158SmillertB<Unicode::Normalize> is required (see also B<CAVEAT>).
1460b39c5158Smillert
1461b39c5158Smillert=item overrideCJK
1462b39c5158Smillert
1463b39c5158Smillert-- see 7.1 Derived Collation Elements, UTS #10.
1464b39c5158Smillert
1465898184e3SsthenBy default, CJK unified ideographs are ordered in Unicode codepoint
14666fb12b70Safresh1order, but those in the CJK Unified Ideographs block are less than
1467898184e3Ssthenthose in the CJK Unified Ideographs Extension A etc.
1468b39c5158Smillert
1469898184e3Ssthen    In the CJK Unified Ideographs block:
1470898184e3Ssthen    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1471898184e3Ssthen    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1472898184e3Ssthen    U+4E00..U+9FC3 if UCA_Version is 18.
1473898184e3Ssthen    U+4E00..U+9FCB if UCA_Version is 20 or 22.
14749f11ffb7Safresh1    U+4E00..U+9FCC if UCA_Version is 24 to 30.
14759f11ffb7Safresh1    U+4E00..U+9FD5 if UCA_Version is 32 or 34.
14769f11ffb7Safresh1    U+4E00..U+9FEA if UCA_Version is 36.
1477*eac174f2Safresh1    U+4E00..U+9FEF if UCA_Version is 38, 40 or 41.
1478*eac174f2Safresh1    U+4E00..U+9FFC if UCA_Version is 43.
1479b39c5158Smillert
1480898184e3Ssthen    In the CJK Unified Ideographs Extension blocks:
1481*eac174f2Safresh1    Ext.A (U+3400..U+4DB5)   if UCA_Version is  8 to 41.
1482*eac174f2Safresh1    Ext.A (U+3400..U+4DBF)   if UCA_Version is 43.
1483*eac174f2Safresh1    Ext.B (U+20000..U+2A6D6) if UCA_Version is  8 to 41.
1484*eac174f2Safresh1    Ext.B (U+20000..U+2A6DD) if UCA_Version is 43.
14856fb12b70Safresh1    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
14866fb12b70Safresh1    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
14879f11ffb7Safresh1    Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later.
1488*eac174f2Safresh1    Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later.
1489*eac174f2Safresh1    Ext.G (U+30000..U+3134A) if UCA_Version is 43.
1490898184e3Ssthen
1491898184e3SsthenThrough C<overrideCJK>, ordering of CJK unified ideographs (including
1492898184e3Ssthenextensions) can be overridden.
1493898184e3Ssthen
1494898184e3Ssthenex. CJK unified ideographs in the JIS code point order.
1495b39c5158Smillert
1496b39c5158Smillert  overrideCJK => sub {
1497b39c5158Smillert      my $u = shift;             # get a Unicode codepoint
1498b39c5158Smillert      my $b = pack('n', $u);     # to UTF-16BE
1499b39c5158Smillert      my $s = your_unicode_to_sjis_converter($b); # convert
1500b39c5158Smillert      my $n = unpack('n', $s);   # convert sjis to short
1501b39c5158Smillert      [ $n, 0x20, 0x2, $u ];     # return the collation element
1502b39c5158Smillert  },
1503b39c5158Smillert
1504898184e3SsthenThe return value may be an arrayref of 1st to 4th weights as shown
1505898184e3Ssthenabove. The return value may be an integer as the primary weight
1506898184e3Ssthenas shown below.  If C<undef> is returned, the default derived
1507898184e3Ssthencollation element will be used.
1508898184e3Ssthen
1509898184e3Ssthen  overrideCJK => sub {
1510898184e3Ssthen      my $u = shift;             # get a Unicode codepoint
1511898184e3Ssthen      my $b = pack('n', $u);     # to UTF-16BE
1512898184e3Ssthen      my $s = your_unicode_to_sjis_converter($b); # convert
1513898184e3Ssthen      my $n = unpack('n', $s);   # convert sjis to short
1514898184e3Ssthen      return $n;                 # return the primary weight
1515898184e3Ssthen  },
1516898184e3Ssthen
1517898184e3SsthenThe return value may be a list containing zero or more of
1518898184e3Ssthenan arrayref, an integer, or C<undef>.
1519898184e3Ssthen
1520898184e3Ssthenex. ignores all CJK unified ideographs.
1521b39c5158Smillert
1522b39c5158Smillert  overrideCJK => sub {()}, # CODEREF returning empty list
1523b39c5158Smillert
1524b39c5158Smillert   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1525898184e3Ssthen   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1526b39c5158Smillert
15276fb12b70Safresh1If a false value (including C<undef>) is passed, C<overrideCJK>
15286fb12b70Safresh1has no effect.
15296fb12b70Safresh1C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
15306fb12b70Safresh1
1531898184e3SsthenBut assignment of weight for CJK unified ideographs
1532898184e3Ssthenin C<table> or C<entry> is still valid.
15336fb12b70Safresh1If C<undef> is passed explicitly as the value for this key,
15346fb12b70Safresh1weights for CJK unified ideographs are treated as undefined.
15356fb12b70Safresh1However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
15366fb12b70Safresh1has no special meaning.
1537898184e3Ssthen
1538898184e3SsthenB<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1539898184e3SsthenC<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1540898184e3SsthenC<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1541898184e3Ssthenideographs. But they can't be overridden via C<overrideCJK> when you use
1542898184e3SsthenDUCET, as the table includes weights for them. C<table> or C<entry> has
1543898184e3Ssthenpriority over C<overrideCJK>.
1544b39c5158Smillert
1545b39c5158Smillert=item overrideHangul
1546b39c5158Smillert
1547b39c5158Smillert-- see 7.1 Derived Collation Elements, UTS #10.
1548b39c5158Smillert
1549898184e3SsthenBy default, Hangul syllables are decomposed into Hangul Jamo,
1550b39c5158Smillerteven if C<(normalization =E<gt> undef)>.
1551898184e3SsthenBut the mapping of Hangul syllables may be overridden.
1552b39c5158Smillert
1553b39c5158SmillertThis parameter works like C<overrideCJK>, so see there for examples.
1554b39c5158Smillert
1555898184e3SsthenIf you want to override the mapping of Hangul syllables,
1556898184e3SsthenNFD and NFKD are not appropriate, since NFD and NFKD will decompose
1557898184e3SsthenHangul syllables before overriding. FCD may decompose Hangul syllables
1558898184e3Ssthenas the case may be.
1559b39c5158Smillert
15606fb12b70Safresh1If a false value (but not C<undef>) is passed, C<overrideHangul>
15616fb12b70Safresh1has no effect.
15626fb12b70Safresh1C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
15636fb12b70Safresh1
1564b39c5158SmillertIf C<undef> is passed explicitly as the value for this key,
1565898184e3Ssthenweight for Hangul syllables is treated as undefined
1566b39c5158Smillertwithout decomposition into Hangul Jamo.
1567898184e3SsthenBut definition of weight for Hangul syllables
1568898184e3Ssthenin C<table> or C<entry> is still valid.
1569b39c5158Smillert
15706fb12b70Safresh1=item overrideOut
15716fb12b70Safresh1
15726fb12b70Safresh1-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
15736fb12b70Safresh1
15746fb12b70Safresh1Perl seems to allow out-of-range values (greater than 0x10FFFF).
15756fb12b70Safresh1By default, out-of-range values are replaced with C<U+FFFD>
15766fb12b70Safresh1(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
15776fb12b70Safresh1or ignored when C<UCA_Version> E<lt>= 20.
15786fb12b70Safresh1
15796fb12b70Safresh1When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
15806fb12b70Safresh1can be overridden. Though C<table> or C<entry> are available for them,
15816fb12b70Safresh1out-of-range values are too many.
15826fb12b70Safresh1
15836fb12b70Safresh1C<overrideOut> can perform it algorithmically.
15846fb12b70Safresh1This parameter works like C<overrideCJK>, so see there for examples.
15856fb12b70Safresh1
15866fb12b70Safresh1ex. ignores all out-of-range values.
15876fb12b70Safresh1
15886fb12b70Safresh1  overrideOut => sub {()}, # CODEREF returning empty list
15896fb12b70Safresh1
15906fb12b70Safresh1If a false value (including C<undef>) is passed, C<overrideOut>
15916fb12b70Safresh1has no effect.
15926fb12b70Safresh1C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
15936fb12b70Safresh1
15946fb12b70Safresh1B<NOTE ABOUT U+FFFD:>
15956fb12b70Safresh1
15966fb12b70Safresh1UCA recommends that out-of-range values should not be ignored for security
15976fb12b70Safresh1reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
15986fb12b70Safresh1However, C<U+FFFD> is wrongly mapped to a variable collation element
15996fb12b70Safresh1in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
16006fb12b70Safresh1ignored when C<variable> isn't C<Non-ignorable>.
16016fb12b70Safresh1
16026fb12b70Safresh1The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
16036fb12b70Safresh1see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
16046fb12b70Safresh1(7.1.4 Trailing Weights). Such a correction is reproduced by this.
16056fb12b70Safresh1
16066fb12b70Safresh1  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
16076fb12b70Safresh1
16086fb12b70Safresh1This workaround is unnecessary since Unicode 6.3.0.
16096fb12b70Safresh1
1610b39c5158Smillert=item preprocess
1611b39c5158Smillert
161291f110e0Safresh1-- see 5.4 Preprocessing, UTS #10.
1613b39c5158Smillert
1614898184e3SsthenIf specified, the coderef is used to preprocess each string
1615b39c5158Smillertbefore the formation of sort keys.
1616b39c5158Smillert
1617b39c5158Smillertex. dropping English articles, such as "a" or "the".
1618b39c5158SmillertThen, "the pen" is before "a pencil".
1619b39c5158Smillert
1620b39c5158Smillert     preprocess => sub {
1621b39c5158Smillert           my $str = shift;
1622b39c5158Smillert           $str =~ s/\b(?:an?|the)\s+//gi;
1623b39c5158Smillert           return $str;
1624b39c5158Smillert        },
1625b39c5158Smillert
1626b39c5158SmillertC<preprocess> is performed before C<normalization> (if defined).
1627b39c5158Smillert
1628898184e3Ssthenex. decoding strings in a legacy encoding such as shift-jis:
1629898184e3Ssthen
1630898184e3Ssthen    $sjis_collator = Unicode::Collate->new(
1631898184e3Ssthen        preprocess => \&your_shiftjis_to_unicode_decoder,
1632898184e3Ssthen    );
1633898184e3Ssthen    @result = $sjis_collator->sort(@shiftjis_strings);
1634898184e3Ssthen
1635898184e3SsthenB<Note:> Strings returned from the coderef will be interpreted
1636898184e3Ssthenaccording to Perl's Unicode support. See L<perlunicode>,
1637898184e3SsthenL<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1638898184e3Ssthen
1639b39c5158Smillert=item rearrange
1640b39c5158Smillert
164191f110e0Safresh1-- see 3.5 Rearrangement, UTS #10.
1642b39c5158Smillert
1643b39c5158SmillertCharacters that are not coded in logical order and to be rearranged.
16446fb12b70Safresh1If C<UCA_Version> is equal to or less than 11, default is:
1645b39c5158Smillert
1646b39c5158Smillert    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1647b39c5158Smillert
1648b39c5158SmillertIf you want to disallow any rearrangement, pass C<undef> or C<[]>
1649b39c5158Smillert(a reference to empty list) as the value for this key.
1650b39c5158Smillert
1651898184e3SsthenIf C<UCA_Version> is equal to or greater than 14, default is C<[]>
1652898184e3Ssthen(i.e. no rearrangement).
1653b39c5158Smillert
1654b39c5158SmillertB<According to the version 9 of UCA, this parameter shall not be used;
1655b39c5158Smillertbut it is not warned at present.>
1656b39c5158Smillert
1657898184e3Ssthen=item rewrite
1658898184e3Ssthen
1659898184e3SsthenIf specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1660898184e3SsthenThe coderef will get each line, and then should return a rewritten line
1661898184e3Ssthenaccording to the UCA file format.
1662898184e3SsthenIf the coderef returns an empty line, the line will be skipped.
1663898184e3Ssthen
1664898184e3Ssthene.g. any primary ignorable characters into tertiary ignorable:
1665898184e3Ssthen
1666898184e3Ssthen    rewrite => sub {
1667898184e3Ssthen        my $line = shift;
1668898184e3Ssthen        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1669898184e3Ssthen        return $line;
1670898184e3Ssthen    },
1671898184e3Ssthen
1672898184e3SsthenThis example shows rewriting weights. C<rewrite> is allowed to
1673898184e3Ssthenaffect code points, weights, and the name.
1674898184e3Ssthen
1675898184e3SsthenB<NOTE>: C<table> is available to use another table file;
1676898184e3Ssthenpreparing a modified table once would be more efficient than
1677898184e3Ssthenrewriting lines on reading an unmodified table every time.
1678898184e3Ssthen
1679898184e3Ssthen=item suppress
1680898184e3Ssthen
16819f11ffb7Safresh1-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
1682898184e3Ssthen
1683898184e3SsthenContractions beginning with the specified characters are suppressed,
1684898184e3Sstheneven if those contractions are defined in C<table>.
1685898184e3Ssthen
1686898184e3SsthenAn example for Russian and some languages using the Cyrillic script:
1687898184e3Ssthen
1688898184e3Ssthen    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1689898184e3Ssthen
1690898184e3Ssthenwhere 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1691898184e3Ssthen
16929f11ffb7Safresh1B<NOTE>: Contractions via C<entry> will not be suppressed.
1693898184e3Ssthen
1694b39c5158Smillert=item table
1695b39c5158Smillert
1696b8851fccSafresh1-- see 3.8 Default Unicode Collation Element Table, UTS #10.
1697b39c5158Smillert
1698b39c5158SmillertYou can use another collation element table if desired.
1699b39c5158Smillert
1700b39c5158SmillertThe table file should locate in the F<Unicode/Collate> directory
1701b39c5158Smillerton C<@INC>. Say, if the filename is F<Foo.txt>,
1702b39c5158Smillertthe table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1703b39c5158Smillert
1704b39c5158SmillertBy default, F<allkeys.txt> (as the filename of DUCET) is used.
1705b39c5158SmillertIf you will prepare your own table file, any name other than F<allkeys.txt>
1706b39c5158Smillertmay be better to avoid namespace conflict.
1707b39c5158Smillert
1708898184e3SsthenB<NOTE>: When XSUB is used, the DUCET is compiled on building this
1709898184e3Ssthenmodule, and it may save time at the run time.
17106fb12b70Safresh1Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1711898184e3Ssthenor using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1712898184e3SsthenC<rewrite> will prevent this module from using the compiled DUCET.
1713898184e3Ssthen
1714b39c5158SmillertIf C<undef> is passed explicitly as the value for this key,
1715b39c5158Smillertno file is read (but you can define collation elements via C<entry>).
1716b39c5158Smillert
1717b39c5158SmillertA typical way to define a collation element table
1718b39c5158Smillertwithout any file of table:
1719b39c5158Smillert
1720b39c5158Smillert   $onlyABC = Unicode::Collate->new(
1721b39c5158Smillert       table => undef,
1722b39c5158Smillert       entry => << 'ENTRIES',
1723b39c5158Smillert0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1724b39c5158Smillert0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1725b39c5158Smillert0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1726b39c5158Smillert0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1727b39c5158Smillert0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1728b39c5158Smillert0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1729b39c5158SmillertENTRIES
1730b39c5158Smillert    );
1731b39c5158Smillert
1732b39c5158SmillertIf C<ignoreName> or C<undefName> is used, character names should be
1733b39c5158Smillertspecified as a comment (following C<#>) on each line.
1734b39c5158Smillert
1735b39c5158Smillert=item undefChar
1736b39c5158Smillert
1737b39c5158Smillert=item undefName
1738b39c5158Smillert
17399f11ffb7Safresh1-- see 6.3.3 Reducing the Repertoire, UTS #10.
1740b39c5158Smillert
1741898184e3SsthenUndefines the collation element as if it were unassigned in the C<table>.
1742b39c5158SmillertThis reduces the size of the table.
1743b39c5158SmillertIf an unassigned character appears in the string to be collated,
1744b39c5158Smillertthe sort key is made from its codepoint
1745b39c5158Smillertas a single-character collation element,
1746b39c5158Smillertas it is greater than any other assigned collation elements
1747b39c5158Smillert(in the codepoint order among the unassigned characters).
1748b39c5158SmillertBut, it'd be better to ignore characters
1749b39c5158Smillertunfamiliar to you and maybe never used.
1750b39c5158Smillert
1751b39c5158SmillertThrough C<undefChar>, any character matching C<qr/$undefChar/>
1752b39c5158Smillertwill be undefined. Through C<undefName>, any character whose name
1753b39c5158Smillert(given in the C<table> file as a comment) matches C<qr/$undefName/>
1754b39c5158Smillertwill be undefined.
1755b39c5158Smillert
1756b39c5158Smillertex. Collation weights for beyond-BMP characters are not stored in object:
1757b39c5158Smillert
1758b39c5158Smillert    undefChar => qr/[^\0-\x{fffd}]/,
1759b39c5158Smillert
1760b39c5158Smillert=item upper_before_lower
1761b39c5158Smillert
1762b39c5158Smillert-- see 6.6 Case Comparisons, UTS #10.
1763b39c5158Smillert
1764b39c5158SmillertBy default, lowercase is before uppercase.
1765b39c5158SmillertIf the parameter is made true, this is reversed.
1766b39c5158Smillert
1767b39c5158SmillertB<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1768b39c5158Smillertdistinctions must occur in level 3, and their weights at level 3 must be
1769b39c5158Smillertsame as those mentioned in 7.3.1, UTS #10.
1770b39c5158SmillertIf you define your collation elements which differs from this requirement,
1771b39c5158Smillertthis parameter doesn't work validly.
1772b39c5158Smillert
1773b39c5158Smillert=item variable
1774b39c5158Smillert
1775b8851fccSafresh1-- see 3.6 Variable Weighting, UTS #10.
1776b39c5158Smillert
1777898184e3SsthenThis key allows for variable weighting of variable collation elements,
1778b39c5158Smillertwhich are marked with an ASTERISK in the table
1779898184e3Ssthen(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1780b39c5158Smillert
1781b39c5158Smillert   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1782b39c5158Smillert
1783b39c5158SmillertThese names are case-insensitive.
1784b39c5158SmillertBy default (if specification is omitted), 'shifted' is adopted.
1785b39c5158Smillert
1786b39c5158Smillert   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1787b39c5158Smillert                    considered at the 4th level.
1788b39c5158Smillert
1789b39c5158Smillert   'Non-Ignorable'  Variable elements are not reset to ignorable.
1790b39c5158Smillert
1791b39c5158Smillert   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1792b39c5158Smillert                    their level 4 weight is replaced by the old level 1 weight.
1793b39c5158Smillert                    Level 4 weight for Non-Variable elements is 0xFFFF.
1794b39c5158Smillert
1795b39c5158Smillert   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1796b39c5158Smillert                    are trimmed.
1797b39c5158Smillert
1798b39c5158Smillert=back
1799b39c5158Smillert
1800b39c5158Smillert=head2 Methods for Collation
1801b39c5158Smillert
1802b39c5158Smillert=over 4
1803b39c5158Smillert
1804b39c5158Smillert=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1805b39c5158Smillert
1806b39c5158SmillertSorts a list of strings.
1807b39c5158Smillert
1808b39c5158Smillert=item C<$result = $Collator-E<gt>cmp($a, $b)>
1809b39c5158Smillert
1810b39c5158SmillertReturns 1 (when C<$a> is greater than C<$b>)
1811b39c5158Smillertor 0 (when C<$a> is equal to C<$b>)
18126fb12b70Safresh1or -1 (when C<$a> is less than C<$b>).
1813b39c5158Smillert
1814b39c5158Smillert=item C<$result = $Collator-E<gt>eq($a, $b)>
1815b39c5158Smillert
1816b39c5158Smillert=item C<$result = $Collator-E<gt>ne($a, $b)>
1817b39c5158Smillert
1818b39c5158Smillert=item C<$result = $Collator-E<gt>lt($a, $b)>
1819b39c5158Smillert
1820b39c5158Smillert=item C<$result = $Collator-E<gt>le($a, $b)>
1821b39c5158Smillert
1822b39c5158Smillert=item C<$result = $Collator-E<gt>gt($a, $b)>
1823b39c5158Smillert
1824b39c5158Smillert=item C<$result = $Collator-E<gt>ge($a, $b)>
1825b39c5158Smillert
1826b39c5158SmillertThey works like the same name operators as theirs.
1827b39c5158Smillert
1828b39c5158Smillert   eq : whether $a is equal to $b.
1829b39c5158Smillert   ne : whether $a is not equal to $b.
18306fb12b70Safresh1   lt : whether $a is less than $b.
18316fb12b70Safresh1   le : whether $a is less than $b or equal to $b.
1832b39c5158Smillert   gt : whether $a is greater than $b.
1833b39c5158Smillert   ge : whether $a is greater than $b or equal to $b.
1834b39c5158Smillert
1835b39c5158Smillert=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1836b39c5158Smillert
1837b39c5158Smillert-- see 4.3 Form Sort Key, UTS #10.
1838b39c5158Smillert
1839b39c5158SmillertReturns a sort key.
1840b39c5158Smillert
1841b39c5158SmillertYou compare the sort keys using a binary comparison
1842b39c5158Smillertand get the result of the comparison of the strings using UCA.
1843b39c5158Smillert
1844b39c5158Smillert   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1845b39c5158Smillert
1846b39c5158Smillert      is equivalent to
1847b39c5158Smillert
1848b39c5158Smillert   $Collator->cmp($a, $b)
1849b39c5158Smillert
1850b39c5158Smillert=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1851b39c5158Smillert
1852b39c5158SmillertConverts a sorting key into its representation form.
1853b39c5158SmillertIf C<UCA_Version> is 8, the output is slightly different.
1854b39c5158Smillert
1855b39c5158Smillert   use Unicode::Collate;
1856b39c5158Smillert   my $c = Unicode::Collate->new();
1857b39c5158Smillert   print $c->viewSortKey("Perl"),"\n";
1858b39c5158Smillert
1859b39c5158Smillert   # output:
1860b39c5158Smillert   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1861b39c5158Smillert   #  Level 1               Level 2               Level 3               Level 4
1862b39c5158Smillert
1863b39c5158Smillert=back
1864b39c5158Smillert
1865b39c5158Smillert=head2 Methods for Searching
1866b39c5158Smillert
1867b39c5158SmillertThe C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1868b39c5158Smillertlike C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1869b39c5158Smillertbut they are not aware of any pattern, but only a literal substring.
1870b39c5158Smillert
1871898184e3SsthenB<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1872898184e3Ssthenfor C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1873898184e3SsthenC<subst>, C<gsubst>) is croaked, as the position and the length might
1874898184e3Ssthendiffer from those on the specified string.
1875898184e3Ssthen
1876898184e3SsthenC<rearrange> and C<hangul_terminator> parameters are neglected.
1877898184e3SsthenC<katakana_before_hiragana> and C<upper_before_lower> don't affect
18786fb12b70Safresh1matching and searching, as it doesn't matter whether greater or less.
1879898184e3Ssthen
1880b39c5158Smillert=over 4
1881b39c5158Smillert
1882b39c5158Smillert=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1883b39c5158Smillert
1884b39c5158Smillert=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1885b39c5158Smillert
1886b39c5158SmillertIf C<$substring> matches a part of C<$string>, returns
1887b39c5158Smillertthe position of the first occurrence of the matching part in scalar context;
1888b39c5158Smillertin list context, returns a two-element list of
1889b39c5158Smillertthe position and the length of the matching part.
1890b39c5158Smillert
1891b39c5158SmillertIf C<$substring> does not match any part of C<$string>,
1892b39c5158Smillertreturns C<-1> in scalar context and
1893b39c5158Smillertan empty list in list context.
1894b39c5158Smillert
18956fb12b70Safresh1e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
18966fb12b70Safresh1you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1897b39c5158Smillert
1898b39c5158Smillert  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1899b39c5158Smillert                                     # (normalization => undef) is REQUIRED.
1900b39c5158Smillert  my $match;
1901b39c5158Smillert  if (my($pos,$len) = $Collator->index($str, $sub)) {
1902b39c5158Smillert      $match = substr($str, $pos, $len);
1903b39c5158Smillert  }
1904b39c5158Smillert
19056fb12b70Safresh1and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
19066fb12b70Safresh1is primary equal to C<"M>E<uuml>C<SS">.
1907b39c5158Smillert
1908b39c5158Smillert=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1909b39c5158Smillert
1910b39c5158Smillert=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1911b39c5158Smillert
1912b39c5158SmillertIf C<$substring> matches a part of C<$string>, in scalar context, returns
1913b39c5158SmillertB<a reference to> the first occurrence of the matching part
1914b39c5158Smillert(C<$match_ref> is always true if matches,
1915b39c5158Smillertsince every reference is B<true>);
1916b39c5158Smillertin list context, returns the first occurrence of the matching part.
1917b39c5158Smillert
1918b39c5158SmillertIf C<$substring> does not match any part of C<$string>,
1919b39c5158Smillertreturns C<undef> in scalar context and
1920b39c5158Smillertan empty list in list context.
1921b39c5158Smillert
1922b39c5158Smillerte.g.
1923b39c5158Smillert
1924b39c5158Smillert    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1925b39c5158Smillert	print "matches [$$match_ref].\n";
1926b39c5158Smillert    } else {
1927b39c5158Smillert	print "doesn't match.\n";
1928b39c5158Smillert    }
1929b39c5158Smillert
1930b39c5158Smillert     or
1931b39c5158Smillert
1932b39c5158Smillert    if (($match) = $Collator->match($str, $sub)) { # list context
1933b39c5158Smillert	print "matches [$match].\n";
1934b39c5158Smillert    } else {
1935b39c5158Smillert	print "doesn't match.\n";
1936b39c5158Smillert    }
1937b39c5158Smillert
1938b39c5158Smillert=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1939b39c5158Smillert
1940b39c5158SmillertIf C<$substring> matches a part of C<$string>, returns
1941b39c5158Smillertall the matching parts (or matching count in scalar context).
1942b39c5158Smillert
1943b39c5158SmillertIf C<$substring> does not match any part of C<$string>,
1944b39c5158Smillertreturns an empty list.
1945b39c5158Smillert
1946b39c5158Smillert=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1947b39c5158Smillert
1948b39c5158SmillertIf C<$substring> matches a part of C<$string>,
1949b39c5158Smillertthe first occurrence of the matching part is replaced by C<$replacement>
1950898184e3Ssthen(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1951b39c5158Smillert
1952b39c5158SmillertC<$replacement> can be a C<CODEREF>,
1953b39c5158Smillerttaking the matching part as an argument,
1954b39c5158Smillertand returning a string to replace the matching part
1955b39c5158Smillert(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1956b39c5158Smillert
1957b39c5158Smillert=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1958b39c5158Smillert
1959b39c5158SmillertIf C<$substring> matches a part of C<$string>,
1960898184e3Ssthenall the occurrences of the matching part are replaced by C<$replacement>
1961898184e3Ssthen(C<$string> is modified) and C<$count> is returned.
1962b39c5158Smillert
1963b39c5158SmillertC<$replacement> can be a C<CODEREF>,
1964b39c5158Smillerttaking the matching part as an argument,
1965b39c5158Smillertand returning a string to replace the matching part
1966b39c5158Smillert(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1967b39c5158Smillert
1968b39c5158Smillerte.g.
1969b39c5158Smillert
1970b39c5158Smillert  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1971b39c5158Smillert                                     # (normalization => undef) is REQUIRED.
1972898184e3Ssthen  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1973b39c5158Smillert  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1974b39c5158Smillert
1975898184e3Ssthen  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1976b39c5158Smillert  # i.e., all the camels are made bold-faced.
1977b39c5158Smillert
1978898184e3Ssthen   Examples: levels and ignore_level2 - what does camel match?
1979898184e3Ssthen  ---------------------------------------------------------------------------
1980898184e3Ssthen   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1981898184e3Ssthen  -----------------------|---------------------------------------------------
1982898184e3Ssthen     1        false      |   yes    yes      yes          yes        yes
1983898184e3Ssthen     2        false      |   yes    yes      no           yes        yes
1984898184e3Ssthen     3        false      |   yes    no       no           yes        yes
1985898184e3Ssthen     4        false      |   yes    no       no           no         yes
1986898184e3Ssthen  -----------------------|---------------------------------------------------
1987898184e3Ssthen     1        true       |   yes    yes      yes          yes        yes
1988898184e3Ssthen     2        true       |   yes    yes      yes          yes        yes
1989898184e3Ssthen     3        true       |   yes    no       yes          yes        yes
1990898184e3Ssthen     4        true       |   yes    no       yes          no         yes
1991898184e3Ssthen  ---------------------------------------------------------------------------
1992898184e3Ssthen   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1993898184e3Ssthen         at any level.
1994898184e3Ssthen
1995b39c5158Smillert=back
1996b39c5158Smillert
1997b39c5158Smillert=head2 Other Methods
1998b39c5158Smillert
1999b39c5158Smillert=over 4
2000b39c5158Smillert
2001b39c5158Smillert=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
2002b39c5158Smillert
2003898184e3Ssthen=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
2004898184e3Ssthen
2005898184e3SsthenChanges the value of specified keys and returns the changed part.
2006b39c5158Smillert
2007b39c5158Smillert    $Collator = Unicode::Collate->new(level => 4);
2008b39c5158Smillert
2009b39c5158Smillert    $Collator->eq("perl", "PERL"); # false
2010b39c5158Smillert
2011b39c5158Smillert    %old = $Collator->change(level => 2); # returns (level => 4).
2012b39c5158Smillert
2013b39c5158Smillert    $Collator->eq("perl", "PERL"); # true
2014b39c5158Smillert
2015b39c5158Smillert    $Collator->change(%old); # returns (level => 2).
2016b39c5158Smillert
2017b39c5158Smillert    $Collator->eq("perl", "PERL"); # false
2018b39c5158Smillert
2019b39c5158SmillertNot all C<(key,value)>s are allowed to be changed.
2020b39c5158SmillertSee also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
2021b39c5158Smillert
2022b39c5158SmillertIn the scalar context, returns the modified collator
2023b39c5158Smillert(but it is B<not> a clone from the original).
2024b39c5158Smillert
2025b39c5158Smillert    $Collator->change(level => 2)->eq("perl", "PERL"); # true
2026b39c5158Smillert
2027b39c5158Smillert    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2028b39c5158Smillert
2029b39c5158Smillert    $Collator->change(level => 4)->eq("perl", "PERL"); # false
2030b39c5158Smillert
2031b39c5158Smillert=item C<$version = $Collator-E<gt>version()>
2032b39c5158Smillert
2033b39c5158SmillertReturns the version number (a string) of the Unicode Standard
2034b39c5158Smillertwhich the C<table> file used by the collator object is based on.
2035b39c5158SmillertIf the table does not include a version line (starting with C<@version>),
2036b39c5158Smillertreturns C<"unknown">.
2037b39c5158Smillert
2038b39c5158Smillert=item C<UCA_Version()>
2039b39c5158Smillert
2040898184e3SsthenReturns the revision number of UTS #10 this module consults,
2041898184e3Ssthenthat should correspond with the DUCET incorporated.
2042b39c5158Smillert
2043b39c5158Smillert=item C<Base_Unicode_Version()>
2044b39c5158Smillert
2045898184e3SsthenReturns the version number of UTS #10 this module consults,
2046898184e3Ssthenthat should correspond with the DUCET incorporated.
2047b39c5158Smillert
2048b39c5158Smillert=back
2049b39c5158Smillert
2050b39c5158Smillert=head1 EXPORT
2051b39c5158Smillert
2052b39c5158SmillertNo method will be exported.
2053b39c5158Smillert
2054b39c5158Smillert=head1 INSTALL
2055b39c5158Smillert
2056b39c5158SmillertThough this module can be used without any C<table> file,
2057b39c5158Smillertto use this module easily, it is recommended to install a table file
2058b39c5158Smillertin the UCA format, by copying it under the directory
2059b39c5158Smillert<a place in @INC>/Unicode/Collate.
2060b39c5158Smillert
2061b39c5158SmillertThe most preferable one is "The Default Unicode Collation Element Table"
2062b39c5158Smillert(aka DUCET), available from the Unicode Consortium's website:
2063b39c5158Smillert
2064b39c5158Smillert   http://www.unicode.org/Public/UCA/
2065b39c5158Smillert
20669f11ffb7Safresh1   http://www.unicode.org/Public/UCA/latest/allkeys.txt
20679f11ffb7Safresh1   (latest version)
2068b39c5158Smillert
2069b39c5158SmillertIf DUCET is not installed, it is recommended to copy the file
2070b39c5158Smillertfrom http://www.unicode.org/Public/UCA/latest/allkeys.txt
2071b39c5158Smillertto <a place in @INC>/Unicode/Collate/allkeys.txt
2072b39c5158Smillertmanually.
2073b39c5158Smillert
2074b39c5158Smillert=head1 CAVEATS
2075b39c5158Smillert
2076b39c5158Smillert=over 4
2077b39c5158Smillert
2078b39c5158Smillert=item Normalization
2079b39c5158Smillert
2080b39c5158SmillertUse of the C<normalization> parameter requires the B<Unicode::Normalize>
2081b39c5158Smillertmodule (see L<Unicode::Normalize>).
2082b39c5158Smillert
2083b39c5158SmillertIf you need not it (say, in the case when you need not
2084b39c5158Smillerthandle any combining characters),
20856fb12b70Safresh1assign C<(normalization =E<gt> undef)> explicitly.
2086b39c5158Smillert
2087b39c5158Smillert-- see 6.5 Avoiding Normalization, UTS #10.
2088b39c5158Smillert
2089b39c5158Smillert=item Conformance Test
2090b39c5158Smillert
2091b39c5158SmillertThe Conformance Test for the UCA is available
2092b39c5158Smillertunder L<http://www.unicode.org/Public/UCA/>.
2093b39c5158Smillert
2094b39c5158SmillertFor F<CollationTest_SHIFTED.txt>,
2095b39c5158Smillerta collator via C<Unicode::Collate-E<gt>new( )> should be used;
2096b39c5158Smillertfor F<CollationTest_NON_IGNORABLE.txt>, a collator via
2097b39c5158SmillertC<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2098b39c5158Smillert
209991f110e0Safresh1If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
210091f110e0Safresh1C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
210191f110e0Safresh1C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
210291f110e0Safresh1C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
210391f110e0Safresh1
2104b39c5158SmillertB<Unicode::Normalize is required to try The Conformance Test.>
2105b39c5158Smillert
2106*eac174f2Safresh1B<EBCDIC-SUPPORT IS EXPERIMENTAL.>
2107*eac174f2Safresh1
2108b39c5158Smillert=back
2109b39c5158Smillert
2110b39c5158Smillert=head1 AUTHOR, COPYRIGHT AND LICENSE
2111b39c5158Smillert
2112b39c5158SmillertThe Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2113*eac174f2Safresh1<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2021,
2114b39c5158SmillertSADAHIRO Tomoyuki. Japan. All rights reserved.
2115b39c5158Smillert
2116b39c5158SmillertThis module is free software; you can redistribute it and/or
2117b39c5158Smillertmodify it under the same terms as Perl itself.
2118b39c5158Smillert
2119898184e3SsthenThe file Unicode/Collate/allkeys.txt was copied verbatim
2120*eac174f2Safresh1from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>.
2121*eac174f2Safresh1For this file, Copyright (c) 2020 Unicode, Inc.; distributed
21229f11ffb7Safresh1under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
2123b39c5158Smillert
2124b39c5158Smillert=head1 SEE ALSO
2125b39c5158Smillert
2126b39c5158Smillert=over 4
2127b39c5158Smillert
2128b39c5158Smillert=item Unicode Collation Algorithm - UTS #10
2129b39c5158Smillert
2130b39c5158SmillertL<http://www.unicode.org/reports/tr10/>
2131b39c5158Smillert
2132b39c5158Smillert=item The Default Unicode Collation Element Table (DUCET)
2133b39c5158Smillert
2134b39c5158SmillertL<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2135b39c5158Smillert
2136b39c5158Smillert=item The conformance test for the UCA
2137b39c5158Smillert
2138b39c5158SmillertL<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2139b39c5158Smillert
2140b39c5158SmillertL<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2141b39c5158Smillert
2142b39c5158Smillert=item Hangul Syllable Type
2143b39c5158Smillert
2144b39c5158SmillertL<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2145b39c5158Smillert
2146b39c5158Smillert=item Unicode Normalization Forms - UAX #15
2147b39c5158Smillert
2148b39c5158SmillertL<http://www.unicode.org/reports/tr15/>
2149b39c5158Smillert
2150898184e3Ssthen=item Unicode Locale Data Markup Language (LDML) - UTS #35
2151898184e3Ssthen
2152898184e3SsthenL<http://www.unicode.org/reports/tr35/>
2153898184e3Ssthen
2154b39c5158Smillert=back
2155b39c5158Smillert
2156b39c5158Smillert=cut
2157