xref: /openbsd-src/gnu/usr.bin/perl/cpan/Unicode-Collate/Collate.pm (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1package Unicode::Collate;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6    }
7    unless (0x41 == unpack('U', 'A')) {
8	die "Unicode::Collate cannot get a Unicode code point\n";
9    }
10}
11
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16use File::Spec;
17
18no warnings 'utf8';
19
20our $VERSION = '1.04';
21our $PACKAGE = __PACKAGE__;
22
23### begin XS only ###
24require DynaLoader;
25our @ISA = qw(DynaLoader);
26bootstrap Unicode::Collate $VERSION;
27### end XS only ###
28
29my @Path = qw(Unicode Collate);
30my $KeyFile = "allkeys.txt";
31
32# Perl's boolean
33use constant TRUE  => 1;
34use constant FALSE => "";
35use constant NOMATCHPOS => -1;
36
37# A coderef to get combining class imported from Unicode::Normalize
38# (i.e. \&Unicode::Normalize::getCombinClass).
39# This is also used as a HAS_UNICODE_NORMALIZE flag.
40my $CVgetCombinClass;
41
42# Supported Levels
43use constant MinLevel => 1;
44use constant MaxLevel => 4;
45
46# Minimum weights at level 2 and 3, respectively
47use constant Min2Wt => 0x20;
48use constant Min3Wt => 0x02;
49
50# Shifted weight at 4th level
51use constant Shift4Wt => 0xFFFF;
52
53# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
54use constant VCE_TEMPLATE => 'Cn4';
55
56# A sort key: 16-bit weights
57use constant KEY_TEMPLATE => 'n*';
58
59# The tie-breaking: 32-bit weights
60use constant TIE_TEMPLATE => 'N*';
61
62# Level separator in a sort key:
63# i.e. pack(KEY_TEMPLATE, 0)
64use constant LEVEL_SEP => "\0\0";
65
66# As Unicode code point separator for hash keys.
67# A joined code point string (denoted by JCPS below)
68# like "65;768" is used for internal processing
69# instead of Perl's Unicode string like "\x41\x{300}",
70# as the native code point is different from the Unicode code point
71# on EBCDIC platform.
72# This character must not be included in any stringified
73# representation of an integer.
74use constant CODE_SEP => ';';
75	# NOTE: in regex /;/ is used for $jcps!
76
77# boolean values of variable weights
78use constant NON_VAR => 0; # Non-Variable character
79use constant VAR     => 1; # Variable character
80
81# specific code points
82use constant Hangul_SIni   => 0xAC00;
83use constant Hangul_SFin   => 0xD7A3;
84
85# Logical_Order_Exception in PropList.txt
86my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
87
88# for highestFFFF and minimalFFFE
89my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
90my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
91
92sub UCA_Version { "28" }
93
94sub Base_Unicode_Version { "6.3.0" }
95
96######
97
98sub pack_U {
99    return pack('U*', @_);
100}
101
102### begin XS only ###
103*unpack_U = exists &Unicode::Collate::bootstrap &&
104	$] < 5.008 && \&unpackUfor56 && 0x41 == unpackUfor56('A')
105    ? \&unpackUfor56 : sub { return unpack('U*', shift(@_).pack('U*')) };
106### end XS only ###
107
108######
109
110my (%VariableOK);
111@VariableOK{ qw/
112    blanked  non-ignorable  shifted  shift-trimmed
113  / } = (); # keys lowercased
114
115our @ChangeOK = qw/
116    alternate backwards level normalization rearrange
117    katakana_before_hiragana upper_before_lower ignore_level2
118    overrideCJK overrideHangul overrideOut preprocess UCA_Version
119    hangul_terminator variable identical highestFFFF minimalFFFE
120  /;
121
122our @ChangeNG = qw/
123    entry mapping table maxlength contraction
124    ignoreChar ignoreName undefChar undefName rewrite
125    versionTable alternateTable backwardsTable forwardsTable
126    rearrangeTable variableTable
127    derivCode normCode rearrangeHash backwardsFlag
128    suppress suppressHash
129    __useXS /; ### XS only
130# The hash key 'ignored' was deleted at v 0.21.
131# The hash key 'isShift' was deleted at v 0.23.
132# The hash key 'combining' was deleted at v 0.24.
133# The hash key 'entries' was deleted at v 0.30.
134# The hash key 'L3_ignorable' was deleted at v 0.40.
135
136sub version {
137    my $self = shift;
138    return $self->{versionTable} || 'unknown';
139}
140
141my (%ChangeOK, %ChangeNG);
142@ChangeOK{ @ChangeOK } = ();
143@ChangeNG{ @ChangeNG } = ();
144
145sub change {
146    my $self = shift;
147    my %hash = @_;
148    my %old;
149    if (exists $hash{alternate}) {
150	if (exists $hash{variable}) {
151	    delete $hash{alternate};
152	} else {
153	    $hash{variable} = $hash{alternate};
154	}
155    }
156    foreach my $k (keys %hash) {
157	if (exists $ChangeOK{$k}) {
158	    $old{$k} = $self->{$k};
159	    $self->{$k} = $hash{$k};
160	} elsif (exists $ChangeNG{$k}) {
161	    croak "change of $k via change() is not allowed!";
162	}
163	# else => ignored
164    }
165    $self->checkCollator();
166    return wantarray ? %old : $self;
167}
168
169sub _checkLevel {
170    my $level = shift;
171    my $key   = shift; # 'level' or 'backwards'
172    MinLevel <= $level or croak sprintf
173	"Illegal level %d (in value for key '%s') lower than %d.",
174	    $level, $key, MinLevel;
175    $level <= MaxLevel or croak sprintf
176	"Unsupported level %d (in value for key '%s') higher than %d.",
177	    $level, $key, MaxLevel;
178}
179
180my %DerivCode = (
181    8 => \&_derivCE_8,
182    9 => \&_derivCE_9,
183   11 => \&_derivCE_9, # 11 == 9
184   14 => \&_derivCE_14,
185   16 => \&_derivCE_14, # 16 == 14
186   18 => \&_derivCE_18,
187   20 => \&_derivCE_20,
188   22 => \&_derivCE_22,
189   24 => \&_derivCE_24,
190   26 => \&_derivCE_24, # 26 == 24
191   28 => \&_derivCE_24, # 28 == 24
192);
193
194sub checkCollator {
195    my $self = shift;
196    _checkLevel($self->{level}, "level");
197
198    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
199	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
200
201    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
202				$self->{alternateTable} || 'shifted';
203    $self->{variable} = $self->{alternate} = lc($self->{variable});
204    exists $VariableOK{ $self->{variable} }
205	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
206
207    if (! defined $self->{backwards}) {
208	$self->{backwardsFlag} = 0;
209    } elsif (! ref $self->{backwards}) {
210	_checkLevel($self->{backwards}, "backwards");
211	$self->{backwardsFlag} = 1 << $self->{backwards};
212    } else {
213	my %level;
214	$self->{backwardsFlag} = 0;
215	for my $b (@{ $self->{backwards} }) {
216	    _checkLevel($b, "backwards");
217	    $level{$b} = 1;
218	}
219	for my $v (sort keys %level) {
220	    $self->{backwardsFlag} += 1 << $v;
221	}
222    }
223
224    defined $self->{rearrange} or $self->{rearrange} = [];
225    ref $self->{rearrange}
226	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
227
228    # keys of $self->{rearrangeHash} are $self->{rearrange}.
229    $self->{rearrangeHash} = undef;
230
231    if (@{ $self->{rearrange} }) {
232	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
233    }
234
235    $self->{normCode} = undef;
236
237    if (defined $self->{normalization}) {
238	eval { require Unicode::Normalize };
239	$@ and croak "Unicode::Normalize is required to normalize strings";
240
241	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
242
243	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
244	    $self->{normCode} = \&Unicode::Normalize::NFD;
245	}
246	elsif ($self->{normalization} ne 'prenormalized') {
247	    my $norm = $self->{normalization};
248	    $self->{normCode} = sub {
249		Unicode::Normalize::normalize($norm, shift);
250	    };
251	    eval { $self->{normCode}->("") }; # try
252	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
253	}
254    }
255    return;
256}
257
258sub new
259{
260    my $class = shift;
261    my $self = bless { @_ }, $class;
262
263### begin XS only ###
264    if (! exists $self->{table}     && !defined $self->{rewrite} &&
265	!defined $self->{undefName} && !defined $self->{ignoreName} &&
266	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
267	$self->{__useXS} = \&_fetch_simple;
268    } else {
269	$self->{__useXS} = undef;
270    }
271### end XS only ###
272
273    # keys of $self->{suppressHash} are $self->{suppress}.
274    if ($self->{suppress} && @{ $self->{suppress} }) {
275	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
276    } # before read_table()
277
278    # If undef is passed explicitly, no file is read.
279    $self->{table} = $KeyFile if ! exists $self->{table};
280    $self->read_table() if defined $self->{table};
281
282    if ($self->{entry}) {
283	while ($self->{entry} =~ /([^\n]+)/g) {
284	    $self->parseEntry($1, TRUE);
285	}
286    }
287
288    $self->{level} ||= MaxLevel;
289    $self->{UCA_Version} ||= UCA_Version();
290
291    $self->{overrideHangul} = FALSE
292	if ! exists $self->{overrideHangul};
293    $self->{overrideCJK} = FALSE
294	if ! exists $self->{overrideCJK};
295    $self->{normalization} = 'NFD'
296	if ! exists $self->{normalization};
297    $self->{rearrange} = $self->{rearrangeTable} ||
298	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
299	if ! exists $self->{rearrange};
300    $self->{backwards} = $self->{backwardsTable}
301	if ! exists $self->{backwards};
302
303    $self->checkCollator();
304
305    return $self;
306}
307
308sub parseAtmark {
309    my $self = shift;
310    my $line = shift; # after s/^\s*\@//
311
312    if ($line =~ /^version\s*(\S*)/) {
313	$self->{versionTable} ||= $1;
314    }
315    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
316	$self->{variableTable} ||= $1;
317    }
318    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
319	$self->{alternateTable} ||= $1;
320    }
321    elsif ($line =~ /^backwards\s+(\S*)/) {
322	push @{ $self->{backwardsTable} }, $1;
323    }
324    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
325	push @{ $self->{forwardsTable} }, $1;
326    }
327    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
328	push @{ $self->{rearrangeTable} }, _getHexArray($1);
329    }
330}
331
332sub read_table {
333    my $self = shift;
334
335### begin XS only ###
336    if ($self->{__useXS}) {
337	my @rest = _fetch_rest(); # complex matter need to parse
338	for my $line (@rest) {
339	    next if $line =~ /^\s*#/;
340
341	    if ($line =~ s/^\s*\@//) {
342		$self->parseAtmark($line);
343	    } else {
344		$self->parseEntry($line);
345	    }
346	}
347	return;
348    }
349### end XS only ###
350
351    my($f, $fh);
352    foreach my $d (@INC) {
353	$f = File::Spec->catfile($d, @Path, $self->{table});
354	last if open($fh, $f);
355	$f = undef;
356    }
357    if (!defined $f) {
358	$f = File::Spec->catfile(@Path, $self->{table});
359	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
360    }
361
362    while (my $line = <$fh>) {
363	next if $line =~ /^\s*#/;
364
365	if ($line =~ s/^\s*\@//) {
366	    $self->parseAtmark($line);
367	} else {
368	    $self->parseEntry($line);
369	}
370    }
371    close $fh;
372}
373
374
375##
376## get $line, parse it, and write an entry in $self
377##
378sub parseEntry
379{
380    my $self = shift;
381    my $line = shift;
382    my $tailoring = shift;
383    my($name, $entry, @uv, @key);
384
385    if (defined $self->{rewrite}) {
386	$line = $self->{rewrite}->($line);
387    }
388
389    return if $line !~ /^\s*[0-9A-Fa-f]/;
390
391    # removes comment and gets name
392    $name = $1
393	if $line =~ s/[#%]\s*(.*)//;
394    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
395
396    # gets element
397    my($e, $k) = split /;/, $line;
398    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
399	if ! $k;
400
401    @uv = _getHexArray($e);
402    return if !@uv;
403    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
404		  exists $self->{suppressHash}{$uv[0]};
405    $entry = join(CODE_SEP, @uv); # in JCPS
406
407    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
408	my $ele = pack_U(@uv);
409
410	# regarded as if it were not stored in the table
411	return
412	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
413
414	# replaced as completely ignorable
415	$k = '[.0000.0000.0000.0000]'
416	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
417    }
418
419    # replaced as completely ignorable
420    $k = '[.0000.0000.0000.0000]'
421	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
422
423    my $is_L3_ignorable = TRUE;
424
425    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
426	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
427	my @wt = _getHexArray($arr);
428	push @key, pack(VCE_TEMPLATE, $var, @wt);
429	$is_L3_ignorable = FALSE
430	    if $wt[0] || $wt[1] || $wt[2];
431	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
432	# is completely ignorable.
433	# For expansion, an entry $is_L3_ignorable
434	# if and only if "all" CEs are [.0000.0000.0000].
435    }
436
437    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
438
439    if (@uv > 1) {
440	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
441	    $self->{maxlength}{$uv[0]} = @uv;
442	}
443    }
444    if (@uv > 2) {
445	while (@uv) {
446	    pop @uv;
447	    my $fake_entry = join(CODE_SEP, @uv); # in JCPS
448	    $self->{contraction}{$fake_entry} = 1;
449	}
450    }
451}
452
453
454sub viewSortKey
455{
456    my $self = shift;
457    my $str  = shift;
458    $self->visualizeSortKey($self->getSortKey($str));
459}
460
461
462sub process
463{
464    my $self = shift;
465    my $str  = shift;
466    my $prep = $self->{preprocess};
467    my $norm = $self->{normCode};
468
469    $str = &$prep($str) if ref $prep;
470    $str = &$norm($str) if ref $norm;
471    return $str;
472}
473
474##
475## arrayref of JCPS   = splitEnt(string to be collated)
476## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
477##
478sub splitEnt
479{
480    my $self = shift;
481    my $str  = shift;
482    my $wLen = shift; # with Length
483
484    my $map  = $self->{mapping};
485    my $max  = $self->{maxlength};
486    my $reH  = $self->{rearrangeHash};
487    my $vers = $self->{UCA_Version};
488    my $ver9 = $vers >= 9 && $vers <= 11;
489    my $uXS  = $self->{__useXS}; ### XS only
490
491    my @buf;
492
493    # get array of Unicode code point of string.
494    my @src = unpack_U($str);
495
496    # rearrangement:
497    # Character positions are not kept if rearranged,
498    # then neglected if $wLen is true.
499    if ($reH && ! $wLen) {
500	for (my $i = 0; $i < @src; $i++) {
501	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
502		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
503		$i++;
504	    }
505	}
506    }
507
508    # remove a code point marked as a completely ignorable.
509    for (my $i = 0; $i < @src; $i++) {
510	if ($vers <= 20 && _isIllegal($src[$i])) {
511	    $src[$i] = undef;
512	} elsif ($ver9) {
513	    $src[$i] = undef if $map->{ $src[$i] }
514			   ? @{ $map->{ $src[$i] } } == 0
515			   : $uXS && _ignorable_simple($src[$i]); ### XS only
516	}
517    }
518
519    for (my $i = 0; $i < @src; $i++) {
520	my $jcps = $src[$i];
521
522	# skip removed code point
523	if (! defined $jcps) {
524	    if ($wLen && @buf) {
525		$buf[-1][2] = $i + 1;
526	    }
527	    next;
528	}
529
530	my $i_orig = $i;
531
532	# find contraction
533	if ($max->{$jcps}) {
534	    my $temp_jcps = $jcps;
535	    my $jcpsLen = 1;
536	    my $maxLen = $max->{$jcps};
537
538	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
539		next if ! defined $src[$p];
540		$temp_jcps .= CODE_SEP . $src[$p];
541		$jcpsLen++;
542		if ($map->{$temp_jcps}) {
543		    $jcps = $temp_jcps;
544		    $i = $p;
545		}
546	    }
547
548	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
549	# This process requires Unicode::Normalize.
550	# If "normalization" is undef, here should be skipped *always*
551	# (in spite of bool value of $CVgetCombinClass),
552	# since canonical ordering cannot be expected.
553	# Blocked combining character should not be contracted.
554
555	    # $self->{normCode} is false in the case of "prenormalized".
556	    if ($self->{normalization}) {
557		my $cont = $self->{contraction};
558		my $preCC = 0;
559		my $preCC_uc = 0;
560		my $jcps_uc = $jcps;
561		my(@out, @out_uc);
562
563		for (my $p = $i + 1; $p < @src; $p++) {
564		    next if ! defined $src[$p];
565		    my $curCC = $CVgetCombinClass->($src[$p]);
566		    last unless $curCC;
567		    my $tail = CODE_SEP . $src[$p];
568
569		    if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
570					       $cont->{$jcps_uc.$tail})) {
571			$jcps_uc .= $tail;
572			push @out_uc, $p;
573		    } else {
574			$preCC_uc = $curCC;
575		    }
576
577		    if ($preCC != $curCC && $map->{$jcps.$tail}) {
578			$jcps .= $tail;
579			push @out, $p;
580		    } else {
581			$preCC = $curCC;
582		    }
583		}
584
585		if ($map->{$jcps_uc}) {
586		    $jcps = $jcps_uc;
587		    $src[$_] = undef for @out_uc;
588		} else {
589		    $src[$_] = undef for @out;
590		}
591	    }
592	}
593
594	# skip completely ignorable
595	if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
596	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
597	    if ($wLen && @buf) {
598		$buf[-1][2] = $i + 1;
599	    }
600	    next;
601	}
602
603	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
604    }
605    return \@buf;
606}
607
608##
609## VCE = _pack_override(input, codepoint, derivCode)
610##
611sub _pack_override ($$$) {
612    my $r = shift;
613    my $u = shift;
614    my $der = shift;
615
616    if (ref $r) {
617	return pack(VCE_TEMPLATE, NON_VAR, @$r);
618    } elsif (defined $r) {
619	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
620    } else {
621	$u = 0xFFFD if 0x10FFFF < $u;
622	return $der->($u);
623    }
624}
625
626##
627## list of VCE = getWt(JCPS)
628##
629sub getWt
630{
631    my $self = shift;
632    my $u    = shift;
633    my $map  = $self->{mapping};
634    my $der  = $self->{derivCode};
635    my $out  = $self->{overrideOut};
636    my $uXS  = $self->{__useXS}; ### XS only
637
638    return if !defined $u;
639    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
640    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
641    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
642
643    my @ce;
644    if ($map->{$u}) {
645	@ce = @{ $map->{$u} }; # $u may be a contraction
646### begin XS only ###
647    } elsif ($uXS && _exists_simple($u)) {
648	@ce = _fetch_simple($u);
649### end XS only ###
650    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
651	my $hang = $self->{overrideHangul};
652	if ($hang) {
653	    @ce = map _pack_override($_, $u, $der), $hang->($u);
654	} elsif (!defined $hang) {
655	    @ce = $der->($u);
656	} else {
657	    my $max  = $self->{maxlength};
658	    my @decH = _decompHangul($u);
659
660	    if (@decH == 2) {
661		my $contract = join(CODE_SEP, @decH);
662		@decH = ($contract) if $map->{$contract};
663	    } else { # must be <@decH == 3>
664		if ($max->{$decH[0]}) {
665		    my $contract = join(CODE_SEP, @decH);
666		    if ($map->{$contract}) {
667			@decH = ($contract);
668		    } else {
669			$contract = join(CODE_SEP, @decH[0,1]);
670			$map->{$contract} and @decH = ($contract, $decH[2]);
671		    }
672		    # even if V's ignorable, LT contraction is not supported.
673		    # If such a situation were required, NFD should be used.
674		}
675		if (@decH == 3 && $max->{$decH[1]}) {
676		    my $contract = join(CODE_SEP, @decH[1,2]);
677		    $map->{$contract} and @decH = ($decH[0], $contract);
678		}
679	    }
680
681	    @ce = map({
682		    $map->{$_} ? @{ $map->{$_} } :
683		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
684		    $der->($_);
685		} @decH);
686	}
687    } elsif ($out && 0x10FFFF < $u) {
688	@ce = map _pack_override($_, $u, $der), $out->($u);
689    } else {
690	my $cjk  = $self->{overrideCJK};
691	my $vers = $self->{UCA_Version};
692	if ($cjk && _isUIdeo($u, $vers)) {
693	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
694	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
695	    @ce = _uideoCE_8($u);
696	} else {
697	    @ce = $der->($u);
698	}
699    }
700    return map $self->varCE($_), @ce;
701}
702
703
704##
705## string sortkey = getSortKey(string arg)
706##
707sub getSortKey
708{
709    my $self = shift;
710    my $orig = shift;
711    my $str  = $self->process($orig);
712    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
713    my $vers = $self->{UCA_Version};
714    my $term = $self->{hangul_terminator};
715    my $lev  = $self->{level};
716    my $iden = $self->{identical};
717
718    my @buf; # weight arrays
719    if ($term) {
720	my $preHST = '';
721	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
722	foreach my $jcps (@$rEnt) {
723	    # weird things like VL, TL-contraction are not considered!
724	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
725	    if ($preHST && !$curHST || # hangul before non-hangul
726		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
727		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
728		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
729		push @buf, $termCE;
730	    }
731	    $preHST = $curHST;
732	    push @buf, $self->getWt($jcps);
733	}
734	push @buf, $termCE if $preHST; # end at hangul
735    } else {
736	foreach my $jcps (@$rEnt) {
737	    push @buf, $self->getWt($jcps);
738	}
739    }
740
741    my $rkey = $self->mk_SortKey(\@buf); ### XS only
742
743    if ($iden || $vers >= 26 && $lev == MaxLevel) {
744	$rkey .= LEVEL_SEP;
745	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
746    }
747    return $rkey;
748}
749
750
751##
752## int compare = cmp(string a, string b)
753##
754sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
755sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
756sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
757sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
758sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
759sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
760sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
761
762##
763## list[strings] sorted = sort(list[strings] arg)
764##
765sub sort {
766    my $obj = shift;
767    return
768	map { $_->[1] }
769	    sort{ $a->[0] cmp $b->[0] }
770		map [ $obj->getSortKey($_), $_ ], @_;
771}
772
773
774##
775## bool _nonIgnorAtLevel(arrayref weights, int level)
776##
777sub _nonIgnorAtLevel($$)
778{
779    my $wt = shift;
780    return if ! defined $wt;
781    my $lv = shift;
782    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
783}
784
785##
786## bool _eqArray(
787##    arrayref of arrayref[weights] source,
788##    arrayref of arrayref[weights] substr,
789##    int level)
790## * comparison of graphemes vs graphemes.
791##   @$source >= @$substr must be true (check it before call this);
792##
793sub _eqArray($$$)
794{
795    my $source = shift;
796    my $substr = shift;
797    my $lev = shift;
798
799    for my $g (0..@$substr-1){
800	# Do the $g'th graphemes have the same number of AV weights?
801	return if @{ $source->[$g] } != @{ $substr->[$g] };
802
803	for my $w (0..@{ $substr->[$g] }-1) {
804	    for my $v (0..$lev-1) {
805		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
806	    }
807	}
808    }
809    return 1;
810}
811
812##
813## (int position, int length)
814## int position = index(string, substring, position, [undoc'ed global])
815##
816## With "global" (only for the list context),
817##  returns list of arrayref[position, length].
818##
819sub index
820{
821    my $self = shift;
822    $self->{preprocess} and
823	croak "Don't use Preprocess with index(), match(), etc.";
824    $self->{normCode} and
825	croak "Don't use Normalization with index(), match(), etc.";
826
827    my $str  = shift;
828    my $len  = length($str);
829    my $sub  = shift;
830    my $subE = $self->splitEnt($sub);
831    my $pos  = @_ ? shift : 0;
832       $pos  = 0 if $pos < 0;
833    my $glob = shift;
834
835    my $lev  = $self->{level};
836    my $v2i  = $self->{UCA_Version} >= 9 &&
837		$self->{variable} ne 'non-ignorable';
838
839    if (! @$subE) {
840	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
841	return $glob
842	    ? map([$_, 0], $temp..$len)
843	    : wantarray ? ($temp,0) : $temp;
844    }
845    $len < $pos
846	and return wantarray ? () : NOMATCHPOS;
847    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
848    @$strE
849	or return wantarray ? () : NOMATCHPOS;
850
851    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
852
853    my $last_is_variable;
854    for my $vwt (map $self->getWt($_), @$subE) {
855	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
856	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
857
858	# "Ignorable (L1, L2) after Variable" since track. v. 9
859	if ($v2i) {
860	    if ($var) {
861		$last_is_variable = TRUE;
862	    }
863	    elsif (!$wt[0]) { # ignorable
864		$to_be_pushed = FALSE if $last_is_variable;
865	    }
866	    else {
867		$last_is_variable = FALSE;
868	    }
869	}
870
871	if (@subWt && !$var && !$wt[0]) {
872	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
873	} elsif ($to_be_pushed) {
874	    push @subWt, [ \@wt ];
875	}
876	# else ===> skipped
877    }
878
879    my $count = 0;
880    my $end = @$strE - 1;
881
882    $last_is_variable = FALSE; # reuse
883    for (my $i = 0; $i <= $end; ) { # no $i++
884	my $found_base = 0;
885
886	# fetch a grapheme
887	while ($i <= $end && $found_base == 0) {
888	    for my $vwt ($self->getWt($strE->[$i][0])) {
889		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
890		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
891
892		# "Ignorable (L1, L2) after Variable" since track. v. 9
893		if ($v2i) {
894		    if ($var) {
895			$last_is_variable = TRUE;
896		    }
897		    elsif (!$wt[0]) { # ignorable
898			$to_be_pushed = FALSE if $last_is_variable;
899		    }
900		    else {
901			$last_is_variable = FALSE;
902		    }
903		}
904
905		if (@strWt && !$var && !$wt[0]) {
906		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
907		    $finPos[-1] = $strE->[$i][2];
908		} elsif ($to_be_pushed) {
909		    push @strWt, [ \@wt ];
910		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
911		    $finPos[-1] = NOMATCHPOS if $found_base;
912		    push @finPos, $strE->[$i][2];
913		    $found_base++;
914		}
915		# else ===> no-op
916	    }
917	    $i++;
918	}
919
920	# try to match
921	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
922	    if ($iniPos[0] != NOMATCHPOS &&
923		    $finPos[$#subWt] != NOMATCHPOS &&
924			_eqArray(\@strWt, \@subWt, $lev)) {
925		my $temp = $iniPos[0] + $pos;
926
927		if ($glob) {
928		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
929		    splice @strWt,  0, $#subWt;
930		    splice @iniPos, 0, $#subWt;
931		    splice @finPos, 0, $#subWt;
932		}
933		else {
934		    return wantarray
935			? ($temp, $finPos[$#subWt] - $iniPos[0])
936			:  $temp;
937		}
938	    }
939	    shift @strWt;
940	    shift @iniPos;
941	    shift @finPos;
942	}
943    }
944
945    return $glob
946	? @g_ret
947	: wantarray ? () : NOMATCHPOS;
948}
949
950##
951## scalarref to matching part = match(string, substring)
952##
953sub match
954{
955    my $self = shift;
956    if (my($pos,$len) = $self->index($_[0], $_[1])) {
957	my $temp = substr($_[0], $pos, $len);
958	return wantarray ? $temp : \$temp;
959	# An lvalue ref \substr should be avoided,
960	# since its value is affected by modification of its referent.
961    }
962    else {
963	return;
964    }
965}
966
967##
968## arrayref matching parts = gmatch(string, substring)
969##
970sub gmatch
971{
972    my $self = shift;
973    my $str  = shift;
974    my $sub  = shift;
975    return map substr($str, $_->[0], $_->[1]),
976		$self->index($str, $sub, 0, 'g');
977}
978
979##
980## bool subst'ed = subst(string, substring, replace)
981##
982sub subst
983{
984    my $self = shift;
985    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
986
987    if (my($pos,$len) = $self->index($_[0], $_[1])) {
988	if ($code) {
989	    my $mat = substr($_[0], $pos, $len);
990	    substr($_[0], $pos, $len, $code->($mat));
991	} else {
992	    substr($_[0], $pos, $len, $_[2]);
993	}
994	return TRUE;
995    }
996    else {
997	return FALSE;
998    }
999}
1000
1001##
1002## int count = gsubst(string, substring, replace)
1003##
1004sub gsubst
1005{
1006    my $self = shift;
1007    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1008    my $cnt = 0;
1009
1010    # Replacement is carried out from the end, then use reverse.
1011    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1012	if ($code) {
1013	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1014	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1015	} else {
1016	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1017	}
1018	$cnt++;
1019    }
1020    return $cnt;
1021}
1022
10231;
1024__END__
1025
1026=head1 NAME
1027
1028Unicode::Collate - Unicode Collation Algorithm
1029
1030=head1 SYNOPSIS
1031
1032  use Unicode::Collate;
1033
1034  #construct
1035  $Collator = Unicode::Collate->new(%tailoring);
1036
1037  #sort
1038  @sorted = $Collator->sort(@not_sorted);
1039
1040  #compare
1041  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1042
1043B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1044according to Perl's Unicode support. See L<perlunicode>,
1045L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1046Otherwise you can use C<preprocess> or should decode them before.
1047
1048=head1 DESCRIPTION
1049
1050This module is an implementation of Unicode Technical Standard #10
1051(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1052
1053=head2 Constructor and Tailoring
1054
1055The C<new> method returns a collator object. If new() is called
1056with no parameters, the collator should do the default collation.
1057
1058   $Collator = Unicode::Collate->new(
1059      UCA_Version => $UCA_Version,
1060      alternate => $alternate, # alias for 'variable'
1061      backwards => $levelNumber, # or \@levelNumbers
1062      entry => $element,
1063      hangul_terminator => $term_primary_weight,
1064      highestFFFF => $bool,
1065      identical => $bool,
1066      ignoreName => qr/$ignoreName/,
1067      ignoreChar => qr/$ignoreChar/,
1068      ignore_level2 => $bool,
1069      katakana_before_hiragana => $bool,
1070      level => $collationLevel,
1071      minimalFFFE => $bool,
1072      normalization  => $normalization_form,
1073      overrideCJK => \&overrideCJK,
1074      overrideHangul => \&overrideHangul,
1075      preprocess => \&preprocess,
1076      rearrange => \@charList,
1077      rewrite => \&rewrite,
1078      suppress => \@charList,
1079      table => $filename,
1080      undefName => qr/$undefName/,
1081      undefChar => qr/$undefChar/,
1082      upper_before_lower => $bool,
1083      variable => $variable,
1084   );
1085
1086=over 4
1087
1088=item UCA_Version
1089
1090If the revision (previously "tracking version") number of UCA is given,
1091behavior of that revision is emulated on collating.
1092If omitted, the return value of C<UCA_Version()> is used.
1093
1094The following revisions are supported.  The default is 28.
1095
1096     UCA       Unicode Standard         DUCET (@version)
1097   -------------------------------------------------------
1098      8              3.1                3.0.1 (3.0.1d9)
1099      9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1100     11              4.0                4.0.0 (4.0.0)
1101     14             4.1.0               4.1.0 (4.1.0)
1102     16              5.0                5.0.0 (5.0.0)
1103     18             5.1.0               5.1.0 (5.1.0)
1104     20             5.2.0               5.2.0 (5.2.0)
1105     22             6.0.0               6.0.0 (6.0.0)
1106     24             6.1.0               6.1.0 (6.1.0)
1107     26             6.2.0               6.2.0 (6.2.0)
1108     28             6.3.0               6.3.0 (6.3.0)
1109
1110* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1111since C<UCA_Version> 22.
1112
1113* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1114and can be overridden since C<UCA_Version> 22.
1115
1116* Fully ignorable characters were ignored, and would not interrupt
1117contractions with C<UCA_Version> 9 and 11.
1118
1119* Treatment of ignorables after variables and some behaviors
1120were changed at C<UCA_Version> 9.
1121
1122* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1123depend on C<UCA_Version>.
1124
1125* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1126C<hangul_terminator>.
1127
1128=item alternate
1129
1130-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1131
1132For backward compatibility, C<alternate> (old name) can be used
1133as an alias for C<variable>.
1134
1135=item backwards
1136
1137-- see 3.4 Backward Accents, UTS #10.
1138
1139     backwards => $levelNumber or \@levelNumbers
1140
1141Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1142If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1143forwards at all the levels.
1144
1145=item entry
1146
1147-- see 5 Tailoring; 3.6.1 File Format, UTS #10.
1148
1149If the same character (or a sequence of characters) exists
1150in the collation element table through C<table>,
1151mapping to collation elements is overridden.
1152If it does not exist, the mapping is defined additionally.
1153
1154    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11550063 0068 ; [.0E6A.0020.0002.0063] # ch
11560043 0068 ; [.0E6A.0020.0007.0043] # Ch
11570043 0048 ; [.0E6A.0020.0008.0043] # CH
1158006C 006C ; [.0F4C.0020.0002.006C] # ll
1159004C 006C ; [.0F4C.0020.0007.004C] # Ll
1160004C 004C ; [.0F4C.0020.0008.004C] # LL
116100F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1162006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
116300D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1164004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1165ENTRY
1166
1167    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
116800E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
116900C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1170ENTRY
1171
1172B<NOTE:> The code point in the UCA file format (before C<';'>)
1173B<must> be a Unicode code point (defined as hexadecimal),
1174but not a native code point.
1175So C<0063> must always denote C<U+0063>,
1176but not a character of C<"\x63">.
1177
1178Weighting may vary depending on collation element table.
1179So ensure the weights defined in C<entry> will be consistent with
1180those in the collation element table loaded via C<table>.
1181
1182In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1183and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1184(as a value between C<0E60> and C<0E6D>)
1185makes ordering as C<C E<lt> CH E<lt> D>.
1186Exactly speaking DUCET already has some characters between C<C> and C<D>:
1187C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1188C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1189and C<c-curl> (C<U+0255>) with C<0E69>.
1190Then primary weight C<0E6A> for C<CH> makes C<CH>
1191ordered between C<c-curl> and C<D>.
1192
1193=item hangul_terminator
1194
1195-- see 7.1.4 Trailing Weights, UTS #10.
1196
1197If a true value is given (non-zero but should be positive),
1198it will be added as a terminator primary weight to the end of
1199every standard Hangul syllable. Secondary and any higher weights
1200for terminator are set to zero.
1201If the value is false or C<hangul_terminator> key does not exist,
1202insertion of terminator weights will not be performed.
1203
1204Boundaries of Hangul syllables are determined
1205according to conjoining Jamo behavior in F<the Unicode Standard>
1206and F<HangulSyllableType.txt>.
1207
1208B<Implementation Note:>
1209(1) For expansion mapping (Unicode character mapped
1210to a sequence of collation elements), a terminator will not be added
1211between collation elements, even if Hangul syllable boundary exists there.
1212Addition of terminator is restricted to the next position
1213to the last collation element.
1214
1215(2) Non-conjoining Hangul letters
1216(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1217automatically terminated with a terminator primary weight.
1218These characters may need terminator included in a collation element
1219table beforehand.
1220
1221=item highestFFFF
1222
1223-- see 5.14 Collation Elements, UTS #35.
1224
1225If the parameter is made true, C<U+FFFF> has a highest primary weight.
1226When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1227C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1228begins with C<"abc">, or another primary equivalent.
1229C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1230such as C<"abc\x{FFFF}xyz">.
1231
1232C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1233almost, but the latter has a problem that you should know which letter is
1234next to C<c>. For a certain language where C<ch> as the next letter,
1235C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1236
1237Note:
1238This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1239Any other character than C<U+FFFF> can be tailored by C<entry>.
1240
1241=item identical
1242
1243-- see A.3 Deterministic Comparison, UTS #10.
1244
1245By default, strings whose weights are equal should be equal,
1246even though their code points are not equal.
1247Completely ignorable characters are ignored.
1248
1249If the parameter is made true, a final, tie-breaking level is used.
1250If no difference of weights is found after the comparison through
1251all the level specified by C<level>, the comparison with code points
1252will be performed.
1253For the tie-breaking comparison, the sort key has code points
1254of the original string appended.
1255Completely ignorable characters are not ignored.
1256
1257If C<preprocess> and/or C<normalization> is applied, the code points
1258of the string after them (in NFD by default) are used.
1259
1260=item ignoreChar
1261
1262=item ignoreName
1263
1264-- see 3.6.2 Variable Weighting, UTS #10.
1265
1266Makes the entry in the table completely ignorable;
1267i.e. as if the weights were zero at all level.
1268
1269Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1270will be ignored. Through C<ignoreName>, any character whose name
1271(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1272will be ignored.
1273
1274E.g. when 'a' and 'e' are ignorable,
1275'element' is equal to 'lament' (or 'lmnt').
1276
1277=item ignore_level2
1278
1279-- see 5.1 Parametric Tailoring, UTS #10.
1280
1281By default, case-sensitive comparison (that is level 3 difference)
1282won't ignore accents (that is level 2 difference).
1283
1284If the parameter is made true, accents (and other primary ignorable
1285characters) are ignored, even though cases are taken into account.
1286
1287B<NOTE>: C<level> should be 3 or greater.
1288
1289=item katakana_before_hiragana
1290
1291-- see 7.2 Tertiary Weight Table, UTS #10.
1292
1293By default, hiragana is before katakana.
1294If the parameter is made true, this is reversed.
1295
1296B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1297distinctions must occur in level 3, and their weights at level 3 must be
1298same as those mentioned in 7.3.1, UTS #10.
1299If you define your collation elements which violate this requirement,
1300this parameter does not work validly.
1301
1302=item level
1303
1304-- see 4.3 Form Sort Key, UTS #10.
1305
1306Set the maximum level.
1307Any higher levels than the specified one are ignored.
1308
1309  Level 1: alphabetic ordering
1310  Level 2: diacritic ordering
1311  Level 3: case ordering
1312  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1313
1314  ex.level => 2,
1315
1316If omitted, the maximum is the 4th.
1317
1318B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1319But this module only uses weights within 0xFFFF.
1320When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1321and 'shift-trimmed'), the level 4 may be unreliable.
1322
1323See also C<identical>.
1324
1325=item minimalFFFE
1326
1327-- see 5.14 Collation Elements, UTS #35.
1328
1329If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1330The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1331first compares C<$a1> and C<$b1> at level 1, and
1332then C<$a2> and C<$b2> at level 1, as followed.
1333
1334        "ab\x{FFFE}a"
1335        "Ab\x{FFFE}a"
1336        "ab\x{FFFE}c"
1337        "Ab\x{FFFE}c"
1338        "ab\x{FFFE}xyz"
1339        "abc\x{FFFE}def"
1340        "abc\x{FFFE}xYz"
1341        "aBc\x{FFFE}xyz"
1342        "abcX\x{FFFE}def"
1343        "abcx\x{FFFE}xyz"
1344        "b\x{FFFE}aaa"
1345        "bbb\x{FFFE}a"
1346
1347Note:
1348This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1349Any other character than C<U+FFFE> can be tailored by C<entry>.
1350
1351=item normalization
1352
1353-- see 4.1 Normalize, UTS #10.
1354
1355If specified, strings are normalized before preparation of sort keys
1356(the normalization is executed after preprocess).
1357
1358A form name C<Unicode::Normalize::normalize()> accepts will be applied
1359as C<$normalization_form>.
1360Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1361See C<Unicode::Normalize::normalize()> for detail.
1362If omitted, C<'NFD'> is used.
1363
1364C<normalization> is performed after C<preprocess> (if defined).
1365
1366Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1367though they are not concerned with C<Unicode::Normalize::normalize()>.
1368
1369If C<undef> (not a string C<"undef">) is passed explicitly
1370as the value for this key,
1371any normalization is not carried out (this may make tailoring easier
1372if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1373only contiguous contractions are resolved;
1374e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1375C<A-cedilla-ring> would be primary equal to C<A>.
1376In this point,
1377C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1378B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1379
1380In the case of C<(normalization =E<gt> "prenormalized")>,
1381any normalization is not performed, but
1382discontiguous contractions with combining characters are performed.
1383Therefore
1384C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1385B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1386If source strings are finely prenormalized,
1387C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1388
1389Except C<(normalization =E<gt> undef)>,
1390B<Unicode::Normalize> is required (see also B<CAVEAT>).
1391
1392=item overrideCJK
1393
1394-- see 7.1 Derived Collation Elements, UTS #10.
1395
1396By default, CJK unified ideographs are ordered in Unicode codepoint
1397order, but those in the CJK Unified Ideographs block are less than
1398those in the CJK Unified Ideographs Extension A etc.
1399
1400    In the CJK Unified Ideographs block:
1401    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1402    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1403    U+4E00..U+9FC3 if UCA_Version is 18.
1404    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1405    U+4E00..U+9FCC if UCA_Version is 24 or later.
1406
1407    In the CJK Unified Ideographs Extension blocks:
1408    Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1409    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1410    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1411
1412Through C<overrideCJK>, ordering of CJK unified ideographs (including
1413extensions) can be overridden.
1414
1415ex. CJK unified ideographs in the JIS code point order.
1416
1417  overrideCJK => sub {
1418      my $u = shift;             # get a Unicode codepoint
1419      my $b = pack('n', $u);     # to UTF-16BE
1420      my $s = your_unicode_to_sjis_converter($b); # convert
1421      my $n = unpack('n', $s);   # convert sjis to short
1422      [ $n, 0x20, 0x2, $u ];     # return the collation element
1423  },
1424
1425The return value may be an arrayref of 1st to 4th weights as shown
1426above. The return value may be an integer as the primary weight
1427as shown below.  If C<undef> is returned, the default derived
1428collation element will be used.
1429
1430  overrideCJK => sub {
1431      my $u = shift;             # get a Unicode codepoint
1432      my $b = pack('n', $u);     # to UTF-16BE
1433      my $s = your_unicode_to_sjis_converter($b); # convert
1434      my $n = unpack('n', $s);   # convert sjis to short
1435      return $n;                 # return the primary weight
1436  },
1437
1438The return value may be a list containing zero or more of
1439an arrayref, an integer, or C<undef>.
1440
1441ex. ignores all CJK unified ideographs.
1442
1443  overrideCJK => sub {()}, # CODEREF returning empty list
1444
1445   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1446   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1447
1448If a false value (including C<undef>) is passed, C<overrideCJK>
1449has no effect.
1450C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1451
1452But assignment of weight for CJK unified ideographs
1453in C<table> or C<entry> is still valid.
1454If C<undef> is passed explicitly as the value for this key,
1455weights for CJK unified ideographs are treated as undefined.
1456However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1457has no special meaning.
1458
1459B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1460C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1461C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1462ideographs. But they can't be overridden via C<overrideCJK> when you use
1463DUCET, as the table includes weights for them. C<table> or C<entry> has
1464priority over C<overrideCJK>.
1465
1466=item overrideHangul
1467
1468-- see 7.1 Derived Collation Elements, UTS #10.
1469
1470By default, Hangul syllables are decomposed into Hangul Jamo,
1471even if C<(normalization =E<gt> undef)>.
1472But the mapping of Hangul syllables may be overridden.
1473
1474This parameter works like C<overrideCJK>, so see there for examples.
1475
1476If you want to override the mapping of Hangul syllables,
1477NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1478Hangul syllables before overriding. FCD may decompose Hangul syllables
1479as the case may be.
1480
1481If a false value (but not C<undef>) is passed, C<overrideHangul>
1482has no effect.
1483C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1484
1485If C<undef> is passed explicitly as the value for this key,
1486weight for Hangul syllables is treated as undefined
1487without decomposition into Hangul Jamo.
1488But definition of weight for Hangul syllables
1489in C<table> or C<entry> is still valid.
1490
1491=item overrideOut
1492
1493-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1494
1495Perl seems to allow out-of-range values (greater than 0x10FFFF).
1496By default, out-of-range values are replaced with C<U+FFFD>
1497(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1498or ignored when C<UCA_Version> E<lt>= 20.
1499
1500When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1501can be overridden. Though C<table> or C<entry> are available for them,
1502out-of-range values are too many.
1503
1504C<overrideOut> can perform it algorithmically.
1505This parameter works like C<overrideCJK>, so see there for examples.
1506
1507ex. ignores all out-of-range values.
1508
1509  overrideOut => sub {()}, # CODEREF returning empty list
1510
1511If a false value (including C<undef>) is passed, C<overrideOut>
1512has no effect.
1513C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1514
1515B<NOTE ABOUT U+FFFD:>
1516
1517UCA recommends that out-of-range values should not be ignored for security
1518reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1519However, C<U+FFFD> is wrongly mapped to a variable collation element
1520in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1521ignored when C<variable> isn't C<Non-ignorable>.
1522
1523The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1524see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1525(7.1.4 Trailing Weights). Such a correction is reproduced by this.
1526
1527  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1528
1529This workaround is unnecessary since Unicode 6.3.0.
1530
1531=item preprocess
1532
1533-- see 5.4 Preprocessing, UTS #10.
1534
1535If specified, the coderef is used to preprocess each string
1536before the formation of sort keys.
1537
1538ex. dropping English articles, such as "a" or "the".
1539Then, "the pen" is before "a pencil".
1540
1541     preprocess => sub {
1542           my $str = shift;
1543           $str =~ s/\b(?:an?|the)\s+//gi;
1544           return $str;
1545        },
1546
1547C<preprocess> is performed before C<normalization> (if defined).
1548
1549ex. decoding strings in a legacy encoding such as shift-jis:
1550
1551    $sjis_collator = Unicode::Collate->new(
1552        preprocess => \&your_shiftjis_to_unicode_decoder,
1553    );
1554    @result = $sjis_collator->sort(@shiftjis_strings);
1555
1556B<Note:> Strings returned from the coderef will be interpreted
1557according to Perl's Unicode support. See L<perlunicode>,
1558L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1559
1560=item rearrange
1561
1562-- see 3.5 Rearrangement, UTS #10.
1563
1564Characters that are not coded in logical order and to be rearranged.
1565If C<UCA_Version> is equal to or less than 11, default is:
1566
1567    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1568
1569If you want to disallow any rearrangement, pass C<undef> or C<[]>
1570(a reference to empty list) as the value for this key.
1571
1572If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1573(i.e. no rearrangement).
1574
1575B<According to the version 9 of UCA, this parameter shall not be used;
1576but it is not warned at present.>
1577
1578=item rewrite
1579
1580If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1581The coderef will get each line, and then should return a rewritten line
1582according to the UCA file format.
1583If the coderef returns an empty line, the line will be skipped.
1584
1585e.g. any primary ignorable characters into tertiary ignorable:
1586
1587    rewrite => sub {
1588        my $line = shift;
1589        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1590        return $line;
1591    },
1592
1593This example shows rewriting weights. C<rewrite> is allowed to
1594affect code points, weights, and the name.
1595
1596B<NOTE>: C<table> is available to use another table file;
1597preparing a modified table once would be more efficient than
1598rewriting lines on reading an unmodified table every time.
1599
1600=item suppress
1601
1602-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1603UTS #35 (LDML).
1604
1605Contractions beginning with the specified characters are suppressed,
1606even if those contractions are defined in C<table>.
1607
1608An example for Russian and some languages using the Cyrillic script:
1609
1610    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1611
1612where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1613
1614B<NOTE>: Contractions via C<entry> are not be suppressed.
1615
1616=item table
1617
1618-- see 3.6 Default Unicode Collation Element Table, UTS #10.
1619
1620You can use another collation element table if desired.
1621
1622The table file should locate in the F<Unicode/Collate> directory
1623on C<@INC>. Say, if the filename is F<Foo.txt>,
1624the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1625
1626By default, F<allkeys.txt> (as the filename of DUCET) is used.
1627If you will prepare your own table file, any name other than F<allkeys.txt>
1628may be better to avoid namespace conflict.
1629
1630B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1631module, and it may save time at the run time.
1632Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1633or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1634C<rewrite> will prevent this module from using the compiled DUCET.
1635
1636If C<undef> is passed explicitly as the value for this key,
1637no file is read (but you can define collation elements via C<entry>).
1638
1639A typical way to define a collation element table
1640without any file of table:
1641
1642   $onlyABC = Unicode::Collate->new(
1643       table => undef,
1644       entry => << 'ENTRIES',
16450061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
16460041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
16470062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
16480042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
16490063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
16500043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1651ENTRIES
1652    );
1653
1654If C<ignoreName> or C<undefName> is used, character names should be
1655specified as a comment (following C<#>) on each line.
1656
1657=item undefChar
1658
1659=item undefName
1660
1661-- see 6.3.4 Reducing the Repertoire, UTS #10.
1662
1663Undefines the collation element as if it were unassigned in the C<table>.
1664This reduces the size of the table.
1665If an unassigned character appears in the string to be collated,
1666the sort key is made from its codepoint
1667as a single-character collation element,
1668as it is greater than any other assigned collation elements
1669(in the codepoint order among the unassigned characters).
1670But, it'd be better to ignore characters
1671unfamiliar to you and maybe never used.
1672
1673Through C<undefChar>, any character matching C<qr/$undefChar/>
1674will be undefined. Through C<undefName>, any character whose name
1675(given in the C<table> file as a comment) matches C<qr/$undefName/>
1676will be undefined.
1677
1678ex. Collation weights for beyond-BMP characters are not stored in object:
1679
1680    undefChar => qr/[^\0-\x{fffd}]/,
1681
1682=item upper_before_lower
1683
1684-- see 6.6 Case Comparisons, UTS #10.
1685
1686By default, lowercase is before uppercase.
1687If the parameter is made true, this is reversed.
1688
1689B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1690distinctions must occur in level 3, and their weights at level 3 must be
1691same as those mentioned in 7.3.1, UTS #10.
1692If you define your collation elements which differs from this requirement,
1693this parameter doesn't work validly.
1694
1695=item variable
1696
1697-- see 3.6.2 Variable Weighting, UTS #10.
1698
1699This key allows for variable weighting of variable collation elements,
1700which are marked with an ASTERISK in the table
1701(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1702
1703   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1704
1705These names are case-insensitive.
1706By default (if specification is omitted), 'shifted' is adopted.
1707
1708   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1709                    considered at the 4th level.
1710
1711   'Non-Ignorable'  Variable elements are not reset to ignorable.
1712
1713   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1714                    their level 4 weight is replaced by the old level 1 weight.
1715                    Level 4 weight for Non-Variable elements is 0xFFFF.
1716
1717   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1718                    are trimmed.
1719
1720=back
1721
1722=head2 Methods for Collation
1723
1724=over 4
1725
1726=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1727
1728Sorts a list of strings.
1729
1730=item C<$result = $Collator-E<gt>cmp($a, $b)>
1731
1732Returns 1 (when C<$a> is greater than C<$b>)
1733or 0 (when C<$a> is equal to C<$b>)
1734or -1 (when C<$a> is less than C<$b>).
1735
1736=item C<$result = $Collator-E<gt>eq($a, $b)>
1737
1738=item C<$result = $Collator-E<gt>ne($a, $b)>
1739
1740=item C<$result = $Collator-E<gt>lt($a, $b)>
1741
1742=item C<$result = $Collator-E<gt>le($a, $b)>
1743
1744=item C<$result = $Collator-E<gt>gt($a, $b)>
1745
1746=item C<$result = $Collator-E<gt>ge($a, $b)>
1747
1748They works like the same name operators as theirs.
1749
1750   eq : whether $a is equal to $b.
1751   ne : whether $a is not equal to $b.
1752   lt : whether $a is less than $b.
1753   le : whether $a is less than $b or equal to $b.
1754   gt : whether $a is greater than $b.
1755   ge : whether $a is greater than $b or equal to $b.
1756
1757=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1758
1759-- see 4.3 Form Sort Key, UTS #10.
1760
1761Returns a sort key.
1762
1763You compare the sort keys using a binary comparison
1764and get the result of the comparison of the strings using UCA.
1765
1766   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1767
1768      is equivalent to
1769
1770   $Collator->cmp($a, $b)
1771
1772=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1773
1774Converts a sorting key into its representation form.
1775If C<UCA_Version> is 8, the output is slightly different.
1776
1777   use Unicode::Collate;
1778   my $c = Unicode::Collate->new();
1779   print $c->viewSortKey("Perl"),"\n";
1780
1781   # output:
1782   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1783   #  Level 1               Level 2               Level 3               Level 4
1784
1785=back
1786
1787=head2 Methods for Searching
1788
1789The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1790like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1791but they are not aware of any pattern, but only a literal substring.
1792
1793B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1794for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1795C<subst>, C<gsubst>) is croaked, as the position and the length might
1796differ from those on the specified string.
1797
1798C<rearrange> and C<hangul_terminator> parameters are neglected.
1799C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1800matching and searching, as it doesn't matter whether greater or less.
1801
1802=over 4
1803
1804=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1805
1806=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1807
1808If C<$substring> matches a part of C<$string>, returns
1809the position of the first occurrence of the matching part in scalar context;
1810in list context, returns a two-element list of
1811the position and the length of the matching part.
1812
1813If C<$substring> does not match any part of C<$string>,
1814returns C<-1> in scalar context and
1815an empty list in list context.
1816
1817e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1818you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1819
1820  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1821                                     # (normalization => undef) is REQUIRED.
1822  my $match;
1823  if (my($pos,$len) = $Collator->index($str, $sub)) {
1824      $match = substr($str, $pos, $len);
1825  }
1826
1827and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1828is primary equal to C<"M>E<uuml>C<SS">.
1829
1830=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1831
1832=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1833
1834If C<$substring> matches a part of C<$string>, in scalar context, returns
1835B<a reference to> the first occurrence of the matching part
1836(C<$match_ref> is always true if matches,
1837since every reference is B<true>);
1838in list context, returns the first occurrence of the matching part.
1839
1840If C<$substring> does not match any part of C<$string>,
1841returns C<undef> in scalar context and
1842an empty list in list context.
1843
1844e.g.
1845
1846    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1847	print "matches [$$match_ref].\n";
1848    } else {
1849	print "doesn't match.\n";
1850    }
1851
1852     or
1853
1854    if (($match) = $Collator->match($str, $sub)) { # list context
1855	print "matches [$match].\n";
1856    } else {
1857	print "doesn't match.\n";
1858    }
1859
1860=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1861
1862If C<$substring> matches a part of C<$string>, returns
1863all the matching parts (or matching count in scalar context).
1864
1865If C<$substring> does not match any part of C<$string>,
1866returns an empty list.
1867
1868=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1869
1870If C<$substring> matches a part of C<$string>,
1871the first occurrence of the matching part is replaced by C<$replacement>
1872(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1873
1874C<$replacement> can be a C<CODEREF>,
1875taking the matching part as an argument,
1876and returning a string to replace the matching part
1877(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1878
1879=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1880
1881If C<$substring> matches a part of C<$string>,
1882all the occurrences of the matching part are replaced by C<$replacement>
1883(C<$string> is modified) and C<$count> is returned.
1884
1885C<$replacement> can be a C<CODEREF>,
1886taking the matching part as an argument,
1887and returning a string to replace the matching part
1888(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1889
1890e.g.
1891
1892  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1893                                     # (normalization => undef) is REQUIRED.
1894  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1895  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1896
1897  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1898  # i.e., all the camels are made bold-faced.
1899
1900   Examples: levels and ignore_level2 - what does camel match?
1901  ---------------------------------------------------------------------------
1902   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1903  -----------------------|---------------------------------------------------
1904     1        false      |   yes    yes      yes          yes        yes
1905     2        false      |   yes    yes      no           yes        yes
1906     3        false      |   yes    no       no           yes        yes
1907     4        false      |   yes    no       no           no         yes
1908  -----------------------|---------------------------------------------------
1909     1        true       |   yes    yes      yes          yes        yes
1910     2        true       |   yes    yes      yes          yes        yes
1911     3        true       |   yes    no       yes          yes        yes
1912     4        true       |   yes    no       yes          no         yes
1913  ---------------------------------------------------------------------------
1914   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1915         at any level.
1916
1917=back
1918
1919=head2 Other Methods
1920
1921=over 4
1922
1923=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1924
1925=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1926
1927Changes the value of specified keys and returns the changed part.
1928
1929    $Collator = Unicode::Collate->new(level => 4);
1930
1931    $Collator->eq("perl", "PERL"); # false
1932
1933    %old = $Collator->change(level => 2); # returns (level => 4).
1934
1935    $Collator->eq("perl", "PERL"); # true
1936
1937    $Collator->change(%old); # returns (level => 2).
1938
1939    $Collator->eq("perl", "PERL"); # false
1940
1941Not all C<(key,value)>s are allowed to be changed.
1942See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1943
1944In the scalar context, returns the modified collator
1945(but it is B<not> a clone from the original).
1946
1947    $Collator->change(level => 2)->eq("perl", "PERL"); # true
1948
1949    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1950
1951    $Collator->change(level => 4)->eq("perl", "PERL"); # false
1952
1953=item C<$version = $Collator-E<gt>version()>
1954
1955Returns the version number (a string) of the Unicode Standard
1956which the C<table> file used by the collator object is based on.
1957If the table does not include a version line (starting with C<@version>),
1958returns C<"unknown">.
1959
1960=item C<UCA_Version()>
1961
1962Returns the revision number of UTS #10 this module consults,
1963that should correspond with the DUCET incorporated.
1964
1965=item C<Base_Unicode_Version()>
1966
1967Returns the version number of UTS #10 this module consults,
1968that should correspond with the DUCET incorporated.
1969
1970=back
1971
1972=head1 EXPORT
1973
1974No method will be exported.
1975
1976=head1 INSTALL
1977
1978Though this module can be used without any C<table> file,
1979to use this module easily, it is recommended to install a table file
1980in the UCA format, by copying it under the directory
1981<a place in @INC>/Unicode/Collate.
1982
1983The most preferable one is "The Default Unicode Collation Element Table"
1984(aka DUCET), available from the Unicode Consortium's website:
1985
1986   http://www.unicode.org/Public/UCA/
1987
1988   http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1989
1990If DUCET is not installed, it is recommended to copy the file
1991from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1992to <a place in @INC>/Unicode/Collate/allkeys.txt
1993manually.
1994
1995=head1 CAVEATS
1996
1997=over 4
1998
1999=item Normalization
2000
2001Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2002module (see L<Unicode::Normalize>).
2003
2004If you need not it (say, in the case when you need not
2005handle any combining characters),
2006assign C<(normalization =E<gt> undef)> explicitly.
2007
2008-- see 6.5 Avoiding Normalization, UTS #10.
2009
2010=item Conformance Test
2011
2012The Conformance Test for the UCA is available
2013under L<http://www.unicode.org/Public/UCA/>.
2014
2015For F<CollationTest_SHIFTED.txt>,
2016a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2017for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2018C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2019
2020If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2021C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2022C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2023C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2024
2025B<Unicode::Normalize is required to try The Conformance Test.>
2026
2027=back
2028
2029=head1 AUTHOR, COPYRIGHT AND LICENSE
2030
2031The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2032<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2013,
2033SADAHIRO Tomoyuki. Japan. All rights reserved.
2034
2035This module is free software; you can redistribute it and/or
2036modify it under the same terms as Perl itself.
2037
2038The file Unicode/Collate/allkeys.txt was copied verbatim
2039from L<http://www.unicode.org/Public/UCA/6.3.0/allkeys.txt>.
2040For this file, Copyright (c) 2001-2012 Unicode, Inc.
2041Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
2042
2043=head1 SEE ALSO
2044
2045=over 4
2046
2047=item Unicode Collation Algorithm - UTS #10
2048
2049L<http://www.unicode.org/reports/tr10/>
2050
2051=item The Default Unicode Collation Element Table (DUCET)
2052
2053L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2054
2055=item The conformance test for the UCA
2056
2057L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2058
2059L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2060
2061=item Hangul Syllable Type
2062
2063L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2064
2065=item Unicode Normalization Forms - UAX #15
2066
2067L<http://www.unicode.org/reports/tr15/>
2068
2069=item Unicode Locale Data Markup Language (LDML) - UTS #35
2070
2071L<http://www.unicode.org/reports/tr35/>
2072
2073=back
2074
2075=cut
2076